feadulta/scripts/minimax_tts.py

#!/usr/bin/env python3
"""TTS con MiniMax (clonación de voz + síntesis de calidad). Issue #76.

Credenciales en /home/rafa/Feadulta/minimax.txt:
  - la API key (línea que empieza por 'sk-api-')
  - el GroupId (línea 'GroupId=...' o 'group_id ...' o un número suelto)

Subcomandos:
  clone <audio.wav> <voice_id>                  sube y clona (voice_id: >=8 chars, letras+números)
  carta <post_id> <voice_id> [model] [nombre]   locuta una carta entera
  text  "<texto>" <voice_id> [model] [nombre]   locuta texto suelto
models: speech-2.8-turbo (barato) | speech-2.8-hd (calidad)
"""
import html
import json
import os
import re
import subprocess
import sys
from pathlib import Path

import requests

CRED = "/home/rafa/Feadulta/minimax.txt"
BASE = "https://api.minimax.io/v1"
OUT = Path(__file__).resolve().parent.parent / "wordpress/wp-content/uploads/tts-samples"
CONTAINER = "wordpress-web"


def creds():
    key = gid = None
    for ln in open(CRED):
        ln = ln.strip()
        if not ln:
            continue
        if ln.startswith("sk-"):
            key = ln  # coge la última key del fichero (la más reciente)
        elif "groupid" in ln.lower() or "group_id" in ln.lower():
            gid = re.split(r"[=:\s]+", ln, 1)[1].strip()
        elif ln.isdigit():
            gid = ln
    return key, gid


KEY, GID = creds()
H_JSON = {"Authorization": f"Bearer {KEY}", "Content-Type": "application/json"}


def _q(url):
    return f"{url}?GroupId={GID}" if GID else url


def upload(path, purpose="voice_clone"):
    r = requests.post(_q(f"{BASE}/files/upload"),
                      headers={"Authorization": f"Bearer {KEY}"},
                      data={"purpose": purpose},
                      files={"file": open(path, "rb")})
    j = r.json()
    fid = (j.get("file") or {}).get("file_id")
    if not fid:
        sys.exit(f"upload falló: {json.dumps(j)[:400]}")
    print(f"  file_id={fid}")
    return fid


def clone(audio, voice_id):
    print(f"Subiendo {audio}…", flush=True)
    fid = upload(audio, "voice_clone")
    print(f"Clonando como voice_id={voice_id}…", flush=True)
    r = requests.post(_q(f"{BASE}/voice_clone"), headers=H_JSON,
                      json={"file_id": fid, "voice_id": voice_id, "model": "speech-2.8-hd"})
    print(json.dumps(r.json(), ensure_ascii=False)[:500])


def get_post_text(pid):
    subprocess.run(["docker", "exec", CONTAINER, "php", "/tmp/fea_post_io.php", "get", str(pid)],
                   check=True, capture_output=True)
    subprocess.run(["docker", "cp", f"{CONTAINER}:/tmp/fea_es.json", "/tmp/fea_es.json"], check=True)
    d = json.load(open("/tmp/fea_es.json"))
    raw = re.sub(r"(?i)</p>|<br\s*/?>|</h[1-6]>", "\n", d["content"])
    raw = re.sub(r"<[^>]+>", "", raw)
    raw = re.sub(r"\[[^\]]+\]", "", raw)
    raw = html.unescape(raw)
    paras = [re.sub(r"\s+", " ", p).strip() for p in raw.split("\n") if len(p.strip()) > 1]
    paras = trim_after_author_signature(paras)
    return d["title"], "\n\n".join(paras)


def is_author_signature(text):
    """Heurística simple para detectar la firma final del autor.

    Queremos conservar la línea del nombre y cortar todo lo que venga detrás
    (URLs, notas, anexos o bloques extra), pero sin confundirla con títulos
    internos del artículo.
    """
    text = text.strip()
    if not text or len(text) > 80 or any(ch.isdigit() for ch in text):
        return False
    if any(mark in text for mark in [":", ";", "http", "www.", "@"]):
        return False
    words = text.split()
    if len(words) < 2 or len(words) > 6:
        return False
    allowed_lower = {"de", "del", "la", "las", "los", "y", "e"}
    for word in words:
        clean = re.sub(r"[^\wÁÉÍÓÚÜÑáéíóúüñ-]", "", word)
        if not clean:
            return False
        if clean.lower() in allowed_lower:
            continue
        if not clean[0].isupper():
            return False
    return True


def trim_after_author_signature(paras):
    out = []
    for p in paras:
        out.append(p)
        if is_author_signature(p):
            break
    return out


def _sent_pause(n_words, short, long_):
    """Pausa (s) tras un punto, proporcional a la longitud de la frase que cierra:
    frase corta → pausa corta; frase larga → el narrador 'respira' más."""
    if n_words < short:
        return os.environ.get("FEA_PAUSE_SHORT", "0.1")
    if n_words <= long_:
        return os.environ.get("FEA_PAUSE_MID", "0.2")
    return os.environ.get("FEA_PAUSE_LONG", "0.3")


def ensure_terminal_punctuation(block):
    """Cierra con punto los bloques sin puntuación final.

    MiniMax deja la entonación abierta cuando un título/párrafo termina "en seco".
    Si el bloque ya acaba en . ! ? … : ;, se respeta.
    """
    block = block.strip()
    if not block:
        return ""
    if block[-1] not in ".!?…:;":
        return block + "."
    return block


def expand_bible_abbreviations(text):
    """Expande abreviaturas bíblicas cuando aparecen con forma de cita.

    Ejemplos:
    - Mt 5, 1-12   -> Mateo 5, 1-12
    - Lc 2, 10     -> Lucas 2, 10
    - Jn 3, 16     -> Juan 3, 16
    - Mc 1, 14     -> Marcos 1, 14

    Se limita a abreviaturas seguidas de capítulo/versículo para no tocar usos
    no bíblicos de esas siglas dentro del texto.
    """
    books = [
        ("1Cor", "Primera carta a los Corintios"),
        ("2Cor", "Segunda carta a los Corintios"),
        ("1Tes", "Primera carta a los Tesalonicenses"),
        ("2Tes", "Segunda carta a los Tesalonicenses"),
        ("1Tim", "Primera carta a Timoteo"),
        ("2Tim", "Segunda carta a Timoteo"),
        ("1Pe", "Primera carta de Pedro"),
        ("2Pe", "Segunda carta de Pedro"),
        ("1Jn", "Primera carta de Juan"),
        ("2Jn", "Segunda carta de Juan"),
        ("3Jn", "Tercera carta de Juan"),
        ("1Mac", "Primer libro de los Macabeos"),
        ("2Mac", "Segundo libro de los Macabeos"),
        ("1Sam", "Primer libro de Samuel"),
        ("2Sam", "Segundo libro de Samuel"),
        ("1Sm", "Primer libro de Samuel"),
        ("2Sm", "Segundo libro de Samuel"),
        ("1Re", "Primer libro de los Reyes"),
        ("2Re", "Segundo libro de los Reyes"),
        ("1Cr", "Primer libro de las Crónicas"),
        ("2Cr", "Segundo libro de las Crónicas"),
        ("Hch", "Hechos de los Apóstoles"),
        ("Rom", "Romanos"),
        ("Rm", "Romanos"),
        ("Gal", "Gálatas"),
        ("Gál", "Gálatas"),
        ("Ef", "Efesios"),
        ("Flp", "Filipenses"),
        ("Fil", "Filipenses"),
        ("Col", "Colosenses"),
        ("Tit", "Tito"),
        ("Flm", "Filemón"),
        ("Heb", "Hebreos"),
        ("Sant", "Santiago"),
        ("St", "Santiago"),
        ("Sto", "Santiago"),
        ("Jud", "Judas"),
        ("Ap", "Apocalipsis"),
        ("Mt", "Mateo"),
        ("Mc", "Marcos"),
        ("Lc", "Lucas"),
        ("Jn", "Juan"),
        ("Gn", "Génesis"),
        ("Gen", "Génesis"),
        ("Ex", "Éxodo"),
        ("Lv", "Levítico"),
        ("Lev", "Levítico"),
        ("Nm", "Números"),
        ("Num", "Números"),
        ("Dt", "Deuteronomio"),
        ("Jos", "Josué"),
        ("Jue", "Jueces"),
        ("Rut", "Rut"),
        ("Esd", "Esdras"),
        ("Neh", "Nehemías"),
        ("Tob", "Tobías"),
        ("Jdt", "Judit"),
        ("Est", "Ester"),
        ("Job", "Job"),
        ("Sal", "Salmos"),
        ("Prov", "Proverbios"),
        ("Cant", "Cantar de los Cantares"),
        ("Sab", "Sabiduría"),
        ("Eclo", "Eclesiástico"),
        ("Sir", "Eclesiástico"),
        ("Ecl", "Eclesiástico"),
        ("Isa", "Isaías"),
        ("Is", "Isaías"),
        ("Jer", "Jeremías"),
        ("Jr", "Jeremías"),
        ("Lam", "Lamentaciones"),
        ("Bar", "Baruc"),
        ("Eze", "Ezequiel"),
        ("Ez", "Ezequiel"),
        ("Dan", "Daniel"),
        ("Dn", "Daniel"),
        ("Os", "Oseas"),
        ("Joel", "Joel"),
        ("Am", "Amós"),
        ("Abd", "Abdías"),
        ("Jon", "Jonás"),
        ("Miq", "Miqueas"),
        ("Nah", "Nahúm"),
        ("Hab", "Habacuc"),
        ("Sof", "Sofonías"),
        ("Ag", "Ageo"),
        ("Zac", "Zacarías"),
        ("Mal", "Malaquías"),
    ]
    for short, full in books:
        text = re.sub(
            rf"\b{short}\.?(?=\s+\d)",
            full,
            text,
        )
    text = re.sub(r"\b1\s+Co\.?(?=\s+\d)", "Primera carta a los Corintios", text)
    text = re.sub(r"\b2\s+Co\.?(?=\s+\d)", "Segunda carta a los Corintios", text)
    text = re.sub(r"\b1\s+Ts\.?(?=\s+\d)", "Primera carta a los Tesalonicenses", text)
    text = re.sub(r"\b2\s+Ts\.?(?=\s+\d)", "Segunda carta a los Tesalonicenses", text)
    text = re.sub(r"\b1\s+P\.?(?=\s+\d)", "Primera carta de Pedro", text)
    text = re.sub(r"\b2\s+P\.?(?=\s+\d)", "Segunda carta de Pedro", text)
    return text


def add_pauses(text, para=None):
    """Pausas MiniMax <#seg#> DINÁMICAS por longitud de frase + cierre de párrafos.
    - Tras cada fin de frase (.!?…): pausa según nº de palabras de esa frase
      (<short=0.1s, <=long=0.2s, >long=0.3s; umbrales por palabras).
    - A los párrafos/títulos sin puntuación final se les añade un punto, para que
      MiniMax cierre bien la entonación (si no, deja el tono abierto)."""
    para = para if para is not None else os.environ.get("FEA_PARA_PAUSE", "0.7")
    short = int(os.environ.get("FEA_SHORT_WORDS", "6"))
    long_ = int(os.environ.get("FEA_LONG_WORDS", "12"))
    text = expand_bible_abbreviations(text)
    out = []
    for p in text.split("\n\n"):
        p = ensure_terminal_punctuation(p)
        if not p:
            continue
        # Reconstruir insertando pausa proporcional tras cada signo de fin de frase.
        parts = re.split(r"([.!?…]+)", p)
        rebuilt = ""
        for i in range(0, len(parts), 2):
            frase = parts[i]
            sign = parts[i + 1] if i + 1 < len(parts) else ""
            rebuilt += frase + sign
            if sign and frase.strip():
                rebuilt += f" <#{_sent_pause(len(frase.split()), short, long_)}#> "
        # Quitar la pausa de frase final: el separador de párrafo ya aporta la suya.
        rebuilt = re.sub(r"\s*<#[\d.]+#>\s*$", "", rebuilt)
        out.append(rebuilt.strip())
    return f" <#{para}#> ".join(out)


# MiniMax limita a 10.000 car por petición; dejamos margen porque las pausas
# <#seg#> y el language_boost también cuentan.
CHAR_LIMIT = 8000


def _split_for_tts(text, limit=CHAR_LIMIT):
    """Trocea respetando las pausas <#..#> (frase/párrafo). Fallback por palabras
    si una frase suelta supera el límite."""
    if len(text) <= limit:
        return [text]
    parts = re.split(r"(\s*<#[\d.]+#>\s*)", text)
    chunks, cur = [], ""
    for seg in parts:
        if not seg:
            continue
        if len(cur) + len(seg) <= limit:
            cur += seg
            continue
        if cur.strip():
            chunks.append(cur.strip())
        if len(seg) > limit:                       # frase gigantesca: parte por palabras
            cur = ""
            for w in seg.split(" "):
                if len(cur) + len(w) + 1 <= limit:
                    cur += (" " if cur else "") + w
                else:
                    if cur:
                        chunks.append(cur)
                    cur = w
        else:
            cur = seg
    if cur.strip():
        chunks.append(cur.strip())
    return chunks


def _synth_chunk(text, voice_id, model):
    """Una petición t2a. Devuelve (audio_bytes|None, rc, usage_chars)."""
    body = {
        "model": model,
        "text": text,
        "voice_setting": {"voice_id": voice_id, "speed": 1.0, "vol": 1.0, "pitch": 0},
        "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "mp3", "channel": 1},
        "language_boost": "Spanish",
    }
    r = requests.post(f"{BASE}/t2a_v2", headers=H_JSON, json=body)
    j = r.json()
    audio_hex = (j.get("data") or {}).get("audio")
    if not audio_hex:
        rc = (j.get("base_resp") or {}).get("status_code")
        print(f"t2a falló: {json.dumps(j, ensure_ascii=False)[:300]}")
        return None, rc, 0
    usage = (j.get("extra_info") or {}).get("usage_characters", 0)
    return bytes.fromhex(audio_hex), 0, usage


def t2a(text, voice_id, model, name):
    chunks = _split_for_tts(text)
    print(f"Sintetizando {len(text)} car con {model} / {voice_id} "
          f"({len(chunks)} petición/es)…", flush=True)
    raw = OUT / f"{name}.raw.mp3"
    if len(chunks) == 1:
        audio, rc, _ = _synth_chunk(chunks[0], voice_id, model)
        if audio is None:
            return rc
        raw.write_bytes(audio)
    else:
        parts = []
        for k, ch in enumerate(chunks):
            if k > 0:
                import os as _os, time as _t
                _t.sleep(int(_os.environ.get("FEA_CHUNK_PAUSE", "35")))  # respetar TPM de MiniMax
            print(f"  trozo {k + 1}/{len(chunks)} ({len(ch)} car)…", flush=True)
            audio, rc, _ = _synth_chunk(ch, voice_id, model)
            if audio is None:
                for p in parts:
                    p.unlink(missing_ok=True)
                return rc
            p = OUT / f"{name}.part{k}.mp3"
            p.write_bytes(audio)
            parts.append(p)
        import subprocess as sp0
        args = ["ffmpeg", "-y"]
        for p in parts:
            args += ["-i", str(p)]
        n = len(parts)
        filt = "".join(f"[{k}:a]" for k in range(n)) + f"concat=n={n}:v=0:a=1[a]"
        args += ["-filter_complex", filt, "-map", "[a]", "-b:a", "128k", str(raw)]
        sp0.run(args, capture_output=True)
        for p in parts:
            p.unlink(missing_ok=True)
    # Acabado: comfort noise marrón + fade in/out (quita el "bump" final).
    import subprocess as sp
    dur = float(sp.run(["ffprobe", "-v", "error", "-show_entries", "format=duration",
                        "-of", "default=noprint_wrappers=1:nokey=1", str(raw)],
                       capture_output=True, text=True).stdout.strip() or "0")
    st = max(0.0, dur - 0.5)
    mp3 = OUT / f"{name}.mp3"
    sp.run(["ffmpeg", "-y", "-i", str(raw), "-filter_complex",
            "anoisesrc=color=brown:amplitude=0.004:sample_rate=32000[n];"
            "[n]highpass=f=120,lowpass=f=3800[nf];"
            "[0:a][nf]amix=inputs=2:duration=first:dropout_transition=0:normalize=0[m];"
            f"[m]afade=t=in:st=0:d=0.08,afade=t=out:st={st:.2f}:d=0.5[a]",
            "-map", "[a]", "-b:a", "128k", str(mp3)], capture_output=True)
    raw.unlink(missing_ok=True)
    print(f"OK -> {mp3}  ({dur:.0f}s)")
    return 0


def main():
    if len(sys.argv) < 2:
        sys.exit(__doc__)
    cmd = sys.argv[1]
    if not KEY:
        sys.exit("No encuentro la API key en " + CRED)
    if cmd == "clone":
        clone(sys.argv[2], sys.argv[3])
    elif cmd == "carta":
        pid, voice_id = sys.argv[2], sys.argv[3]
        model = sys.argv[4] if len(sys.argv) > 4 else "speech-2.8-turbo"
        title, text = get_post_text(int(pid))
        name = sys.argv[5] if len(sys.argv) > 5 else f"carta-minimax-{pid}-{model.split('-')[-1]}"
        text = add_pauses(text)
        print(f"Post #{pid}: «{title}» ({len(text)} car con pausas)")
        t2a(text, voice_id, model, name)
    elif cmd == "text":
        model = sys.argv[4] if len(sys.argv) > 4 else "speech-2.8-turbo"
        name = sys.argv[5] if len(sys.argv) > 5 else "minimax-text"
        t2a(sys.argv[2], sys.argv[3], model, name)
    else:
        sys.exit(__doc__)


if __name__ == "__main__":
    main()