431 lines
15 KiB
Python
431 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""TTS con MiniMax (clonación de voz + síntesis de calidad). Issue #76.
|
|
|
|
Credenciales en /home/rafa/Feadulta/minimax.txt:
|
|
- la API key (línea que empieza por 'sk-api-')
|
|
- el GroupId (línea 'GroupId=...' o 'group_id ...' o un número suelto)
|
|
|
|
Subcomandos:
|
|
clone <audio.wav> <voice_id> sube y clona (voice_id: >=8 chars, letras+números)
|
|
carta <post_id> <voice_id> [model] [nombre] locuta una carta entera
|
|
text "<texto>" <voice_id> [model] [nombre] locuta texto suelto
|
|
models: speech-2.8-turbo (barato) | speech-2.8-hd (calidad)
|
|
"""
|
|
import html
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
CRED = "/home/rafa/Feadulta/minimax.txt"
|
|
BASE = "https://api.minimax.io/v1"
|
|
OUT = Path(__file__).resolve().parent.parent / "wordpress/wp-content/uploads/tts-samples"
|
|
CONTAINER = "wordpress-web"
|
|
|
|
|
|
def creds():
|
|
key = gid = None
|
|
for ln in open(CRED):
|
|
ln = ln.strip()
|
|
if not ln:
|
|
continue
|
|
if ln.startswith("sk-"):
|
|
key = ln # coge la última key del fichero (la más reciente)
|
|
elif "groupid" in ln.lower() or "group_id" in ln.lower():
|
|
gid = re.split(r"[=:\s]+", ln, 1)[1].strip()
|
|
elif ln.isdigit():
|
|
gid = ln
|
|
return key, gid
|
|
|
|
|
|
KEY, GID = creds()
|
|
H_JSON = {"Authorization": f"Bearer {KEY}", "Content-Type": "application/json"}
|
|
|
|
|
|
def _q(url):
|
|
return f"{url}?GroupId={GID}" if GID else url
|
|
|
|
|
|
def upload(path, purpose="voice_clone"):
|
|
r = requests.post(_q(f"{BASE}/files/upload"),
|
|
headers={"Authorization": f"Bearer {KEY}"},
|
|
data={"purpose": purpose},
|
|
files={"file": open(path, "rb")})
|
|
j = r.json()
|
|
fid = (j.get("file") or {}).get("file_id")
|
|
if not fid:
|
|
sys.exit(f"upload falló: {json.dumps(j)[:400]}")
|
|
print(f" file_id={fid}")
|
|
return fid
|
|
|
|
|
|
def clone(audio, voice_id):
|
|
print(f"Subiendo {audio}…", flush=True)
|
|
fid = upload(audio, "voice_clone")
|
|
print(f"Clonando como voice_id={voice_id}…", flush=True)
|
|
r = requests.post(_q(f"{BASE}/voice_clone"), headers=H_JSON,
|
|
json={"file_id": fid, "voice_id": voice_id, "model": "speech-2.8-hd"})
|
|
print(json.dumps(r.json(), ensure_ascii=False)[:500])
|
|
|
|
|
|
def get_post_text(pid):
|
|
subprocess.run(["docker", "exec", CONTAINER, "php", "/tmp/fea_post_io.php", "get", str(pid)],
|
|
check=True, capture_output=True)
|
|
subprocess.run(["docker", "cp", f"{CONTAINER}:/tmp/fea_es.json", "/tmp/fea_es.json"], check=True)
|
|
d = json.load(open("/tmp/fea_es.json"))
|
|
raw = re.sub(r"(?i)</p>|<br\s*/?>|</h[1-6]>", "\n", d["content"])
|
|
raw = re.sub(r"<[^>]+>", "", raw)
|
|
raw = re.sub(r"\[[^\]]+\]", "", raw)
|
|
raw = html.unescape(raw)
|
|
paras = [re.sub(r"\s+", " ", p).strip() for p in raw.split("\n") if len(p.strip()) > 1]
|
|
paras = trim_after_author_signature(paras)
|
|
return d["title"], "\n\n".join(paras)
|
|
|
|
|
|
def is_author_signature(text):
|
|
"""Heurística simple para detectar la firma final del autor.
|
|
|
|
Queremos conservar la línea del nombre y cortar todo lo que venga detrás
|
|
(URLs, notas, anexos o bloques extra), pero sin confundirla con títulos
|
|
internos del artículo.
|
|
"""
|
|
text = text.strip()
|
|
if not text or len(text) > 80 or any(ch.isdigit() for ch in text):
|
|
return False
|
|
if any(mark in text for mark in [":", ";", "http", "www.", "@"]):
|
|
return False
|
|
words = text.split()
|
|
if len(words) < 2 or len(words) > 6:
|
|
return False
|
|
allowed_lower = {"de", "del", "la", "las", "los", "y", "e"}
|
|
for word in words:
|
|
clean = re.sub(r"[^\wÁÉÍÓÚÜÑáéíóúüñ-]", "", word)
|
|
if not clean:
|
|
return False
|
|
if clean.lower() in allowed_lower:
|
|
continue
|
|
if not clean[0].isupper():
|
|
return False
|
|
return True
|
|
|
|
|
|
def trim_after_author_signature(paras):
|
|
out = []
|
|
for p in paras:
|
|
out.append(p)
|
|
if is_author_signature(p):
|
|
break
|
|
return out
|
|
|
|
|
|
def _sent_pause(n_words, short, long_):
|
|
"""Pausa (s) tras un punto, proporcional a la longitud de la frase que cierra:
|
|
frase corta → pausa corta; frase larga → el narrador 'respira' más."""
|
|
if n_words < short:
|
|
return os.environ.get("FEA_PAUSE_SHORT", "0.1")
|
|
if n_words <= long_:
|
|
return os.environ.get("FEA_PAUSE_MID", "0.2")
|
|
return os.environ.get("FEA_PAUSE_LONG", "0.3")
|
|
|
|
|
|
def ensure_terminal_punctuation(block):
|
|
"""Cierra con punto los bloques sin puntuación final.
|
|
|
|
MiniMax deja la entonación abierta cuando un título/párrafo termina "en seco".
|
|
Si el bloque ya acaba en . ! ? … : ;, se respeta.
|
|
"""
|
|
block = block.strip()
|
|
if not block:
|
|
return ""
|
|
if block[-1] not in ".!?…:;":
|
|
return block + "."
|
|
return block
|
|
|
|
|
|
def expand_bible_abbreviations(text):
|
|
"""Expande abreviaturas bíblicas cuando aparecen con forma de cita.
|
|
|
|
Ejemplos:
|
|
- Mt 5, 1-12 -> Mateo 5, 1-12
|
|
- Lc 2, 10 -> Lucas 2, 10
|
|
- Jn 3, 16 -> Juan 3, 16
|
|
- Mc 1, 14 -> Marcos 1, 14
|
|
|
|
Se limita a abreviaturas seguidas de capítulo/versículo para no tocar usos
|
|
no bíblicos de esas siglas dentro del texto.
|
|
"""
|
|
books = [
|
|
("1Cor", "Primera carta a los Corintios"),
|
|
("2Cor", "Segunda carta a los Corintios"),
|
|
("1Tes", "Primera carta a los Tesalonicenses"),
|
|
("2Tes", "Segunda carta a los Tesalonicenses"),
|
|
("1Tim", "Primera carta a Timoteo"),
|
|
("2Tim", "Segunda carta a Timoteo"),
|
|
("1Pe", "Primera carta de Pedro"),
|
|
("2Pe", "Segunda carta de Pedro"),
|
|
("1Jn", "Primera carta de Juan"),
|
|
("2Jn", "Segunda carta de Juan"),
|
|
("3Jn", "Tercera carta de Juan"),
|
|
("1Mac", "Primer libro de los Macabeos"),
|
|
("2Mac", "Segundo libro de los Macabeos"),
|
|
("1Sam", "Primer libro de Samuel"),
|
|
("2Sam", "Segundo libro de Samuel"),
|
|
("1Sm", "Primer libro de Samuel"),
|
|
("2Sm", "Segundo libro de Samuel"),
|
|
("1Re", "Primer libro de los Reyes"),
|
|
("2Re", "Segundo libro de los Reyes"),
|
|
("1Cr", "Primer libro de las Crónicas"),
|
|
("2Cr", "Segundo libro de las Crónicas"),
|
|
("Hch", "Hechos de los Apóstoles"),
|
|
("Rom", "Romanos"),
|
|
("Rm", "Romanos"),
|
|
("Gal", "Gálatas"),
|
|
("Gál", "Gálatas"),
|
|
("Ef", "Efesios"),
|
|
("Flp", "Filipenses"),
|
|
("Fil", "Filipenses"),
|
|
("Col", "Colosenses"),
|
|
("Tit", "Tito"),
|
|
("Flm", "Filemón"),
|
|
("Heb", "Hebreos"),
|
|
("Sant", "Santiago"),
|
|
("St", "Santiago"),
|
|
("Sto", "Santiago"),
|
|
("Jud", "Judas"),
|
|
("Ap", "Apocalipsis"),
|
|
("Mt", "Mateo"),
|
|
("Mc", "Marcos"),
|
|
("Lc", "Lucas"),
|
|
("Jn", "Juan"),
|
|
("Gn", "Génesis"),
|
|
("Gen", "Génesis"),
|
|
("Ex", "Éxodo"),
|
|
("Lv", "Levítico"),
|
|
("Lev", "Levítico"),
|
|
("Nm", "Números"),
|
|
("Num", "Números"),
|
|
("Dt", "Deuteronomio"),
|
|
("Jos", "Josué"),
|
|
("Jue", "Jueces"),
|
|
("Rut", "Rut"),
|
|
("Esd", "Esdras"),
|
|
("Neh", "Nehemías"),
|
|
("Tob", "Tobías"),
|
|
("Jdt", "Judit"),
|
|
("Est", "Ester"),
|
|
("Job", "Job"),
|
|
("Sal", "Salmos"),
|
|
("Prov", "Proverbios"),
|
|
("Cant", "Cantar de los Cantares"),
|
|
("Sab", "Sabiduría"),
|
|
("Eclo", "Eclesiástico"),
|
|
("Sir", "Eclesiástico"),
|
|
("Ecl", "Eclesiástico"),
|
|
("Isa", "Isaías"),
|
|
("Is", "Isaías"),
|
|
("Jer", "Jeremías"),
|
|
("Jr", "Jeremías"),
|
|
("Lam", "Lamentaciones"),
|
|
("Bar", "Baruc"),
|
|
("Eze", "Ezequiel"),
|
|
("Ez", "Ezequiel"),
|
|
("Dan", "Daniel"),
|
|
("Dn", "Daniel"),
|
|
("Os", "Oseas"),
|
|
("Joel", "Joel"),
|
|
("Am", "Amós"),
|
|
("Abd", "Abdías"),
|
|
("Jon", "Jonás"),
|
|
("Miq", "Miqueas"),
|
|
("Nah", "Nahúm"),
|
|
("Hab", "Habacuc"),
|
|
("Sof", "Sofonías"),
|
|
("Ag", "Ageo"),
|
|
("Zac", "Zacarías"),
|
|
("Mal", "Malaquías"),
|
|
]
|
|
for short, full in books:
|
|
text = re.sub(
|
|
rf"\b{short}\.?(?=\s+\d)",
|
|
full,
|
|
text,
|
|
)
|
|
text = re.sub(r"\b1\s+Co\.?(?=\s+\d)", "Primera carta a los Corintios", text)
|
|
text = re.sub(r"\b2\s+Co\.?(?=\s+\d)", "Segunda carta a los Corintios", text)
|
|
text = re.sub(r"\b1\s+Ts\.?(?=\s+\d)", "Primera carta a los Tesalonicenses", text)
|
|
text = re.sub(r"\b2\s+Ts\.?(?=\s+\d)", "Segunda carta a los Tesalonicenses", text)
|
|
text = re.sub(r"\b1\s+P\.?(?=\s+\d)", "Primera carta de Pedro", text)
|
|
text = re.sub(r"\b2\s+P\.?(?=\s+\d)", "Segunda carta de Pedro", text)
|
|
return text
|
|
|
|
|
|
def add_pauses(text, para=None):
|
|
"""Pausas MiniMax <#seg#> DINÁMICAS por longitud de frase + cierre de párrafos.
|
|
- Tras cada fin de frase (.!?…): pausa según nº de palabras de esa frase
|
|
(<short=0.1s, <=long=0.2s, >long=0.3s; umbrales por palabras).
|
|
- A los párrafos/títulos sin puntuación final se les añade un punto, para que
|
|
MiniMax cierre bien la entonación (si no, deja el tono abierto)."""
|
|
para = para if para is not None else os.environ.get("FEA_PARA_PAUSE", "0.7")
|
|
short = int(os.environ.get("FEA_SHORT_WORDS", "6"))
|
|
long_ = int(os.environ.get("FEA_LONG_WORDS", "12"))
|
|
text = expand_bible_abbreviations(text)
|
|
out = []
|
|
for p in text.split("\n\n"):
|
|
p = ensure_terminal_punctuation(p)
|
|
if not p:
|
|
continue
|
|
# Reconstruir insertando pausa proporcional tras cada signo de fin de frase.
|
|
parts = re.split(r"([.!?…]+)", p)
|
|
rebuilt = ""
|
|
for i in range(0, len(parts), 2):
|
|
frase = parts[i]
|
|
sign = parts[i + 1] if i + 1 < len(parts) else ""
|
|
rebuilt += frase + sign
|
|
if sign and frase.strip():
|
|
rebuilt += f" <#{_sent_pause(len(frase.split()), short, long_)}#> "
|
|
# Quitar la pausa de frase final: el separador de párrafo ya aporta la suya.
|
|
rebuilt = re.sub(r"\s*<#[\d.]+#>\s*$", "", rebuilt)
|
|
out.append(rebuilt.strip())
|
|
return f" <#{para}#> ".join(out)
|
|
|
|
|
|
# MiniMax limita a 10.000 car por petición; dejamos margen porque las pausas
|
|
# <#seg#> y el language_boost también cuentan.
|
|
CHAR_LIMIT = 8000
|
|
|
|
|
|
def _split_for_tts(text, limit=CHAR_LIMIT):
|
|
"""Trocea respetando las pausas <#..#> (frase/párrafo). Fallback por palabras
|
|
si una frase suelta supera el límite."""
|
|
if len(text) <= limit:
|
|
return [text]
|
|
parts = re.split(r"(\s*<#[\d.]+#>\s*)", text)
|
|
chunks, cur = [], ""
|
|
for seg in parts:
|
|
if not seg:
|
|
continue
|
|
if len(cur) + len(seg) <= limit:
|
|
cur += seg
|
|
continue
|
|
if cur.strip():
|
|
chunks.append(cur.strip())
|
|
if len(seg) > limit: # frase gigantesca: parte por palabras
|
|
cur = ""
|
|
for w in seg.split(" "):
|
|
if len(cur) + len(w) + 1 <= limit:
|
|
cur += (" " if cur else "") + w
|
|
else:
|
|
if cur:
|
|
chunks.append(cur)
|
|
cur = w
|
|
else:
|
|
cur = seg
|
|
if cur.strip():
|
|
chunks.append(cur.strip())
|
|
return chunks
|
|
|
|
|
|
def _synth_chunk(text, voice_id, model):
|
|
"""Una petición t2a. Devuelve (audio_bytes|None, rc, usage_chars)."""
|
|
body = {
|
|
"model": model,
|
|
"text": text,
|
|
"voice_setting": {"voice_id": voice_id, "speed": 1.0, "vol": 1.0, "pitch": 0},
|
|
"audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "mp3", "channel": 1},
|
|
"language_boost": "Spanish",
|
|
}
|
|
r = requests.post(f"{BASE}/t2a_v2", headers=H_JSON, json=body)
|
|
j = r.json()
|
|
audio_hex = (j.get("data") or {}).get("audio")
|
|
if not audio_hex:
|
|
rc = (j.get("base_resp") or {}).get("status_code")
|
|
print(f"t2a falló: {json.dumps(j, ensure_ascii=False)[:300]}")
|
|
return None, rc, 0
|
|
usage = (j.get("extra_info") or {}).get("usage_characters", 0)
|
|
return bytes.fromhex(audio_hex), 0, usage
|
|
|
|
|
|
def t2a(text, voice_id, model, name):
|
|
chunks = _split_for_tts(text)
|
|
print(f"Sintetizando {len(text)} car con {model} / {voice_id} "
|
|
f"({len(chunks)} petición/es)…", flush=True)
|
|
raw = OUT / f"{name}.raw.mp3"
|
|
if len(chunks) == 1:
|
|
audio, rc, _ = _synth_chunk(chunks[0], voice_id, model)
|
|
if audio is None:
|
|
return rc
|
|
raw.write_bytes(audio)
|
|
else:
|
|
parts = []
|
|
for k, ch in enumerate(chunks):
|
|
if k > 0:
|
|
import os as _os, time as _t
|
|
_t.sleep(int(_os.environ.get("FEA_CHUNK_PAUSE", "35"))) # respetar TPM de MiniMax
|
|
print(f" trozo {k + 1}/{len(chunks)} ({len(ch)} car)…", flush=True)
|
|
audio, rc, _ = _synth_chunk(ch, voice_id, model)
|
|
if audio is None:
|
|
for p in parts:
|
|
p.unlink(missing_ok=True)
|
|
return rc
|
|
p = OUT / f"{name}.part{k}.mp3"
|
|
p.write_bytes(audio)
|
|
parts.append(p)
|
|
import subprocess as sp0
|
|
args = ["ffmpeg", "-y"]
|
|
for p in parts:
|
|
args += ["-i", str(p)]
|
|
n = len(parts)
|
|
filt = "".join(f"[{k}:a]" for k in range(n)) + f"concat=n={n}:v=0:a=1[a]"
|
|
args += ["-filter_complex", filt, "-map", "[a]", "-b:a", "128k", str(raw)]
|
|
sp0.run(args, capture_output=True)
|
|
for p in parts:
|
|
p.unlink(missing_ok=True)
|
|
# Acabado: comfort noise marrón + fade in/out (quita el "bump" final).
|
|
import subprocess as sp
|
|
dur = float(sp.run(["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
|
"-of", "default=noprint_wrappers=1:nokey=1", str(raw)],
|
|
capture_output=True, text=True).stdout.strip() or "0")
|
|
st = max(0.0, dur - 0.5)
|
|
mp3 = OUT / f"{name}.mp3"
|
|
sp.run(["ffmpeg", "-y", "-i", str(raw), "-filter_complex",
|
|
"anoisesrc=color=brown:amplitude=0.004:sample_rate=32000[n];"
|
|
"[n]highpass=f=120,lowpass=f=3800[nf];"
|
|
"[0:a][nf]amix=inputs=2:duration=first:dropout_transition=0:normalize=0[m];"
|
|
f"[m]afade=t=in:st=0:d=0.08,afade=t=out:st={st:.2f}:d=0.5[a]",
|
|
"-map", "[a]", "-b:a", "128k", str(mp3)], capture_output=True)
|
|
raw.unlink(missing_ok=True)
|
|
print(f"OK -> {mp3} ({dur:.0f}s)")
|
|
return 0
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
sys.exit(__doc__)
|
|
cmd = sys.argv[1]
|
|
if not KEY:
|
|
sys.exit("No encuentro la API key en " + CRED)
|
|
if cmd == "clone":
|
|
clone(sys.argv[2], sys.argv[3])
|
|
elif cmd == "carta":
|
|
pid, voice_id = sys.argv[2], sys.argv[3]
|
|
model = sys.argv[4] if len(sys.argv) > 4 else "speech-2.8-turbo"
|
|
title, text = get_post_text(int(pid))
|
|
name = sys.argv[5] if len(sys.argv) > 5 else f"carta-minimax-{pid}-{model.split('-')[-1]}"
|
|
text = add_pauses(text)
|
|
print(f"Post #{pid}: «{title}» ({len(text)} car con pausas)")
|
|
t2a(text, voice_id, model, name)
|
|
elif cmd == "text":
|
|
model = sys.argv[4] if len(sys.argv) > 4 else "speech-2.8-turbo"
|
|
name = sys.argv[5] if len(sys.argv) > 5 else "minimax-text"
|
|
t2a(sys.argv[2], sys.argv[3], model, name)
|
|
else:
|
|
sys.exit(__doc__)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|