100 lines
3.6 KiB
Python
100 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""Locuta una carta/artículo entero de feadulta con la voz clonada (XTTS-v2 + GPU).
|
|
|
|
Saca el texto del post ES, lo trocea por párrafos, lo locuta con la voz de
|
|
referencia (calculando los latents del hablante UNA sola vez), concatena con
|
|
pausas y añade comfort noise. Issue #76.
|
|
|
|
Uso:
|
|
tts_carta.py <post_id> <muestra_voz.wav> [nombre_salida]
|
|
"""
|
|
import html
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
os.environ.setdefault("COQUI_TOS_AGREED", "1")
|
|
|
|
import numpy as np # noqa: E402
|
|
import soundfile as sf # noqa: E402
|
|
import torch # noqa: E402
|
|
from TTS.api import TTS # noqa: E402
|
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() and not os.environ.get("FEA_CPU") else "cpu"
|
|
OUT = Path(__file__).resolve().parent.parent / "wordpress/wp-content/uploads/tts-samples"
|
|
SR = 24000
|
|
CONTAINER = "wordpress-web"
|
|
|
|
|
|
def get_post_text(pid):
|
|
subprocess.run(["docker", "exec", CONTAINER, "php", "/tmp/fea_post_io.php", "get", str(pid)],
|
|
check=True, capture_output=True)
|
|
subprocess.run(["docker", "cp", f"{CONTAINER}:/tmp/fea_es.json", "/tmp/fea_es.json"], check=True)
|
|
d = json.load(open("/tmp/fea_es.json"))
|
|
raw = d["content"]
|
|
# Conserva límites de párrafo antes de quitar tags.
|
|
raw = re.sub(r"(?i)</p>|<br\s*/?>|</h[1-6]>", "\n", raw)
|
|
raw = re.sub(r"<[^>]+>", "", raw) # quita tags
|
|
raw = re.sub(r"\[[^\]]+\]", "", raw) # quita shortcodes
|
|
raw = html.unescape(raw)
|
|
paras = [re.sub(r"\s+", " ", p).strip() for p in raw.split("\n")]
|
|
paras = [p for p in paras if len(p) > 1]
|
|
return d["title"], paras
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 3:
|
|
sys.exit("uso: tts_carta.py <post_id> <muestra_voz.wav> [nombre_salida]")
|
|
pid = int(sys.argv[1])
|
|
spk = sys.argv[2]
|
|
name = sys.argv[3] if len(sys.argv) > 3 else f"carta-{pid}"
|
|
|
|
title, paras = get_post_text(pid)
|
|
print(f"Post #{pid}: «{title}» ({len(paras)} párrafos, {sum(len(p) for p in paras)} car)")
|
|
|
|
print(f"Cargando XTTS-v2 en {DEVICE}…", flush=True)
|
|
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE)
|
|
model = tts.synthesizer.tts_model
|
|
print("Calculando timbre del hablante (1 vez)…", flush=True)
|
|
gpt_cond, spk_emb = model.get_conditioning_latents(audio_path=[spk])
|
|
|
|
pause = np.zeros(int(SR * 0.35), dtype=np.float32)
|
|
pieces = []
|
|
import time
|
|
t0 = time.time()
|
|
for i, para in enumerate(paras, 1):
|
|
out = model.inference(
|
|
para, "es", gpt_cond, spk_emb,
|
|
temperature=0.65, repetition_penalty=5.0, top_k=50, top_p=0.85,
|
|
enable_text_splitting=True,
|
|
)
|
|
pieces.append(np.asarray(out["wav"], dtype=np.float32))
|
|
pieces.append(pause)
|
|
print(f" párrafo {i}/{len(paras)} ({len(para)} car) ok", flush=True)
|
|
audio = np.concatenate(pieces)
|
|
dt = time.time() - t0
|
|
dur = len(audio) / SR
|
|
print(f"Síntesis: {dt:.1f}s para {dur:.1f}s de audio (x{dur/dt:.1f} tiempo real) en {DEVICE}")
|
|
|
|
raw = OUT / f"{name}.raw.wav"
|
|
sf.write(raw, audio, SR)
|
|
wav = OUT / f"{name}.wav"
|
|
subprocess.run([
|
|
"ffmpeg", "-y", "-i", str(raw), "-filter_complex",
|
|
"anoisesrc=color=brown:amplitude=0.004:sample_rate=24000[n];"
|
|
"[n]highpass=f=120,lowpass=f=3800[nf];"
|
|
"[0:a][nf]amix=inputs=2:duration=first:dropout_transition=0:normalize=0[a]",
|
|
"-map", "[a]", "-ar", "24000", str(wav),
|
|
], capture_output=True)
|
|
raw.unlink(missing_ok=True)
|
|
mp3 = OUT / f"{name}.mp3"
|
|
subprocess.run(["ffmpeg", "-y", "-i", str(wav), "-b:a", "96k", str(mp3)], capture_output=True)
|
|
print(f"OK -> {mp3} ({dur:.0f}s de audio)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|