#!/usr/bin/env python3 """Locuta una carta/artículo entero de feadulta con la voz clonada (XTTS-v2 + GPU). Saca el texto del post ES, lo trocea por párrafos, lo locuta con la voz de referencia (calculando los latents del hablante UNA sola vez), concatena con pausas y añade comfort noise. Issue #76. Uso: tts_carta.py [nombre_salida] """ import html import json import os import re import subprocess import sys from pathlib import Path os.environ.setdefault("COQUI_TOS_AGREED", "1") import numpy as np # noqa: E402 import soundfile as sf # noqa: E402 import torch # noqa: E402 from TTS.api import TTS # noqa: E402 DEVICE = "cuda" if torch.cuda.is_available() and not os.environ.get("FEA_CPU") else "cpu" OUT = Path(__file__).resolve().parent.parent / "wordpress/wp-content/uploads/tts-samples" SR = 24000 CONTAINER = "wordpress-web" def get_post_text(pid): subprocess.run(["docker", "exec", CONTAINER, "php", "/tmp/fea_post_io.php", "get", str(pid)], check=True, capture_output=True) subprocess.run(["docker", "cp", f"{CONTAINER}:/tmp/fea_es.json", "/tmp/fea_es.json"], check=True) d = json.load(open("/tmp/fea_es.json")) raw = d["content"] # Conserva límites de párrafo antes de quitar tags. raw = re.sub(r"(?i)

||", "\n", raw) raw = re.sub(r"<[^>]+>", "", raw) # quita tags raw = re.sub(r"\[[^\]]+\]", "", raw) # quita shortcodes raw = html.unescape(raw) paras = [re.sub(r"\s+", " ", p).strip() for p in raw.split("\n")] paras = [p for p in paras if len(p) > 1] return d["title"], paras def main(): if len(sys.argv) < 3: sys.exit("uso: tts_carta.py [nombre_salida]") pid = int(sys.argv[1]) spk = sys.argv[2] name = sys.argv[3] if len(sys.argv) > 3 else f"carta-{pid}" title, paras = get_post_text(pid) print(f"Post #{pid}: «{title}» ({len(paras)} párrafos, {sum(len(p) for p in paras)} car)") print(f"Cargando XTTS-v2 en {DEVICE}…", flush=True) tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE) model = tts.synthesizer.tts_model print("Calculando timbre del hablante (1 vez)…", flush=True) gpt_cond, spk_emb = model.get_conditioning_latents(audio_path=[spk]) pause = np.zeros(int(SR * 0.35), dtype=np.float32) pieces = [] import time t0 = time.time() for i, para in enumerate(paras, 1): out = model.inference( para, "es", gpt_cond, spk_emb, temperature=0.65, repetition_penalty=5.0, top_k=50, top_p=0.85, enable_text_splitting=True, ) pieces.append(np.asarray(out["wav"], dtype=np.float32)) pieces.append(pause) print(f" párrafo {i}/{len(paras)} ({len(para)} car) ok", flush=True) audio = np.concatenate(pieces) dt = time.time() - t0 dur = len(audio) / SR print(f"Síntesis: {dt:.1f}s para {dur:.1f}s de audio (x{dur/dt:.1f} tiempo real) en {DEVICE}") raw = OUT / f"{name}.raw.wav" sf.write(raw, audio, SR) wav = OUT / f"{name}.wav" subprocess.run([ "ffmpeg", "-y", "-i", str(raw), "-filter_complex", "anoisesrc=color=brown:amplitude=0.004:sample_rate=24000[n];" "[n]highpass=f=120,lowpass=f=3800[nf];" "[0:a][nf]amix=inputs=2:duration=first:dropout_transition=0:normalize=0[a]", "-map", "[a]", "-ar", "24000", str(wav), ], capture_output=True) raw.unlink(missing_ok=True) mp3 = OUT / f"{name}.mp3" subprocess.run(["ffmpeg", "-y", "-i", str(wav), "-b:a", "96k", str(mp3)], capture_output=True) print(f"OK -> {mp3} ({dur:.0f}s de audio)") if __name__ == "__main__": main()