#!/usr/bin/env python3 """Reprocesa traducciones ROTAS con Claude Haiku 4.5 (API directa). Coge el ES original, lo traduce con Haiku y SOBRESCRIBE la traducción ya existente (in-place, sin duplicar). Detecta los rotos por ratio de longitud. Uso: reprocess_en_haiku.py --auto --langs en --limit 100 # EN rotos reprocess_en_haiku.py --auto --langs fr,it,pt --limit 50 # otros idiomas reprocess_en_haiku.py --ids 44205 --langs en # src concretos añade --apply para ESCRIBIR en la BD (sin él = dry-run). """ import argparse import json import os import re import subprocess import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from translate_haiku import translate # noqa: E402 STATE = "/tmp/feadulta-translate-state.json" CONTAINER = "wordpress-web" RATIO_BROKEN = 0.45 def dexec(args): return subprocess.run(["docker", "exec", CONTAINER, *args], capture_output=True, text=True) def dcp(remote, local): subprocess.run(["docker", "cp", f"{CONTAINER}:{remote}", local], check=True) def dcp_to(local, remote): subprocess.run(["docker", "cp", local, f"{CONTAINER}:{remote}"], check=True) def get_post(pid): r = dexec(["php", "/tmp/fea_post_io.php", "get", str(pid)]) if r.returncode != 0: raise RuntimeError(f"get {pid}: {r.stderr.strip()}") dcp("/tmp/fea_es.json", "/tmp/fea_es.json") return json.load(open("/tmp/fea_es.json")) def strip_len(html): return len(re.sub(r"<[^>]*>", "", html)) def find_broken(state, langs, limit): """Devuelve [(src, lang), ...] de traducciones rotas.""" out = [] for key, tid in state["done"].items(): src, lang = key.split(":") if lang not in langs: continue try: es = get_post(int(src)) tr = get_post(int(tid)) except RuntimeError: continue olen = strip_len(es["content"]) if olen < 40: continue if strip_len(tr["content"]) / olen < RATIO_BROKEN: out.append((int(src), lang)) if len(out) >= limit: break return out def main(): ap = argparse.ArgumentParser() ap.add_argument("--ids", nargs="*", type=int, default=[]) ap.add_argument("--langs", default="en") ap.add_argument("--auto", action="store_true") ap.add_argument("--limit", type=int, default=100) ap.add_argument("--apply", action="store_true") args = ap.parse_args() langs = [l.strip() for l in args.langs.split(",") if l.strip()] state = json.load(open(STATE)) if args.auto: print(f"Autodetectando rotos (ratio<{RATIO_BROKEN}) en {langs}…") pairs = find_broken(state, langs, args.limit) print(f"Encontrados: {pairs}") else: pairs = [(src, lang) for src in args.ids for lang in langs] tot_in = tot_out = 0.0 for src, lang in pairs: tid = state["done"].get(f"{src}:{lang}") if not tid: print(f"[{src}:{lang}] sin traducción en state; salto") continue es = get_post(src) body, u1 = translate(es["content"], lang) title, u2 = translate(es["title"], lang, is_title=True) tot_in += u1.input_tokens + u2.input_tokens tot_out += u1.output_tokens + u2.output_tokens print(f"\n===== src #{src} [{lang}] -> #{tid} =====") print(f"TÍTULO {lang}: {title}") print(f"cuerpo ES={strip_len(es['content'])} -> {lang}={strip_len(body)}") if args.apply: open("/tmp/fea_title.txt", "w").write(title) open("/tmp/fea_body.txt", "w").write(body) dcp_to("/tmp/fea_title.txt", "/tmp/fea_title.txt") dcp_to("/tmp/fea_body.txt", "/tmp/fea_body.txt") r = dexec(["php", "/tmp/fea_post_io.php", "update", str(tid), "/tmp/fea_title.txt", "/tmp/fea_body.txt"]) print(("APLICADO: " + r.stdout.strip()) if r.returncode == 0 else ("FALLO: " + r.stderr.strip())) else: print("(dry-run)") cost = tot_in / 1e6 * 1.0 + tot_out / 1e6 * 5.0 print(f"\nTOTAL tokens: in={int(tot_in)} out={int(tot_out)} coste=${cost:.4f}") print("MODO: " + ("APLICADO a BD" if args.apply else "DRY-RUN")) if __name__ == "__main__": main()