Files
feadulta/scripts/reprocess_en_haiku.py

129 lines
4.2 KiB
Python

#!/usr/bin/env python3
"""Reprocesa traducciones ROTAS con Claude Haiku 4.5 (API directa).
Coge el ES original, lo traduce con Haiku y SOBRESCRIBE la traducción ya
existente (in-place, sin duplicar). Detecta los rotos por ratio de longitud.
Uso:
reprocess_en_haiku.py --auto --langs en --limit 100 # EN rotos
reprocess_en_haiku.py --auto --langs fr,it,pt --limit 50 # otros idiomas
reprocess_en_haiku.py --ids 44205 --langs en # src concretos
añade --apply para ESCRIBIR en la BD (sin él = dry-run).
"""
import argparse
import json
import os
import re
import subprocess
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from translate_haiku import translate # noqa: E402
STATE = "/tmp/feadulta-translate-state.json"
CONTAINER = "wordpress-web"
RATIO_BROKEN = 0.45
def dexec(args):
return subprocess.run(["docker", "exec", CONTAINER, *args],
capture_output=True, text=True)
def dcp(remote, local):
subprocess.run(["docker", "cp", f"{CONTAINER}:{remote}", local], check=True)
def dcp_to(local, remote):
subprocess.run(["docker", "cp", local, f"{CONTAINER}:{remote}"], check=True)
def get_post(pid):
r = dexec(["php", "/tmp/fea_post_io.php", "get", str(pid)])
if r.returncode != 0:
raise RuntimeError(f"get {pid}: {r.stderr.strip()}")
dcp("/tmp/fea_es.json", "/tmp/fea_es.json")
return json.load(open("/tmp/fea_es.json"))
def strip_len(html):
return len(re.sub(r"<[^>]*>", "", html))
def find_broken(state, langs, limit):
"""Devuelve [(src, lang), ...] de traducciones rotas."""
out = []
for key, tid in state["done"].items():
src, lang = key.split(":")
if lang not in langs:
continue
try:
es = get_post(int(src))
tr = get_post(int(tid))
except RuntimeError:
continue
olen = strip_len(es["content"])
if olen < 40:
continue
if strip_len(tr["content"]) / olen < RATIO_BROKEN:
out.append((int(src), lang))
if len(out) >= limit:
break
return out
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--ids", nargs="*", type=int, default=[])
ap.add_argument("--langs", default="en")
ap.add_argument("--auto", action="store_true")
ap.add_argument("--limit", type=int, default=100)
ap.add_argument("--apply", action="store_true")
args = ap.parse_args()
langs = [l.strip() for l in args.langs.split(",") if l.strip()]
state = json.load(open(STATE))
if args.auto:
print(f"Autodetectando rotos (ratio<{RATIO_BROKEN}) en {langs}…")
pairs = find_broken(state, langs, args.limit)
print(f"Encontrados: {pairs}")
else:
pairs = [(src, lang) for src in args.ids for lang in langs]
tot_in = tot_out = 0.0
for src, lang in pairs:
tid = state["done"].get(f"{src}:{lang}")
if not tid:
print(f"[{src}:{lang}] sin traducción en state; salto")
continue
es = get_post(src)
body, u1 = translate(es["content"], lang)
title, u2 = translate(es["title"], lang, is_title=True)
tot_in += u1.input_tokens + u2.input_tokens
tot_out += u1.output_tokens + u2.output_tokens
print(f"\n===== src #{src} [{lang}] -> #{tid} =====")
print(f"TÍTULO {lang}: {title}")
print(f"cuerpo ES={strip_len(es['content'])} -> {lang}={strip_len(body)}")
if args.apply:
open("/tmp/fea_title.txt", "w").write(title)
open("/tmp/fea_body.txt", "w").write(body)
dcp_to("/tmp/fea_title.txt", "/tmp/fea_title.txt")
dcp_to("/tmp/fea_body.txt", "/tmp/fea_body.txt")
r = dexec(["php", "/tmp/fea_post_io.php", "update", str(tid),
"/tmp/fea_title.txt", "/tmp/fea_body.txt"])
print(("APLICADO: " + r.stdout.strip()) if r.returncode == 0
else ("FALLO: " + r.stderr.strip()))
else:
print("(dry-run)")
cost = tot_in / 1e6 * 1.0 + tot_out / 1e6 * 5.0
print(f"\nTOTAL tokens: in={int(tot_in)} out={int(tot_out)} coste=${cost:.4f}")
print("MODO: " + ("APLICADO a BD" if args.apply else "DRY-RUN"))
if __name__ == "__main__":
main()