129 lines
4.2 KiB
Python
129 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Reprocesa traducciones ROTAS con Claude Haiku 4.5 (API directa).
|
|
|
|
Coge el ES original, lo traduce con Haiku y SOBRESCRIBE la traducción ya
|
|
existente (in-place, sin duplicar). Detecta los rotos por ratio de longitud.
|
|
|
|
Uso:
|
|
reprocess_en_haiku.py --auto --langs en --limit 100 # EN rotos
|
|
reprocess_en_haiku.py --auto --langs fr,it,pt --limit 50 # otros idiomas
|
|
reprocess_en_haiku.py --ids 44205 --langs en # src concretos
|
|
añade --apply para ESCRIBIR en la BD (sin él = dry-run).
|
|
"""
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
from translate_haiku import translate # noqa: E402
|
|
|
|
STATE = "/tmp/feadulta-translate-state.json"
|
|
CONTAINER = "wordpress-web"
|
|
RATIO_BROKEN = 0.45
|
|
|
|
|
|
def dexec(args):
|
|
return subprocess.run(["docker", "exec", CONTAINER, *args],
|
|
capture_output=True, text=True)
|
|
|
|
|
|
def dcp(remote, local):
|
|
subprocess.run(["docker", "cp", f"{CONTAINER}:{remote}", local], check=True)
|
|
|
|
|
|
def dcp_to(local, remote):
|
|
subprocess.run(["docker", "cp", local, f"{CONTAINER}:{remote}"], check=True)
|
|
|
|
|
|
def get_post(pid):
|
|
r = dexec(["php", "/tmp/fea_post_io.php", "get", str(pid)])
|
|
if r.returncode != 0:
|
|
raise RuntimeError(f"get {pid}: {r.stderr.strip()}")
|
|
dcp("/tmp/fea_es.json", "/tmp/fea_es.json")
|
|
return json.load(open("/tmp/fea_es.json"))
|
|
|
|
|
|
def strip_len(html):
|
|
return len(re.sub(r"<[^>]*>", "", html))
|
|
|
|
|
|
def find_broken(state, langs, limit):
|
|
"""Devuelve [(src, lang), ...] de traducciones rotas."""
|
|
out = []
|
|
for key, tid in state["done"].items():
|
|
src, lang = key.split(":")
|
|
if lang not in langs:
|
|
continue
|
|
try:
|
|
es = get_post(int(src))
|
|
tr = get_post(int(tid))
|
|
except RuntimeError:
|
|
continue
|
|
olen = strip_len(es["content"])
|
|
if olen < 40:
|
|
continue
|
|
if strip_len(tr["content"]) / olen < RATIO_BROKEN:
|
|
out.append((int(src), lang))
|
|
if len(out) >= limit:
|
|
break
|
|
return out
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--ids", nargs="*", type=int, default=[])
|
|
ap.add_argument("--langs", default="en")
|
|
ap.add_argument("--auto", action="store_true")
|
|
ap.add_argument("--limit", type=int, default=100)
|
|
ap.add_argument("--apply", action="store_true")
|
|
args = ap.parse_args()
|
|
|
|
langs = [l.strip() for l in args.langs.split(",") if l.strip()]
|
|
state = json.load(open(STATE))
|
|
|
|
if args.auto:
|
|
print(f"Autodetectando rotos (ratio<{RATIO_BROKEN}) en {langs}…")
|
|
pairs = find_broken(state, langs, args.limit)
|
|
print(f"Encontrados: {pairs}")
|
|
else:
|
|
pairs = [(src, lang) for src in args.ids for lang in langs]
|
|
|
|
tot_in = tot_out = 0.0
|
|
for src, lang in pairs:
|
|
tid = state["done"].get(f"{src}:{lang}")
|
|
if not tid:
|
|
print(f"[{src}:{lang}] sin traducción en state; salto")
|
|
continue
|
|
es = get_post(src)
|
|
body, u1 = translate(es["content"], lang)
|
|
title, u2 = translate(es["title"], lang, is_title=True)
|
|
tot_in += u1.input_tokens + u2.input_tokens
|
|
tot_out += u1.output_tokens + u2.output_tokens
|
|
|
|
print(f"\n===== src #{src} [{lang}] -> #{tid} =====")
|
|
print(f"TÍTULO {lang}: {title}")
|
|
print(f"cuerpo ES={strip_len(es['content'])} -> {lang}={strip_len(body)}")
|
|
|
|
if args.apply:
|
|
open("/tmp/fea_title.txt", "w").write(title)
|
|
open("/tmp/fea_body.txt", "w").write(body)
|
|
dcp_to("/tmp/fea_title.txt", "/tmp/fea_title.txt")
|
|
dcp_to("/tmp/fea_body.txt", "/tmp/fea_body.txt")
|
|
r = dexec(["php", "/tmp/fea_post_io.php", "update", str(tid),
|
|
"/tmp/fea_title.txt", "/tmp/fea_body.txt"])
|
|
print(("APLICADO: " + r.stdout.strip()) if r.returncode == 0
|
|
else ("FALLO: " + r.stderr.strip()))
|
|
else:
|
|
print("(dry-run)")
|
|
|
|
cost = tot_in / 1e6 * 1.0 + tot_out / 1e6 * 5.0
|
|
print(f"\nTOTAL tokens: in={int(tot_in)} out={int(tot_out)} coste=${cost:.4f}")
|
|
print("MODO: " + ("APLICADO a BD" if args.apply else "DRY-RUN"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|