Añadir mu-plugins y scripts de feadulta
This commit is contained in:
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Reprocesa traducciones ROTAS con Claude Haiku 4.5 (API directa).
|
||||
|
||||
Coge el ES original, lo traduce con Haiku y SOBRESCRIBE la traducción ya
|
||||
existente (in-place, sin duplicar). Detecta los rotos por ratio de longitud.
|
||||
|
||||
Uso:
|
||||
reprocess_en_haiku.py --auto --langs en --limit 100 # EN rotos
|
||||
reprocess_en_haiku.py --auto --langs fr,it,pt --limit 50 # otros idiomas
|
||||
reprocess_en_haiku.py --ids 44205 --langs en # src concretos
|
||||
añade --apply para ESCRIBIR en la BD (sin él = dry-run).
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from translate_haiku import translate # noqa: E402
|
||||
|
||||
STATE = "/tmp/feadulta-translate-state.json"
|
||||
CONTAINER = "wordpress-web"
|
||||
RATIO_BROKEN = 0.45
|
||||
|
||||
|
||||
def dexec(args):
|
||||
return subprocess.run(["docker", "exec", CONTAINER, *args],
|
||||
capture_output=True, text=True)
|
||||
|
||||
|
||||
def dcp(remote, local):
|
||||
subprocess.run(["docker", "cp", f"{CONTAINER}:{remote}", local], check=True)
|
||||
|
||||
|
||||
def dcp_to(local, remote):
|
||||
subprocess.run(["docker", "cp", local, f"{CONTAINER}:{remote}"], check=True)
|
||||
|
||||
|
||||
def get_post(pid):
|
||||
r = dexec(["php", "/tmp/fea_post_io.php", "get", str(pid)])
|
||||
if r.returncode != 0:
|
||||
raise RuntimeError(f"get {pid}: {r.stderr.strip()}")
|
||||
dcp("/tmp/fea_es.json", "/tmp/fea_es.json")
|
||||
return json.load(open("/tmp/fea_es.json"))
|
||||
|
||||
|
||||
def strip_len(html):
|
||||
return len(re.sub(r"<[^>]*>", "", html))
|
||||
|
||||
|
||||
def find_broken(state, langs, limit):
|
||||
"""Devuelve [(src, lang), ...] de traducciones rotas."""
|
||||
out = []
|
||||
for key, tid in state["done"].items():
|
||||
src, lang = key.split(":")
|
||||
if lang not in langs:
|
||||
continue
|
||||
try:
|
||||
es = get_post(int(src))
|
||||
tr = get_post(int(tid))
|
||||
except RuntimeError:
|
||||
continue
|
||||
olen = strip_len(es["content"])
|
||||
if olen < 40:
|
||||
continue
|
||||
if strip_len(tr["content"]) / olen < RATIO_BROKEN:
|
||||
out.append((int(src), lang))
|
||||
if len(out) >= limit:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--ids", nargs="*", type=int, default=[])
|
||||
ap.add_argument("--langs", default="en")
|
||||
ap.add_argument("--auto", action="store_true")
|
||||
ap.add_argument("--limit", type=int, default=100)
|
||||
ap.add_argument("--apply", action="store_true")
|
||||
args = ap.parse_args()
|
||||
|
||||
langs = [l.strip() for l in args.langs.split(",") if l.strip()]
|
||||
state = json.load(open(STATE))
|
||||
|
||||
if args.auto:
|
||||
print(f"Autodetectando rotos (ratio<{RATIO_BROKEN}) en {langs}…")
|
||||
pairs = find_broken(state, langs, args.limit)
|
||||
print(f"Encontrados: {pairs}")
|
||||
else:
|
||||
pairs = [(src, lang) for src in args.ids for lang in langs]
|
||||
|
||||
tot_in = tot_out = 0.0
|
||||
for src, lang in pairs:
|
||||
tid = state["done"].get(f"{src}:{lang}")
|
||||
if not tid:
|
||||
print(f"[{src}:{lang}] sin traducción en state; salto")
|
||||
continue
|
||||
es = get_post(src)
|
||||
body, u1 = translate(es["content"], lang)
|
||||
title, u2 = translate(es["title"], lang, is_title=True)
|
||||
tot_in += u1.input_tokens + u2.input_tokens
|
||||
tot_out += u1.output_tokens + u2.output_tokens
|
||||
|
||||
print(f"\n===== src #{src} [{lang}] -> #{tid} =====")
|
||||
print(f"TÍTULO {lang}: {title}")
|
||||
print(f"cuerpo ES={strip_len(es['content'])} -> {lang}={strip_len(body)}")
|
||||
|
||||
if args.apply:
|
||||
open("/tmp/fea_title.txt", "w").write(title)
|
||||
open("/tmp/fea_body.txt", "w").write(body)
|
||||
dcp_to("/tmp/fea_title.txt", "/tmp/fea_title.txt")
|
||||
dcp_to("/tmp/fea_body.txt", "/tmp/fea_body.txt")
|
||||
r = dexec(["php", "/tmp/fea_post_io.php", "update", str(tid),
|
||||
"/tmp/fea_title.txt", "/tmp/fea_body.txt"])
|
||||
print(("APLICADO: " + r.stdout.strip()) if r.returncode == 0
|
||||
else ("FALLO: " + r.stderr.strip()))
|
||||
else:
|
||||
print("(dry-run)")
|
||||
|
||||
cost = tot_in / 1e6 * 1.0 + tot_out / 1e6 * 5.0
|
||||
print(f"\nTOTAL tokens: in={int(tot_in)} out={int(tot_out)} coste=${cost:.4f}")
|
||||
print("MODO: " + ("APLICADO a BD" if args.apply else "DRY-RUN"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user