Files
feadulta/scripts/retranslate_failures.py

234 lines
8.7 KiB
Python

#!/usr/bin/env python3
"""
retranslate_failures.py
Re-translates posts where content is in the wrong language.
Reads the audit CSV (/tmp/audit_clean.csv), fetches Spanish originals,
retranslates content (and title if needed), and updates the DB.
Uses a clean prompt WITHOUT few-shot examples to avoid contamination.
"""
import pymysql
import json
import re
import html
import urllib.request
import urllib.error
import time
import sys
import csv
from langdetect import detect, LangDetectException, DetectorFactory
DetectorFactory.seed = 0
JAN_URL = "http://172.19.128.1:1337/v1/chat/completions"
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
LANG_NAMES = {"en": "English", "fr": "French", "it": "Italian", "pt": "Portuguese"}
LANG_NORM = {'es':'es','pt':'pt','fr':'fr','en':'en','it':'it','ca':'es','gl':'es'}
AI_FOOTER = "\n<p><em>Traducido con IA</em></p>"
def strip_html(text):
if not text: return ''
text = re.sub(r'<[^>]+>', ' ', text)
text = html.unescape(text)
return re.sub(r'\s+', ' ', text).strip()
def detect_lang(text, min_len=80):
t = strip_html(text)[:600].strip()
if len(t) < min_len: return None
try: return LANG_NORM.get(detect(t), detect(t))
except: return None
def call_jan(messages, max_tokens=4096, temperature=0.3, timeout=300):
payload = json.dumps({
"model": JAN_MODEL,
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
}).encode("utf-8")
req = urllib.request.Request(
JAN_URL, data=payload,
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
method="POST"
)
with urllib.request.urlopen(req, timeout=timeout) as r:
result = json.loads(r.read())
return result["choices"][0]["message"]["content"].strip()
def translate_content(title, content, lang_code, lang_name):
"""Translate title + content using a clean prompt (no few-shot contamination)."""
system = (
f"You are a professional translator specializing in theological and religious texts. "
f"Translate from Spanish to {lang_name}. "
f"Rules: preserve all HTML tags exactly; translate the title literally in ALL CAPS; "
f"maintain formal theological register; translate standard religious proper nouns (e.g. 'Jesús' → 'Jesus' in English); "
f"keep person/place names as-is; return ONLY the translation starting with 'Title:'"
)
user = f"Title: {title}\n\n{content}"
response = call_jan([
{"role": "system", "content": system},
{"role": "user", "content": user}
])
lines = response.split("\n", 2)
if lines[0].startswith("Title:"):
t_title = lines[0].replace("Title:", "").strip()
t_content = "\n".join(lines[1:]).strip() if len(lines) > 1 else ""
else:
t_title = lines[0].strip()
t_content = "\n".join(lines[1:]).strip() if len(lines) > 1 else response
return t_title, t_content
def translate_title_only(title, lang_name):
response = call_jan([
{"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."},
{"role": "user", "content": f"Translate from Spanish to {lang_name}, ALL CAPS:\n\n{title}"}
], max_tokens=120, temperature=0.1, timeout=30)
return response.strip().strip('"').strip("'")
def main():
# Load audit results
audit_path = '/tmp/audit_clean.csv'
failed_ids = set()
try:
with open(audit_path) as f:
reader = csv.DictReader(f)
for row in reader:
failed_ids.add(int(row['id']))
print(f"Loaded {len(failed_ids)} post IDs with issues from audit")
except FileNotFoundError:
print(f"ERROR: {audit_path} not found. Run audit_translations.py first.")
sys.exit(1)
db = pymysql.connect(**DB)
c = db.cursor()
# Fetch failed posts - get lang and translation group description
id_list = ','.join(str(i) for i in sorted(failed_ids))
c.execute(f"""
SELECT DISTINCT p.ID, p.post_title, p.post_content,
t_lang.slug as lang,
ttg.description as group_desc
FROM wp_posts p
JOIN wp_term_relationships trl ON p.ID=trl.object_id
JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id
JOIN wp_term_relationships trg ON p.ID=trg.object_id
JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations'
WHERE p.ID IN ({id_list}) AND p.post_type='post' AND p.post_status='publish'
""")
raw_posts = c.fetchall()
# Extract Spanish ID from group description and fetch Spanish content
import re as _re
posts = []
es_cache = {}
for p in raw_posts:
desc = p['group_desc'] or ''
m = _re.search(r's:2:"es";i:(\d+);', desc)
if not m:
continue
es_id = int(m.group(1))
if es_id not in es_cache:
c.execute("SELECT ID, post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
row = c.fetchone()
es_cache[es_id] = row
es = es_cache[es_id]
if es:
posts.append({**p, 'es_id': es_id, 'es_title': es['post_title'], 'es_content': es['post_content']})
db.close()
print(f"Fetched {len(posts)} posts to retranslate\n")
# Group by Spanish original to avoid redundant API calls
by_es = {}
for p in posts:
by_es.setdefault(p['es_id'], []).append(p)
done = errors = skipped = 0
total = len(posts)
n = 0
for es_id, translations in sorted(by_es.items()):
es_title = translations[0]['es_title']
es_content = translations[0]['es_content'] or ''
content_len = len(strip_html(es_content))
if content_len < 50:
print(f" ES:{es_id} — SKIPPING (content too short: {content_len} chars)")
skipped += len(translations)
n += len(translations)
continue
print(f"\nES:{es_id}{(es_title or '')[:50]} ({content_len} chars)")
for p in translations:
post_id = p['ID']
lang = p['lang']
lang_name = LANG_NAMES.get(lang, lang)
n += 1
try:
t0 = time.time()
t_title, t_content = translate_content(es_title or '', es_content, lang, lang_name)
elapsed = time.time() - t0
# Validate: content should now be in target language
content_lang = detect_lang(t_content, min_len=80)
ok = (content_lang == lang) or content_lang is None
# If still wrong language, retry with simpler prompt
if not ok and content_lang:
print(f" [{lang}] ⚠ Content still {content_lang}, retrying...")
retry_response = call_jan([
{"role": "system", "content": f"You are a professional translator. Translate the following Spanish text to {lang_name}. Preserve all HTML tags. Return ONLY the translated text, no preamble, no explanation."},
{"role": "user", "content": es_content}
])
t_content = retry_response
content_lang2 = detect_lang(t_content, min_len=80)
if content_lang2 == lang or content_lang2 is None:
print(f" [{lang}] ✓ Retry succeeded ({content_lang2})")
ok = True
else:
print(f" [{lang}] ✗ Retry still {content_lang2}, saving anyway")
# Add AI footer if not present
if AI_FOOTER.strip() not in t_content:
t_content = t_content + AI_FOOTER
# Update DB
db2 = pymysql.connect(**DB)
c2 = db2.cursor()
c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
(t_title, t_content, post_id))
db2.commit()
db2.close()
status = "✓" if ok else "⚠"
print(f" [{lang}] {status} {post_id}: {t_title[:50]} ({elapsed:.0f}s)")
done += 1
except Exception as e:
print(f" [{lang}] ✗ ERROR on {post_id}: {e}")
errors += 1
print(f"\n{'='*50}")
print(f"Done: {done} retranslated, {errors} errors, {skipped} skipped")
print(f"Total processed: {n}/{total}")
if __name__ == "__main__":
main()