#!/usr/bin/env python3 """ retranslate_failures.py Re-translates posts where content is in the wrong language. Reads the audit CSV (/tmp/audit_clean.csv), fetches Spanish originals, retranslates content (and title if needed), and updates the DB. Uses a clean prompt WITHOUT few-shot examples to avoid contamination. """ import pymysql import json import re import html import urllib.request import urllib.error import time import sys import csv from langdetect import detect, LangDetectException, DetectorFactory DetectorFactory.seed = 0 JAN_URL = "http://172.19.128.1:1337/v1/chat/completions" JAN_MODEL = "gemma-3-12b-it-Q4_K_M" DB = dict(host='172.18.0.2', port=3306, user='wordpress_user', password='wordpress_pass', database='wordpress_db', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) LANG_NAMES = {"en": "English", "fr": "French", "it": "Italian", "pt": "Portuguese"} LANG_NORM = {'es':'es','pt':'pt','fr':'fr','en':'en','it':'it','ca':'es','gl':'es'} AI_FOOTER = "\n
Traducido con IA
" def strip_html(text): if not text: return '' text = re.sub(r'<[^>]+>', ' ', text) text = html.unescape(text) return re.sub(r'\s+', ' ', text).strip() def detect_lang(text, min_len=80): t = strip_html(text)[:600].strip() if len(t) < min_len: return None try: return LANG_NORM.get(detect(t), detect(t)) except: return None def call_jan(messages, max_tokens=4096, temperature=0.3, timeout=300): payload = json.dumps({ "model": JAN_MODEL, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, }).encode("utf-8") req = urllib.request.Request( JAN_URL, data=payload, headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"}, method="POST" ) with urllib.request.urlopen(req, timeout=timeout) as r: result = json.loads(r.read()) return result["choices"][0]["message"]["content"].strip() def translate_content(title, content, lang_code, lang_name): """Translate title + content using a clean prompt (no few-shot contamination).""" system = ( f"You are a professional translator specializing in theological and religious texts. " f"Translate from Spanish to {lang_name}. " f"Rules: preserve all HTML tags exactly; translate the title literally in ALL CAPS; " f"maintain formal theological register; translate standard religious proper nouns (e.g. 'Jesús' → 'Jesus' in English); " f"keep person/place names as-is; return ONLY the translation starting with 'Title:'" ) user = f"Title: {title}\n\n{content}" response = call_jan([ {"role": "system", "content": system}, {"role": "user", "content": user} ]) lines = response.split("\n", 2) if lines[0].startswith("Title:"): t_title = lines[0].replace("Title:", "").strip() t_content = "\n".join(lines[1:]).strip() if len(lines) > 1 else "" else: t_title = lines[0].strip() t_content = "\n".join(lines[1:]).strip() if len(lines) > 1 else response return t_title, t_content def translate_title_only(title, lang_name): response = call_jan([ {"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."}, {"role": "user", "content": f"Translate from Spanish to {lang_name}, ALL CAPS:\n\n{title}"} ], max_tokens=120, temperature=0.1, timeout=30) return response.strip().strip('"').strip("'") def main(): # Load audit results audit_path = '/tmp/audit_clean.csv' failed_ids = set() try: with open(audit_path) as f: reader = csv.DictReader(f) for row in reader: failed_ids.add(int(row['id'])) print(f"Loaded {len(failed_ids)} post IDs with issues from audit") except FileNotFoundError: print(f"ERROR: {audit_path} not found. Run audit_translations.py first.") sys.exit(1) db = pymysql.connect(**DB) c = db.cursor() # Fetch failed posts - get lang and translation group description id_list = ','.join(str(i) for i in sorted(failed_ids)) c.execute(f""" SELECT DISTINCT p.ID, p.post_title, p.post_content, t_lang.slug as lang, ttg.description as group_desc FROM wp_posts p JOIN wp_term_relationships trl ON p.ID=trl.object_id JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language' JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id JOIN wp_term_relationships trg ON p.ID=trg.object_id JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations' WHERE p.ID IN ({id_list}) AND p.post_type='post' AND p.post_status='publish' """) raw_posts = c.fetchall() # Extract Spanish ID from group description and fetch Spanish content import re as _re posts = [] es_cache = {} for p in raw_posts: desc = p['group_desc'] or '' m = _re.search(r's:2:"es";i:(\d+);', desc) if not m: continue es_id = int(m.group(1)) if es_id not in es_cache: c.execute("SELECT ID, post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,)) row = c.fetchone() es_cache[es_id] = row es = es_cache[es_id] if es: posts.append({**p, 'es_id': es_id, 'es_title': es['post_title'], 'es_content': es['post_content']}) db.close() print(f"Fetched {len(posts)} posts to retranslate\n") # Group by Spanish original to avoid redundant API calls by_es = {} for p in posts: by_es.setdefault(p['es_id'], []).append(p) done = errors = skipped = 0 total = len(posts) n = 0 for es_id, translations in sorted(by_es.items()): es_title = translations[0]['es_title'] es_content = translations[0]['es_content'] or '' content_len = len(strip_html(es_content)) if content_len < 50: print(f" ES:{es_id} — SKIPPING (content too short: {content_len} chars)") skipped += len(translations) n += len(translations) continue print(f"\nES:{es_id} — {(es_title or '')[:50]} ({content_len} chars)") for p in translations: post_id = p['ID'] lang = p['lang'] lang_name = LANG_NAMES.get(lang, lang) n += 1 try: t0 = time.time() t_title, t_content = translate_content(es_title or '', es_content, lang, lang_name) elapsed = time.time() - t0 # Validate: content should now be in target language content_lang = detect_lang(t_content, min_len=80) ok = (content_lang == lang) or content_lang is None # If still wrong language, retry with simpler prompt if not ok and content_lang: print(f" [{lang}] ⚠ Content still {content_lang}, retrying...") retry_response = call_jan([ {"role": "system", "content": f"You are a professional translator. Translate the following Spanish text to {lang_name}. Preserve all HTML tags. Return ONLY the translated text, no preamble, no explanation."}, {"role": "user", "content": es_content} ]) t_content = retry_response content_lang2 = detect_lang(t_content, min_len=80) if content_lang2 == lang or content_lang2 is None: print(f" [{lang}] ✓ Retry succeeded ({content_lang2})") ok = True else: print(f" [{lang}] ✗ Retry still {content_lang2}, saving anyway") # Add AI footer if not present if AI_FOOTER.strip() not in t_content: t_content = t_content + AI_FOOTER # Update DB db2 = pymysql.connect(**DB) c2 = db2.cursor() c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s", (t_title, t_content, post_id)) db2.commit() db2.close() status = "✓" if ok else "⚠" print(f" [{lang}] {status} {post_id}: {t_title[:50]} ({elapsed:.0f}s)") done += 1 except Exception as e: print(f" [{lang}] ✗ ERROR on {post_id}: {e}") errors += 1 print(f"\n{'='*50}") print(f"Done: {done} retranslated, {errors} errors, {skipped} skipped") print(f"Total processed: {n}/{total}") if __name__ == "__main__": main()