feadulta/scripts/retranslate_failures.py

#!/usr/bin/env python3
"""
retranslate_failures.py

Re-translates posts where content is in the wrong language.
Reads the audit CSV (/tmp/audit_clean.csv), fetches Spanish originals,
retranslates content (and title if needed), and updates the DB.

Uses a clean prompt WITHOUT few-shot examples to avoid contamination.
"""

import pymysql
import json
import re
import html
import urllib.request
import urllib.error
import time
import sys
import csv
from langdetect import detect, LangDetectException, DetectorFactory
DetectorFactory.seed = 0

JAN_URL   = "http://172.19.128.1:1337/v1/chat/completions"
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"

DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
          password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
          cursorclass=pymysql.cursors.DictCursor)

LANG_NAMES = {"en": "English", "fr": "French", "it": "Italian", "pt": "Portuguese"}
LANG_NORM  = {'es':'es','pt':'pt','fr':'fr','en':'en','it':'it','ca':'es','gl':'es'}

AI_FOOTER = "\n<p><em>Traducido con IA</em></p>"


def strip_html(text):
    if not text: return ''
    text = re.sub(r'<[^>]+>', ' ', text)
    text = html.unescape(text)
    return re.sub(r'\s+', ' ', text).strip()


def detect_lang(text, min_len=80):
    t = strip_html(text)[:600].strip()
    if len(t) < min_len: return None
    try: return LANG_NORM.get(detect(t), detect(t))
    except: return None


def call_jan(messages, max_tokens=4096, temperature=0.3, timeout=300):
    payload = json.dumps({
        "model": JAN_MODEL,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
    }).encode("utf-8")
    req = urllib.request.Request(
        JAN_URL, data=payload,
        headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
        method="POST"
    )
    with urllib.request.urlopen(req, timeout=timeout) as r:
        result = json.loads(r.read())
        return result["choices"][0]["message"]["content"].strip()


def translate_content(title, content, lang_code, lang_name):
    """Translate title + content using a clean prompt (no few-shot contamination)."""
    system = (
        f"You are a professional translator specializing in theological and religious texts. "
        f"Translate from Spanish to {lang_name}. "
        f"Rules: preserve all HTML tags exactly; translate the title literally in ALL CAPS; "
        f"maintain formal theological register; translate standard religious proper nouns (e.g. 'Jesús' → 'Jesus' in English); "
        f"keep person/place names as-is; return ONLY the translation starting with 'Title:'"
    )
    user = f"Title: {title}\n\n{content}"
    response = call_jan([
        {"role": "system", "content": system},
        {"role": "user", "content": user}
    ])

    lines = response.split("\n", 2)
    if lines[0].startswith("Title:"):
        t_title = lines[0].replace("Title:", "").strip()
        t_content = "\n".join(lines[1:]).strip() if len(lines) > 1 else ""
    else:
        t_title = lines[0].strip()
        t_content = "\n".join(lines[1:]).strip() if len(lines) > 1 else response

    return t_title, t_content


def translate_title_only(title, lang_name):
    response = call_jan([
        {"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."},
        {"role": "user", "content": f"Translate from Spanish to {lang_name}, ALL CAPS:\n\n{title}"}
    ], max_tokens=120, temperature=0.1, timeout=30)
    return response.strip().strip('"').strip("'")


def main():
    # Load audit results
    audit_path = '/tmp/audit_clean.csv'
    failed_ids = set()
    try:
        with open(audit_path) as f:
            reader = csv.DictReader(f)
            for row in reader:
                failed_ids.add(int(row['id']))
        print(f"Loaded {len(failed_ids)} post IDs with issues from audit")
    except FileNotFoundError:
        print(f"ERROR: {audit_path} not found. Run audit_translations.py first.")
        sys.exit(1)

    db = pymysql.connect(**DB)
    c = db.cursor()

    # Fetch failed posts - get lang and translation group description
    id_list = ','.join(str(i) for i in sorted(failed_ids))
    c.execute(f"""
        SELECT DISTINCT p.ID, p.post_title, p.post_content,
               t_lang.slug as lang,
               ttg.description as group_desc
        FROM wp_posts p
        JOIN wp_term_relationships trl ON p.ID=trl.object_id
        JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
        JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id
        JOIN wp_term_relationships trg ON p.ID=trg.object_id
        JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations'
        WHERE p.ID IN ({id_list}) AND p.post_type='post' AND p.post_status='publish'
    """)
    raw_posts = c.fetchall()

    # Extract Spanish ID from group description and fetch Spanish content
    import re as _re
    posts = []
    es_cache = {}
    for p in raw_posts:
        desc = p['group_desc'] or ''
        m = _re.search(r's:2:"es";i:(\d+);', desc)
        if not m:
            continue
        es_id = int(m.group(1))
        if es_id not in es_cache:
            c.execute("SELECT ID, post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
            row = c.fetchone()
            es_cache[es_id] = row
        es = es_cache[es_id]
        if es:
            posts.append({**p, 'es_id': es_id, 'es_title': es['post_title'], 'es_content': es['post_content']})
    db.close()
    print(f"Fetched {len(posts)} posts to retranslate\n")

    # Group by Spanish original to avoid redundant API calls
    by_es = {}
    for p in posts:
        by_es.setdefault(p['es_id'], []).append(p)

    done = errors = skipped = 0
    total = len(posts)
    n = 0

    for es_id, translations in sorted(by_es.items()):
        es_title = translations[0]['es_title']
        es_content = translations[0]['es_content'] or ''
        content_len = len(strip_html(es_content))

        if content_len < 50:
            print(f"  ES:{es_id} — SKIPPING (content too short: {content_len} chars)")
            skipped += len(translations)
            n += len(translations)
            continue

        print(f"\nES:{es_id} — {(es_title or '')[:50]} ({content_len} chars)")

        for p in translations:
            post_id = p['ID']
            lang = p['lang']
            lang_name = LANG_NAMES.get(lang, lang)
            n += 1

            try:
                t0 = time.time()
                t_title, t_content = translate_content(es_title or '', es_content, lang, lang_name)
                elapsed = time.time() - t0

                # Validate: content should now be in target language
                content_lang = detect_lang(t_content, min_len=80)
                ok = (content_lang == lang) or content_lang is None

                # If still wrong language, retry with simpler prompt
                if not ok and content_lang:
                    print(f"  [{lang}] ⚠ Content still {content_lang}, retrying...")
                    retry_response = call_jan([
                        {"role": "system", "content": f"You are a professional translator. Translate the following Spanish text to {lang_name}. Preserve all HTML tags. Return ONLY the translated text, no preamble, no explanation."},
                        {"role": "user", "content": es_content}
                    ])
                    t_content = retry_response
                    content_lang2 = detect_lang(t_content, min_len=80)
                    if content_lang2 == lang or content_lang2 is None:
                        print(f"  [{lang}] ✓ Retry succeeded ({content_lang2})")
                        ok = True
                    else:
                        print(f"  [{lang}] ✗ Retry still {content_lang2}, saving anyway")

                # Add AI footer if not present
                if AI_FOOTER.strip() not in t_content:
                    t_content = t_content + AI_FOOTER

                # Update DB
                db2 = pymysql.connect(**DB)
                c2 = db2.cursor()
                c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
                           (t_title, t_content, post_id))
                db2.commit()
                db2.close()

                status = "✓" if ok else "⚠"
                print(f"  [{lang}] {status} {post_id}: {t_title[:50]} ({elapsed:.0f}s)")
                done += 1

            except Exception as e:
                print(f"  [{lang}] ✗ ERROR on {post_id}: {e}")
                errors += 1

    print(f"\n{'='*50}")
    print(f"Done: {done} retranslated, {errors} errors, {skipped} skipped")
    print(f"Total processed: {n}/{total}")


if __name__ == "__main__":
    main()