feadulta/scripts/retranslate_chunks.py

#!/usr/bin/env python3
"""
retranslate_chunks.py

Re-translates posts where content is in the wrong language.
Splits post_content into chunks of ~800 chars (at </p> boundaries)
and translates each chunk independently to avoid model drift.
"""

import pymysql
import json
import re
import html
import urllib.request
import time
import sys
import csv
from langdetect import detect, LangDetectException, DetectorFactory
DetectorFactory.seed = 0

JAN_URL   = "http://172.19.128.1:1337/v1/chat/completions"
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"

DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
          password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
          cursorclass=pymysql.cursors.DictCursor)

LANG_NAMES = {"en": "English", "fr": "French", "it": "Italian", "pt": "Portuguese"}
LANG_NORM  = {'es':'es','pt':'pt','fr':'fr','en':'en','it':'it','ca':'es','gl':'es'}

AI_FOOTER  = "\n<p><em>Traducido con IA</em></p>"
CHUNK_SIZE = 800   # max chars per translation chunk
MAX_RETRIES = 2


def strip_html(text):
    if not text: return ''
    text = re.sub(r'<[^>]+>', ' ', text)
    text = html.unescape(text)
    return re.sub(r'\s+', ' ', text).strip()


def detect_lang(text, min_len=60):
    t = strip_html(text)[:600].strip()
    if len(t) < min_len: return None
    try: return LANG_NORM.get(detect(t), detect(t))
    except: return None


def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120):
    payload = json.dumps({
        "model": JAN_MODEL,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
    }).encode("utf-8")
    req = urllib.request.Request(
        JAN_URL, data=payload,
        headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
        method="POST"
    )
    with urllib.request.urlopen(req, timeout=timeout) as r:
        result = json.loads(r.read())
        return result["choices"][0]["message"]["content"].strip()


def translate_chunk(chunk, lang_name):
    """Translate a single HTML chunk. Returns translated text or None on failure."""
    system = (
        f"You are a professional translator. Translate the following Spanish text to {lang_name}. "
        f"Preserve all HTML tags exactly as they are. "
        f"Return ONLY the translated text, nothing else. No preamble, no explanation."
    )
    plain_len = len(strip_html(chunk).strip())
    for attempt in range(MAX_RETRIES):
        try:
            result = call_jan([
                {"role": "system", "content": system},
                {"role": "user",   "content": chunk}
            ])
            # For short chunks (headings, short phrases) langdetect is unreliable —
            # accept the result as long as it changed from the original Spanish
            if plain_len < 40:
                changed = strip_html(result).strip().lower() != strip_html(chunk).strip().lower()
                if changed or attempt > 0:
                    return result
            else:
                lang = detect_lang(result, min_len=40)
                if lang is None or lang == lang_name[:2].lower():
                    return result
            # Wrong language — retry with more explicit prompt
            system = (
                f"Translate from Spanish to {lang_name}. "
                f"Your response must be entirely in {lang_name}. "
                f"Preserve HTML tags. Return ONLY the translation."
            )
        except Exception as e:
            if attempt == MAX_RETRIES - 1:
                return None
            time.sleep(2)
    return None  # all retries failed


def translate_title(title, lang_name):
    try:
        result = call_jan([
            {"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."},
            {"role": "user",   "content": f"Translate from Spanish to {lang_name}, ALL CAPS:\n\n{title}"}
        ], max_tokens=120, temperature=0.1, timeout=30)
        return result.strip().strip('"').strip("'")
    except:
        return None


def split_into_chunks(content, max_size=CHUNK_SIZE):
    """Split HTML content at </p> boundaries into chunks <= max_size chars."""
    # Split at closing block tags
    parts = re.split(r'(</p>|</li>|</h[1-6]>|</blockquote>)', content)

    chunks = []
    current = ""
    for i in range(0, len(parts), 2):
        piece = parts[i]
        closer = parts[i+1] if i+1 < len(parts) else ""
        segment = piece + closer

        if len(current) + len(segment) <= max_size:
            current += segment
        else:
            if current:
                chunks.append(current)
            # If a single segment exceeds max_size, split it roughly
            if len(segment) > max_size:
                # Split at sentence boundaries
                sentences = re.split(r'(?<=[.!?])\s+', segment)
                current = ""
                for s in sentences:
                    if len(current) + len(s) <= max_size:
                        current += s + " "
                    else:
                        if current:
                            chunks.append(current.strip())
                        current = s + " "
            else:
                current = segment

    if current:
        chunks.append(current)

    return [c for c in chunks if c.strip()]


def translate_content_chunked(content, lang_name):
    """
    Translate full post_content by splitting into chunks.
    Returns (translated_content, success_ratio).
    """
    if not content or not content.strip():
        return content, 1.0

    chunks = split_into_chunks(content)
    translated_chunks = []
    failed = 0

    for chunk in chunks:
        # Skip chunks that are only HTML tags / whitespace
        if not strip_html(chunk).strip():
            translated_chunks.append(chunk)
            continue

        result = translate_chunk(chunk, lang_name)
        if result is None:
            # Keep original chunk rather than losing it
            translated_chunks.append(chunk)
            failed += 1
        else:
            translated_chunks.append(result)

    success_ratio = 1.0 - (failed / len(chunks)) if chunks else 1.0
    return "\n".join(translated_chunks), success_ratio


def main():
    audit_path = '/tmp/audit_clean.csv'
    failed_ids = set()
    try:
        with open(audit_path) as f:
            reader = csv.DictReader(f)
            for row in reader:
                failed_ids.add(int(row['id']))
        print(f"Loaded {len(failed_ids)} post IDs with issues from audit")
    except FileNotFoundError:
        print(f"ERROR: {audit_path} not found. Run audit_translations.py first.")
        sys.exit(1)

    db = pymysql.connect(**DB)
    c = db.cursor()

    id_list = ','.join(str(i) for i in sorted(failed_ids))
    c.execute(f"""
        SELECT DISTINCT p.ID, p.post_title, p.post_content,
               t_lang.slug as lang,
               ttg.description as group_desc
        FROM wp_posts p
        JOIN wp_term_relationships trl ON p.ID=trl.object_id
        JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
        JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id
        JOIN wp_term_relationships trg ON p.ID=trg.object_id
        JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations'
        WHERE p.ID IN ({id_list}) AND p.post_type='post' AND p.post_status='publish'
    """)
    raw_posts = c.fetchall()

    # Fetch Spanish originals
    posts = []
    es_cache = {}
    for p in raw_posts:
        desc = p['group_desc'] or ''
        m = re.search(r's:2:"es";i:(\d+);', desc)
        if not m:
            continue
        es_id = int(m.group(1))
        if es_id not in es_cache:
            c.execute("SELECT ID, post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
            row = c.fetchone()
            es_cache[es_id] = row
        es = es_cache[es_id]
        if es:
            posts.append({**p, 'es_id': es_id, 'es_title': es['post_title'], 'es_content': es['post_content']})
    db.close()
    print(f"Fetched {len(posts)} posts to retranslate\n")

    by_es = {}
    for p in posts:
        by_es.setdefault(p['es_id'], []).append(p)

    done = errors = skipped = partial = 0
    total = len(posts)
    n = 0

    for es_id, translations in sorted(by_es.items()):
        es_title   = translations[0]['es_title'] or ''
        es_content = translations[0]['es_content'] or ''
        content_len = len(strip_html(es_content))

        if content_len < 50:
            print(f"  ES:{es_id} — SKIPPING (too short: {content_len} chars)")
            skipped += len(translations)
            n += len(translations)
            continue

        # Show chunk count for visibility
        chunks = split_into_chunks(es_content)
        print(f"\nES:{es_id} — {es_title[:50]} ({content_len} chars, {len(chunks)} chunks)")

        for p in translations:
            post_id   = p['ID']
            lang      = p['lang']
            lang_name = LANG_NAMES.get(lang, lang)
            n += 1

            try:
                t0 = time.time()

                # Translate title
                t_title = translate_title(es_title, lang_name) if es_title else ''
                if not t_title or t_title.upper() == es_title.upper():
                    t_title = p['post_title']  # keep existing if translation failed

                # Translate content chunk by chunk
                t_content, ratio = translate_content_chunked(es_content, lang_name)
                elapsed = time.time() - t0

                # Validate overall content language
                content_lang = detect_lang(t_content, min_len=80)
                lang_ok = (content_lang == lang) or content_lang is None

                # Add AI footer
                if AI_FOOTER.strip() not in t_content:
                    t_content = t_content + AI_FOOTER

                # Update DB
                db2 = pymysql.connect(**DB)
                c2 = db2.cursor()
                c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
                           (t_title, t_content, post_id))
                db2.commit()
                db2.close()

                status = "✓" if (lang_ok and ratio == 1.0) else ("~" if lang_ok else "⚠")
                if ratio < 1.0:
                    partial += 1
                elif lang_ok:
                    done += 1
                else:
                    errors += 1

                print(f"  [{lang}] {status} {post_id}: {t_title[:50]} ({elapsed:.0f}s, {ratio:.0%} ok)")

            except Exception as e:
                print(f"  [{lang}] ✗ ERROR on {post_id}: {e}")
                errors += 1

    print(f"\n{'='*50}")
    print(f"Done: {done} ✓  partial: {partial} ~  errors/wrong-lang: {errors} ⚠  skipped: {skipped}")
    print(f"Total: {n}/{total}")


if __name__ == "__main__":
    main()