Añadir mu-plugins y scripts de feadulta

2026-06-28 15:10:46 -04:00
parent bce7e42f44
commit b6116b066d
106 changed files with 17600 additions and 2 deletions
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""
+retranslate_failures.py
+
+Re-translates posts where content is in the wrong language.
+Reads the audit CSV (/tmp/audit_clean.csv), fetches Spanish originals,
+retranslates content (and title if needed), and updates the DB.
+
+Uses a clean prompt WITHOUT few-shot examples to avoid contamination.
+"""
+
+import pymysql
+import json
+import re
+import html
+import urllib.request
+import urllib.error
+import time
+import sys
+import csv
+from langdetect import detect, LangDetectException, DetectorFactory
+DetectorFactory.seed = 0
+
+JAN_URL   = "http://172.19.128.1:1337/v1/chat/completions"
+JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
+
+DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
+          password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
+          cursorclass=pymysql.cursors.DictCursor)
+
+LANG_NAMES = {"en": "English", "fr": "French", "it": "Italian", "pt": "Portuguese"}
+LANG_NORM  = {'es':'es','pt':'pt','fr':'fr','en':'en','it':'it','ca':'es','gl':'es'}
+
+AI_FOOTER = "\n<p><em>Traducido con IA</em></p>"
+
+
+def strip_html(text):
+    if not text: return ''
+    text = re.sub(r'<[^>]+>', ' ', text)
+    text = html.unescape(text)
+    return re.sub(r'\s+', ' ', text).strip()
+
+
+def detect_lang(text, min_len=80):
+    t = strip_html(text)[:600].strip()
+    if len(t) < min_len: return None
+    try: return LANG_NORM.get(detect(t), detect(t))
+    except: return None
+
+
+def call_jan(messages, max_tokens=4096, temperature=0.3, timeout=300):
+    payload = json.dumps({
+        "model": JAN_MODEL,
+        "messages": messages,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+    }).encode("utf-8")
+    req = urllib.request.Request(
+        JAN_URL, data=payload,
+        headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
+        method="POST"
+    )
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        result = json.loads(r.read())
+        return result["choices"][0]["message"]["content"].strip()
+
+
+def translate_content(title, content, lang_code, lang_name):
+    """Translate title + content using a clean prompt (no few-shot contamination)."""
+    system = (
+        f"You are a professional translator specializing in theological and religious texts. "
+        f"Translate from Spanish to {lang_name}. "
+        f"Rules: preserve all HTML tags exactly; translate the title literally in ALL CAPS; "
+        f"maintain formal theological register; translate standard religious proper nouns (e.g. 'Jesús' → 'Jesus' in English); "
+        f"keep person/place names as-is; return ONLY the translation starting with 'Title:'"
+    )
+    user = f"Title: {title}\n\n{content}"
+    response = call_jan([
+        {"role": "system", "content": system},
+        {"role": "user", "content": user}
+    ])
+
+    lines = response.split("\n", 2)
+    if lines[0].startswith("Title:"):
+        t_title = lines[0].replace("Title:", "").strip()
+        t_content = "\n".join(lines[1:]).strip() if len(lines) > 1 else ""
+    else:
+        t_title = lines[0].strip()
+        t_content = "\n".join(lines[1:]).strip() if len(lines) > 1 else response
+
+    return t_title, t_content
+
+
+def translate_title_only(title, lang_name):
+    response = call_jan([
+        {"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."},
+        {"role": "user", "content": f"Translate from Spanish to {lang_name}, ALL CAPS:\n\n{title}"}
+    ], max_tokens=120, temperature=0.1, timeout=30)
+    return response.strip().strip('"').strip("'")
+
+
+def main():
+    # Load audit results
+    audit_path = '/tmp/audit_clean.csv'
+    failed_ids = set()
+    try:
+        with open(audit_path) as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                failed_ids.add(int(row['id']))
+        print(f"Loaded {len(failed_ids)} post IDs with issues from audit")
+    except FileNotFoundError:
+        print(f"ERROR: {audit_path} not found. Run audit_translations.py first.")
+        sys.exit(1)
+
+    db = pymysql.connect(**DB)
+    c = db.cursor()
+
+    # Fetch failed posts - get lang and translation group description
+    id_list = ','.join(str(i) for i in sorted(failed_ids))
+    c.execute(f"""
+        SELECT DISTINCT p.ID, p.post_title, p.post_content,
+               t_lang.slug as lang,
+               ttg.description as group_desc
+        FROM wp_posts p
+        JOIN wp_term_relationships trl ON p.ID=trl.object_id
+        JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
+        JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id
+        JOIN wp_term_relationships trg ON p.ID=trg.object_id
+        JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations'
+        WHERE p.ID IN ({id_list}) AND p.post_type='post' AND p.post_status='publish'
+    """)
+    raw_posts = c.fetchall()
+
+    # Extract Spanish ID from group description and fetch Spanish content
+    import re as _re
+    posts = []
+    es_cache = {}
+    for p in raw_posts:
+        desc = p['group_desc'] or ''
+        m = _re.search(r's:2:"es";i:(\d+);', desc)
+        if not m:
+            continue
+        es_id = int(m.group(1))
+        if es_id not in es_cache:
+            c.execute("SELECT ID, post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
+            row = c.fetchone()
+            es_cache[es_id] = row
+        es = es_cache[es_id]
+        if es:
+            posts.append({**p, 'es_id': es_id, 'es_title': es['post_title'], 'es_content': es['post_content']})
+    db.close()
+    print(f"Fetched {len(posts)} posts to retranslate\n")
+
+    # Group by Spanish original to avoid redundant API calls
+    by_es = {}
+    for p in posts:
+        by_es.setdefault(p['es_id'], []).append(p)
+
+    done = errors = skipped = 0
+    total = len(posts)
+    n = 0
+
+    for es_id, translations in sorted(by_es.items()):
+        es_title = translations[0]['es_title']
+        es_content = translations[0]['es_content'] or ''
+        content_len = len(strip_html(es_content))
+
+        if content_len < 50:
+            print(f"  ES:{es_id} — SKIPPING (content too short: {content_len} chars)")
+            skipped += len(translations)
+            n += len(translations)
+            continue
+
+        print(f"\nES:{es_id} — {(es_title or '')[:50]} ({content_len} chars)")
+
+        for p in translations:
+            post_id = p['ID']
+            lang = p['lang']
+            lang_name = LANG_NAMES.get(lang, lang)
+            n += 1
+
+            try:
+                t0 = time.time()
+                t_title, t_content = translate_content(es_title or '', es_content, lang, lang_name)
+                elapsed = time.time() - t0
+
+                # Validate: content should now be in target language
+                content_lang = detect_lang(t_content, min_len=80)
+                ok = (content_lang == lang) or content_lang is None
+
+                # If still wrong language, retry with simpler prompt
+                if not ok and content_lang:
+                    print(f"  [{lang}] ⚠ Content still {content_lang}, retrying...")
+                    retry_response = call_jan([
+                        {"role": "system", "content": f"You are a professional translator. Translate the following Spanish text to {lang_name}. Preserve all HTML tags. Return ONLY the translated text, no preamble, no explanation."},
+                        {"role": "user", "content": es_content}
+                    ])
+                    t_content = retry_response
+                    content_lang2 = detect_lang(t_content, min_len=80)
+                    if content_lang2 == lang or content_lang2 is None:
+                        print(f"  [{lang}] ✓ Retry succeeded ({content_lang2})")
+                        ok = True
+                    else:
+                        print(f"  [{lang}] ✗ Retry still {content_lang2}, saving anyway")
+
+                # Add AI footer if not present
+                if AI_FOOTER.strip() not in t_content:
+                    t_content = t_content + AI_FOOTER
+
+                # Update DB
+                db2 = pymysql.connect(**DB)
+                c2 = db2.cursor()
+                c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
+                           (t_title, t_content, post_id))
+                db2.commit()
+                db2.close()
+
+                status = "✓" if ok else "⚠"
+                print(f"  [{lang}] {status} {post_id}: {t_title[:50]} ({elapsed:.0f}s)")
+                done += 1
+
+            except Exception as e:
+                print(f"  [{lang}] ✗ ERROR on {post_id}: {e}")
+                errors += 1
+
+    print(f"\n{'='*50}")
+    print(f"Done: {done} retranslated, {errors} errors, {skipped} skipped")
+    print(f"Total processed: {n}/{total}")
+
+
+if __name__ == "__main__":
+    main()