Añadir mu-plugins y scripts de feadulta

2026-06-28 15:10:46 -04:00
parent bce7e42f44
commit b6116b066d
106 changed files with 17600 additions and 2 deletions
@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+"""
+retranslate_chunks.py
+
+Re-translates posts where content is in the wrong language.
+Splits post_content into chunks of ~800 chars (at </p> boundaries)
+and translates each chunk independently to avoid model drift.
+"""
+
+import pymysql
+import json
+import re
+import html
+import urllib.request
+import time
+import sys
+import csv
+from langdetect import detect, LangDetectException, DetectorFactory
+DetectorFactory.seed = 0
+
+JAN_URL   = "http://172.19.128.1:1337/v1/chat/completions"
+JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
+
+DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
+          password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
+          cursorclass=pymysql.cursors.DictCursor)
+
+LANG_NAMES = {"en": "English", "fr": "French", "it": "Italian", "pt": "Portuguese"}
+LANG_NORM  = {'es':'es','pt':'pt','fr':'fr','en':'en','it':'it','ca':'es','gl':'es'}
+
+AI_FOOTER  = "\n<p><em>Traducido con IA</em></p>"
+CHUNK_SIZE = 800   # max chars per translation chunk
+MAX_RETRIES = 2
+
+
+def strip_html(text):
+    if not text: return ''
+    text = re.sub(r'<[^>]+>', ' ', text)
+    text = html.unescape(text)
+    return re.sub(r'\s+', ' ', text).strip()
+
+
+def detect_lang(text, min_len=60):
+    t = strip_html(text)[:600].strip()
+    if len(t) < min_len: return None
+    try: return LANG_NORM.get(detect(t), detect(t))
+    except: return None
+
+
+def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120):
+    payload = json.dumps({
+        "model": JAN_MODEL,
+        "messages": messages,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+    }).encode("utf-8")
+    req = urllib.request.Request(
+        JAN_URL, data=payload,
+        headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
+        method="POST"
+    )
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        result = json.loads(r.read())
+        return result["choices"][0]["message"]["content"].strip()
+
+
+def translate_chunk(chunk, lang_name):
+    """Translate a single HTML chunk. Returns translated text or None on failure."""
+    system = (
+        f"You are a professional translator. Translate the following Spanish text to {lang_name}. "
+        f"Preserve all HTML tags exactly as they are. "
+        f"Return ONLY the translated text, nothing else. No preamble, no explanation."
+    )
+    plain_len = len(strip_html(chunk).strip())
+    for attempt in range(MAX_RETRIES):
+        try:
+            result = call_jan([
+                {"role": "system", "content": system},
+                {"role": "user",   "content": chunk}
+            ])
+            # For short chunks (headings, short phrases) langdetect is unreliable —
+            # accept the result as long as it changed from the original Spanish
+            if plain_len < 40:
+                changed = strip_html(result).strip().lower() != strip_html(chunk).strip().lower()
+                if changed or attempt > 0:
+                    return result
+            else:
+                lang = detect_lang(result, min_len=40)
+                if lang is None or lang == lang_name[:2].lower():
+                    return result
+            # Wrong language — retry with more explicit prompt
+            system = (
+                f"Translate from Spanish to {lang_name}. "
+                f"Your response must be entirely in {lang_name}. "
+                f"Preserve HTML tags. Return ONLY the translation."
+            )
+        except Exception as e:
+            if attempt == MAX_RETRIES - 1:
+                return None
+            time.sleep(2)
+    return None  # all retries failed
+
+
+def translate_title(title, lang_name):
+    try:
+        result = call_jan([
+            {"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."},
+            {"role": "user",   "content": f"Translate from Spanish to {lang_name}, ALL CAPS:\n\n{title}"}
+        ], max_tokens=120, temperature=0.1, timeout=30)
+        return result.strip().strip('"').strip("'")
+    except:
+        return None
+
+
+def split_into_chunks(content, max_size=CHUNK_SIZE):
+    """Split HTML content at </p> boundaries into chunks <= max_size chars."""
+    # Split at closing block tags
+    parts = re.split(r'(</p>|</li>|</h[1-6]>|</blockquote>)', content)
+
+    chunks = []
+    current = ""
+    for i in range(0, len(parts), 2):
+        piece = parts[i]
+        closer = parts[i+1] if i+1 < len(parts) else ""
+        segment = piece + closer
+
+        if len(current) + len(segment) <= max_size:
+            current += segment
+        else:
+            if current:
+                chunks.append(current)
+            # If a single segment exceeds max_size, split it roughly
+            if len(segment) > max_size:
+                # Split at sentence boundaries
+                sentences = re.split(r'(?<=[.!?])\s+', segment)
+                current = ""
+                for s in sentences:
+                    if len(current) + len(s) <= max_size:
+                        current += s + " "
+                    else:
+                        if current:
+                            chunks.append(current.strip())
+                        current = s + " "
+            else:
+                current = segment
+
+    if current:
+        chunks.append(current)
+
+    return [c for c in chunks if c.strip()]
+
+
+def translate_content_chunked(content, lang_name):
+    """
+    Translate full post_content by splitting into chunks.
+    Returns (translated_content, success_ratio).
+    """
+    if not content or not content.strip():
+        return content, 1.0
+
+    chunks = split_into_chunks(content)
+    translated_chunks = []
+    failed = 0
+
+    for chunk in chunks:
+        # Skip chunks that are only HTML tags / whitespace
+        if not strip_html(chunk).strip():
+            translated_chunks.append(chunk)
+            continue
+
+        result = translate_chunk(chunk, lang_name)
+        if result is None:
+            # Keep original chunk rather than losing it
+            translated_chunks.append(chunk)
+            failed += 1
+        else:
+            translated_chunks.append(result)
+
+    success_ratio = 1.0 - (failed / len(chunks)) if chunks else 1.0
+    return "\n".join(translated_chunks), success_ratio
+
+
+def main():
+    audit_path = '/tmp/audit_clean.csv'
+    failed_ids = set()
+    try:
+        with open(audit_path) as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                failed_ids.add(int(row['id']))
+        print(f"Loaded {len(failed_ids)} post IDs with issues from audit")
+    except FileNotFoundError:
+        print(f"ERROR: {audit_path} not found. Run audit_translations.py first.")
+        sys.exit(1)
+
+    db = pymysql.connect(**DB)
+    c = db.cursor()
+
+    id_list = ','.join(str(i) for i in sorted(failed_ids))
+    c.execute(f"""
+        SELECT DISTINCT p.ID, p.post_title, p.post_content,
+               t_lang.slug as lang,
+               ttg.description as group_desc
+        FROM wp_posts p
+        JOIN wp_term_relationships trl ON p.ID=trl.object_id
+        JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
+        JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id
+        JOIN wp_term_relationships trg ON p.ID=trg.object_id
+        JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations'
+        WHERE p.ID IN ({id_list}) AND p.post_type='post' AND p.post_status='publish'
+    """)
+    raw_posts = c.fetchall()
+
+    # Fetch Spanish originals
+    posts = []
+    es_cache = {}
+    for p in raw_posts:
+        desc = p['group_desc'] or ''
+        m = re.search(r's:2:"es";i:(\d+);', desc)
+        if not m:
+            continue
+        es_id = int(m.group(1))
+        if es_id not in es_cache:
+            c.execute("SELECT ID, post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
+            row = c.fetchone()
+            es_cache[es_id] = row
+        es = es_cache[es_id]
+        if es:
+            posts.append({**p, 'es_id': es_id, 'es_title': es['post_title'], 'es_content': es['post_content']})
+    db.close()
+    print(f"Fetched {len(posts)} posts to retranslate\n")
+
+    by_es = {}
+    for p in posts:
+        by_es.setdefault(p['es_id'], []).append(p)
+
+    done = errors = skipped = partial = 0
+    total = len(posts)
+    n = 0
+
+    for es_id, translations in sorted(by_es.items()):
+        es_title   = translations[0]['es_title'] or ''
+        es_content = translations[0]['es_content'] or ''
+        content_len = len(strip_html(es_content))
+
+        if content_len < 50:
+            print(f"  ES:{es_id} — SKIPPING (too short: {content_len} chars)")
+            skipped += len(translations)
+            n += len(translations)
+            continue
+
+        # Show chunk count for visibility
+        chunks = split_into_chunks(es_content)
+        print(f"\nES:{es_id} — {es_title[:50]} ({content_len} chars, {len(chunks)} chunks)")
+
+        for p in translations:
+            post_id   = p['ID']
+            lang      = p['lang']
+            lang_name = LANG_NAMES.get(lang, lang)
+            n += 1
+
+            try:
+                t0 = time.time()
+
+                # Translate title
+                t_title = translate_title(es_title, lang_name) if es_title else ''
+                if not t_title or t_title.upper() == es_title.upper():
+                    t_title = p['post_title']  # keep existing if translation failed
+
+                # Translate content chunk by chunk
+                t_content, ratio = translate_content_chunked(es_content, lang_name)
+                elapsed = time.time() - t0
+
+                # Validate overall content language
+                content_lang = detect_lang(t_content, min_len=80)
+                lang_ok = (content_lang == lang) or content_lang is None
+
+                # Add AI footer
+                if AI_FOOTER.strip() not in t_content:
+                    t_content = t_content + AI_FOOTER
+
+                # Update DB
+                db2 = pymysql.connect(**DB)
+                c2 = db2.cursor()
+                c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
+                           (t_title, t_content, post_id))
+                db2.commit()
+                db2.close()
+
+                status = "✓" if (lang_ok and ratio == 1.0) else ("~" if lang_ok else "⚠")
+                if ratio < 1.0:
+                    partial += 1
+                elif lang_ok:
+                    done += 1
+                else:
+                    errors += 1
+
+                print(f"  [{lang}] {status} {post_id}: {t_title[:50]} ({elapsed:.0f}s, {ratio:.0%} ok)")
+
+            except Exception as e:
+                print(f"  [{lang}] ✗ ERROR on {post_id}: {e}")
+                errors += 1
+
+    print(f"\n{'='*50}")
+    print(f"Done: {done} ✓  partial: {partial} ~  errors/wrong-lang: {errors} ⚠  skipped: {skipped}")
+    print(f"Total: {n}/{total}")
+
+
+if __name__ == "__main__":
+    main()