Añadir mu-plugins y scripts de feadulta

2026-06-28 15:10:46 -04:00
parent bce7e42f44
commit b6116b066d
106 changed files with 17600 additions and 2 deletions
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""
+retranslate_en_all.py
+
+Retranslates ALL English posts (ID > 42760) from their Spanish originals.
+Uses chunk-based translation (~800 chars per chunk) to avoid model drift.
+Sequential, single process.
+"""
+
+import pymysql, json, re, html, urllib.request, time, sys
+from langdetect import detect, LangDetectException, DetectorFactory
+DetectorFactory.seed = 0
+
+JAN_URL   = "http://172.19.128.1:1337/v1/chat/completions"
+JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
+
+DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
+          password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
+          cursorclass=pymysql.cursors.DictCursor)
+
+CHUNK_SIZE  = 800
+MAX_RETRIES = 2
+AI_FOOTER   = "\n<p><em>Traducido con IA</em></p>"
+
+
+def strip_html(text):
+    if not text: return ''
+    text = re.sub(r'<[^>]+>', ' ', text)
+    text = html.unescape(text)
+    return re.sub(r'\s+', ' ', text).strip()
+
+
+def detect_lang(text, min_len=40):
+    t = strip_html(text)[:400].strip()
+    if len(t) < min_len: return None
+    try:
+        from langdetect import detect as _detect
+        return _detect(t)
+    except: return None
+
+
+def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120):
+    payload = json.dumps({
+        "model": JAN_MODEL, "messages": messages,
+        "temperature": temperature, "max_tokens": max_tokens,
+    }).encode("utf-8")
+    req = urllib.request.Request(
+        JAN_URL, data=payload,
+        headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
+        method="POST"
+    )
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        return json.loads(r.read())["choices"][0]["message"]["content"].strip()
+
+
+def translate_chunk(chunk, attempt=0):
+    prompts = [
+        "You are a professional translator. Translate the following Spanish text to English. Preserve all HTML tags exactly. Return ONLY the translated text, no preamble, no explanation.",
+        "Translate from Spanish to English. Your entire response must be in English. Preserve HTML tags. Return ONLY the translation, nothing else.",
+    ]
+    system = prompts[min(attempt, len(prompts)-1)]
+    result = call_jan([
+        {"role": "system", "content": system},
+        {"role": "user",   "content": chunk}
+    ])
+    # Short chunks: retry if output == input (model didn't translate)
+    plain_in  = strip_html(chunk).strip().lower()
+    plain_out = strip_html(result).strip().lower()
+    if len(plain_in) < 40 and plain_in == plain_out and attempt == 0:
+        return translate_chunk(chunk, attempt=1)
+    return result
+
+
+def translate_title(es_title):
+    try:
+        result = call_jan([
+            {"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."},
+            {"role": "user",   "content": f"Translate from Spanish to English, ALL CAPS:\n\n{es_title}"}
+        ], max_tokens=150, temperature=0.1, timeout=30)
+        result = result.strip().strip('"').strip("'")
+        # Reject if identical to original
+        if result.upper() == es_title.upper():
+            return es_title
+        return result
+    except:
+        return es_title
+
+
+def split_chunks(content):
+    parts = re.split(r'(</p>|</li>|</h[1-6]>|</blockquote>)', content)
+    chunks, current = [], ""
+    for i in range(0, len(parts), 2):
+        segment = parts[i] + (parts[i+1] if i+1 < len(parts) else "")
+        if len(current) + len(segment) <= CHUNK_SIZE:
+            current += segment
+        else:
+            if current: chunks.append(current)
+            if len(segment) > CHUNK_SIZE:
+                # Split long segment at sentence boundaries
+                sentences = re.split(r'(?<=[.!?])\s+', segment)
+                current = ""
+                for s in sentences:
+                    if len(current) + len(s) <= CHUNK_SIZE:
+                        current += s + " "
+                    else:
+                        if current: chunks.append(current.strip())
+                        current = s + " "
+            else:
+                current = segment
+    if current: chunks.append(current)
+    return [c for c in chunks if strip_html(c).strip()]
+
+
+def main():
+    db = pymysql.connect(**DB)
+    c = db.cursor()
+
+    # Fetch all EN posts with their Spanish originals
+    c.execute("""
+        SELECT DISTINCT p.ID, p.post_title,
+               ttg.description as group_desc
+        FROM wp_posts p
+        JOIN wp_term_relationships trl ON p.ID=trl.object_id
+        JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
+        JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id AND t_lang.slug='en'
+        JOIN wp_term_relationships trg ON p.ID=trg.object_id
+        JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations'
+        WHERE p.ID > 42760 AND p.post_type='post' AND p.post_status='publish'
+        ORDER BY p.ID
+    """)
+    posts = c.fetchall()
+    print(f"Found {len(posts)} EN posts to retranslate\n", flush=True)
+
+    done = errors = skipped = 0
+    total = len(posts)
+
+    for n, p in enumerate(posts, 1):
+        post_id   = p['ID']
+        desc      = p['group_desc'] or ''
+        m         = re.search(r's:2:"es";i:(\d+);', desc)
+        if not m:
+            print(f"[{n}/{total}] {post_id} — SKIP (no ES original in group)", flush=True)
+            skipped += 1
+            continue
+
+        es_id = int(m.group(1))
+        c.execute("SELECT post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
+        es = c.fetchone()
+        if not es or not es['post_content']:
+            print(f"[{n}/{total}] {post_id} — SKIP (ES:{es_id} empty)", flush=True)
+            skipped += 1
+            continue
+
+        es_title   = es['post_title'] or ''
+        es_content = es['post_content']
+        plain_len  = len(strip_html(es_content))
+        chunks     = split_chunks(es_content)
+
+        print(f"\n[{n}/{total}] WP:{post_id} ← ES:{es_id} — {es_title[:50]}", flush=True)
+        print(f"  {plain_len} chars, {len(chunks)} chunks", flush=True)
+
+        if plain_len < 50:
+            print(f"  SKIP (too short)", flush=True)
+            skipped += 1
+            continue
+
+        try:
+            t0 = time.time()
+
+            # Translate title
+            t_title = translate_title(es_title)
+
+            # Translate content chunk by chunk
+            translated = []
+            chunk_ok = chunk_bad = 0
+            for i, chunk in enumerate(chunks):
+                try:
+                    result = translate_chunk(chunk, attempt=0)
+                    lang   = detect_lang(result, min_len=40)
+
+                    if lang and lang != 'en' and len(strip_html(result)) >= 40:
+                        result2 = translate_chunk(chunk, attempt=1)
+                        lang2   = detect_lang(result2, min_len=40)
+                        if lang2 == 'en' or lang2 is None:
+                            result = result2
+                            chunk_ok += 1
+                        else:
+                            chunk_bad += 1
+                    else:
+                        chunk_ok += 1
+                    translated.append(result)
+                except Exception as e:
+                    print(f"  chunk {i+1} ERROR: {e}", flush=True)
+                    translated.append(chunk)
+                    chunk_bad += 1
+
+            t_content = "\n".join(translated)
+            if AI_FOOTER.strip() not in t_content:
+                t_content += AI_FOOTER
+
+            # Validate overall
+            content_lang = detect_lang(t_content, min_len=80)
+            lang_ok = content_lang in ('en', None)
+            elapsed = time.time() - t0
+
+            # Save
+            db2 = pymysql.connect(**DB)
+            c2  = db2.cursor()
+            c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
+                       (t_title, t_content, post_id))
+            db2.commit()
+            db2.close()
+
+            status = "✓" if lang_ok else "⚠"
+            bad_note = f" ({chunk_bad} chunks bad)" if chunk_bad else ""
+            print(f"  {status} {t_title[:60]} ({elapsed:.0f}s){bad_note}", flush=True)
+            done += 1
+
+        except Exception as e:
+            print(f"  ✗ ERROR: {e}", flush=True)
+            errors += 1
+
+    db.close()
+    print(f"\n{'='*50}")
+    print(f"Done: {done} ✓  errors: {errors} ✗  skipped: {skipped}")
+    print(f"Total: {total}")
+
+
+if __name__ == "__main__":
+    main()