Añadir mu-plugins y scripts de feadulta

This commit is contained in:
2026-06-28 15:10:46 -04:00
parent bce7e42f44
commit b6116b066d
106 changed files with 17600 additions and 2 deletions
+230
View File
@@ -0,0 +1,230 @@
#!/usr/bin/env python3
"""
retranslate_en_all.py
Retranslates ALL English posts (ID > 42760) from their Spanish originals.
Uses chunk-based translation (~800 chars per chunk) to avoid model drift.
Sequential, single process.
"""
import pymysql, json, re, html, urllib.request, time, sys
from langdetect import detect, LangDetectException, DetectorFactory
DetectorFactory.seed = 0
JAN_URL = "http://172.19.128.1:1337/v1/chat/completions"
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
CHUNK_SIZE = 800
MAX_RETRIES = 2
AI_FOOTER = "\n<p><em>Traducido con IA</em></p>"
def strip_html(text):
if not text: return ''
text = re.sub(r'<[^>]+>', ' ', text)
text = html.unescape(text)
return re.sub(r'\s+', ' ', text).strip()
def detect_lang(text, min_len=40):
t = strip_html(text)[:400].strip()
if len(t) < min_len: return None
try:
from langdetect import detect as _detect
return _detect(t)
except: return None
def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120):
payload = json.dumps({
"model": JAN_MODEL, "messages": messages,
"temperature": temperature, "max_tokens": max_tokens,
}).encode("utf-8")
req = urllib.request.Request(
JAN_URL, data=payload,
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
method="POST"
)
with urllib.request.urlopen(req, timeout=timeout) as r:
return json.loads(r.read())["choices"][0]["message"]["content"].strip()
def translate_chunk(chunk, attempt=0):
prompts = [
"You are a professional translator. Translate the following Spanish text to English. Preserve all HTML tags exactly. Return ONLY the translated text, no preamble, no explanation.",
"Translate from Spanish to English. Your entire response must be in English. Preserve HTML tags. Return ONLY the translation, nothing else.",
]
system = prompts[min(attempt, len(prompts)-1)]
result = call_jan([
{"role": "system", "content": system},
{"role": "user", "content": chunk}
])
# Short chunks: retry if output == input (model didn't translate)
plain_in = strip_html(chunk).strip().lower()
plain_out = strip_html(result).strip().lower()
if len(plain_in) < 40 and plain_in == plain_out and attempt == 0:
return translate_chunk(chunk, attempt=1)
return result
def translate_title(es_title):
try:
result = call_jan([
{"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."},
{"role": "user", "content": f"Translate from Spanish to English, ALL CAPS:\n\n{es_title}"}
], max_tokens=150, temperature=0.1, timeout=30)
result = result.strip().strip('"').strip("'")
# Reject if identical to original
if result.upper() == es_title.upper():
return es_title
return result
except:
return es_title
def split_chunks(content):
parts = re.split(r'(</p>|</li>|</h[1-6]>|</blockquote>)', content)
chunks, current = [], ""
for i in range(0, len(parts), 2):
segment = parts[i] + (parts[i+1] if i+1 < len(parts) else "")
if len(current) + len(segment) <= CHUNK_SIZE:
current += segment
else:
if current: chunks.append(current)
if len(segment) > CHUNK_SIZE:
# Split long segment at sentence boundaries
sentences = re.split(r'(?<=[.!?])\s+', segment)
current = ""
for s in sentences:
if len(current) + len(s) <= CHUNK_SIZE:
current += s + " "
else:
if current: chunks.append(current.strip())
current = s + " "
else:
current = segment
if current: chunks.append(current)
return [c for c in chunks if strip_html(c).strip()]
def main():
db = pymysql.connect(**DB)
c = db.cursor()
# Fetch all EN posts with their Spanish originals
c.execute("""
SELECT DISTINCT p.ID, p.post_title,
ttg.description as group_desc
FROM wp_posts p
JOIN wp_term_relationships trl ON p.ID=trl.object_id
JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id AND t_lang.slug='en'
JOIN wp_term_relationships trg ON p.ID=trg.object_id
JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations'
WHERE p.ID > 42760 AND p.post_type='post' AND p.post_status='publish'
ORDER BY p.ID
""")
posts = c.fetchall()
print(f"Found {len(posts)} EN posts to retranslate\n", flush=True)
done = errors = skipped = 0
total = len(posts)
for n, p in enumerate(posts, 1):
post_id = p['ID']
desc = p['group_desc'] or ''
m = re.search(r's:2:"es";i:(\d+);', desc)
if not m:
print(f"[{n}/{total}] {post_id} — SKIP (no ES original in group)", flush=True)
skipped += 1
continue
es_id = int(m.group(1))
c.execute("SELECT post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
es = c.fetchone()
if not es or not es['post_content']:
print(f"[{n}/{total}] {post_id} — SKIP (ES:{es_id} empty)", flush=True)
skipped += 1
continue
es_title = es['post_title'] or ''
es_content = es['post_content']
plain_len = len(strip_html(es_content))
chunks = split_chunks(es_content)
print(f"\n[{n}/{total}] WP:{post_id} ← ES:{es_id}{es_title[:50]}", flush=True)
print(f" {plain_len} chars, {len(chunks)} chunks", flush=True)
if plain_len < 50:
print(f" SKIP (too short)", flush=True)
skipped += 1
continue
try:
t0 = time.time()
# Translate title
t_title = translate_title(es_title)
# Translate content chunk by chunk
translated = []
chunk_ok = chunk_bad = 0
for i, chunk in enumerate(chunks):
try:
result = translate_chunk(chunk, attempt=0)
lang = detect_lang(result, min_len=40)
if lang and lang != 'en' and len(strip_html(result)) >= 40:
result2 = translate_chunk(chunk, attempt=1)
lang2 = detect_lang(result2, min_len=40)
if lang2 == 'en' or lang2 is None:
result = result2
chunk_ok += 1
else:
chunk_bad += 1
else:
chunk_ok += 1
translated.append(result)
except Exception as e:
print(f" chunk {i+1} ERROR: {e}", flush=True)
translated.append(chunk)
chunk_bad += 1
t_content = "\n".join(translated)
if AI_FOOTER.strip() not in t_content:
t_content += AI_FOOTER
# Validate overall
content_lang = detect_lang(t_content, min_len=80)
lang_ok = content_lang in ('en', None)
elapsed = time.time() - t0
# Save
db2 = pymysql.connect(**DB)
c2 = db2.cursor()
c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
(t_title, t_content, post_id))
db2.commit()
db2.close()
status = "" if lang_ok else ""
bad_note = f" ({chunk_bad} chunks bad)" if chunk_bad else ""
print(f" {status} {t_title[:60]} ({elapsed:.0f}s){bad_note}", flush=True)
done += 1
except Exception as e:
print(f" ✗ ERROR: {e}", flush=True)
errors += 1
db.close()
print(f"\n{'='*50}")
print(f"Done: {done} ✓ errors: {errors} ✗ skipped: {skipped}")
print(f"Total: {total}")
if __name__ == "__main__":
main()