Añadir mu-plugins y scripts de feadulta
This commit is contained in:
@@ -0,0 +1,230 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
retranslate_en_all.py
|
||||
|
||||
Retranslates ALL English posts (ID > 42760) from their Spanish originals.
|
||||
Uses chunk-based translation (~800 chars per chunk) to avoid model drift.
|
||||
Sequential, single process.
|
||||
"""
|
||||
|
||||
import pymysql, json, re, html, urllib.request, time, sys
|
||||
from langdetect import detect, LangDetectException, DetectorFactory
|
||||
DetectorFactory.seed = 0
|
||||
|
||||
JAN_URL = "http://172.19.128.1:1337/v1/chat/completions"
|
||||
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
|
||||
|
||||
DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
|
||||
password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
|
||||
cursorclass=pymysql.cursors.DictCursor)
|
||||
|
||||
CHUNK_SIZE = 800
|
||||
MAX_RETRIES = 2
|
||||
AI_FOOTER = "\n<p><em>Traducido con IA</em></p>"
|
||||
|
||||
|
||||
def strip_html(text):
|
||||
if not text: return ''
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = html.unescape(text)
|
||||
return re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
|
||||
def detect_lang(text, min_len=40):
|
||||
t = strip_html(text)[:400].strip()
|
||||
if len(t) < min_len: return None
|
||||
try:
|
||||
from langdetect import detect as _detect
|
||||
return _detect(t)
|
||||
except: return None
|
||||
|
||||
|
||||
def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120):
|
||||
payload = json.dumps({
|
||||
"model": JAN_MODEL, "messages": messages,
|
||||
"temperature": temperature, "max_tokens": max_tokens,
|
||||
}).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
JAN_URL, data=payload,
|
||||
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
|
||||
method="POST"
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return json.loads(r.read())["choices"][0]["message"]["content"].strip()
|
||||
|
||||
|
||||
def translate_chunk(chunk, attempt=0):
|
||||
prompts = [
|
||||
"You are a professional translator. Translate the following Spanish text to English. Preserve all HTML tags exactly. Return ONLY the translated text, no preamble, no explanation.",
|
||||
"Translate from Spanish to English. Your entire response must be in English. Preserve HTML tags. Return ONLY the translation, nothing else.",
|
||||
]
|
||||
system = prompts[min(attempt, len(prompts)-1)]
|
||||
result = call_jan([
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": chunk}
|
||||
])
|
||||
# Short chunks: retry if output == input (model didn't translate)
|
||||
plain_in = strip_html(chunk).strip().lower()
|
||||
plain_out = strip_html(result).strip().lower()
|
||||
if len(plain_in) < 40 and plain_in == plain_out and attempt == 0:
|
||||
return translate_chunk(chunk, attempt=1)
|
||||
return result
|
||||
|
||||
|
||||
def translate_title(es_title):
|
||||
try:
|
||||
result = call_jan([
|
||||
{"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."},
|
||||
{"role": "user", "content": f"Translate from Spanish to English, ALL CAPS:\n\n{es_title}"}
|
||||
], max_tokens=150, temperature=0.1, timeout=30)
|
||||
result = result.strip().strip('"').strip("'")
|
||||
# Reject if identical to original
|
||||
if result.upper() == es_title.upper():
|
||||
return es_title
|
||||
return result
|
||||
except:
|
||||
return es_title
|
||||
|
||||
|
||||
def split_chunks(content):
|
||||
parts = re.split(r'(</p>|</li>|</h[1-6]>|</blockquote>)', content)
|
||||
chunks, current = [], ""
|
||||
for i in range(0, len(parts), 2):
|
||||
segment = parts[i] + (parts[i+1] if i+1 < len(parts) else "")
|
||||
if len(current) + len(segment) <= CHUNK_SIZE:
|
||||
current += segment
|
||||
else:
|
||||
if current: chunks.append(current)
|
||||
if len(segment) > CHUNK_SIZE:
|
||||
# Split long segment at sentence boundaries
|
||||
sentences = re.split(r'(?<=[.!?])\s+', segment)
|
||||
current = ""
|
||||
for s in sentences:
|
||||
if len(current) + len(s) <= CHUNK_SIZE:
|
||||
current += s + " "
|
||||
else:
|
||||
if current: chunks.append(current.strip())
|
||||
current = s + " "
|
||||
else:
|
||||
current = segment
|
||||
if current: chunks.append(current)
|
||||
return [c for c in chunks if strip_html(c).strip()]
|
||||
|
||||
|
||||
def main():
|
||||
db = pymysql.connect(**DB)
|
||||
c = db.cursor()
|
||||
|
||||
# Fetch all EN posts with their Spanish originals
|
||||
c.execute("""
|
||||
SELECT DISTINCT p.ID, p.post_title,
|
||||
ttg.description as group_desc
|
||||
FROM wp_posts p
|
||||
JOIN wp_term_relationships trl ON p.ID=trl.object_id
|
||||
JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
|
||||
JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id AND t_lang.slug='en'
|
||||
JOIN wp_term_relationships trg ON p.ID=trg.object_id
|
||||
JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations'
|
||||
WHERE p.ID > 42760 AND p.post_type='post' AND p.post_status='publish'
|
||||
ORDER BY p.ID
|
||||
""")
|
||||
posts = c.fetchall()
|
||||
print(f"Found {len(posts)} EN posts to retranslate\n", flush=True)
|
||||
|
||||
done = errors = skipped = 0
|
||||
total = len(posts)
|
||||
|
||||
for n, p in enumerate(posts, 1):
|
||||
post_id = p['ID']
|
||||
desc = p['group_desc'] or ''
|
||||
m = re.search(r's:2:"es";i:(\d+);', desc)
|
||||
if not m:
|
||||
print(f"[{n}/{total}] {post_id} — SKIP (no ES original in group)", flush=True)
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
es_id = int(m.group(1))
|
||||
c.execute("SELECT post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
|
||||
es = c.fetchone()
|
||||
if not es or not es['post_content']:
|
||||
print(f"[{n}/{total}] {post_id} — SKIP (ES:{es_id} empty)", flush=True)
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
es_title = es['post_title'] or ''
|
||||
es_content = es['post_content']
|
||||
plain_len = len(strip_html(es_content))
|
||||
chunks = split_chunks(es_content)
|
||||
|
||||
print(f"\n[{n}/{total}] WP:{post_id} ← ES:{es_id} — {es_title[:50]}", flush=True)
|
||||
print(f" {plain_len} chars, {len(chunks)} chunks", flush=True)
|
||||
|
||||
if plain_len < 50:
|
||||
print(f" SKIP (too short)", flush=True)
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
t0 = time.time()
|
||||
|
||||
# Translate title
|
||||
t_title = translate_title(es_title)
|
||||
|
||||
# Translate content chunk by chunk
|
||||
translated = []
|
||||
chunk_ok = chunk_bad = 0
|
||||
for i, chunk in enumerate(chunks):
|
||||
try:
|
||||
result = translate_chunk(chunk, attempt=0)
|
||||
lang = detect_lang(result, min_len=40)
|
||||
|
||||
if lang and lang != 'en' and len(strip_html(result)) >= 40:
|
||||
result2 = translate_chunk(chunk, attempt=1)
|
||||
lang2 = detect_lang(result2, min_len=40)
|
||||
if lang2 == 'en' or lang2 is None:
|
||||
result = result2
|
||||
chunk_ok += 1
|
||||
else:
|
||||
chunk_bad += 1
|
||||
else:
|
||||
chunk_ok += 1
|
||||
translated.append(result)
|
||||
except Exception as e:
|
||||
print(f" chunk {i+1} ERROR: {e}", flush=True)
|
||||
translated.append(chunk)
|
||||
chunk_bad += 1
|
||||
|
||||
t_content = "\n".join(translated)
|
||||
if AI_FOOTER.strip() not in t_content:
|
||||
t_content += AI_FOOTER
|
||||
|
||||
# Validate overall
|
||||
content_lang = detect_lang(t_content, min_len=80)
|
||||
lang_ok = content_lang in ('en', None)
|
||||
elapsed = time.time() - t0
|
||||
|
||||
# Save
|
||||
db2 = pymysql.connect(**DB)
|
||||
c2 = db2.cursor()
|
||||
c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
|
||||
(t_title, t_content, post_id))
|
||||
db2.commit()
|
||||
db2.close()
|
||||
|
||||
status = "✓" if lang_ok else "⚠"
|
||||
bad_note = f" ({chunk_bad} chunks bad)" if chunk_bad else ""
|
||||
print(f" {status} {t_title[:60]} ({elapsed:.0f}s){bad_note}", flush=True)
|
||||
done += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ ERROR: {e}", flush=True)
|
||||
errors += 1
|
||||
|
||||
db.close()
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Done: {done} ✓ errors: {errors} ✗ skipped: {skipped}")
|
||||
print(f"Total: {total}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user