Files
feadulta/scripts/retranslate_chunks.py
T

311 lines
11 KiB
Python

#!/usr/bin/env python3
"""
retranslate_chunks.py
Re-translates posts where content is in the wrong language.
Splits post_content into chunks of ~800 chars (at </p> boundaries)
and translates each chunk independently to avoid model drift.
"""
import pymysql
import json
import re
import html
import urllib.request
import time
import sys
import csv
from langdetect import detect, LangDetectException, DetectorFactory
DetectorFactory.seed = 0
JAN_URL = "http://172.19.128.1:1337/v1/chat/completions"
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
LANG_NAMES = {"en": "English", "fr": "French", "it": "Italian", "pt": "Portuguese"}
LANG_NORM = {'es':'es','pt':'pt','fr':'fr','en':'en','it':'it','ca':'es','gl':'es'}
AI_FOOTER = "\n<p><em>Traducido con IA</em></p>"
CHUNK_SIZE = 800 # max chars per translation chunk
MAX_RETRIES = 2
def strip_html(text):
if not text: return ''
text = re.sub(r'<[^>]+>', ' ', text)
text = html.unescape(text)
return re.sub(r'\s+', ' ', text).strip()
def detect_lang(text, min_len=60):
t = strip_html(text)[:600].strip()
if len(t) < min_len: return None
try: return LANG_NORM.get(detect(t), detect(t))
except: return None
def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120):
payload = json.dumps({
"model": JAN_MODEL,
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
}).encode("utf-8")
req = urllib.request.Request(
JAN_URL, data=payload,
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
method="POST"
)
with urllib.request.urlopen(req, timeout=timeout) as r:
result = json.loads(r.read())
return result["choices"][0]["message"]["content"].strip()
def translate_chunk(chunk, lang_name):
"""Translate a single HTML chunk. Returns translated text or None on failure."""
system = (
f"You are a professional translator. Translate the following Spanish text to {lang_name}. "
f"Preserve all HTML tags exactly as they are. "
f"Return ONLY the translated text, nothing else. No preamble, no explanation."
)
plain_len = len(strip_html(chunk).strip())
for attempt in range(MAX_RETRIES):
try:
result = call_jan([
{"role": "system", "content": system},
{"role": "user", "content": chunk}
])
# For short chunks (headings, short phrases) langdetect is unreliable —
# accept the result as long as it changed from the original Spanish
if plain_len < 40:
changed = strip_html(result).strip().lower() != strip_html(chunk).strip().lower()
if changed or attempt > 0:
return result
else:
lang = detect_lang(result, min_len=40)
if lang is None or lang == lang_name[:2].lower():
return result
# Wrong language — retry with more explicit prompt
system = (
f"Translate from Spanish to {lang_name}. "
f"Your response must be entirely in {lang_name}. "
f"Preserve HTML tags. Return ONLY the translation."
)
except Exception as e:
if attempt == MAX_RETRIES - 1:
return None
time.sleep(2)
return None # all retries failed
def translate_title(title, lang_name):
try:
result = call_jan([
{"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."},
{"role": "user", "content": f"Translate from Spanish to {lang_name}, ALL CAPS:\n\n{title}"}
], max_tokens=120, temperature=0.1, timeout=30)
return result.strip().strip('"').strip("'")
except:
return None
def split_into_chunks(content, max_size=CHUNK_SIZE):
"""Split HTML content at </p> boundaries into chunks <= max_size chars."""
# Split at closing block tags
parts = re.split(r'(</p>|</li>|</h[1-6]>|</blockquote>)', content)
chunks = []
current = ""
for i in range(0, len(parts), 2):
piece = parts[i]
closer = parts[i+1] if i+1 < len(parts) else ""
segment = piece + closer
if len(current) + len(segment) <= max_size:
current += segment
else:
if current:
chunks.append(current)
# If a single segment exceeds max_size, split it roughly
if len(segment) > max_size:
# Split at sentence boundaries
sentences = re.split(r'(?<=[.!?])\s+', segment)
current = ""
for s in sentences:
if len(current) + len(s) <= max_size:
current += s + " "
else:
if current:
chunks.append(current.strip())
current = s + " "
else:
current = segment
if current:
chunks.append(current)
return [c for c in chunks if c.strip()]
def translate_content_chunked(content, lang_name):
"""
Translate full post_content by splitting into chunks.
Returns (translated_content, success_ratio).
"""
if not content or not content.strip():
return content, 1.0
chunks = split_into_chunks(content)
translated_chunks = []
failed = 0
for chunk in chunks:
# Skip chunks that are only HTML tags / whitespace
if not strip_html(chunk).strip():
translated_chunks.append(chunk)
continue
result = translate_chunk(chunk, lang_name)
if result is None:
# Keep original chunk rather than losing it
translated_chunks.append(chunk)
failed += 1
else:
translated_chunks.append(result)
success_ratio = 1.0 - (failed / len(chunks)) if chunks else 1.0
return "\n".join(translated_chunks), success_ratio
def main():
audit_path = '/tmp/audit_clean.csv'
failed_ids = set()
try:
with open(audit_path) as f:
reader = csv.DictReader(f)
for row in reader:
failed_ids.add(int(row['id']))
print(f"Loaded {len(failed_ids)} post IDs with issues from audit")
except FileNotFoundError:
print(f"ERROR: {audit_path} not found. Run audit_translations.py first.")
sys.exit(1)
db = pymysql.connect(**DB)
c = db.cursor()
id_list = ','.join(str(i) for i in sorted(failed_ids))
c.execute(f"""
SELECT DISTINCT p.ID, p.post_title, p.post_content,
t_lang.slug as lang,
ttg.description as group_desc
FROM wp_posts p
JOIN wp_term_relationships trl ON p.ID=trl.object_id
JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id
JOIN wp_term_relationships trg ON p.ID=trg.object_id
JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations'
WHERE p.ID IN ({id_list}) AND p.post_type='post' AND p.post_status='publish'
""")
raw_posts = c.fetchall()
# Fetch Spanish originals
posts = []
es_cache = {}
for p in raw_posts:
desc = p['group_desc'] or ''
m = re.search(r's:2:"es";i:(\d+);', desc)
if not m:
continue
es_id = int(m.group(1))
if es_id not in es_cache:
c.execute("SELECT ID, post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
row = c.fetchone()
es_cache[es_id] = row
es = es_cache[es_id]
if es:
posts.append({**p, 'es_id': es_id, 'es_title': es['post_title'], 'es_content': es['post_content']})
db.close()
print(f"Fetched {len(posts)} posts to retranslate\n")
by_es = {}
for p in posts:
by_es.setdefault(p['es_id'], []).append(p)
done = errors = skipped = partial = 0
total = len(posts)
n = 0
for es_id, translations in sorted(by_es.items()):
es_title = translations[0]['es_title'] or ''
es_content = translations[0]['es_content'] or ''
content_len = len(strip_html(es_content))
if content_len < 50:
print(f" ES:{es_id} — SKIPPING (too short: {content_len} chars)")
skipped += len(translations)
n += len(translations)
continue
# Show chunk count for visibility
chunks = split_into_chunks(es_content)
print(f"\nES:{es_id}{es_title[:50]} ({content_len} chars, {len(chunks)} chunks)")
for p in translations:
post_id = p['ID']
lang = p['lang']
lang_name = LANG_NAMES.get(lang, lang)
n += 1
try:
t0 = time.time()
# Translate title
t_title = translate_title(es_title, lang_name) if es_title else ''
if not t_title or t_title.upper() == es_title.upper():
t_title = p['post_title'] # keep existing if translation failed
# Translate content chunk by chunk
t_content, ratio = translate_content_chunked(es_content, lang_name)
elapsed = time.time() - t0
# Validate overall content language
content_lang = detect_lang(t_content, min_len=80)
lang_ok = (content_lang == lang) or content_lang is None
# Add AI footer
if AI_FOOTER.strip() not in t_content:
t_content = t_content + AI_FOOTER
# Update DB
db2 = pymysql.connect(**DB)
c2 = db2.cursor()
c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
(t_title, t_content, post_id))
db2.commit()
db2.close()
status = "✓" if (lang_ok and ratio == 1.0) else ("~" if lang_ok else "⚠")
if ratio < 1.0:
partial += 1
elif lang_ok:
done += 1
else:
errors += 1
print(f" [{lang}] {status} {post_id}: {t_title[:50]} ({elapsed:.0f}s, {ratio:.0%} ok)")
except Exception as e:
print(f" [{lang}] ✗ ERROR on {post_id}: {e}")
errors += 1
print(f"\n{'='*50}")
print(f"Done: {done} ✓ partial: {partial} ~ errors/wrong-lang: {errors} ⚠ skipped: {skipped}")
print(f"Total: {n}/{total}")
if __name__ == "__main__":
main()