311 lines
11 KiB
Python
311 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
retranslate_chunks.py
|
|
|
|
Re-translates posts where content is in the wrong language.
|
|
Splits post_content into chunks of ~800 chars (at </p> boundaries)
|
|
and translates each chunk independently to avoid model drift.
|
|
"""
|
|
|
|
import pymysql
|
|
import json
|
|
import re
|
|
import html
|
|
import urllib.request
|
|
import time
|
|
import sys
|
|
import csv
|
|
from langdetect import detect, LangDetectException, DetectorFactory
|
|
DetectorFactory.seed = 0
|
|
|
|
JAN_URL = "http://172.19.128.1:1337/v1/chat/completions"
|
|
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
|
|
|
|
DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
|
|
password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
|
|
cursorclass=pymysql.cursors.DictCursor)
|
|
|
|
LANG_NAMES = {"en": "English", "fr": "French", "it": "Italian", "pt": "Portuguese"}
|
|
LANG_NORM = {'es':'es','pt':'pt','fr':'fr','en':'en','it':'it','ca':'es','gl':'es'}
|
|
|
|
AI_FOOTER = "\n<p><em>Traducido con IA</em></p>"
|
|
CHUNK_SIZE = 800 # max chars per translation chunk
|
|
MAX_RETRIES = 2
|
|
|
|
|
|
def strip_html(text):
|
|
if not text: return ''
|
|
text = re.sub(r'<[^>]+>', ' ', text)
|
|
text = html.unescape(text)
|
|
return re.sub(r'\s+', ' ', text).strip()
|
|
|
|
|
|
def detect_lang(text, min_len=60):
|
|
t = strip_html(text)[:600].strip()
|
|
if len(t) < min_len: return None
|
|
try: return LANG_NORM.get(detect(t), detect(t))
|
|
except: return None
|
|
|
|
|
|
def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120):
|
|
payload = json.dumps({
|
|
"model": JAN_MODEL,
|
|
"messages": messages,
|
|
"temperature": temperature,
|
|
"max_tokens": max_tokens,
|
|
}).encode("utf-8")
|
|
req = urllib.request.Request(
|
|
JAN_URL, data=payload,
|
|
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
|
|
method="POST"
|
|
)
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
result = json.loads(r.read())
|
|
return result["choices"][0]["message"]["content"].strip()
|
|
|
|
|
|
def translate_chunk(chunk, lang_name):
|
|
"""Translate a single HTML chunk. Returns translated text or None on failure."""
|
|
system = (
|
|
f"You are a professional translator. Translate the following Spanish text to {lang_name}. "
|
|
f"Preserve all HTML tags exactly as they are. "
|
|
f"Return ONLY the translated text, nothing else. No preamble, no explanation."
|
|
)
|
|
plain_len = len(strip_html(chunk).strip())
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
result = call_jan([
|
|
{"role": "system", "content": system},
|
|
{"role": "user", "content": chunk}
|
|
])
|
|
# For short chunks (headings, short phrases) langdetect is unreliable —
|
|
# accept the result as long as it changed from the original Spanish
|
|
if plain_len < 40:
|
|
changed = strip_html(result).strip().lower() != strip_html(chunk).strip().lower()
|
|
if changed or attempt > 0:
|
|
return result
|
|
else:
|
|
lang = detect_lang(result, min_len=40)
|
|
if lang is None or lang == lang_name[:2].lower():
|
|
return result
|
|
# Wrong language — retry with more explicit prompt
|
|
system = (
|
|
f"Translate from Spanish to {lang_name}. "
|
|
f"Your response must be entirely in {lang_name}. "
|
|
f"Preserve HTML tags. Return ONLY the translation."
|
|
)
|
|
except Exception as e:
|
|
if attempt == MAX_RETRIES - 1:
|
|
return None
|
|
time.sleep(2)
|
|
return None # all retries failed
|
|
|
|
|
|
def translate_title(title, lang_name):
|
|
try:
|
|
result = call_jan([
|
|
{"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."},
|
|
{"role": "user", "content": f"Translate from Spanish to {lang_name}, ALL CAPS:\n\n{title}"}
|
|
], max_tokens=120, temperature=0.1, timeout=30)
|
|
return result.strip().strip('"').strip("'")
|
|
except:
|
|
return None
|
|
|
|
|
|
def split_into_chunks(content, max_size=CHUNK_SIZE):
|
|
"""Split HTML content at </p> boundaries into chunks <= max_size chars."""
|
|
# Split at closing block tags
|
|
parts = re.split(r'(</p>|</li>|</h[1-6]>|</blockquote>)', content)
|
|
|
|
chunks = []
|
|
current = ""
|
|
for i in range(0, len(parts), 2):
|
|
piece = parts[i]
|
|
closer = parts[i+1] if i+1 < len(parts) else ""
|
|
segment = piece + closer
|
|
|
|
if len(current) + len(segment) <= max_size:
|
|
current += segment
|
|
else:
|
|
if current:
|
|
chunks.append(current)
|
|
# If a single segment exceeds max_size, split it roughly
|
|
if len(segment) > max_size:
|
|
# Split at sentence boundaries
|
|
sentences = re.split(r'(?<=[.!?])\s+', segment)
|
|
current = ""
|
|
for s in sentences:
|
|
if len(current) + len(s) <= max_size:
|
|
current += s + " "
|
|
else:
|
|
if current:
|
|
chunks.append(current.strip())
|
|
current = s + " "
|
|
else:
|
|
current = segment
|
|
|
|
if current:
|
|
chunks.append(current)
|
|
|
|
return [c for c in chunks if c.strip()]
|
|
|
|
|
|
def translate_content_chunked(content, lang_name):
|
|
"""
|
|
Translate full post_content by splitting into chunks.
|
|
Returns (translated_content, success_ratio).
|
|
"""
|
|
if not content or not content.strip():
|
|
return content, 1.0
|
|
|
|
chunks = split_into_chunks(content)
|
|
translated_chunks = []
|
|
failed = 0
|
|
|
|
for chunk in chunks:
|
|
# Skip chunks that are only HTML tags / whitespace
|
|
if not strip_html(chunk).strip():
|
|
translated_chunks.append(chunk)
|
|
continue
|
|
|
|
result = translate_chunk(chunk, lang_name)
|
|
if result is None:
|
|
# Keep original chunk rather than losing it
|
|
translated_chunks.append(chunk)
|
|
failed += 1
|
|
else:
|
|
translated_chunks.append(result)
|
|
|
|
success_ratio = 1.0 - (failed / len(chunks)) if chunks else 1.0
|
|
return "\n".join(translated_chunks), success_ratio
|
|
|
|
|
|
def main():
|
|
audit_path = '/tmp/audit_clean.csv'
|
|
failed_ids = set()
|
|
try:
|
|
with open(audit_path) as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
failed_ids.add(int(row['id']))
|
|
print(f"Loaded {len(failed_ids)} post IDs with issues from audit")
|
|
except FileNotFoundError:
|
|
print(f"ERROR: {audit_path} not found. Run audit_translations.py first.")
|
|
sys.exit(1)
|
|
|
|
db = pymysql.connect(**DB)
|
|
c = db.cursor()
|
|
|
|
id_list = ','.join(str(i) for i in sorted(failed_ids))
|
|
c.execute(f"""
|
|
SELECT DISTINCT p.ID, p.post_title, p.post_content,
|
|
t_lang.slug as lang,
|
|
ttg.description as group_desc
|
|
FROM wp_posts p
|
|
JOIN wp_term_relationships trl ON p.ID=trl.object_id
|
|
JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
|
|
JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id
|
|
JOIN wp_term_relationships trg ON p.ID=trg.object_id
|
|
JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations'
|
|
WHERE p.ID IN ({id_list}) AND p.post_type='post' AND p.post_status='publish'
|
|
""")
|
|
raw_posts = c.fetchall()
|
|
|
|
# Fetch Spanish originals
|
|
posts = []
|
|
es_cache = {}
|
|
for p in raw_posts:
|
|
desc = p['group_desc'] or ''
|
|
m = re.search(r's:2:"es";i:(\d+);', desc)
|
|
if not m:
|
|
continue
|
|
es_id = int(m.group(1))
|
|
if es_id not in es_cache:
|
|
c.execute("SELECT ID, post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
|
|
row = c.fetchone()
|
|
es_cache[es_id] = row
|
|
es = es_cache[es_id]
|
|
if es:
|
|
posts.append({**p, 'es_id': es_id, 'es_title': es['post_title'], 'es_content': es['post_content']})
|
|
db.close()
|
|
print(f"Fetched {len(posts)} posts to retranslate\n")
|
|
|
|
by_es = {}
|
|
for p in posts:
|
|
by_es.setdefault(p['es_id'], []).append(p)
|
|
|
|
done = errors = skipped = partial = 0
|
|
total = len(posts)
|
|
n = 0
|
|
|
|
for es_id, translations in sorted(by_es.items()):
|
|
es_title = translations[0]['es_title'] or ''
|
|
es_content = translations[0]['es_content'] or ''
|
|
content_len = len(strip_html(es_content))
|
|
|
|
if content_len < 50:
|
|
print(f" ES:{es_id} — SKIPPING (too short: {content_len} chars)")
|
|
skipped += len(translations)
|
|
n += len(translations)
|
|
continue
|
|
|
|
# Show chunk count for visibility
|
|
chunks = split_into_chunks(es_content)
|
|
print(f"\nES:{es_id} — {es_title[:50]} ({content_len} chars, {len(chunks)} chunks)")
|
|
|
|
for p in translations:
|
|
post_id = p['ID']
|
|
lang = p['lang']
|
|
lang_name = LANG_NAMES.get(lang, lang)
|
|
n += 1
|
|
|
|
try:
|
|
t0 = time.time()
|
|
|
|
# Translate title
|
|
t_title = translate_title(es_title, lang_name) if es_title else ''
|
|
if not t_title or t_title.upper() == es_title.upper():
|
|
t_title = p['post_title'] # keep existing if translation failed
|
|
|
|
# Translate content chunk by chunk
|
|
t_content, ratio = translate_content_chunked(es_content, lang_name)
|
|
elapsed = time.time() - t0
|
|
|
|
# Validate overall content language
|
|
content_lang = detect_lang(t_content, min_len=80)
|
|
lang_ok = (content_lang == lang) or content_lang is None
|
|
|
|
# Add AI footer
|
|
if AI_FOOTER.strip() not in t_content:
|
|
t_content = t_content + AI_FOOTER
|
|
|
|
# Update DB
|
|
db2 = pymysql.connect(**DB)
|
|
c2 = db2.cursor()
|
|
c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
|
|
(t_title, t_content, post_id))
|
|
db2.commit()
|
|
db2.close()
|
|
|
|
status = "✓" if (lang_ok and ratio == 1.0) else ("~" if lang_ok else "⚠")
|
|
if ratio < 1.0:
|
|
partial += 1
|
|
elif lang_ok:
|
|
done += 1
|
|
else:
|
|
errors += 1
|
|
|
|
print(f" [{lang}] {status} {post_id}: {t_title[:50]} ({elapsed:.0f}s, {ratio:.0%} ok)")
|
|
|
|
except Exception as e:
|
|
print(f" [{lang}] ✗ ERROR on {post_id}: {e}")
|
|
errors += 1
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"Done: {done} ✓ partial: {partial} ~ errors/wrong-lang: {errors} ⚠ skipped: {skipped}")
|
|
print(f"Total: {n}/{total}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|