#!/usr/bin/env python3 """ retranslate_lang.py Retranslates ALL posts for a given language (ID > 42760) from their Spanish originals. Uses chunk-based translation (~800 chars per chunk) to avoid model drift. Sequential, single process. Usage: python3 retranslate_lang.py fr python3 retranslate_lang.py it python3 retranslate_lang.py pt """ import pymysql, json, re, html, urllib.request, time, sys from langdetect import detect, LangDetectException, DetectorFactory DetectorFactory.seed = 0 JAN_URL = "http://172.19.128.1:1337/v1/chat/completions" JAN_MODEL = "gemma-3-12b-it-Q4_K_M" DB = dict(host='172.18.0.2', port=3306, user='wordpress_user', password='wordpress_pass', database='wordpress_db', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) LANG_CONFIG = { "en": {"name": "English", "footer": "
English version translated with AI
"}, "fr": {"name": "French", "footer": "Version française traduite par IA
"}, "it": {"name": "Italian", "footer": "Versione italiana tradotta con IA
"}, "pt": {"name": "Portuguese", "footer": "Versão portuguesa traduzida com IA
"}, } CHUNK_SIZE = 800 MAX_RETRIES = 2 def strip_html(text): if not text: return '' text = re.sub(r'<[^>]+>', ' ', text) text = html.unescape(text) return re.sub(r'\s+', ' ', text).strip() def detect_lang(text, min_len=40): t = strip_html(text)[:400].strip() if len(t) < min_len: return None try: return detect(t) except: return None def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=150): payload = json.dumps({ "model": JAN_MODEL, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, }).encode("utf-8") req = urllib.request.Request( JAN_URL, data=payload, headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"}, method="POST" ) with urllib.request.urlopen(req, timeout=timeout) as r: return json.loads(r.read())["choices"][0]["message"]["content"].strip() def fix_html_structure(content): """Fix common model errors: markdown bold → HTML, orphaned text →wrapped, unclosed
before a new
.""" # **text** →
text
content = re.sub(r'\*\*(.+?)\*\*', lambda m: '' + m.group(1).strip() + '
', content) # Lines of bare text not inside any block tag → wrap inlines = content.split('\n') fixed = [] for line in lines: s = line.strip() if s and not s.startswith('<') and not s.startswith('