#!/usr/bin/env python3 """ retranslate_en_all.py Retranslates ALL English posts (ID > 42760) from their Spanish originals. Uses chunk-based translation (~800 chars per chunk) to avoid model drift. Sequential, single process. """ import pymysql, json, re, html, urllib.request, time, sys from langdetect import detect, LangDetectException, DetectorFactory DetectorFactory.seed = 0 JAN_URL = "http://172.19.128.1:1337/v1/chat/completions" JAN_MODEL = "gemma-3-12b-it-Q4_K_M" DB = dict(host='172.18.0.2', port=3306, user='wordpress_user', password='wordpress_pass', database='wordpress_db', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) CHUNK_SIZE = 800 MAX_RETRIES = 2 AI_FOOTER = "\n
Traducido con IA
" def strip_html(text): if not text: return '' text = re.sub(r'<[^>]+>', ' ', text) text = html.unescape(text) return re.sub(r'\s+', ' ', text).strip() def detect_lang(text, min_len=40): t = strip_html(text)[:400].strip() if len(t) < min_len: return None try: from langdetect import detect as _detect return _detect(t) except: return None def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120): payload = json.dumps({ "model": JAN_MODEL, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, }).encode("utf-8") req = urllib.request.Request( JAN_URL, data=payload, headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"}, method="POST" ) with urllib.request.urlopen(req, timeout=timeout) as r: return json.loads(r.read())["choices"][0]["message"]["content"].strip() def translate_chunk(chunk, attempt=0): prompts = [ "You are a professional translator. Translate the following Spanish text to English. Preserve all HTML tags exactly. Return ONLY the translated text, no preamble, no explanation.", "Translate from Spanish to English. Your entire response must be in English. Preserve HTML tags. Return ONLY the translation, nothing else.", ] system = prompts[min(attempt, len(prompts)-1)] result = call_jan([ {"role": "system", "content": system}, {"role": "user", "content": chunk} ]) # Short chunks: retry if output == input (model didn't translate) plain_in = strip_html(chunk).strip().lower() plain_out = strip_html(result).strip().lower() if len(plain_in) < 40 and plain_in == plain_out and attempt == 0: return translate_chunk(chunk, attempt=1) return result def translate_title(es_title): try: result = call_jan([ {"role": "system", "content": "You are a translator. Respond ONLY with the translated text, nothing else."}, {"role": "user", "content": f"Translate from Spanish to English, ALL CAPS:\n\n{es_title}"} ], max_tokens=150, temperature=0.1, timeout=30) result = result.strip().strip('"').strip("'") # Reject if identical to original if result.upper() == es_title.upper(): return es_title return result except: return es_title def split_chunks(content): parts = re.split(r'(|||)', content) chunks, current = [], "" for i in range(0, len(parts), 2): segment = parts[i] + (parts[i+1] if i+1 < len(parts) else "") if len(current) + len(segment) <= CHUNK_SIZE: current += segment else: if current: chunks.append(current) if len(segment) > CHUNK_SIZE: # Split long segment at sentence boundaries sentences = re.split(r'(?<=[.!?])\s+', segment) current = "" for s in sentences: if len(current) + len(s) <= CHUNK_SIZE: current += s + " " else: if current: chunks.append(current.strip()) current = s + " " else: current = segment if current: chunks.append(current) return [c for c in chunks if strip_html(c).strip()] def main(): db = pymysql.connect(**DB) c = db.cursor() # Fetch all EN posts with their Spanish originals c.execute(""" SELECT DISTINCT p.ID, p.post_title, ttg.description as group_desc FROM wp_posts p JOIN wp_term_relationships trl ON p.ID=trl.object_id JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language' JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id AND t_lang.slug='en' JOIN wp_term_relationships trg ON p.ID=trg.object_id JOIN wp_term_taxonomy ttg ON trg.term_taxonomy_id=ttg.term_taxonomy_id AND ttg.taxonomy='post_translations' WHERE p.ID > 42760 AND p.post_type='post' AND p.post_status='publish' ORDER BY p.ID """) posts = c.fetchall() print(f"Found {len(posts)} EN posts to retranslate\n", flush=True) done = errors = skipped = 0 total = len(posts) for n, p in enumerate(posts, 1): post_id = p['ID'] desc = p['group_desc'] or '' m = re.search(r's:2:"es";i:(\d+);', desc) if not m: print(f"[{n}/{total}] {post_id} — SKIP (no ES original in group)", flush=True) skipped += 1 continue es_id = int(m.group(1)) c.execute("SELECT post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,)) es = c.fetchone() if not es or not es['post_content']: print(f"[{n}/{total}] {post_id} — SKIP (ES:{es_id} empty)", flush=True) skipped += 1 continue es_title = es['post_title'] or '' es_content = es['post_content'] plain_len = len(strip_html(es_content)) chunks = split_chunks(es_content) print(f"\n[{n}/{total}] WP:{post_id} ← ES:{es_id} — {es_title[:50]}", flush=True) print(f" {plain_len} chars, {len(chunks)} chunks", flush=True) if plain_len < 50: print(f" SKIP (too short)", flush=True) skipped += 1 continue try: t0 = time.time() # Translate title t_title = translate_title(es_title) # Translate content chunk by chunk translated = [] chunk_ok = chunk_bad = 0 for i, chunk in enumerate(chunks): try: result = translate_chunk(chunk, attempt=0) lang = detect_lang(result, min_len=40) if lang and lang != 'en' and len(strip_html(result)) >= 40: result2 = translate_chunk(chunk, attempt=1) lang2 = detect_lang(result2, min_len=40) if lang2 == 'en' or lang2 is None: result = result2 chunk_ok += 1 else: chunk_bad += 1 else: chunk_ok += 1 translated.append(result) except Exception as e: print(f" chunk {i+1} ERROR: {e}", flush=True) translated.append(chunk) chunk_bad += 1 t_content = "\n".join(translated) if AI_FOOTER.strip() not in t_content: t_content += AI_FOOTER # Validate overall content_lang = detect_lang(t_content, min_len=80) lang_ok = content_lang in ('en', None) elapsed = time.time() - t0 # Save db2 = pymysql.connect(**DB) c2 = db2.cursor() c2.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s", (t_title, t_content, post_id)) db2.commit() db2.close() status = "✓" if lang_ok else "⚠" bad_note = f" ({chunk_bad} chunks bad)" if chunk_bad else "" print(f" {status} {t_title[:60]} ({elapsed:.0f}s){bad_note}", flush=True) done += 1 except Exception as e: print(f" ✗ ERROR: {e}", flush=True) errors += 1 db.close() print(f"\n{'='*50}") print(f"Done: {done} ✓ errors: {errors} ✗ skipped: {skipped}") print(f"Total: {total}") if __name__ == "__main__": main()