#!/usr/bin/env python3 """ test_5articles.py Translates 5 specific articles ES→EN using chunk approach. Prints per-chunk results so we can verify quality before full batch. """ import pymysql, json, re, html, urllib.request, time from langdetect import detect, LangDetectException, DetectorFactory DetectorFactory.seed = 0 JAN_URL = "http://172.19.128.1:1337/v1/chat/completions" JAN_MODEL = "gemma-3-12b-it-Q4_K_M" DB = dict(host='172.18.0.2', port=3306, user='wordpress_user', password='wordpress_pass', database='wordpress_db', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) # (wp_id_EN, es_id) TEST_POSTS = [ (43127, 42557), # ~3k chars (43132, 42547), # ~4k chars (43114, 42570), # ~4k chars (43139, 42536), # ~5k chars (42987, 42535), # ~15k chars ] CHUNK_SIZE = 800 AI_FOOTER = "\n
Traducido con IA
" def strip_html(text): if not text: return '' text = re.sub(r'<[^>]+>', ' ', text) text = html.unescape(text) return re.sub(r'\s+', ' ', text).strip() def detect_lang(text, min_len=40): t = strip_html(text)[:400].strip() if len(t) < min_len: return None try: return detect(t) except: return None def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120): payload = json.dumps({ "model": JAN_MODEL, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, }).encode("utf-8") req = urllib.request.Request( JAN_URL, data=payload, headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"}, method="POST" ) with urllib.request.urlopen(req, timeout=timeout) as r: return json.loads(r.read())["choices"][0]["message"]["content"].strip() def translate_chunk(chunk, attempt=0): prompts = [ "You are a professional translator. Translate the following Spanish text to English. Preserve all HTML tags exactly. Return ONLY the translated text, no preamble.", "Translate from Spanish to English. Your response must be entirely in English. Preserve HTML tags. Return ONLY the translation.", ] system = prompts[min(attempt, len(prompts)-1)] result = call_jan([ {"role": "system", "content": system}, {"role": "user", "content": chunk} ]) # For very short chunks, retry if result == original (model didn't translate) plain_in = strip_html(chunk).strip().lower() plain_out = strip_html(result).strip().lower() if len(plain_in) < 40 and plain_in == plain_out and attempt == 0: return translate_chunk(chunk, attempt=1) return result def split_chunks(content): parts = re.split(r'(|||)', content) chunks, current = [], "" for i in range(0, len(parts), 2): segment = parts[i] + (parts[i+1] if i+1 < len(parts) else "") if len(current) + len(segment) <= CHUNK_SIZE: current += segment else: if current: chunks.append(current) current = segment if current: chunks.append(current) return [c for c in chunks if strip_html(c).strip()] def main(): db = pymysql.connect(**DB) c = db.cursor() for wp_en_id, es_id in TEST_POSTS: c.execute("SELECT post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,)) es = c.fetchone() if not es: print(f"\n[SKIP] ES:{es_id} not found"); continue es_title = es['post_title'] or '' es_content = es['post_content'] or '' chunks = split_chunks(es_content) plain_len = len(strip_html(es_content)) print(f"\n{'='*60}") print(f"WP:{wp_en_id} ← ES:{es_id}") print(f"Title: {es_title[:60]}") print(f"Content: {plain_len} chars, {len(chunks)} chunks") print(f"{'='*60}") # Translate title try: t0 = time.time() t_title = call_jan([ {"role": "system", "content": "You are a translator. Respond ONLY with the translated text."}, {"role": "user", "content": f"Translate from Spanish to English, ALL CAPS:\n\n{es_title}"} ], max_tokens=120, temperature=0.1, timeout=30) t_title = t_title.strip().strip('"').strip("'") print(f"Title [{detect_lang(t_title) or '?'}]: {t_title[:70]} ({time.time()-t0:.0f}s)") except Exception as e: t_title = es_title print(f"Title ERROR: {e}") # Translate chunks translated = [] ok = bad = 0 for i, chunk in enumerate(chunks): try: t0 = time.time() result = translate_chunk(chunk, attempt=0) lang = detect_lang(result) or '?' if lang not in ('en', None, '?') and len(strip_html(result)) > 40: # Retry result2 = translate_chunk(chunk, attempt=1) lang2 = detect_lang(result2) or '?' if lang2 == 'en' or lang2 in ('?', None): result, lang = result2, lang2 print(f" chunk {i+1}/{len(chunks)} [retry→{lang}] {time.time()-t0:.0f}s ✓") else: print(f" chunk {i+1}/{len(chunks)} [STILL {lang2}] {time.time()-t0:.0f}s ⚠ — keeping anyway") bad += 1 else: print(f" chunk {i+1}/{len(chunks)} [{lang}] {time.time()-t0:.0f}s ✓") ok += 1 translated.append(result) except Exception as e: print(f" chunk {i+1}/{len(chunks)} ERROR: {e}") translated.append(chunk) # keep original bad += 1 t_content = "\n".join(translated) if AI_FOOTER.strip() not in t_content: t_content += AI_FOOTER # Save to DB c.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s", (t_title, t_content, wp_en_id)) db.commit() ratio = ok / len(chunks) if chunks else 1.0 print(f" → Saved. {ok}/{len(chunks)} chunks ok ({ratio:.0%})") print(f" → Check: http://localhost:8081/?p={wp_en_id}") db.close() print(f"\n{'='*60}") print("Done. Review the 5 posts in WP admin before running full batch.") print("URLs to check:") for wp_en_id, _ in TEST_POSTS: print(f" http://localhost:8081/?p={wp_en_id}") if __name__ == "__main__": main()