Files
feadulta/scripts/test_5articles.py
T

179 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
test_5articles.py
Translates 5 specific articles ES→EN using chunk approach.
Prints per-chunk results so we can verify quality before full batch.
"""
import pymysql, json, re, html, urllib.request, time
from langdetect import detect, LangDetectException, DetectorFactory
DetectorFactory.seed = 0
JAN_URL = "http://172.19.128.1:1337/v1/chat/completions"
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
# (wp_id_EN, es_id)
TEST_POSTS = [
(43127, 42557), # ~3k chars
(43132, 42547), # ~4k chars
(43114, 42570), # ~4k chars
(43139, 42536), # ~5k chars
(42987, 42535), # ~15k chars
]
CHUNK_SIZE = 800
AI_FOOTER = "\n<p><em>Traducido con IA</em></p>"
def strip_html(text):
if not text: return ''
text = re.sub(r'<[^>]+>', ' ', text)
text = html.unescape(text)
return re.sub(r'\s+', ' ', text).strip()
def detect_lang(text, min_len=40):
t = strip_html(text)[:400].strip()
if len(t) < min_len: return None
try: return detect(t)
except: return None
def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120):
payload = json.dumps({
"model": JAN_MODEL, "messages": messages,
"temperature": temperature, "max_tokens": max_tokens,
}).encode("utf-8")
req = urllib.request.Request(
JAN_URL, data=payload,
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
method="POST"
)
with urllib.request.urlopen(req, timeout=timeout) as r:
return json.loads(r.read())["choices"][0]["message"]["content"].strip()
def translate_chunk(chunk, attempt=0):
prompts = [
"You are a professional translator. Translate the following Spanish text to English. Preserve all HTML tags exactly. Return ONLY the translated text, no preamble.",
"Translate from Spanish to English. Your response must be entirely in English. Preserve HTML tags. Return ONLY the translation.",
]
system = prompts[min(attempt, len(prompts)-1)]
result = call_jan([
{"role": "system", "content": system},
{"role": "user", "content": chunk}
])
# For very short chunks, retry if result == original (model didn't translate)
plain_in = strip_html(chunk).strip().lower()
plain_out = strip_html(result).strip().lower()
if len(plain_in) < 40 and plain_in == plain_out and attempt == 0:
return translate_chunk(chunk, attempt=1)
return result
def split_chunks(content):
parts = re.split(r'(</p>|</li>|</h[1-6]>|</blockquote>)', content)
chunks, current = [], ""
for i in range(0, len(parts), 2):
segment = parts[i] + (parts[i+1] if i+1 < len(parts) else "")
if len(current) + len(segment) <= CHUNK_SIZE:
current += segment
else:
if current: chunks.append(current)
current = segment
if current: chunks.append(current)
return [c for c in chunks if strip_html(c).strip()]
def main():
db = pymysql.connect(**DB)
c = db.cursor()
for wp_en_id, es_id in TEST_POSTS:
c.execute("SELECT post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
es = c.fetchone()
if not es:
print(f"\n[SKIP] ES:{es_id} not found"); continue
es_title = es['post_title'] or ''
es_content = es['post_content'] or ''
chunks = split_chunks(es_content)
plain_len = len(strip_html(es_content))
print(f"\n{'='*60}")
print(f"WP:{wp_en_id} ← ES:{es_id}")
print(f"Title: {es_title[:60]}")
print(f"Content: {plain_len} chars, {len(chunks)} chunks")
print(f"{'='*60}")
# Translate title
try:
t0 = time.time()
t_title = call_jan([
{"role": "system", "content": "You are a translator. Respond ONLY with the translated text."},
{"role": "user", "content": f"Translate from Spanish to English, ALL CAPS:\n\n{es_title}"}
], max_tokens=120, temperature=0.1, timeout=30)
t_title = t_title.strip().strip('"').strip("'")
print(f"Title [{detect_lang(t_title) or '?'}]: {t_title[:70]} ({time.time()-t0:.0f}s)")
except Exception as e:
t_title = es_title
print(f"Title ERROR: {e}")
# Translate chunks
translated = []
ok = bad = 0
for i, chunk in enumerate(chunks):
try:
t0 = time.time()
result = translate_chunk(chunk, attempt=0)
lang = detect_lang(result) or '?'
if lang not in ('en', None, '?') and len(strip_html(result)) > 40:
# Retry
result2 = translate_chunk(chunk, attempt=1)
lang2 = detect_lang(result2) or '?'
if lang2 == 'en' or lang2 in ('?', None):
result, lang = result2, lang2
print(f" chunk {i+1}/{len(chunks)} [retry→{lang}] {time.time()-t0:.0f}s ✓")
else:
print(f" chunk {i+1}/{len(chunks)} [STILL {lang2}] {time.time()-t0:.0f}s ⚠ — keeping anyway")
bad += 1
else:
print(f" chunk {i+1}/{len(chunks)} [{lang}] {time.time()-t0:.0f}s ✓")
ok += 1
translated.append(result)
except Exception as e:
print(f" chunk {i+1}/{len(chunks)} ERROR: {e}")
translated.append(chunk) # keep original
bad += 1
t_content = "\n".join(translated)
if AI_FOOTER.strip() not in t_content:
t_content += AI_FOOTER
# Save to DB
c.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
(t_title, t_content, wp_en_id))
db.commit()
ratio = ok / len(chunks) if chunks else 1.0
print(f" → Saved. {ok}/{len(chunks)} chunks ok ({ratio:.0%})")
print(f" → Check: http://localhost:8081/?p={wp_en_id}")
db.close()
print(f"\n{'='*60}")
print("Done. Review the 5 posts in WP admin before running full batch.")
print("URLs to check:")
for wp_en_id, _ in TEST_POSTS:
print(f" http://localhost:8081/?p={wp_en_id}")
if __name__ == "__main__":
main()