Añadir mu-plugins y scripts de feadulta
This commit is contained in:
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
test_5articles.py
|
||||
|
||||
Translates 5 specific articles ES→EN using chunk approach.
|
||||
Prints per-chunk results so we can verify quality before full batch.
|
||||
"""
|
||||
|
||||
import pymysql, json, re, html, urllib.request, time
|
||||
from langdetect import detect, LangDetectException, DetectorFactory
|
||||
DetectorFactory.seed = 0
|
||||
|
||||
JAN_URL = "http://172.19.128.1:1337/v1/chat/completions"
|
||||
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
|
||||
|
||||
DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
|
||||
password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
|
||||
cursorclass=pymysql.cursors.DictCursor)
|
||||
|
||||
# (wp_id_EN, es_id)
|
||||
TEST_POSTS = [
|
||||
(43127, 42557), # ~3k chars
|
||||
(43132, 42547), # ~4k chars
|
||||
(43114, 42570), # ~4k chars
|
||||
(43139, 42536), # ~5k chars
|
||||
(42987, 42535), # ~15k chars
|
||||
]
|
||||
|
||||
CHUNK_SIZE = 800
|
||||
AI_FOOTER = "\n<p><em>Traducido con IA</em></p>"
|
||||
|
||||
|
||||
def strip_html(text):
|
||||
if not text: return ''
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = html.unescape(text)
|
||||
return re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
|
||||
def detect_lang(text, min_len=40):
|
||||
t = strip_html(text)[:400].strip()
|
||||
if len(t) < min_len: return None
|
||||
try: return detect(t)
|
||||
except: return None
|
||||
|
||||
|
||||
def call_jan(messages, max_tokens=1200, temperature=0.2, timeout=120):
|
||||
payload = json.dumps({
|
||||
"model": JAN_MODEL, "messages": messages,
|
||||
"temperature": temperature, "max_tokens": max_tokens,
|
||||
}).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
JAN_URL, data=payload,
|
||||
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
|
||||
method="POST"
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return json.loads(r.read())["choices"][0]["message"]["content"].strip()
|
||||
|
||||
|
||||
def translate_chunk(chunk, attempt=0):
|
||||
prompts = [
|
||||
"You are a professional translator. Translate the following Spanish text to English. Preserve all HTML tags exactly. Return ONLY the translated text, no preamble.",
|
||||
"Translate from Spanish to English. Your response must be entirely in English. Preserve HTML tags. Return ONLY the translation.",
|
||||
]
|
||||
system = prompts[min(attempt, len(prompts)-1)]
|
||||
result = call_jan([
|
||||
{"role": "system", "content": system},
|
||||
{"role": "user", "content": chunk}
|
||||
])
|
||||
# For very short chunks, retry if result == original (model didn't translate)
|
||||
plain_in = strip_html(chunk).strip().lower()
|
||||
plain_out = strip_html(result).strip().lower()
|
||||
if len(plain_in) < 40 and plain_in == plain_out and attempt == 0:
|
||||
return translate_chunk(chunk, attempt=1)
|
||||
return result
|
||||
|
||||
|
||||
def split_chunks(content):
|
||||
parts = re.split(r'(</p>|</li>|</h[1-6]>|</blockquote>)', content)
|
||||
chunks, current = [], ""
|
||||
for i in range(0, len(parts), 2):
|
||||
segment = parts[i] + (parts[i+1] if i+1 < len(parts) else "")
|
||||
if len(current) + len(segment) <= CHUNK_SIZE:
|
||||
current += segment
|
||||
else:
|
||||
if current: chunks.append(current)
|
||||
current = segment
|
||||
if current: chunks.append(current)
|
||||
return [c for c in chunks if strip_html(c).strip()]
|
||||
|
||||
|
||||
def main():
|
||||
db = pymysql.connect(**DB)
|
||||
c = db.cursor()
|
||||
|
||||
for wp_en_id, es_id in TEST_POSTS:
|
||||
c.execute("SELECT post_title, post_content FROM wp_posts WHERE ID=%s", (es_id,))
|
||||
es = c.fetchone()
|
||||
if not es:
|
||||
print(f"\n[SKIP] ES:{es_id} not found"); continue
|
||||
|
||||
es_title = es['post_title'] or ''
|
||||
es_content = es['post_content'] or ''
|
||||
chunks = split_chunks(es_content)
|
||||
plain_len = len(strip_html(es_content))
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"WP:{wp_en_id} ← ES:{es_id}")
|
||||
print(f"Title: {es_title[:60]}")
|
||||
print(f"Content: {plain_len} chars, {len(chunks)} chunks")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Translate title
|
||||
try:
|
||||
t0 = time.time()
|
||||
t_title = call_jan([
|
||||
{"role": "system", "content": "You are a translator. Respond ONLY with the translated text."},
|
||||
{"role": "user", "content": f"Translate from Spanish to English, ALL CAPS:\n\n{es_title}"}
|
||||
], max_tokens=120, temperature=0.1, timeout=30)
|
||||
t_title = t_title.strip().strip('"').strip("'")
|
||||
print(f"Title [{detect_lang(t_title) or '?'}]: {t_title[:70]} ({time.time()-t0:.0f}s)")
|
||||
except Exception as e:
|
||||
t_title = es_title
|
||||
print(f"Title ERROR: {e}")
|
||||
|
||||
# Translate chunks
|
||||
translated = []
|
||||
ok = bad = 0
|
||||
for i, chunk in enumerate(chunks):
|
||||
try:
|
||||
t0 = time.time()
|
||||
result = translate_chunk(chunk, attempt=0)
|
||||
lang = detect_lang(result) or '?'
|
||||
|
||||
if lang not in ('en', None, '?') and len(strip_html(result)) > 40:
|
||||
# Retry
|
||||
result2 = translate_chunk(chunk, attempt=1)
|
||||
lang2 = detect_lang(result2) or '?'
|
||||
if lang2 == 'en' or lang2 in ('?', None):
|
||||
result, lang = result2, lang2
|
||||
print(f" chunk {i+1}/{len(chunks)} [retry→{lang}] {time.time()-t0:.0f}s ✓")
|
||||
else:
|
||||
print(f" chunk {i+1}/{len(chunks)} [STILL {lang2}] {time.time()-t0:.0f}s ⚠ — keeping anyway")
|
||||
bad += 1
|
||||
else:
|
||||
print(f" chunk {i+1}/{len(chunks)} [{lang}] {time.time()-t0:.0f}s ✓")
|
||||
ok += 1
|
||||
|
||||
translated.append(result)
|
||||
except Exception as e:
|
||||
print(f" chunk {i+1}/{len(chunks)} ERROR: {e}")
|
||||
translated.append(chunk) # keep original
|
||||
bad += 1
|
||||
|
||||
t_content = "\n".join(translated)
|
||||
if AI_FOOTER.strip() not in t_content:
|
||||
t_content += AI_FOOTER
|
||||
|
||||
# Save to DB
|
||||
c.execute("UPDATE wp_posts SET post_title=%s, post_content=%s WHERE ID=%s",
|
||||
(t_title, t_content, wp_en_id))
|
||||
db.commit()
|
||||
|
||||
ratio = ok / len(chunks) if chunks else 1.0
|
||||
print(f" → Saved. {ok}/{len(chunks)} chunks ok ({ratio:.0%})")
|
||||
print(f" → Check: http://localhost:8081/?p={wp_en_id}")
|
||||
|
||||
db.close()
|
||||
print(f"\n{'='*60}")
|
||||
print("Done. Review the 5 posts in WP admin before running full batch.")
|
||||
print("URLs to check:")
|
||||
for wp_en_id, _ in TEST_POSTS:
|
||||
print(f" http://localhost:8081/?p={wp_en_id}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user