Files
feadulta/scripts/translate_cartas.py
T

402 lines
18 KiB
Python

#!/usr/bin/env python3
"""
translate_cartas.py
Traduce artículos españoles de las últimas 2 cartas semanales usando Jan (Gemma 12B).
Crea los posts traducidos en WordPress local (Docker) y los vincula con Polylang.
Uso:
1. Arranca Jan con Gemma 12B
2. python3 translate_cartas.py --check-api # verifica conexión a Jan
3. python3 translate_cartas.py --dry-run # muestra qué se traduciría
4. python3 translate_cartas.py # traduce todo
5. python3 translate_cartas.py --lang en # solo un idioma
6. python3 translate_cartas.py --id 42579 # solo un artículo
"""
import subprocess
import json
import re
import sys
import time
import argparse
import pymysql
# ── Configuración ─────────────────────────────────────────────────────────────
JAN_URL = "http://172.19.128.1:1337/v1/chat/completions"
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
DB_HOST = "172.18.0.2"
DB_PORT = 3306
DB_NAME = "wordpress_db"
DB_USER = "wordpress_user"
DB_PASS = "wordpress_pass"
WP_CONTAINER = "wordpress-web"
TARGET_LANGS = {
"en": "English",
"fr": "French",
"it": "Italian",
"pt": "Portuguese",
}
# IDs de artículos en español de todas las cartas de 2026
# (excluye 26899 = 42k chars, demasiado largo para Jan)
SPANISH_IDS = [
# Carta 2026-03-05 (Agua Viva) — las 2 últimas ya traducidas, se saltarán automáticamente
42732, 42731, 42730, 42729, 42728, 42727, 42726, 42590,
42579, 42578, 42577, 42576, 42575, 42574, 42573, 42572, 42571,
42570, 42569, 42568, 42567, 42566, 42565, 42564, 42563, 42562,
42561, 42560, 42559, 42558, 42557, 42556,
# Carta 2026-02-26 (¿Creemos en el evangelio?)
42594, 42555, 42554, 42553, 42552, 42551, 42550, 42549, 42548, 42547,
42546, 42545, 42544, 42543, 42542, 42541, 42540, 42539, 42538,
42537, 42536, 42535, 42534, 42533, 42532, 42531, 42530, 42529,
42528, 42527, 42526, 42525, 42524, 42523,
# Carta 2026-02-19 (Seres limitados)
42589, 42517, 42516, 42515, 42514, 42513, 42512, 42511,
42510, 42509, 42508, 42507, 42506, 42518, 42505, 42504, 42503,
42502, 42501,
# Carta 2026-02-12 (Más allá de la ley)
42588, 42500, 42499, 42498, 42497, 42496, 42495, 42490,
42489, 42488, 42487, 42486, 42485, 42484, 42587, 42478,
# Carta 2026-02-05 (Ser sal, ser luz)
42477, 42476, 42475, 42474, 42473, 42472, 42471, 42470,
42469, 42468, 42467, 42466, 42465, 42464, 42586, 42479,
# Carta 2026-01-29 (Bienaventurados)
42459, 42458, 42457, 42456, 42455, 42454, 42453, 42452,
42451, 42585, 42450, 42463, 42462, 42461, 42460, 42445, 42444,
# Carta 2026-01-22 (Nuevos caminos)
42584, 42443, 42442, 42441, 42440, 42439, 42438, 42437,
42436, 42431, 42430, 42429, 42428, 42427, 42426, 42425, 42424,
# Carta 2026-01-15 (La ley del Oeste)
26899, # 42k chars — se saltará por tamaño
26898, 26897, 26896, 26895, 26894, 26893, 26892,
26714, 26713, 26712, 26711, 26710, 26717, 26887, 26716, 26886, 26715,
# Carta 2026-01-08 (Hakuna / Avivando ilusiones)
26885, 26884, 26883, 26882, 26881, 26880, 26875, 26708,
26707, 26706, 26705, 26704, 26703, 26702, 26874, 26873,
26872, 26871, 26870, 26869, 26868, 26867, 26866, 26865,
# Carta 2026-01-01
26864, 26863, 26862, 26861, 26860, 26859, 26858, 26857,
26856, 26855, 26709,
]
# Tamaño máximo de contenido para traducción automática (chars)
MAX_CONTENT_LEN = 35000
AI_FOOTER = "\n<p><em>Traducido con IA</em></p>"
# ── Detectar modelo Jan ───────────────────────────────────────────────────────
def get_jan_model():
import urllib.request
try:
req_m = urllib.request.Request(JAN_URL.replace("/chat/completions", "/models"), headers={"Authorization": "Bearer dummy"})
with urllib.request.urlopen(req_m, timeout=5) as r:
data = json.loads(r.read())
models = data.get("data", [])
if models:
return models[0]["id"]
except Exception as e:
print(f"ERROR: No se puede conectar a Jan en {JAN_URL}")
print(f" {e}")
print(" Asegúrate de que Jan está corriendo con Gemma 12B cargado.")
sys.exit(1)
return "gemma"
# ── Traducción via Jan ────────────────────────────────────────────────────────
def translate(title, content, lang_code, lang_name):
import urllib.request, urllib.error
# Few-shot examples from existing human translations (Pagola) to guide style
few_shot = {
"en": [
("NO SABEMOS SABOREAR LA FE", "WE DON'T KNOW HOW TO SAVOR FAITH"),
("ESCUCHAR A JESÚS EN LA SOCIEDAD ACTUAL", "LISTENING TO JESUS IN TODAY'S SOCIETY"),
("FIELES A JESÚS EN MEDIO DE LAS TENTACIONES", "FAITHFUL TO JESUS IN TEMPTATIONS"),
],
"fr": [
("NO SABEMOS SABOREAR LA FE", "NOUS NE SAVONS PAS APPRÉCIER LA FOI"),
("ESCUCHAR A JESÚS EN LA SOCIEDAD ACTUAL", "ÉCOUTER JÉSUS DANS LA SOCIÉTÉ ACTUELLE"),
("FIELES A JESÚS EN MEDIO DE LAS TENTACIONES", "FIDÈLES À JÉSUS AU MILIEU DES TENTATIONS"),
],
"it": [
("NO SABEMOS SABOREAR LA FE", "NON SAPPIAMO ASSAPORARE LA FEDE"),
("ESCUCHAR A JESÚS EN LA SOCIEDAD ACTUAL", "ASCOLTARE GESÙ NELLA SOCIETÀ ATTUALE"),
("FIELES A JESÚS EN MEDIO DE LAS TENTACIONES", "FEDELI A GESÙ NELLE TENTAZIONI"),
],
"pt": [
("NO SABEMOS SABOREAR LA FE", "NÃO SABEMOS SABOREAR A FÉ"),
("ESCUCHAR A JESÚS EN LA SOCIEDAD ACTUAL", "OUVIR JESUS NA SOCIEDADE ATUAL"),
("FIELES A JESÚS EN MEDIO DE LAS TENTACIONES", "FIÉIS A JESUS NO MEIO DAS TENTAÇÕES"),
],
}
example_lines = "\n".join(
f" ES: {e}\n {lang_code.upper()}: {t}"
for e, t in few_shot.get(lang_code, [])
)
example_block = f"\n\nTitle translation examples (be exactly this literal):\n{example_lines}" if example_lines else ""
system_prompt = f"""You are a professional translator specializing in theological and religious texts.
Translate from Spanish to {lang_name}.
Rules:
- Preserve all HTML tags exactly as they appear
- Translate the title LITERALLY — never paraphrase or summarize it
- Keep the full title including everything after colons and quoted subtitles
- Titles must be in ALL CAPS
- Maintain formal theological register
- Standard religious proper nouns: translate them (e.g. "Jesús" → "Jesus" in English)
- Other proper nouns (person names, place names): keep as-is
- Return ONLY the translation, starting with 'Title:'{example_block}"""
payload = json.dumps({
"model": JAN_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"Title: {title}\n\n{content}"}
],
"temperature": 0.3,
"max_tokens": 4096,
}).encode("utf-8")
req = urllib.request.Request(
JAN_URL,
data=payload,
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=300) as r:
result = json.loads(r.read())
full = result["choices"][0]["message"]["content"].strip()
# Separar título traducido del contenido
lines = full.split("\n", 2)
if lines[0].startswith("Title:"):
translated_title = lines[0].replace("Title:", "").strip()
translated_content = "\n".join(lines[1:]).strip() if len(lines) > 1 else ""
else:
translated_title = lines[0].strip()
translated_content = "\n".join(lines[1:]).strip() if len(lines) > 1 else full
# Si el título volvió igual al original (sin traducir), reintentamos solo el título
if translated_title.strip().upper() == title.strip().upper():
title_payload = json.dumps({
"model": JAN_MODEL,
"messages": [
{"role": "user", "content": f"Translate this title from Spanish to {lang_name}. Return ONLY the translated title in ALL CAPS, nothing else: {title}"}
],
"temperature": 0.2,
"max_tokens": 50,
}).encode("utf-8")
title_req = urllib.request.Request(JAN_URL, data=title_payload,
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"}, method="POST")
with urllib.request.urlopen(title_req, timeout=30) as tr:
title_result = json.loads(tr.read())
translated_title = title_result["choices"][0]["message"]["content"].strip().strip('"')
# Si el contenido traducido está vacío o es muy corto, reintentamos con prompt más directo
if len(translated_content.strip()) < 50 and len(content.strip()) > 50:
retry_payload = json.dumps({
"model": JAN_MODEL,
"messages": [
{"role": "system", "content": f"You are a professional translator. Translate the following text from Spanish to {lang_name}. Preserve all HTML tags. Return ONLY the translated text, no preamble."},
{"role": "user", "content": content}
],
"temperature": 0.3,
"max_tokens": 4096,
}).encode("utf-8")
retry_req = urllib.request.Request(JAN_URL, data=retry_payload,
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"}, method="POST")
with urllib.request.urlopen(retry_req, timeout=300) as rr:
retry_result = json.loads(rr.read())
translated_content = retry_result["choices"][0]["message"]["content"].strip()
return translated_title, translated_content
except urllib.error.URLError as e:
raise RuntimeError(f"Error llamando a Jan: {e}")
# ── Base de datos WordPress ───────────────────────────────────────────────────
def get_db():
return pymysql.connect(
host=DB_HOST, port=DB_PORT,
user=DB_USER, password=DB_PASS,
database=DB_NAME, charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor
)
def get_article(db, wp_id):
with db.cursor() as c:
c.execute("""
SELECT p.ID, p.post_title, p.post_content, p.post_author,
p.post_date, p.post_name,
GROUP_CONCAT(t.term_id) as term_ids
FROM wp_posts p
LEFT JOIN wp_term_relationships tr ON p.ID=tr.object_id
LEFT JOIN wp_term_taxonomy tt ON tr.term_taxonomy_id=tt.term_taxonomy_id
AND tt.taxonomy='category'
LEFT JOIN wp_terms t ON tt.term_id=t.term_id
WHERE p.ID=%s
GROUP BY p.ID
""", (wp_id,))
return c.fetchone()
def get_existing_translation(db, original_id, lang_code):
"""Devuelve el WP ID de la traducción si ya existe."""
with db.cursor() as c:
# Polylang guarda las traducciones en wp_term_relationships con taxonomy 'post_translations'
c.execute("""
SELECT tr2.object_id as translated_id
FROM wp_term_relationships tr1
JOIN wp_term_relationships tr2 ON tr1.term_taxonomy_id=tr2.term_taxonomy_id
JOIN wp_term_taxonomy tt1 ON tr1.term_taxonomy_id=tt1.term_taxonomy_id
WHERE tt1.taxonomy='post_translations'
AND tr1.object_id=%s AND tr2.object_id!=%s
""", (original_id, original_id))
candidates = [r['translated_id'] for r in c.fetchall()]
for cid in candidates:
c.execute("""
SELECT t.slug FROM wp_terms t
JOIN wp_term_taxonomy tt ON t.term_id=tt.term_id
JOIN wp_term_relationships tr ON tt.term_taxonomy_id=tr.term_taxonomy_id
WHERE tt.taxonomy='language' AND tr.object_id=%s
""", (cid,))
row = c.fetchone()
if row and row['slug'] == lang_code:
return cid
return None
# ── Crear post vía WP-CLI en Docker ──────────────────────────────────────────
def create_wp_post(article, translated_title, translated_content, lang_code, original_id, dry_run=False):
content_with_footer = translated_content + AI_FOOTER
php = f"""
global $wpdb;
$post_id = wp_insert_post([
'post_title' => {json.dumps(translated_title, ensure_ascii=False)},
'post_content' => {json.dumps(content_with_footer, ensure_ascii=False)},
'post_author' => {article['post_author']},
'post_status' => 'publish',
'post_type' => 'post',
'post_date' => {json.dumps(article['post_date'].strftime('%Y-%m-%d %H:%M:%S') if hasattr(article['post_date'], 'strftime') else str(article['post_date']), ensure_ascii=False)},
]);
if (is_wp_error($post_id)) {{ echo 'ERROR: ' . $post_id->get_error_message(); exit; }}
// Asignar idioma Polylang
if (function_exists('pll_set_post_language')) {{
pll_set_post_language($post_id, {json.dumps(lang_code)});
}}
// Vincular traducciones
if (function_exists('pll_save_post_translations')) {{
$translations = pll_get_post_translations({original_id});
$translations[{json.dumps(lang_code)}] = $post_id;
$translations['es'] = {original_id};
pll_save_post_translations($translations);
}}
// Copiar categorías del original (excepto las de idioma)
$cats = wp_get_post_categories({original_id}, ['fields' => 'ids']);
if (!empty($cats)) wp_set_post_categories($post_id, $cats);
echo 'CREATED:' . $post_id;
"""
if dry_run:
print(f" [DRY] Crearía post '{translated_title[:60]}' en {lang_code}")
return 0
cmd = ["docker", "exec", WP_CONTAINER, "wp", "eval", php, "--allow-root"]
result = subprocess.run(cmd, capture_output=True, text=True)
output = result.stdout.strip()
if "CREATED:" in output:
new_id = int(output.split("CREATED:")[1].strip())
return new_id
else:
raise RuntimeError(f"Error creando post: {result.stdout} {result.stderr}")
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--check-api", action="store_true", help="Verificar conexión a Jan")
parser.add_argument("--dry-run", action="store_true", help="Simular sin crear posts")
parser.add_argument("--lang", help="Solo traducir a este idioma (en/fr/it/pt)")
parser.add_argument("--id", type=int, help="Solo traducir este WP ID")
args = parser.parse_args()
global JAN_MODEL
JAN_MODEL = get_jan_model()
print(f"Jan API OK — modelo: {JAN_MODEL}")
if args.check_api:
print("Probando traducción...")
t, c = translate("Prueba", "<p>Hola mundo</p>", "en", "English")
print(f" Título: {t}")
print(f" Contenido: {c}")
return
langs = {args.lang: TARGET_LANGS[args.lang]} if args.lang else TARGET_LANGS
ids = [args.id] if args.id else SPANISH_IDS
db = get_db()
total = len(ids) * len(langs)
done = 0
skipped = 0
errors = 0
print(f"\nArtículos: {len(ids)} | Idiomas: {list(langs.keys())} | Total: {total} traducciones\n")
for wp_id in ids:
article = get_article(db, wp_id)
if not article:
print(f" ⚠ ID {wp_id} no encontrado, saltando")
continue
title = article['post_title']
content = article['post_content']
print(f"\n[{wp_id}] {title[:70]}")
if len(content) > MAX_CONTENT_LEN:
print(f" ⚠ Contenido demasiado largo ({len(content)} chars), saltando")
skipped += 1
continue
for lang_code, lang_name in langs.items():
existing = get_existing_translation(db, wp_id, lang_code)
if existing:
print(f" → {lang_code.upper()}: ya existe (ID {existing}), saltando")
skipped += 1
continue
try:
if args.dry_run:
print(f" → {lang_code.upper()}: [DRY] se traduciría y crearía post")
done += 1
continue
print(f" → {lang_code.upper()}: traduciendo... ", end="", flush=True)
t0 = time.time()
trans_title, trans_content = translate(title, content, lang_code, lang_name)
elapsed = time.time() - t0
print(f"{elapsed:.0f}s")
print(f" Título: {trans_title[:60]}")
new_id = create_wp_post(article, trans_title, trans_content, lang_code, wp_id, False)
print(f" Post creado: ID {new_id}")
done += 1
except Exception as e:
print(f" ERROR: {e}")
errors += 1
time.sleep(2)
db.close()
print(f"\n{'='*50}")
print(f"Completado: {done} creados, {skipped} saltados, {errors} errores")
if errors:
print("Puedes volver a ejecutar — los ya creados se saltarán automáticamente.")
if __name__ == "__main__":
main()