181 lines
7.8 KiB
Python
181 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
fix_titles.py
|
|
|
|
Fixes wrong/contaminated/untranslated titles for translated WordPress posts.
|
|
Translates only the title via Jan (fast, ~5s each).
|
|
"""
|
|
|
|
import pymysql
|
|
import json
|
|
import urllib.request
|
|
import sys
|
|
import time
|
|
|
|
JAN_URL = "http://172.19.128.1:1337/v1/chat/completions"
|
|
JAN_MODEL = "gemma-3-12b-it-Q4_K_M"
|
|
|
|
DB_HOST = "172.18.0.2"
|
|
DB_PORT = 3306
|
|
DB_NAME = "wordpress_db"
|
|
DB_USER = "wordpress_user"
|
|
DB_PASS = "wordpress_pass"
|
|
|
|
TARGET_LANGS = {"en": "English", "fr": "French", "it": "Italian", "pt": "Portuguese"}
|
|
|
|
# All posts needing title fix: post_id -> (lang, spanish_id, spanish_title)
|
|
FIXES = {
|
|
43151: ("en", 42523, "LA TENTACIÓN"),
|
|
43281: ("fr", 42523, "LA TENTACIÓN"),
|
|
43150: ("en", 42524, "CUANDO NOS LEEMOS EN CLAVE DE CARENCIA"),
|
|
43280: ("fr", 42524, "CUANDO NOS LEEMOS EN CLAVE DE CARENCIA"),
|
|
43278: ("fr", 42525, "SE TRATA DE BUSCAR LO MEJOR PARA MÍ, AUNQUE ME CUESTE"),
|
|
43270: ("it", 42526, "PARA SER TENTADO"),
|
|
43269: ("pt", 42526, "PARA SER TENTADO"),
|
|
43143: ("en", 42531, "LA MAYOR TENTACIÓN HUMANA"),
|
|
43263: ("fr", 42531, "LA MAYOR TENTACIÓN HUMANA"),
|
|
43261: ("it", 42531, "LA MAYOR TENTACIÓN HUMANA"),
|
|
43256: ("fr", 42532, "MIÉRCOLES DE CENIZA"),
|
|
43260: ("it", 42532, "MIÉRCOLES DE CENIZA"),
|
|
43141: ("en", 42533, "1º DOMINGO DE CUARESMA"),
|
|
43259: ("it", 42533, "1º DOMINGO DE CUARESMA"),
|
|
43251: ("pt", 42533, "1º DOMINGO DE CUARESMA"),
|
|
43137: ("en", 42538, "ADÁN, EVA Y JESÚS FRENTE A LA TENTACIÓN"),
|
|
43240: ("fr", 42538, "ADÁN, EVA Y JESÚS FRENTE A LA TENTACIÓN"),
|
|
43236: ("pt", 42538, "ADÁN, EVA Y JESÚS FRENTE A LA TENTACIÓN"),
|
|
43135: ("en", 42544, "LO PROVISIONAL Y LO DEFINITIVO"),
|
|
43234: ("fr", 42544, "LO PROVISIONAL Y LO DEFINITIVO"),
|
|
43228: ("pt", 42544, "LO PROVISIONAL Y LO DEFINITIVO"),
|
|
43134: ("en", 42545, "2º DOMINGO DE CUARESMA"),
|
|
43232: ("fr", 42545, "2º DOMINGO DE CUARESMA"),
|
|
43226: ("pt", 42545, "2º DOMINGO DE CUARESMA"),
|
|
43225: ("pt", 42546, "POR LA RENUNCIA AL TRIUNFO"),
|
|
43132: ("en", 42547, "LO DIVINO ES NUESTRA ESENCIA"),
|
|
43233: ("it", 42547, "LO DIVINO ES NUESTRA ESENCIA"),
|
|
43131: ("en", 42548, "¡QUÉ BUENO ES QUE ESTEMOS AQUÍ!"),
|
|
43223: ("fr", 42548, "¡QUÉ BUENO ES QUE ESTEMOS AQUÍ!"),
|
|
43230: ("it", 42548, "¡QUÉ BUENO ES QUE ESTEMOS AQUÍ!"),
|
|
43216: ("pt", 42549, "¿A QUÉ TRANSFIGURACIÓN NOS ESTAMOS REFIRIENDO?"),
|
|
43129: ("en", 42555, "CUANDO NOS LEEMOS EN CLAVE DE PLENITUD"),
|
|
43211: ("fr", 42555, "CUANDO NOS LEEMOS EN CLAVE DE PLENITUD"),
|
|
43221: ("it", 42555, "CUANDO NOS LEEMOS EN CLAVE DE PLENITUD"),
|
|
43212: ("pt", 42555, "CUANDO NOS LEEMOS EN CLAVE DE PLENITUD"),
|
|
43128: ("en", 42556, "CUARESMA: CREER EN EL EVANGELIO"),
|
|
43208: ("fr", 42556, "CUARESMA: CREER EN EL EVANGELIO"),
|
|
43127: ("en", 42557, "LA CUARESMA COMO PEDAGOGÍA EN EL TIEMPO"),
|
|
43206: ("fr", 42557, "LA CUARESMA COMO PEDAGOGÍA EN EL TIEMPO"),
|
|
43217: ("it", 42557, "LA CUARESMA COMO PEDAGOGÍA EN EL TIEMPO"),
|
|
43205: ("pt", 42557, "LA CUARESMA COMO PEDAGOGÍA EN EL TIEMPO"),
|
|
43126: ("en", 42558, "¡NO TENEMOS UN DIOS VENGATIVO!"),
|
|
43124: ("en", 42560, 'CARLOS AGUIAR: "LA SINODALIDAD HA VENIDO A LA IGLESIA PARA QUEDARSE"'),
|
|
43123: ("en", 42561, "¿HERENCIA CRISTIANA?"),
|
|
43196: ("fr", 42561, "¿HERENCIA CRISTIANA?"),
|
|
43194: ("pt", 42561, "¿HERENCIA CRISTIANA?"),
|
|
43122: ("en", 42562, 'EL PAPA ADVIERTE A LOS CURAS DE LA "PANDEMIA" DEL CLERICALISMO'),
|
|
43120: ("en", 42564, "MOISÉS, LA SAMARITANA Y EL BORRACHO"),
|
|
43187: ("pt", 42564, "MOISÉS, LA SAMARITANA Y EL BORRACHO"),
|
|
43119: ("en", 42565, "EL FINAL DE LA BÚSQUEDA"),
|
|
43182: ("pt", 42565, "EL FINAL DE LA BÚSQUEDA"),
|
|
43174: ("fr", 42568, "EN EL POZO DE LA DIGNIDAD LIBERADA"),
|
|
43183: ("it", 42568, "EN EL POZO DE LA DIGNIDAD LIBERADA"),
|
|
43115: ("en", 42569, "PALABRA Y EUCARISTÍA"),
|
|
43171: ("fr", 42569, "PALABRA Y EUCARISTÍA"),
|
|
43172: ("pt", 42569, "PALABRA Y EUCARISTÍA"),
|
|
43167: ("fr", 42570, 'MABEL RUIZ: "LA TRADICIÓN HA UTILIZADO A LAS MUJERES PARA QUE SEAN SILENCIADAS"'),
|
|
43169: ("pt", 42570, 'MABEL RUIZ: "LA TRADICIÓN HA UTILIZADO A LAS MUJERES PARA QUE SEAN SILENCIADAS"'),
|
|
43113: ("en", 42571, 'LEÓN XIV, ANTE EL ATAQUE DE EEUU E ISRAEL CONTRA IRÁN: "HAY QUE DETENERLO"'),
|
|
43166: ("pt", 42571, 'LEÓN XIV, ANTE EL ATAQUE DE EEUU E ISRAEL CONTRA IRÁN: "HAY QUE DETENERLO"'),
|
|
43111: ("en", 42573, 'VICARIO GENERAL DE MOSCÚ: "LA GUERRA EN UCRANIA DEBE TERMINAR"'),
|
|
43104: ("pt", 42573, 'VICARIO GENERAL DE MOSCÚ: "LA GUERRA EN UCRANIA DEBE TERMINAR"'),
|
|
43163: ("pt", 42574, "SERVIR ES UNA FORMA DE LIDERAR"),
|
|
43156: ("pt", 42576, 'DIARMAID MACCULLOCH, HISTORIADOR: "NO EXISTE UNA ENSEÑANZA UNIFORME SOBRE SEXUALIDAD"'),
|
|
43155: ("pt", 42577, "3º DOMINGO DE CUARESMA"),
|
|
}
|
|
|
|
# Orphaned posts to delete (no Polylang link to any Spanish original)
|
|
ORPHANS_TO_DELETE = [42581, 43130, 43235]
|
|
|
|
|
|
def translate_title(spanish_title, lang_code, lang_name):
|
|
payload = json.dumps({
|
|
"model": JAN_MODEL,
|
|
"messages": [
|
|
{"role": "user", "content": f"Translate this title from Spanish to {lang_name}. Return ONLY the translated title in ALL CAPS, nothing else: {spanish_title}"}
|
|
],
|
|
"temperature": 0.2,
|
|
"max_tokens": 100,
|
|
}).encode("utf-8")
|
|
req = urllib.request.Request(
|
|
JAN_URL, data=payload,
|
|
headers={"Content-Type": "application/json", "Authorization": "Bearer dummy"},
|
|
method="POST"
|
|
)
|
|
with urllib.request.urlopen(req, timeout=30) as r:
|
|
result = json.loads(r.read())
|
|
return result["choices"][0]["message"]["content"].strip().strip('"').strip("'")
|
|
|
|
|
|
def get_db():
|
|
return pymysql.connect(
|
|
host=DB_HOST, port=DB_PORT,
|
|
user=DB_USER, password=DB_PASS,
|
|
database=DB_NAME, charset="utf8mb4",
|
|
cursorclass=pymysql.cursors.DictCursor
|
|
)
|
|
|
|
|
|
def main():
|
|
db = get_db()
|
|
c = db.cursor()
|
|
|
|
# Delete orphans first
|
|
print("Deleting orphaned posts...")
|
|
for orphan_id in ORPHANS_TO_DELETE:
|
|
cmd = f"docker exec wordpress-web wp post delete {orphan_id} --force --allow-root"
|
|
import subprocess
|
|
result = subprocess.run(cmd.split(), capture_output=True, text=True)
|
|
print(f" Deleted {orphan_id}: {result.stdout.strip() or result.stderr.strip()}")
|
|
|
|
print(f"\nFixing {len(FIXES)} titles...\n")
|
|
done = 0
|
|
errors = 0
|
|
|
|
# Group by Spanish title to batch translate same title to multiple langs
|
|
by_spanish = {}
|
|
for post_id, (lang, sp_id, sp_title) in FIXES.items():
|
|
by_spanish.setdefault((sp_id, sp_title), []).append((post_id, lang))
|
|
|
|
translated_cache = {} # (sp_id, lang) -> translated_title
|
|
|
|
for (sp_id, sp_title), targets in by_spanish.items():
|
|
print(f"ES:{sp_id} — {sp_title[:50]}")
|
|
for post_id, lang in targets:
|
|
lang_name = TARGET_LANGS[lang]
|
|
cache_key = (sp_id, lang)
|
|
|
|
if cache_key not in translated_cache:
|
|
try:
|
|
t0 = time.time()
|
|
new_title = translate_title(sp_title, lang, lang_name)
|
|
elapsed = time.time() - t0
|
|
translated_cache[cache_key] = new_title
|
|
print(f" [{lang}] {new_title[:60]} ({elapsed:.0f}s)")
|
|
except Exception as e:
|
|
print(f" [{lang}] ERROR translating: {e}")
|
|
errors += 1
|
|
continue
|
|
|
|
new_title = translated_cache[cache_key]
|
|
# Update the post title
|
|
c.execute("UPDATE wp_posts SET post_title=%s WHERE ID=%s", (new_title, post_id))
|
|
db.commit()
|
|
print(f" [{lang}] Updated {post_id}: {new_title[:60]}")
|
|
done += 1
|
|
|
|
db.close()
|
|
print(f"\nDone: {done} fixed, {errors} errors")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|