Files
feadulta/scripts/lecturas_apply.py

85 lines
3.1 KiB
Python

#!/usr/bin/env python3
"""
lecturas_apply.py — Casa las lecturas ES sin traducir contra el índice del leccionario
(build_lectionary_index.py) POR REFERENCIA bíblica y vuelca las traducciones a crear.
Entrada: /tmp/lectionary_index.json , /tmp/lecturas_todo.json
Salida: /tmp/lecturas_creadas.json (para que un wp eval cree+asocie+publique)
/tmp/lecturas_skip.json
Uso: python3 lecturas_apply.py [--limit N]
"""
import sys, re, json, unicodedata
from collections import Counter
# Alias de nombre de libro: feadulta -> token usado por evangelizo (último token full_title ES)
ALIAS = {
"HECHOS": "APOSTOLES", "HCH": "APOSTOLES",
"CANTAR": "CANTARES",
"APOC": "APOCALIPSIS", "AP": "APOCALIPSIS",
"QOHELET": "ECLESIASTES",
# abreviaturas litúrgicas
"MT": "MATEO", "MC": "MARCOS", "LC": "LUCAS", "JN": "JUAN",
"RM": "ROMANOS", "GA": "GALATAS", "EF": "EFESIOS", "FLP": "FILIPENSES",
"COL": "COLOSENSES", "HB": "HEBREOS", "ST": "SANTIAGO",
"IS": "ISAIAS", "JR": "JEREMIAS", "EZ": "EZEQUIEL", "GN": "GENESIS",
"EX": "EXODO", "DT": "DEUTERONOMIO", "SAL": "SALMOS", "PR": "PROVERBIOS",
}
def norm(s):
s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode().upper()
return re.sub(r"[^A-Z]", "", s) # solo letras → descarta el número del libro
def title_keys(title):
keys = []
for part in re.split(r"\s*/\s*", title):
m = re.search(r"([0-9]?\s*[A-Za-zÀ-ÿ][A-Za-zÀ-ÿ.\s]+?)\s+(\d{1,3})\s*,\s*(\d{1,3})", part)
if not m:
return None # parte no parseable → no casar el post entero
book = norm(m.group(1))
book = ALIAS.get(book, book)
keys.append(f"{book}|{int(m.group(2))}|{int(m.group(3))}")
return keys or None
def main():
limit = 0
if "--limit" in sys.argv:
limit = int(sys.argv[sys.argv.index("--limit") + 1])
idx = json.load(open("/tmp/lectionary_index.json"))
todo = json.load(open("/tmp/lecturas_todo.json"))
if limit:
todo = todo[:limit]
creadas, skip = [], []
for t in todo:
keys = title_keys(t["title"])
if not keys:
skip.append({**t, "why": "título no parseable"})
continue
if not all(k in idx for k in keys):
missing = [k for k in keys if k not in idx]
skip.append({**t, "why": "ref no en índice", "missing": missing})
continue
langs = {}
for wl in ("en", "fr", "it", "pt"):
langs[wl] = "".join(idx[k][wl] for k in keys)
creadas.append({"es_id": t["id"], "title": t["title"], "langs": langs})
json.dump(creadas, open("/tmp/lecturas_creadas.json", "w"), ensure_ascii=False)
json.dump(skip, open("/tmp/lecturas_skip.json", "w"), ensure_ascii=False)
print(f"CASADAS: {len(creadas)} / {len(todo)} SKIP: {len(skip)}")
print("motivos skip:", dict(Counter(s["why"] for s in skip)))
# muestra de refs que faltan (para ampliar alias/rango)
missing = Counter()
for s in skip:
for k in s.get("missing", []):
missing[k.split("|")[0]] += 1
print("libros con más misses:", dict(missing.most_common(12)))
if __name__ == "__main__":
main()