85 lines
3.1 KiB
Python
85 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
lecturas_apply.py — Casa las lecturas ES sin traducir contra el índice del leccionario
|
|
(build_lectionary_index.py) POR REFERENCIA bíblica y vuelca las traducciones a crear.
|
|
|
|
Entrada: /tmp/lectionary_index.json , /tmp/lecturas_todo.json
|
|
Salida: /tmp/lecturas_creadas.json (para que un wp eval cree+asocie+publique)
|
|
/tmp/lecturas_skip.json
|
|
|
|
Uso: python3 lecturas_apply.py [--limit N]
|
|
"""
|
|
import sys, re, json, unicodedata
|
|
from collections import Counter
|
|
|
|
# Alias de nombre de libro: feadulta -> token usado por evangelizo (último token full_title ES)
|
|
ALIAS = {
|
|
"HECHOS": "APOSTOLES", "HCH": "APOSTOLES",
|
|
"CANTAR": "CANTARES",
|
|
"APOC": "APOCALIPSIS", "AP": "APOCALIPSIS",
|
|
"QOHELET": "ECLESIASTES",
|
|
# abreviaturas litúrgicas
|
|
"MT": "MATEO", "MC": "MARCOS", "LC": "LUCAS", "JN": "JUAN",
|
|
"RM": "ROMANOS", "GA": "GALATAS", "EF": "EFESIOS", "FLP": "FILIPENSES",
|
|
"COL": "COLOSENSES", "HB": "HEBREOS", "ST": "SANTIAGO",
|
|
"IS": "ISAIAS", "JR": "JEREMIAS", "EZ": "EZEQUIEL", "GN": "GENESIS",
|
|
"EX": "EXODO", "DT": "DEUTERONOMIO", "SAL": "SALMOS", "PR": "PROVERBIOS",
|
|
}
|
|
|
|
|
|
def norm(s):
|
|
s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode().upper()
|
|
return re.sub(r"[^A-Z]", "", s) # solo letras → descarta el número del libro
|
|
|
|
|
|
def title_keys(title):
|
|
keys = []
|
|
for part in re.split(r"\s*/\s*", title):
|
|
m = re.search(r"([0-9]?\s*[A-Za-zÀ-ÿ][A-Za-zÀ-ÿ.\s]+?)\s+(\d{1,3})\s*,\s*(\d{1,3})", part)
|
|
if not m:
|
|
return None # parte no parseable → no casar el post entero
|
|
book = norm(m.group(1))
|
|
book = ALIAS.get(book, book)
|
|
keys.append(f"{book}|{int(m.group(2))}|{int(m.group(3))}")
|
|
return keys or None
|
|
|
|
|
|
def main():
|
|
limit = 0
|
|
if "--limit" in sys.argv:
|
|
limit = int(sys.argv[sys.argv.index("--limit") + 1])
|
|
idx = json.load(open("/tmp/lectionary_index.json"))
|
|
todo = json.load(open("/tmp/lecturas_todo.json"))
|
|
if limit:
|
|
todo = todo[:limit]
|
|
|
|
creadas, skip = [], []
|
|
for t in todo:
|
|
keys = title_keys(t["title"])
|
|
if not keys:
|
|
skip.append({**t, "why": "título no parseable"})
|
|
continue
|
|
if not all(k in idx for k in keys):
|
|
missing = [k for k in keys if k not in idx]
|
|
skip.append({**t, "why": "ref no en índice", "missing": missing})
|
|
continue
|
|
langs = {}
|
|
for wl in ("en", "fr", "it", "pt"):
|
|
langs[wl] = "".join(idx[k][wl] for k in keys)
|
|
creadas.append({"es_id": t["id"], "title": t["title"], "langs": langs})
|
|
|
|
json.dump(creadas, open("/tmp/lecturas_creadas.json", "w"), ensure_ascii=False)
|
|
json.dump(skip, open("/tmp/lecturas_skip.json", "w"), ensure_ascii=False)
|
|
print(f"CASADAS: {len(creadas)} / {len(todo)} SKIP: {len(skip)}")
|
|
print("motivos skip:", dict(Counter(s["why"] for s in skip)))
|
|
# muestra de refs que faltan (para ampliar alias/rango)
|
|
missing = Counter()
|
|
for s in skip:
|
|
for k in s.get("missing", []):
|
|
missing[k.split("|")[0]] += 1
|
|
print("libros con más misses:", dict(missing.most_common(12)))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|