feadulta/scripts/lecturas_apply.py

#!/usr/bin/env python3
"""
lecturas_apply.py — Casa las lecturas ES sin traducir contra el índice del leccionario
(build_lectionary_index.py) POR REFERENCIA bíblica y vuelca las traducciones a crear.

Entrada: /tmp/lectionary_index.json , /tmp/lecturas_todo.json
Salida:  /tmp/lecturas_creadas.json (para que un wp eval cree+asocie+publique)
         /tmp/lecturas_skip.json

Uso: python3 lecturas_apply.py [--limit N]
"""
import sys, re, json, unicodedata
from collections import Counter

# Alias de nombre de libro: feadulta -> token usado por evangelizo (último token full_title ES)
ALIAS = {
    "HECHOS": "APOSTOLES", "HCH": "APOSTOLES",
    "CANTAR": "CANTARES",
    "APOC": "APOCALIPSIS", "AP": "APOCALIPSIS",
    "QOHELET": "ECLESIASTES",
    # abreviaturas litúrgicas
    "MT": "MATEO", "MC": "MARCOS", "LC": "LUCAS", "JN": "JUAN",
    "RM": "ROMANOS", "GA": "GALATAS", "EF": "EFESIOS", "FLP": "FILIPENSES",
    "COL": "COLOSENSES", "HB": "HEBREOS", "ST": "SANTIAGO",
    "IS": "ISAIAS", "JR": "JEREMIAS", "EZ": "EZEQUIEL", "GN": "GENESIS",
    "EX": "EXODO", "DT": "DEUTERONOMIO", "SAL": "SALMOS", "PR": "PROVERBIOS",
}


def norm(s):
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode().upper()
    return re.sub(r"[^A-Z]", "", s)  # solo letras → descarta el número del libro


def title_keys(title):
    keys = []
    for part in re.split(r"\s*/\s*", title):
        m = re.search(r"([0-9]?\s*[A-Za-zÀ-ÿ][A-Za-zÀ-ÿ.\s]+?)\s+(\d{1,3})\s*,\s*(\d{1,3})", part)
        if not m:
            return None  # parte no parseable → no casar el post entero
        book = norm(m.group(1))
        book = ALIAS.get(book, book)
        keys.append(f"{book}|{int(m.group(2))}|{int(m.group(3))}")
    return keys or None


def main():
    limit = 0
    if "--limit" in sys.argv:
        limit = int(sys.argv[sys.argv.index("--limit") + 1])
    idx = json.load(open("/tmp/lectionary_index.json"))
    todo = json.load(open("/tmp/lecturas_todo.json"))
    if limit:
        todo = todo[:limit]

    creadas, skip = [], []
    for t in todo:
        keys = title_keys(t["title"])
        if not keys:
            skip.append({**t, "why": "título no parseable"})
            continue
        if not all(k in idx for k in keys):
            missing = [k for k in keys if k not in idx]
            skip.append({**t, "why": "ref no en índice", "missing": missing})
            continue
        langs = {}
        for wl in ("en", "fr", "it", "pt"):
            langs[wl] = "".join(idx[k][wl] for k in keys)
        creadas.append({"es_id": t["id"], "title": t["title"], "langs": langs})

    json.dump(creadas, open("/tmp/lecturas_creadas.json", "w"), ensure_ascii=False)
    json.dump(skip, open("/tmp/lecturas_skip.json", "w"), ensure_ascii=False)
    print(f"CASADAS: {len(creadas)} / {len(todo)}   SKIP: {len(skip)}")
    print("motivos skip:", dict(Counter(s["why"] for s in skip)))
    # muestra de refs que faltan (para ampliar alias/rango)
    missing = Counter()
    for s in skip:
        for k in s.get("missing", []):
            missing[k.split("|")[0]] += 1
    print("libros con más misses:", dict(missing.most_common(12)))


if __name__ == "__main__":
    main()