Files
feadulta/scripts/build_lectionary_index.py
T

127 lines
4.7 KiB
Python

#!/usr/bin/env python3
"""
build_lectionary_index.py — Descarga el leccionario de evangelizo.ws para un rango de
fechas (un ciclo litúrgico completo cubre todas las lecturas) en es/en/fr/it/pt y
construye un índice POR REFERENCIA bíblica, para casar lecturas sin depender de fechas.
Salida: /tmp/lectionary_index.json { "LIBRO|cap|vers": {es,en,fr,it,pt: html} }
Cache por día/idioma en /tmp/evangelizo_cache (resumible).
Uso: python3 build_lectionary_index.py 2023-01-01 2025-12-31
"""
import sys, os, re, json, time, html, unicodedata, urllib.request
from datetime import date, timedelta
LANGS = {"SP": "es", "AM": "en", "FR": "fr", "IT": "it", "PT": "pt"}
CACHE = "/tmp/evangelizo_cache"
os.makedirs(CACHE, exist_ok=True)
INDEX = "/tmp/lectionary_index.json"
def norm_book(full_title):
# "Libro de Jeremías" / "Carta de san Pablo a los Romanos" -> "JEREMIAS"/"ROMANOS"
s = unicodedata.normalize("NFKD", full_title).encode("ascii", "ignore").decode().upper()
s = re.sub(r"[^A-Z0-9 ]", " ", s)
toks = [t for t in s.split() if t]
return toks[-1] if toks else ""
def clean(raw):
raw = html.unescape(raw or "")
raw = re.sub(r"\[\[[^\]]+\]\]", "", raw)
paras = [p.strip() for p in raw.split("\n") if p.strip()]
return "".join(f"<p>{p}</p>\n" for p in paras)
def fetch(date_s, lang_code):
fp = os.path.join(CACHE, f"{date_s}_{lang_code}.json")
if os.path.exists(fp):
try:
return json.load(open(fp))
except Exception:
pass
url = f"https://publication.evangelizo.ws/{lang_code}/days/{date_s}"
for a in range(3):
try:
req = urllib.request.Request(url, headers={"User-Agent": "fea-lect/1.0"})
with urllib.request.urlopen(req, timeout=30) as r:
data = json.load(r)
out = []
for rd in data.get("data", {}).get("readings", []) or []:
out.append({
"code": rd.get("reading_code", ""),
"ref": (rd.get("reference_displayed") or "").strip().rstrip("."),
"book": (rd.get("book") or {}).get("full_title", ""),
"text": clean(rd.get("text", "")),
})
json.dump(out, open(fp, "w"), ensure_ascii=False)
time.sleep(0.15)
return out
except Exception:
if a == 2:
json.dump([], open(fp, "w"))
return []
time.sleep(1.0)
def key_from(book_full, ref):
m = re.match(r"(\d{1,3})\s*,\s*(\d{1,3})", ref)
if not m:
return None
return f"{norm_book(book_full)}|{int(m.group(1))}|{int(m.group(2))}"
def main():
d0 = date.fromisoformat(sys.argv[1])
d1 = date.fromisoformat(sys.argv[2])
days = (d1 - d0).days + 1
# Las fiestas trasladadas caen en fechas distintas por país/idioma → NO se puede
# casar dentro del mismo día. Indexamos por reading_code (estable entre idiomas)
# acumulando el texto de cada idioma desde CUALQUIER día donde aparezca.
code_text = {wl: {} for wl in LANGS.values()} # lang -> {code: text}
code_book = {} # code -> norm_book (del SP)
cur, n = d0, 0
while cur <= d1:
ds = cur.isoformat()
for lc, wl in LANGS.items():
for rd in fetch(ds, lc):
code = rd["code"]
if not code:
continue
if rd["text"] and code not in code_text[wl]:
code_text[wl][code] = rd["text"]
if wl == "es" and code not in code_book:
nb = norm_book(rd["book"])
m = re.search(r"(\d{1,3})\s*,\s*(\d{1,3})", code)
if nb and m:
code_book[code] = f"{nb}|{int(m.group(1))}|{int(m.group(2))}"
n += 1
if n % 60 == 0:
print(f" {n}/{days} días codes_es={len(code_text['es'])}", flush=True)
cur += timedelta(days=1)
# combinar: para cada code con clave-ES y texto en los 4 idiomas
index = {}
for code, key in code_book.items():
if key in index:
continue
entry = {}
ok = True
for wl in ("es", "en", "fr", "it", "pt"):
t = code_text[wl].get(code)
if not t:
ok = (wl == "es") and ok # es siempre presente; faltar otro descarta
if wl != "es":
ok = False
break
entry[wl] = t
if ok and all(l in entry for l in ("en", "fr", "it", "pt")):
index[key] = entry
json.dump(index, open(INDEX, "w"), ensure_ascii=False)
print(f"FIN. {n} días. codes_es={len(code_book)} → índice {len(index)} referencias en {INDEX}")
if __name__ == "__main__":
main()