127 lines
4.7 KiB
Python
127 lines
4.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
build_lectionary_index.py — Descarga el leccionario de evangelizo.ws para un rango de
|
|
fechas (un ciclo litúrgico completo cubre todas las lecturas) en es/en/fr/it/pt y
|
|
construye un índice POR REFERENCIA bíblica, para casar lecturas sin depender de fechas.
|
|
|
|
Salida: /tmp/lectionary_index.json { "LIBRO|cap|vers": {es,en,fr,it,pt: html} }
|
|
Cache por día/idioma en /tmp/evangelizo_cache (resumible).
|
|
|
|
Uso: python3 build_lectionary_index.py 2023-01-01 2025-12-31
|
|
"""
|
|
import sys, os, re, json, time, html, unicodedata, urllib.request
|
|
from datetime import date, timedelta
|
|
|
|
LANGS = {"SP": "es", "AM": "en", "FR": "fr", "IT": "it", "PT": "pt"}
|
|
CACHE = "/tmp/evangelizo_cache"
|
|
os.makedirs(CACHE, exist_ok=True)
|
|
INDEX = "/tmp/lectionary_index.json"
|
|
|
|
|
|
def norm_book(full_title):
|
|
# "Libro de Jeremías" / "Carta de san Pablo a los Romanos" -> "JEREMIAS"/"ROMANOS"
|
|
s = unicodedata.normalize("NFKD", full_title).encode("ascii", "ignore").decode().upper()
|
|
s = re.sub(r"[^A-Z0-9 ]", " ", s)
|
|
toks = [t for t in s.split() if t]
|
|
return toks[-1] if toks else ""
|
|
|
|
|
|
def clean(raw):
|
|
raw = html.unescape(raw or "")
|
|
raw = re.sub(r"\[\[[^\]]+\]\]", "", raw)
|
|
paras = [p.strip() for p in raw.split("\n") if p.strip()]
|
|
return "".join(f"<p>{p}</p>\n" for p in paras)
|
|
|
|
|
|
def fetch(date_s, lang_code):
|
|
fp = os.path.join(CACHE, f"{date_s}_{lang_code}.json")
|
|
if os.path.exists(fp):
|
|
try:
|
|
return json.load(open(fp))
|
|
except Exception:
|
|
pass
|
|
url = f"https://publication.evangelizo.ws/{lang_code}/days/{date_s}"
|
|
for a in range(3):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": "fea-lect/1.0"})
|
|
with urllib.request.urlopen(req, timeout=30) as r:
|
|
data = json.load(r)
|
|
out = []
|
|
for rd in data.get("data", {}).get("readings", []) or []:
|
|
out.append({
|
|
"code": rd.get("reading_code", ""),
|
|
"ref": (rd.get("reference_displayed") or "").strip().rstrip("."),
|
|
"book": (rd.get("book") or {}).get("full_title", ""),
|
|
"text": clean(rd.get("text", "")),
|
|
})
|
|
json.dump(out, open(fp, "w"), ensure_ascii=False)
|
|
time.sleep(0.15)
|
|
return out
|
|
except Exception:
|
|
if a == 2:
|
|
json.dump([], open(fp, "w"))
|
|
return []
|
|
time.sleep(1.0)
|
|
|
|
|
|
def key_from(book_full, ref):
|
|
m = re.match(r"(\d{1,3})\s*,\s*(\d{1,3})", ref)
|
|
if not m:
|
|
return None
|
|
return f"{norm_book(book_full)}|{int(m.group(1))}|{int(m.group(2))}"
|
|
|
|
|
|
def main():
|
|
d0 = date.fromisoformat(sys.argv[1])
|
|
d1 = date.fromisoformat(sys.argv[2])
|
|
days = (d1 - d0).days + 1
|
|
|
|
# Las fiestas trasladadas caen en fechas distintas por país/idioma → NO se puede
|
|
# casar dentro del mismo día. Indexamos por reading_code (estable entre idiomas)
|
|
# acumulando el texto de cada idioma desde CUALQUIER día donde aparezca.
|
|
code_text = {wl: {} for wl in LANGS.values()} # lang -> {code: text}
|
|
code_book = {} # code -> norm_book (del SP)
|
|
cur, n = d0, 0
|
|
while cur <= d1:
|
|
ds = cur.isoformat()
|
|
for lc, wl in LANGS.items():
|
|
for rd in fetch(ds, lc):
|
|
code = rd["code"]
|
|
if not code:
|
|
continue
|
|
if rd["text"] and code not in code_text[wl]:
|
|
code_text[wl][code] = rd["text"]
|
|
if wl == "es" and code not in code_book:
|
|
nb = norm_book(rd["book"])
|
|
m = re.search(r"(\d{1,3})\s*,\s*(\d{1,3})", code)
|
|
if nb and m:
|
|
code_book[code] = f"{nb}|{int(m.group(1))}|{int(m.group(2))}"
|
|
n += 1
|
|
if n % 60 == 0:
|
|
print(f" {n}/{days} días codes_es={len(code_text['es'])}", flush=True)
|
|
cur += timedelta(days=1)
|
|
|
|
# combinar: para cada code con clave-ES y texto en los 4 idiomas
|
|
index = {}
|
|
for code, key in code_book.items():
|
|
if key in index:
|
|
continue
|
|
entry = {}
|
|
ok = True
|
|
for wl in ("es", "en", "fr", "it", "pt"):
|
|
t = code_text[wl].get(code)
|
|
if not t:
|
|
ok = (wl == "es") and ok # es siempre presente; faltar otro descarta
|
|
if wl != "es":
|
|
ok = False
|
|
break
|
|
entry[wl] = t
|
|
if ok and all(l in entry for l in ("en", "fr", "it", "pt")):
|
|
index[key] = entry
|
|
json.dump(index, open(INDEX, "w"), ensure_ascii=False)
|
|
print(f"FIN. {n} días. codes_es={len(code_book)} → índice {len(index)} referencias en {INDEX}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|