#!/usr/bin/env python3 """ build_lectionary_index.py — Descarga el leccionario de evangelizo.ws para un rango de fechas (un ciclo litúrgico completo cubre todas las lecturas) en es/en/fr/it/pt y construye un índice POR REFERENCIA bíblica, para casar lecturas sin depender de fechas. Salida: /tmp/lectionary_index.json { "LIBRO|cap|vers": {es,en,fr,it,pt: html} } Cache por día/idioma en /tmp/evangelizo_cache (resumible). Uso: python3 build_lectionary_index.py 2023-01-01 2025-12-31 """ import sys, os, re, json, time, html, unicodedata, urllib.request from datetime import date, timedelta LANGS = {"SP": "es", "AM": "en", "FR": "fr", "IT": "it", "PT": "pt"} CACHE = "/tmp/evangelizo_cache" os.makedirs(CACHE, exist_ok=True) INDEX = "/tmp/lectionary_index.json" def norm_book(full_title): # "Libro de Jeremías" / "Carta de san Pablo a los Romanos" -> "JEREMIAS"/"ROMANOS" s = unicodedata.normalize("NFKD", full_title).encode("ascii", "ignore").decode().upper() s = re.sub(r"[^A-Z0-9 ]", " ", s) toks = [t for t in s.split() if t] return toks[-1] if toks else "" def clean(raw): raw = html.unescape(raw or "") raw = re.sub(r"\[\[[^\]]+\]\]", "", raw) paras = [p.strip() for p in raw.split("\n") if p.strip()] return "".join(f"
{p}
\n" for p in paras) def fetch(date_s, lang_code): fp = os.path.join(CACHE, f"{date_s}_{lang_code}.json") if os.path.exists(fp): try: return json.load(open(fp)) except Exception: pass url = f"https://publication.evangelizo.ws/{lang_code}/days/{date_s}" for a in range(3): try: req = urllib.request.Request(url, headers={"User-Agent": "fea-lect/1.0"}) with urllib.request.urlopen(req, timeout=30) as r: data = json.load(r) out = [] for rd in data.get("data", {}).get("readings", []) or []: out.append({ "code": rd.get("reading_code", ""), "ref": (rd.get("reference_displayed") or "").strip().rstrip("."), "book": (rd.get("book") or {}).get("full_title", ""), "text": clean(rd.get("text", "")), }) json.dump(out, open(fp, "w"), ensure_ascii=False) time.sleep(0.15) return out except Exception: if a == 2: json.dump([], open(fp, "w")) return [] time.sleep(1.0) def key_from(book_full, ref): m = re.match(r"(\d{1,3})\s*,\s*(\d{1,3})", ref) if not m: return None return f"{norm_book(book_full)}|{int(m.group(1))}|{int(m.group(2))}" def main(): d0 = date.fromisoformat(sys.argv[1]) d1 = date.fromisoformat(sys.argv[2]) days = (d1 - d0).days + 1 # Las fiestas trasladadas caen en fechas distintas por país/idioma → NO se puede # casar dentro del mismo día. Indexamos por reading_code (estable entre idiomas) # acumulando el texto de cada idioma desde CUALQUIER día donde aparezca. code_text = {wl: {} for wl in LANGS.values()} # lang -> {code: text} code_book = {} # code -> norm_book (del SP) cur, n = d0, 0 while cur <= d1: ds = cur.isoformat() for lc, wl in LANGS.items(): for rd in fetch(ds, lc): code = rd["code"] if not code: continue if rd["text"] and code not in code_text[wl]: code_text[wl][code] = rd["text"] if wl == "es" and code not in code_book: nb = norm_book(rd["book"]) m = re.search(r"(\d{1,3})\s*,\s*(\d{1,3})", code) if nb and m: code_book[code] = f"{nb}|{int(m.group(1))}|{int(m.group(2))}" n += 1 if n % 60 == 0: print(f" {n}/{days} días codes_es={len(code_text['es'])}", flush=True) cur += timedelta(days=1) # combinar: para cada code con clave-ES y texto en los 4 idiomas index = {} for code, key in code_book.items(): if key in index: continue entry = {} ok = True for wl in ("es", "en", "fr", "it", "pt"): t = code_text[wl].get(code) if not t: ok = (wl == "es") and ok # es siempre presente; faltar otro descarta if wl != "es": ok = False break entry[wl] = t if ok and all(l in entry for l in ("en", "fr", "it", "pt")): index[key] = entry json.dump(index, open(INDEX, "w"), ensure_ascii=False) print(f"FIN. {n} días. codes_es={len(code_book)} → índice {len(index)} referencias en {INDEX}") if __name__ == "__main__": main()