299 lines
11 KiB
Python
299 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Regenera `clasificacion_articulos.csv` recorriendo las cartas semanales y
|
|
extrayendo los links que cada una agrupa por encabezado (Artículos, Evangelio,
|
|
Eucaristía, Multimedia, EFFA). Paridad con `wp-content/mu-plugins/fea-carta-portada.php`.
|
|
|
|
Output: post_id, post_title, categoria_propuesta, seccion_original, carta_id, carta_titulo, carta_fecha
|
|
|
|
ALCANCE (vs. CSV histórico de marzo 2026):
|
|
- Cubre las 5 secciones estándar: comentario, articulo, eucaristia, multimedia, effa.
|
|
- Sub-clasifica por posición dentro de "Evangelio y comentarios al Evangelio":
|
|
· pos 1 → `lectura` (cita del evangelio)
|
|
· pos 2 → `comentario_editorial`
|
|
· pos 3+ → `comentario`
|
|
(regla del editor, confirmada contra el CSV histórico)
|
|
- NO cubre todavía:
|
|
· `lectura` dentro de eucaristía (lecturas bíblicas — el viejo las separa, aquí van todas a `eucaristia`)
|
|
· `otro` (catch-all del catch-all)
|
|
· `noticia` (subgrupo poco usado, 12 filas en el viejo)
|
|
· Encabezados de fiestas especiales ("Domingo de Resurrección", "Navidad", "Vigilia Pascual", etc.)
|
|
|
|
Para regenerar el CSV con cobertura completa habría que ampliar el mapping en
|
|
SECTION_PATTERNS y SECTION_LABELS con reglas adicionales. El CSV histórico
|
|
existente (raíz del repo) sirve como baseline para esa cobertura granular.
|
|
|
|
Uso:
|
|
python3 regenerar_clasificacion_csv.py [--out /path/clasificacion_articulos.csv] [--diff /path/csv_marzo.csv]
|
|
|
|
Issue: rafa/feadulta#42
|
|
"""
|
|
import argparse, csv, os, re, subprocess, sys
|
|
|
|
try:
|
|
import pymysql
|
|
except ImportError:
|
|
sys.exit('pymysql requerido: pip install --user pymysql')
|
|
|
|
# Mapping sección encabezado → cat slug (debe espejar fea-carta-portada.php)
|
|
SECTION_PATTERNS = [
|
|
('comentario', re.compile(r'Evangelio\s+y\s+comentarios\s+al\s+Evangelio', re.I)),
|
|
('articulo', re.compile(r'Art[ií]culos\s+seleccionados\s+para\s+la\s+semana', re.I)),
|
|
('eucaristia', re.compile(r'Para\s+unas\s+eucarist[ií]as\s+m[áa]s\s+participativas', re.I)),
|
|
('multimedia', re.compile(r'Material\s+multimedia', re.I)),
|
|
('effa', re.compile(r'Escuela\s+EFFA', re.I)),
|
|
]
|
|
|
|
# Nombres “bonitos” usados en seccion_original (verbatim del CSV histórico)
|
|
SECTION_LABELS = {
|
|
'comentario': 'Evangelio y comentarios al Evangelio',
|
|
'articulo': 'Artículos seleccionados para la semana',
|
|
'eucaristia': 'Para unas eucaristías más participativas y actuales',
|
|
'multimedia': 'Material multimedia',
|
|
'effa': 'Escuela EFFA',
|
|
}
|
|
|
|
CAT_PROPUESTA = {
|
|
'comentario': 'comentario',
|
|
'articulo': 'articulo',
|
|
'eucaristia': 'eucaristia',
|
|
'multimedia': 'multimedia',
|
|
'effa': 'effa',
|
|
}
|
|
|
|
# Sub-clasificación posicional dentro de la sección "Evangelio y comentarios al Evangelio".
|
|
# El editor SIEMPRE coloca: 1º lectura del evangelio, 2º comentario editorial, 3º+ comentarios.
|
|
SUBCAT_EVANGELIO_BY_POS = ['lectura', 'comentario_editorial'] # resto = 'comentario'
|
|
|
|
HREF_RX = re.compile(r'href=["\']([^"\']+)["\']', re.I)
|
|
WP_SLUG_RX = re.compile(r'(?:^|/)fea/([a-z0-9\-]+)/?(?:[?#]|$)', re.I)
|
|
K2_ITEM_RX = re.compile(r'/item/(\d+)-[^/"]+\.html', re.I)
|
|
RESERVED_SLUGS = {'wp-admin','wp-content','category','tag','author','page','en','fr','it','pt'}
|
|
|
|
|
|
def get_conn():
|
|
ip = subprocess.run(
|
|
['docker', 'inspect', 'wordpress-mysql', '--format',
|
|
'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}'],
|
|
capture_output=True, text=True, check=True,
|
|
).stdout.strip()
|
|
return pymysql.connect(
|
|
host=ip, user='wordpress_user', password='wordpress_pass',
|
|
database='wordpress_db', charset='utf8mb4', autocommit=True,
|
|
)
|
|
|
|
|
|
def mysql(query, conn=None):
|
|
"""Ejecuta query y devuelve filas (lista de tuplas) — pymysql, sin parsing CLI."""
|
|
own = False
|
|
if conn is None:
|
|
conn = get_conn(); own = True
|
|
try:
|
|
with conn.cursor() as c:
|
|
c.execute(query)
|
|
return list(c.fetchall())
|
|
finally:
|
|
if own: conn.close()
|
|
|
|
|
|
def fetch_cartas(conn):
|
|
"""Todas las cartas (cat 6 actual, 22 semana pasada, 21 otras) + sus contenidos."""
|
|
q = """
|
|
SELECT p.ID, p.post_title, DATE(p.post_date), p.post_content
|
|
FROM wp_posts p
|
|
WHERE p.post_status='publish' AND p.post_type='post'
|
|
AND p.ID IN (
|
|
SELECT DISTINCT tr.object_id FROM wp_term_relationships tr
|
|
JOIN wp_term_taxonomy tt ON tt.term_taxonomy_id=tr.term_taxonomy_id
|
|
WHERE tt.term_id IN (6, 21, 22) AND tt.taxonomy='category'
|
|
)
|
|
ORDER BY p.post_date DESC;
|
|
"""
|
|
return mysql(q, conn)
|
|
|
|
|
|
def build_lookups(conn):
|
|
"""Construye dicts slug→post_id y k2_id→post_id para no machacar la BD por cada link.
|
|
Para slugs duplicados (varios posts con mismo slug), se usa el MÁS RECIENTE
|
|
(criterio espejo del mu-plugin fea-carta-portada.php tras el bug detectado en #38).
|
|
"""
|
|
print('Cargando lookups (slug y k2_id) ...', file=sys.stderr, flush=True)
|
|
slug_to_id = {}
|
|
rows = mysql("""
|
|
SELECT p1.post_name, p1.ID
|
|
FROM wp_posts p1
|
|
WHERE p1.post_status='publish' AND p1.post_type='post' AND p1.post_name<>''
|
|
ORDER BY p1.post_date DESC;
|
|
""", conn)
|
|
for r in rows:
|
|
slug = r[0]
|
|
if slug not in slug_to_id: # primero (más reciente) gana
|
|
slug_to_id[slug] = int(r[1])
|
|
|
|
k2_to_id = {}
|
|
rows = mysql("""
|
|
SELECT meta_value, MAX(post_id) FROM wp_postmeta
|
|
WHERE meta_key='_fgj2wp_old_k2_id' AND meta_value<>''
|
|
GROUP BY meta_value;
|
|
""", conn)
|
|
for r in rows:
|
|
try:
|
|
k2_to_id[int(r[0])] = int(r[1])
|
|
except (ValueError, TypeError):
|
|
continue
|
|
|
|
print(f' slugs: {len(slug_to_id)} k2_ids: {len(k2_to_id)}', file=sys.stderr)
|
|
return slug_to_id, k2_to_id
|
|
|
|
|
|
def fetch_titles(ids, conn):
|
|
if not ids: return {}
|
|
ids_str = ','.join(str(i) for i in ids)
|
|
rows = mysql(f"SELECT ID, post_title FROM wp_posts WHERE ID IN ({ids_str});", conn)
|
|
return {int(r[0]): r[1] for r in rows}
|
|
|
|
|
|
def url_to_post_id(url, slug_to_id, k2_to_id):
|
|
m = WP_SLUG_RX.search(url)
|
|
if m:
|
|
slug = m.group(1).lower()
|
|
if slug not in RESERVED_SLUGS and slug in slug_to_id:
|
|
return slug_to_id[slug]
|
|
m = K2_ITEM_RX.search(url)
|
|
if m:
|
|
k2 = int(m.group(1))
|
|
if k2 in k2_to_id:
|
|
return k2_to_id[k2]
|
|
return None
|
|
|
|
|
|
def extract_sections(html_content):
|
|
"""Devuelve dict {section_slug: [post_id, ...]} basándose en encabezados.
|
|
NOTA: los post_ids aún no están resueltos aquí — devuelve hrefs en su lugar.
|
|
"""
|
|
positions = []
|
|
for slug, rx in SECTION_PATTERNS:
|
|
m = rx.search(html_content)
|
|
if m:
|
|
positions.append((m.start(), slug))
|
|
if not positions:
|
|
return {}
|
|
positions.sort()
|
|
positions.append((len(html_content), None))
|
|
|
|
out = {}
|
|
for i in range(len(positions) - 1):
|
|
start, slug = positions[i]
|
|
end = positions[i+1][0]
|
|
segment = html_content[start:end]
|
|
hrefs = HREF_RX.findall(segment)
|
|
# Dedup preservando orden
|
|
seen, urls = set(), []
|
|
for h in hrefs:
|
|
if h not in seen:
|
|
seen.add(h); urls.append(h)
|
|
out[slug] = urls
|
|
return out
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument('--out', default='/tmp/clasificacion_articulos_regen.csv')
|
|
ap.add_argument('--diff', help='CSV de referencia para mostrar diff')
|
|
args = ap.parse_args()
|
|
|
|
conn = get_conn()
|
|
slug_to_id, k2_to_id = build_lookups(conn)
|
|
|
|
print('Leyendo cartas ...', file=sys.stderr, flush=True)
|
|
cartas = fetch_cartas(conn)
|
|
print(f' cartas: {len(cartas)}', file=sys.stderr)
|
|
|
|
rows_out = []
|
|
needed_titles = set()
|
|
n_unresolved = 0
|
|
n_resolved = 0
|
|
|
|
for c in cartas:
|
|
carta_id, carta_title, carta_fecha, content = c
|
|
carta_id = int(carta_id)
|
|
if not content:
|
|
continue
|
|
|
|
sections = extract_sections(content)
|
|
for slug, urls in sections.items():
|
|
label = SECTION_LABELS[slug]
|
|
default_cat = CAT_PROPUESTA[slug]
|
|
# Filtrar a posts resueltos manteniendo orden
|
|
resolved = []
|
|
for url in urls:
|
|
pid = url_to_post_id(url, slug_to_id, k2_to_id)
|
|
if pid is None:
|
|
n_unresolved += 1
|
|
continue
|
|
resolved.append(pid)
|
|
for pos, pid in enumerate(resolved):
|
|
# Sub-clasificación posicional para evangelio
|
|
if slug == 'comentario' and pos < len(SUBCAT_EVANGELIO_BY_POS):
|
|
cat = SUBCAT_EVANGELIO_BY_POS[pos]
|
|
else:
|
|
cat = default_cat
|
|
n_resolved += 1
|
|
needed_titles.add(pid)
|
|
rows_out.append({
|
|
'post_id': pid,
|
|
'categoria_propuesta': cat,
|
|
'seccion_original': label,
|
|
'carta_id': carta_id,
|
|
'carta_titulo': carta_title,
|
|
'carta_fecha': carta_fecha,
|
|
})
|
|
|
|
print(f'Resueltos: {n_resolved} Sin resolver: {n_unresolved}', file=sys.stderr)
|
|
print('Cargando títulos ...', file=sys.stderr, flush=True)
|
|
titles = fetch_titles(list(needed_titles), conn)
|
|
conn.close()
|
|
|
|
# Escribir CSV
|
|
cols = ['post_id', 'post_title', 'categoria_propuesta', 'seccion_original', 'carta_id', 'carta_titulo', 'carta_fecha']
|
|
with open(args.out, 'w', newline='', encoding='utf-8') as f:
|
|
w = csv.DictWriter(f, fieldnames=cols, quoting=csv.QUOTE_MINIMAL)
|
|
w.writeheader()
|
|
for r in rows_out:
|
|
r['post_title'] = titles.get(r['post_id'], '')
|
|
w.writerow(r)
|
|
print(f'Escrito: {args.out} ({len(rows_out)} filas)')
|
|
|
|
# Diff opcional
|
|
if args.diff and os.path.exists(args.diff):
|
|
from collections import defaultdict, Counter
|
|
def load(path):
|
|
d = defaultdict(set) # (post_id, cat) → set((carta_id, seccion))
|
|
cats_by_post = defaultdict(set)
|
|
with open(path, encoding='utf-8-sig') as fh:
|
|
r = csv.DictReader(fh)
|
|
for row in r:
|
|
pid = row.get('post_id','')
|
|
cat = row.get('categoria_propuesta','')
|
|
if not pid: continue
|
|
cats_by_post[pid].add(cat)
|
|
return cats_by_post
|
|
old = load(args.diff)
|
|
new = load(args.out)
|
|
old_keys = set(old.keys())
|
|
new_keys = set(new.keys())
|
|
print('\n=== DIFF ===')
|
|
print(f'posts en CSV viejo: {len(old_keys)}')
|
|
print(f'posts en CSV nuevo: {len(new_keys)}')
|
|
print(f'solo en viejo: {len(old_keys - new_keys)}')
|
|
print(f'solo en nuevo: {len(new_keys - old_keys)}')
|
|
common = old_keys & new_keys
|
|
same_cats = sum(1 for k in common if old[k] == new[k])
|
|
diff_cats = len(common) - same_cats
|
|
print(f'común con mismas cats: {same_cats}')
|
|
print(f'común con cats distintas: {diff_cats}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|