Files
feadulta/scripts/aplicar_clasificacion_a_bd.py
T

181 lines
6.8 KiB
Python

#!/usr/bin/env python3
"""
Aplica `clasificacion_articulos_regen.csv` a wp_term_relationships.
MODO CONSERVADOR (--mode=add): solo AÑADE las cats nuevas que el CSV indique
y que no estén ya. NO borra cats existentes. Maximiza seguridad — no perdemos
atribuciones legítimas que el CSV viejo o asignaciones manuales pusieran.
MODO ESTRICTO (--mode=replace): para los posts presentes en el CSV, sustituye
el conjunto de cats {1645,1646,1647,1648,1649,1650} por exactamente las que
el CSV indique. Borra las que sobren. Posts NO presentes en CSV no se tocan.
Recalcula `wp_term_taxonomy.count` al final.
Issue: rafa/feadulta#42
Uso: python3 aplicar_clasificacion_a_bd.py [--csv FILE] [--mode add|replace] [--dry-run]
"""
import argparse, csv, subprocess, sys
from collections import defaultdict
try:
import pymysql
except ImportError:
sys.exit('requiere pymysql')
CAT_NAME_TO_TERM = {
'lectura': 1645,
'comentario_editorial':1646,
'comentario': 1647,
'eucaristia': 1648,
'multimedia': 1649,
'articulo': 1650,
# 'noticia': 1651, # no implementado
# 'otro': 1652, # no implementado
# 'effa': ?, # no implementado
}
MANAGED_TERMS = set(CAT_NAME_TO_TERM.values())
def get_conn():
ip = subprocess.run(['docker','inspect','wordpress-mysql','--format',
'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}'],
capture_output=True, text=True, check=True).stdout.strip()
return pymysql.connect(host=ip, user='wordpress_user', password='wordpress_pass',
database='wordpress_db', charset='utf8mb4', autocommit=False)
def get_term_taxonomy_ids(conn, term_ids):
"""Devuelve dict term_id → term_taxonomy_id para taxonomy='category'."""
with conn.cursor() as c:
c.execute(f"""
SELECT term_id, term_taxonomy_id FROM wp_term_taxonomy
WHERE taxonomy='category' AND term_id IN ({','.join(str(t) for t in term_ids)})
""")
return dict(c.fetchall())
def load_csv(path):
"""Devuelve dict post_id → set(cat_name)."""
out = defaultdict(set)
with open(path, encoding='utf-8') as f:
r = csv.DictReader(f)
for row in r:
pid = row.get('post_id')
cat = row.get('categoria_propuesta')
if pid and cat in CAT_NAME_TO_TERM:
out[int(pid)].add(cat)
return out
def current_cats(conn, post_ids, tt_ids):
"""Para cada post devuelve set de term_ids de MANAGED_TERMS que tiene actualmente."""
if not post_ids: return {}
in_ttids = ','.join(str(t) for t in tt_ids)
in_pids = ','.join(str(p) for p in post_ids)
out = defaultdict(set)
with conn.cursor() as c:
c.execute(f"""
SELECT tr.object_id, tt.term_id
FROM wp_term_relationships tr
JOIN wp_term_taxonomy tt ON tt.term_taxonomy_id=tr.term_taxonomy_id
WHERE tr.object_id IN ({in_pids}) AND tt.term_taxonomy_id IN ({in_ttids})
""")
for pid, tid in c.fetchall():
out[pid].add(tid)
return out
def main():
ap = argparse.ArgumentParser()
ap.add_argument('--csv', default='/tmp/clasif_new.csv')
ap.add_argument('--mode', choices=['add', 'replace'], default='add')
ap.add_argument('--dry-run', action='store_true')
args = ap.parse_args()
print(f'CSV: {args.csv}', file=sys.stderr)
print(f'Mode: {args.mode}{" (DRY)" if args.dry_run else ""}', file=sys.stderr)
desired_by_pid = load_csv(args.csv)
print(f'Posts en CSV: {len(desired_by_pid)}', file=sys.stderr)
conn = get_conn()
term_to_tt = get_term_taxonomy_ids(conn, MANAGED_TERMS)
print(f'Term taxonomy ids: {term_to_tt}', file=sys.stderr)
if len(term_to_tt) != len(CAT_NAME_TO_TERM):
sys.exit(f'No encuentro todos los term_ids: {set(MANAGED_TERMS) - set(term_to_tt)}')
cat_to_tt = {name: term_to_tt[tid] for name, tid in CAT_NAME_TO_TERM.items()}
# Cats actuales para los posts del CSV
pids = list(desired_by_pid.keys())
BATCH = 5000
current_by_pid = {}
for i in range(0, len(pids), BATCH):
chunk = pids[i:i+BATCH]
current_by_pid.update(current_cats(conn, chunk, term_to_tt.values()))
# Computar añadir / quitar
to_add = [] # (object_id, term_taxonomy_id)
to_del = [] # (object_id, term_taxonomy_id)
for pid, desired_names in desired_by_pid.items():
desired_tids = {CAT_NAME_TO_TERM[n] for n in desired_names}
current_tids = current_by_pid.get(pid, set())
# Añadir las que estén en desired y no en current
for tid in desired_tids - current_tids:
to_add.append((pid, term_to_tt[tid]))
# En modo replace: quitar las MANAGED que estén en current y no en desired
if args.mode == 'replace':
for tid in current_tids - desired_tids:
to_del.append((pid, term_to_tt[tid]))
print(f'A añadir: {len(to_add)}', file=sys.stderr)
print(f'A quitar: {len(to_del)}', file=sys.stderr)
if args.dry_run:
# Muestra
print('\n--- 5 ejemplos añadir ---', file=sys.stderr)
for x in to_add[:5]: print(' ', x, file=sys.stderr)
print('\n--- 5 ejemplos quitar ---', file=sys.stderr)
for x in to_del[:5]: print(' ', x, file=sys.stderr)
conn.close()
return
with conn.cursor() as c:
# Bulk insert (INSERT IGNORE)
if to_add:
for i in range(0, len(to_add), 1000):
chunk = to_add[i:i+1000]
vals = ','.join(f'({p},{t})' for p, t in chunk)
c.execute(f'INSERT IGNORE INTO wp_term_relationships (object_id, term_taxonomy_id) VALUES {vals}')
if to_del:
for i in range(0, len(to_del), 1000):
chunk = to_del[i:i+1000]
conds = ' OR '.join(f'(object_id={p} AND term_taxonomy_id={t})' for p, t in chunk)
c.execute(f'DELETE FROM wp_term_relationships WHERE {conds}')
# Recalcular counts
in_ttids = ','.join(str(t) for t in term_to_tt.values())
c.execute(f"""
UPDATE wp_term_taxonomy tt
SET tt.count = (SELECT COUNT(*) FROM wp_term_relationships tr WHERE tr.term_taxonomy_id=tt.term_taxonomy_id)
WHERE tt.term_taxonomy_id IN ({in_ttids})
""")
conn.commit()
print('Commit OK.', file=sys.stderr)
# Conteos finales
with conn.cursor() as c:
c.execute(f"""
SELECT t.term_id, t.slug, tt.count FROM wp_term_taxonomy tt
JOIN wp_terms t USING(term_id)
WHERE tt.term_taxonomy_id IN ({in_ttids}) ORDER BY t.term_id
""")
print('\nCats finales:', file=sys.stderr)
for row in c.fetchall():
print(f' {row[0]:5d} {row[1]:30s} {row[2]}', file=sys.stderr)
conn.close()
if __name__ == '__main__':
main()