#!/usr/bin/env python3 """ Aplica `clasificacion_articulos_regen.csv` a wp_term_relationships. MODO CONSERVADOR (--mode=add): solo AÑADE las cats nuevas que el CSV indique y que no estén ya. NO borra cats existentes. Maximiza seguridad — no perdemos atribuciones legítimas que el CSV viejo o asignaciones manuales pusieran. MODO ESTRICTO (--mode=replace): para los posts presentes en el CSV, sustituye el conjunto de cats {1645,1646,1647,1648,1649,1650} por exactamente las que el CSV indique. Borra las que sobren. Posts NO presentes en CSV no se tocan. Recalcula `wp_term_taxonomy.count` al final. Issue: rafa/feadulta#42 Uso: python3 aplicar_clasificacion_a_bd.py [--csv FILE] [--mode add|replace] [--dry-run] """ import argparse, csv, subprocess, sys from collections import defaultdict try: import pymysql except ImportError: sys.exit('requiere pymysql') CAT_NAME_TO_TERM = { 'lectura': 1645, 'comentario_editorial':1646, 'comentario': 1647, 'eucaristia': 1648, 'multimedia': 1649, 'articulo': 1650, # 'noticia': 1651, # no implementado # 'otro': 1652, # no implementado # 'effa': ?, # no implementado } MANAGED_TERMS = set(CAT_NAME_TO_TERM.values()) def get_conn(): ip = subprocess.run(['docker','inspect','wordpress-mysql','--format', '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}'], capture_output=True, text=True, check=True).stdout.strip() return pymysql.connect(host=ip, user='wordpress_user', password='wordpress_pass', database='wordpress_db', charset='utf8mb4', autocommit=False) def get_term_taxonomy_ids(conn, term_ids): """Devuelve dict term_id → term_taxonomy_id para taxonomy='category'.""" with conn.cursor() as c: c.execute(f""" SELECT term_id, term_taxonomy_id FROM wp_term_taxonomy WHERE taxonomy='category' AND term_id IN ({','.join(str(t) for t in term_ids)}) """) return dict(c.fetchall()) def load_csv(path): """Devuelve dict post_id → set(cat_name).""" out = defaultdict(set) with open(path, encoding='utf-8') as f: r = csv.DictReader(f) for row in r: pid = row.get('post_id') cat = row.get('categoria_propuesta') if pid and cat in CAT_NAME_TO_TERM: out[int(pid)].add(cat) return out def current_cats(conn, post_ids, tt_ids): """Para cada post devuelve set de term_ids de MANAGED_TERMS que tiene actualmente.""" if not post_ids: return {} in_ttids = ','.join(str(t) for t in tt_ids) in_pids = ','.join(str(p) for p in post_ids) out = defaultdict(set) with conn.cursor() as c: c.execute(f""" SELECT tr.object_id, tt.term_id FROM wp_term_relationships tr JOIN wp_term_taxonomy tt ON tt.term_taxonomy_id=tr.term_taxonomy_id WHERE tr.object_id IN ({in_pids}) AND tt.term_taxonomy_id IN ({in_ttids}) """) for pid, tid in c.fetchall(): out[pid].add(tid) return out def main(): ap = argparse.ArgumentParser() ap.add_argument('--csv', default='/tmp/clasif_new.csv') ap.add_argument('--mode', choices=['add', 'replace'], default='add') ap.add_argument('--dry-run', action='store_true') args = ap.parse_args() print(f'CSV: {args.csv}', file=sys.stderr) print(f'Mode: {args.mode}{" (DRY)" if args.dry_run else ""}', file=sys.stderr) desired_by_pid = load_csv(args.csv) print(f'Posts en CSV: {len(desired_by_pid)}', file=sys.stderr) conn = get_conn() term_to_tt = get_term_taxonomy_ids(conn, MANAGED_TERMS) print(f'Term taxonomy ids: {term_to_tt}', file=sys.stderr) if len(term_to_tt) != len(CAT_NAME_TO_TERM): sys.exit(f'No encuentro todos los term_ids: {set(MANAGED_TERMS) - set(term_to_tt)}') cat_to_tt = {name: term_to_tt[tid] for name, tid in CAT_NAME_TO_TERM.items()} # Cats actuales para los posts del CSV pids = list(desired_by_pid.keys()) BATCH = 5000 current_by_pid = {} for i in range(0, len(pids), BATCH): chunk = pids[i:i+BATCH] current_by_pid.update(current_cats(conn, chunk, term_to_tt.values())) # Computar añadir / quitar to_add = [] # (object_id, term_taxonomy_id) to_del = [] # (object_id, term_taxonomy_id) for pid, desired_names in desired_by_pid.items(): desired_tids = {CAT_NAME_TO_TERM[n] for n in desired_names} current_tids = current_by_pid.get(pid, set()) # Añadir las que estén en desired y no en current for tid in desired_tids - current_tids: to_add.append((pid, term_to_tt[tid])) # En modo replace: quitar las MANAGED que estén en current y no en desired if args.mode == 'replace': for tid in current_tids - desired_tids: to_del.append((pid, term_to_tt[tid])) print(f'A añadir: {len(to_add)}', file=sys.stderr) print(f'A quitar: {len(to_del)}', file=sys.stderr) if args.dry_run: # Muestra print('\n--- 5 ejemplos añadir ---', file=sys.stderr) for x in to_add[:5]: print(' ', x, file=sys.stderr) print('\n--- 5 ejemplos quitar ---', file=sys.stderr) for x in to_del[:5]: print(' ', x, file=sys.stderr) conn.close() return with conn.cursor() as c: # Bulk insert (INSERT IGNORE) if to_add: for i in range(0, len(to_add), 1000): chunk = to_add[i:i+1000] vals = ','.join(f'({p},{t})' for p, t in chunk) c.execute(f'INSERT IGNORE INTO wp_term_relationships (object_id, term_taxonomy_id) VALUES {vals}') if to_del: for i in range(0, len(to_del), 1000): chunk = to_del[i:i+1000] conds = ' OR '.join(f'(object_id={p} AND term_taxonomy_id={t})' for p, t in chunk) c.execute(f'DELETE FROM wp_term_relationships WHERE {conds}') # Recalcular counts in_ttids = ','.join(str(t) for t in term_to_tt.values()) c.execute(f""" UPDATE wp_term_taxonomy tt SET tt.count = (SELECT COUNT(*) FROM wp_term_relationships tr WHERE tr.term_taxonomy_id=tt.term_taxonomy_id) WHERE tt.term_taxonomy_id IN ({in_ttids}) """) conn.commit() print('Commit OK.', file=sys.stderr) # Conteos finales with conn.cursor() as c: c.execute(f""" SELECT t.term_id, t.slug, tt.count FROM wp_term_taxonomy tt JOIN wp_terms t USING(term_id) WHERE tt.term_taxonomy_id IN ({in_ttids}) ORDER BY t.term_id """) print('\nCats finales:', file=sys.stderr) for row in c.fetchall(): print(f' {row[0]:5d} {row[1]:30s} {row[2]}', file=sys.stderr) conn.close() if __name__ == '__main__': main()