feadulta/scripts/aplicar_clasificacion_a_bd.py

#!/usr/bin/env python3
"""
Aplica `clasificacion_articulos_regen.csv` a wp_term_relationships.

MODO CONSERVADOR (--mode=add): solo AÑADE las cats nuevas que el CSV indique
  y que no estén ya. NO borra cats existentes. Maximiza seguridad — no perdemos
  atribuciones legítimas que el CSV viejo o asignaciones manuales pusieran.

MODO ESTRICTO (--mode=replace): para los posts presentes en el CSV, sustituye
  el conjunto de cats {1645,1646,1647,1648,1649,1650} por exactamente las que
  el CSV indique. Borra las que sobren. Posts NO presentes en CSV no se tocan.

Recalcula `wp_term_taxonomy.count` al final.

Issue: rafa/feadulta#42
Uso: python3 aplicar_clasificacion_a_bd.py [--csv FILE] [--mode add|replace] [--dry-run]
"""
import argparse, csv, subprocess, sys
from collections import defaultdict

try:
    import pymysql
except ImportError:
    sys.exit('requiere pymysql')

CAT_NAME_TO_TERM = {
    'lectura':             1645,
    'comentario_editorial':1646,
    'comentario':          1647,
    'eucaristia':          1648,
    'multimedia':          1649,
    'articulo':            1650,
    # 'noticia':           1651,  # no implementado
    # 'otro':              1652,  # no implementado
    # 'effa':              ?,     # no implementado
}
MANAGED_TERMS = set(CAT_NAME_TO_TERM.values())


def get_conn():
    ip = subprocess.run(['docker','inspect','wordpress-mysql','--format',
        '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}'],
        capture_output=True, text=True, check=True).stdout.strip()
    return pymysql.connect(host=ip, user='wordpress_user', password='wordpress_pass',
        database='wordpress_db', charset='utf8mb4', autocommit=False)


def get_term_taxonomy_ids(conn, term_ids):
    """Devuelve dict term_id → term_taxonomy_id para taxonomy='category'."""
    with conn.cursor() as c:
        c.execute(f"""
            SELECT term_id, term_taxonomy_id FROM wp_term_taxonomy
            WHERE taxonomy='category' AND term_id IN ({','.join(str(t) for t in term_ids)})
        """)
        return dict(c.fetchall())


def load_csv(path):
    """Devuelve dict post_id → set(cat_name)."""
    out = defaultdict(set)
    with open(path, encoding='utf-8') as f:
        r = csv.DictReader(f)
        for row in r:
            pid = row.get('post_id')
            cat = row.get('categoria_propuesta')
            if pid and cat in CAT_NAME_TO_TERM:
                out[int(pid)].add(cat)
    return out


def current_cats(conn, post_ids, tt_ids):
    """Para cada post devuelve set de term_ids de MANAGED_TERMS que tiene actualmente."""
    if not post_ids: return {}
    in_ttids = ','.join(str(t) for t in tt_ids)
    in_pids  = ','.join(str(p) for p in post_ids)
    out = defaultdict(set)
    with conn.cursor() as c:
        c.execute(f"""
            SELECT tr.object_id, tt.term_id
            FROM wp_term_relationships tr
            JOIN wp_term_taxonomy tt ON tt.term_taxonomy_id=tr.term_taxonomy_id
            WHERE tr.object_id IN ({in_pids}) AND tt.term_taxonomy_id IN ({in_ttids})
        """)
        for pid, tid in c.fetchall():
            out[pid].add(tid)
    return out


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--csv', default='/tmp/clasif_new.csv')
    ap.add_argument('--mode', choices=['add', 'replace'], default='add')
    ap.add_argument('--dry-run', action='store_true')
    args = ap.parse_args()

    print(f'CSV: {args.csv}', file=sys.stderr)
    print(f'Mode: {args.mode}{" (DRY)" if args.dry_run else ""}', file=sys.stderr)

    desired_by_pid = load_csv(args.csv)
    print(f'Posts en CSV: {len(desired_by_pid)}', file=sys.stderr)

    conn = get_conn()
    term_to_tt = get_term_taxonomy_ids(conn, MANAGED_TERMS)
    print(f'Term taxonomy ids: {term_to_tt}', file=sys.stderr)
    if len(term_to_tt) != len(CAT_NAME_TO_TERM):
        sys.exit(f'No encuentro todos los term_ids: {set(MANAGED_TERMS) - set(term_to_tt)}')

    cat_to_tt = {name: term_to_tt[tid] for name, tid in CAT_NAME_TO_TERM.items()}

    # Cats actuales para los posts del CSV
    pids = list(desired_by_pid.keys())
    BATCH = 5000
    current_by_pid = {}
    for i in range(0, len(pids), BATCH):
        chunk = pids[i:i+BATCH]
        current_by_pid.update(current_cats(conn, chunk, term_to_tt.values()))

    # Computar añadir / quitar
    to_add = []   # (object_id, term_taxonomy_id)
    to_del = []   # (object_id, term_taxonomy_id)
    for pid, desired_names in desired_by_pid.items():
        desired_tids = {CAT_NAME_TO_TERM[n] for n in desired_names}
        current_tids = current_by_pid.get(pid, set())
        # Añadir las que estén en desired y no en current
        for tid in desired_tids - current_tids:
            to_add.append((pid, term_to_tt[tid]))
        # En modo replace: quitar las MANAGED que estén en current y no en desired
        if args.mode == 'replace':
            for tid in current_tids - desired_tids:
                to_del.append((pid, term_to_tt[tid]))

    print(f'A añadir: {len(to_add)}', file=sys.stderr)
    print(f'A quitar: {len(to_del)}', file=sys.stderr)

    if args.dry_run:
        # Muestra
        print('\n--- 5 ejemplos añadir ---', file=sys.stderr)
        for x in to_add[:5]: print(' ', x, file=sys.stderr)
        print('\n--- 5 ejemplos quitar ---', file=sys.stderr)
        for x in to_del[:5]: print(' ', x, file=sys.stderr)
        conn.close()
        return

    with conn.cursor() as c:
        # Bulk insert (INSERT IGNORE)
        if to_add:
            for i in range(0, len(to_add), 1000):
                chunk = to_add[i:i+1000]
                vals = ','.join(f'({p},{t})' for p, t in chunk)
                c.execute(f'INSERT IGNORE INTO wp_term_relationships (object_id, term_taxonomy_id) VALUES {vals}')
        if to_del:
            for i in range(0, len(to_del), 1000):
                chunk = to_del[i:i+1000]
                conds = ' OR '.join(f'(object_id={p} AND term_taxonomy_id={t})' for p, t in chunk)
                c.execute(f'DELETE FROM wp_term_relationships WHERE {conds}')
        # Recalcular counts
        in_ttids = ','.join(str(t) for t in term_to_tt.values())
        c.execute(f"""
            UPDATE wp_term_taxonomy tt
            SET tt.count = (SELECT COUNT(*) FROM wp_term_relationships tr WHERE tr.term_taxonomy_id=tt.term_taxonomy_id)
            WHERE tt.term_taxonomy_id IN ({in_ttids})
        """)
    conn.commit()
    print('Commit OK.', file=sys.stderr)

    # Conteos finales
    with conn.cursor() as c:
        c.execute(f"""
            SELECT t.term_id, t.slug, tt.count FROM wp_term_taxonomy tt
            JOIN wp_terms t USING(term_id)
            WHERE tt.term_taxonomy_id IN ({in_ttids}) ORDER BY t.term_id
        """)
        print('\nCats finales:', file=sys.stderr)
        for row in c.fetchall():
            print(f'  {row[0]:5d}  {row[1]:30s}  {row[2]}', file=sys.stderr)
    conn.close()


if __name__ == '__main__':
    main()