feadulta/scripts/import_public_joomla_delta.py

#!/usr/bin/env python3
"""
Importa a WordPress local el delta visible en Joomla produccion usando HTML publico.

Ruta de contingencia para cuando no hay SSH/DB a produccion. Conserva los IDs
Joomla en `_fgj2wp_old_k2_id` y `_fgj2wp_old_content_id` extraidos de las URLs.

Por defecto es dry-run. Usar `--apply` para escribir en la BD local.
"""
import argparse
import html
import re
import subprocess
import sys
import unicodedata
from dataclasses import dataclass, field
from typing import Optional
from urllib.parse import urljoin

import pymysql


ORIGIN_IP = "134.0.10.170"
HOST = "www.feadulta.com"
BASE = f"https://{HOST}"
WP_DB_USER = "wordpress_user"
WP_DB_PASS = "wordpress_pass"
WP_DB_NAME = "wordpress_db"

TERM_FEADULTA = 71
TERM_CARTA_SEMANA = 6
TERM_CARTAS_OTRAS = 21
TERM_CARTA_PASADA = 22
TERM_INDICE_MULTIMEDIA = 26
TERM_VIDEOS = 58
TERM_LECTURA = 1645
TERM_COMENTARIO_EDITORIAL = 1646
TERM_COMENTARIO = 1647
TERM_EUCARISTIA = 1648
TERM_MULTIMEDIA = 1649
TERM_ARTICULOS = 1650

SECTION_TO_TERM = {
    "lectura": TERM_LECTURA,
    "comentario_editorial": TERM_COMENTARIO_EDITORIAL,
    "comentario": TERM_COMENTARIO,
    "articulo": TERM_ARTICULOS,
    "eucaristia": TERM_EUCARISTIA,
    "multimedia": TERM_MULTIMEDIA,
}

CARTAS = [
    {
        "content_id": 9136,
        "url": "/es/ayuda/otras-semanas/9136-uno-y-trino.html",
        "date": "2026-05-28 00:00:00",
        "cats": [TERM_CARTAS_OTRAS, TERM_FEADULTA],
    },
    {
        "content_id": 9143,
        "url": "/es/ayuda/semana-pasada/9143-20-anos-de-fe-adulta.html",
        "date": "2026-06-06 00:00:00",
        "cats": [TERM_CARTAS_OTRAS, TERM_CARTA_PASADA, TERM_FEADULTA],
    },
    {
        "content_id": 9150,
        "url": "/es/ayuda/esta-semana/9150-la-puerta-pequena.html",
        "date": "2026-06-13 00:00:00",
        "cats": [TERM_CARTA_SEMANA, TERM_CARTAS_OTRAS, TERM_FEADULTA],
    },
]


@dataclass
class Item:
    kind: str
    source_id: int
    url: str
    title: str = ""
    content: str = ""
    slug: str = ""
    date: str = "2026-06-13 00:00:00"
    author_name: Optional[str] = None
    term_ids: set[int] = field(default_factory=set)
    carta_source_id: Optional[int] = None


def wp_ip() -> str:
    result = subprocess.run(
        [
            "docker",
            "inspect",
            "wordpress-mysql",
            "--format",
            "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}",
        ],
        capture_output=True,
        text=True,
        check=True,
    )
    return result.stdout.strip()


def conn():
    return pymysql.connect(
        host=wp_ip(),
        user=WP_DB_USER,
        password=WP_DB_PASS,
        database=WP_DB_NAME,
        charset="utf8mb4",
        autocommit=False,
        cursorclass=pymysql.cursors.DictCursor,
    )


def normalize(text: str) -> str:
    text = unicodedata.normalize("NFKD", text)
    text = "".join(c for c in text if not unicodedata.combining(c))
    return re.sub(r"\s+", " ", text).strip().lower()


def slug_from_url(path: str) -> str:
    name = path.rstrip("/").rsplit("/", 1)[-1]
    name = name.split("?", 1)[0]
    name = re.sub(r"^\d+-", "", name)
    return re.sub(r"\.html$", "", name)


def id_from_url(path: str) -> Optional[int]:
    m = re.search(r"/(\d+)-[^/?#]+(?:\.html)?", path)
    return int(m.group(1)) if m else None


def fetch(path: str) -> str:
    url = urljoin(BASE, path)
    print(f"FETCH {path}", file=sys.stderr, flush=True)
    result = subprocess.run(
        [
            "curl",
            "--resolve",
            f"{HOST}:443:{ORIGIN_IP}",
            "-k",
            "-L",
            "--max-time",
            "12",
            "-A",
            "Mozilla/5.0 Codex Feadulta delta importer",
            "-sS",
            url,
        ],
        capture_output=True,
        text=True,
        check=True,
    )
    return result.stdout


def clean_fragment(fragment: str) -> str:
    fragment = re.sub(r"<script\b.*?</script>", "", fragment, flags=re.I | re.S)
    fragment = re.sub(r"<form\b.*?</form>", "", fragment, flags=re.I | re.S)
    fragment = re.sub(r"\s+href=\"([^\"]*)\?tmpl=component[^\"]*\"", r' href="\1"', fragment)
    fragment = fragment.replace("\r\n", "\n")
    # Rutas de imagen Joomla -> uploads WP cuando el fichero existe localmente.
    def repl(m):
        attr, path = m.group(1), m.group(2)
        local = f"/home/rafa/joomla-migration/wordpress/wp-content/uploads/{path}"
        try:
            exists = subprocess.run(["test", "-f", local]).returncode == 0
        except Exception:
            exists = False
        if exists:
            return f'{attr}="/fea/wp-content/uploads/{path}"'
        return f'{attr}="/images/{path}"'

    fragment = re.sub(r'(src|href)="/images/([^"]+)"', repl, fragment)
    return fragment.strip()


def extract_title_and_content(doc: str) -> tuple[str, str]:
    title = ""
    m = re.search(r'<h2 class="fa-postheader">\s*(.*?)\s*</h2>', doc, re.I | re.S)
    if not m:
        m = re.search(r'<h2 class="itemTitle">\s*(.*?)\s*</h2>', doc, re.I | re.S)
    if not m:
        m = re.search(r'<meta property="og:title" content="([^"]+)"', doc, re.I | re.S)
    if not m:
        m = re.search(r"<title>\s*(.*?)\s*</title>", doc, re.I | re.S)
    if m:
        title = html.unescape(re.sub(r"<.*?>", "", m.group(1))).strip()
    m = re.search(r'<div class="fa-article">\s*(.*?)\s*</div>\s*</div>\s*<div class="cleared"', doc, re.I | re.S)
    if not m:
        m = re.search(r'<div class="itemFullText">\s*(.*?)\s*</div>', doc, re.I | re.S)
    if not m:
        m = re.search(r'<div class="fa-article">\s*(.*?)\s*</div>', doc, re.I | re.S)
    content = clean_fragment(m.group(1)) if m else ""
    return title, content


def extract_author(doc: str) -> Optional[str]:
    m = re.search(r'<meta name="author" content="([^"]+)"', doc, re.I)
    if m:
        return html.unescape(m.group(1)).strip()
    m = re.search(r'<a rel="author"[^>]*>\s*(.*?)\s*</a>', doc, re.I | re.S)
    if m:
        return html.unescape(re.sub(r"<.*?>", "", m.group(1))).strip()
    return None


def iter_paragraphs(content: str):
    for m in re.finditer(r"<p\b[^>]*>(.*?)</p>", content, flags=re.I | re.S):
        yield m.group(1)


def links_by_section(carta: Item) -> list[tuple[str, str, str, Optional[str]]]:
    section = None
    evangelio_pos = 0
    out = []
    for p in iter_paragraphs(carta.content):
        plain = normalize(re.sub(r"<.*?>", " ", html.unescape(p)))
        if "evangelio y comentarios al evangelio" in plain:
            section = "evangelio"
            evangelio_pos = 0
            continue
        if "articulos seleccionados para la semana" in plain:
            section = "articulo"
            continue
        if "eucaristias mas participativas" in plain:
            section = "eucaristia"
            continue
        if "material multimedia" in plain:
            section = "multimedia"
            continue
        if not section:
            continue
        for href, text in re.findall(r'<a\b[^>]*href="([^"]+)"[^>]*>(.*?)</a>', p, flags=re.I | re.S):
            href = html.unescape(href)
            text_plain = html.unescape(re.sub(r"<.*?>", " ", text))
            text_plain = re.sub(r"\s+", " ", text_plain).strip()
            if section == "evangelio":
                if evangelio_pos == 0:
                    cat = "lectura"
                elif evangelio_pos == 1:
                    cat = "comentario_editorial"
                else:
                    cat = "comentario"
                evangelio_pos += 1
            else:
                cat = section
            author = text_plain.split(":", 1)[0].strip() if ":" in text_plain else None
            out.append((href, cat, text_plain, author))
    return out


def load_existing(c, meta_key: str) -> set[int]:
    with c.cursor() as cur:
        cur.execute(
            "SELECT CAST(meta_value AS UNSIGNED) id FROM wp_postmeta WHERE meta_key=%s",
            (meta_key,),
        )
        return {int(r["id"]) for r in cur.fetchall() if r["id"] is not None}


def max_existing(ids: set[int]) -> int:
    return max(ids) if ids else 0


def load_terms(c) -> dict[int, int]:
    term_ids = [
        TERM_FEADULTA,
        TERM_CARTA_SEMANA,
        TERM_CARTAS_OTRAS,
        TERM_CARTA_PASADA,
        TERM_INDICE_MULTIMEDIA,
        TERM_VIDEOS,
        TERM_LECTURA,
        TERM_COMENTARIO_EDITORIAL,
        TERM_COMENTARIO,
        TERM_EUCARISTIA,
        TERM_MULTIMEDIA,
        TERM_ARTICULOS,
    ]
    with c.cursor() as cur:
        cur.execute(
            "SELECT term_id, term_taxonomy_id FROM wp_term_taxonomy "
            "WHERE taxonomy='category' AND term_id IN (%s)" % ",".join(["%s"] * len(term_ids)),
            term_ids,
        )
        return {int(r["term_id"]): int(r["term_taxonomy_id"]) for r in cur.fetchall()}


def load_lang_es(c) -> Optional[int]:
    with c.cursor() as cur:
        cur.execute(
            "SELECT tt.term_taxonomy_id FROM wp_terms t "
            "JOIN wp_term_taxonomy tt ON tt.term_id=t.term_id "
            "WHERE tt.taxonomy='language' AND t.slug='es' LIMIT 1"
        )
        row = cur.fetchone()
        return int(row["term_taxonomy_id"]) if row else None


def load_authors(c) -> dict[str, int]:
    with c.cursor() as cur:
        cur.execute("SELECT ID, display_name, user_login FROM wp_users")
        rows = cur.fetchall()
    authors = {}
    for r in rows:
        authors[normalize(r["display_name"])] = int(r["ID"])
        authors[normalize(r["user_login"])] = int(r["ID"])
    return authors


def resolve_author(author_map: dict[str, int], name: Optional[str]) -> int:
    if not name:
        return 1
    n = normalize(name)
    if n in author_map:
        return author_map[n]
    for key, uid in author_map.items():
        if n == key or n in key or key in n:
            return uid
    return 1


def build_items(c) -> list[Item]:
    existing_k2 = load_existing(c, "_fgj2wp_old_k2_id")
    existing_content = load_existing(c, "_fgj2wp_old_content_id")
    max_k2 = max_existing(existing_k2)
    max_content = max_existing(existing_content)
    print(
        f"WP existentes: K2={len(existing_k2)} max={max_k2} "
        f"content={len(existing_content)} max={max_content}"
    )

    items: dict[tuple[str, int], Item] = {}
    for carta_def in CARTAS:
        doc = fetch(carta_def["url"])
        title, content = extract_title_and_content(doc)
        carta = Item(
            kind="content",
            source_id=carta_def["content_id"],
            url=carta_def["url"],
            title=title,
            content=content,
            slug=slug_from_url(carta_def["url"]),
            date=carta_def["date"],
            term_ids=set(carta_def["cats"]),
        )
        if carta.source_id > max_content and carta.source_id not in existing_content:
            items[(carta.kind, carta.source_id)] = carta

        for href, cat_name, _text, author in links_by_section(carta):
            if "/buscadoravanzado/item/" in href:
                sid = id_from_url(href)
                if not sid or sid <= max_k2 or sid in existing_k2:
                    continue
                key = ("k2", sid)
                item = items.get(key)
                if not item:
                    item = Item(
                        kind="k2",
                        source_id=sid,
                        url=href,
                        slug=slug_from_url(href),
                        date=carta.date,
                        author_name=author,
                        term_ids={TERM_FEADULTA},
                        carta_source_id=carta.source_id,
                    )
                    items[key] = item
                item.term_ids.add(SECTION_TO_TERM[cat_name])
            elif "/indice-multimedia/" in href or "/videos/" in href:
                sid = id_from_url(href)
                if not sid or sid <= max_content or sid in existing_content:
                    continue
                is_video = "/videos/" in href
                key = ("content", sid)
                item = items.get(key)
                if not item:
                    item = Item(
                        kind="content",
                        source_id=sid,
                        url=href,
                        slug=slug_from_url(href),
                        date=carta.date,
                        term_ids={TERM_MULTIMEDIA, TERM_VIDEOS if is_video else TERM_INDICE_MULTIMEDIA},
                    )
                    items[key] = item

    # Fetch item pages after discovery.
    for item in items.values():
        if item.title and item.content:
            continue
        doc = fetch(item.url)
        title, content = extract_title_and_content(doc)
        author = extract_author(doc)
        item.title = title or item.slug.replace("-", " ").title()
        item.content = content
        if author and not item.author_name:
            item.author_name = author
    return sorted(items.values(), key=lambda x: (x.date, x.kind, x.source_id))


def insert_item(c, item: Item, term_to_tt: dict[int, int], lang_es_tt: Optional[int], author_map: dict[str, int], dry_run: bool) -> Optional[int]:
    author_id = resolve_author(author_map, item.author_name)
    if dry_run:
        print(
            f"[DRY] {item.kind:7s} {item.source_id:5d} "
            f"terms={sorted(item.term_ids)} author={author_id} {item.title[:70]}"
        )
        return None
    with c.cursor() as cur:
        cur.execute(
            """
            INSERT INTO wp_posts
              (post_author, post_date, post_date_gmt, post_content, post_title,
               post_excerpt, post_status, comment_status, ping_status, post_name,
               post_type, post_modified, post_modified_gmt, comment_count,
               to_ping, pinged, post_content_filtered)
            VALUES
              (%s,%s,%s,%s,%s,'','publish','open','open',%s,
               'post',%s,%s,0,'','','')
            """,
            (author_id, item.date, item.date, item.content, item.title, item.slug, item.date, item.date),
        )
        post_id = cur.lastrowid
        meta_key = "_fgj2wp_old_k2_id" if item.kind == "k2" else "_fgj2wp_old_content_id"
        cur.execute(
            "INSERT INTO wp_postmeta (post_id, meta_key, meta_value) VALUES (%s,%s,%s)",
            (post_id, meta_key, str(item.source_id)),
        )
        cur.execute(
            "INSERT INTO wp_postmeta (post_id, meta_key, meta_value) VALUES (%s,'Idioma','1')",
            (post_id,),
        )
        for term_id in sorted(item.term_ids):
            tt = term_to_tt.get(term_id)
            if tt:
                cur.execute(
                    "INSERT IGNORE INTO wp_term_relationships (object_id, term_taxonomy_id) VALUES (%s,%s)",
                    (post_id, tt),
                )
        if lang_es_tt:
            cur.execute(
                "INSERT IGNORE INTO wp_term_relationships (object_id, term_taxonomy_id) VALUES (%s,%s)",
                (post_id, lang_es_tt),
            )
        return post_id


def refresh_counts(c, term_to_tt: dict[int, int], lang_es_tt: Optional[int]):
    ttids = list(term_to_tt.values())
    if lang_es_tt:
        ttids.append(lang_es_tt)
    with c.cursor() as cur:
        cur.execute(
            "UPDATE wp_term_taxonomy tt SET count = ("
            "SELECT COUNT(*) FROM wp_term_relationships tr "
            "WHERE tr.term_taxonomy_id=tt.term_taxonomy_id"
            ") WHERE tt.term_taxonomy_id IN (%s)" % ",".join(["%s"] * len(ttids)),
            ttids,
        )


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--apply", action="store_true", help="escribe en WordPress local")
    args = ap.parse_args()
    dry_run = not args.apply

    c = conn()
    try:
        term_to_tt = load_terms(c)
        lang_es_tt = load_lang_es(c)
        author_map = load_authors(c)
        items = build_items(c)
        print(f"Items nuevos detectados: {len(items)}")
        print(
            "  K2:",
            len([i for i in items if i.kind == "k2"]),
            "content:",
            len([i for i in items if i.kind == "content"]),
        )

        source_to_wp = {}
        for item in items:
            wp_id = insert_item(c, item, term_to_tt, lang_es_tt, author_map, dry_run)
            if wp_id:
                source_to_wp[(item.kind, item.source_id)] = wp_id

        if not dry_run:
            with c.cursor() as cur:
                for item in items:
                    if item.kind != "k2" or not item.carta_source_id:
                        continue
                    wp_id = source_to_wp.get(("k2", item.source_id))
                    carta_wp_id = source_to_wp.get(("content", item.carta_source_id))
                    if wp_id and carta_wp_id:
                        cur.execute(
                            "INSERT IGNORE INTO wp_postmeta (post_id, meta_key, meta_value) VALUES (%s,'_carta_id',%s)",
                            (wp_id, str(carta_wp_id)),
                        )
            refresh_counts(c, term_to_tt, lang_es_tt)
            c.commit()
            print("Import commit OK.")
        else:
            c.rollback()
            print("Dry-run: sin cambios.")
    except Exception:
        c.rollback()
        raise
    finally:
        c.close()


if __name__ == "__main__":
    main()