Añadir mu-plugins y scripts de feadulta

2026-06-28 15:10:46 -04:00
parent bce7e42f44
commit b6116b066d
106 changed files with 17600 additions and 2 deletions
@@ -0,0 +1,517 @@
+#!/usr/bin/env python3
+"""
+Importa a WordPress local el delta visible en Joomla produccion usando HTML publico.
+
+Ruta de contingencia para cuando no hay SSH/DB a produccion. Conserva los IDs
+Joomla en `_fgj2wp_old_k2_id` y `_fgj2wp_old_content_id` extraidos de las URLs.
+
+Por defecto es dry-run. Usar `--apply` para escribir en la BD local.
+"""
+import argparse
+import html
+import re
+import subprocess
+import sys
+import unicodedata
+from dataclasses import dataclass, field
+from typing import Optional
+from urllib.parse import urljoin
+
+import pymysql
+
+
+ORIGIN_IP = "134.0.10.170"
+HOST = "www.feadulta.com"
+BASE = f"https://{HOST}"
+WP_DB_USER = "wordpress_user"
+WP_DB_PASS = "wordpress_pass"
+WP_DB_NAME = "wordpress_db"
+
+TERM_FEADULTA = 71
+TERM_CARTA_SEMANA = 6
+TERM_CARTAS_OTRAS = 21
+TERM_CARTA_PASADA = 22
+TERM_INDICE_MULTIMEDIA = 26
+TERM_VIDEOS = 58
+TERM_LECTURA = 1645
+TERM_COMENTARIO_EDITORIAL = 1646
+TERM_COMENTARIO = 1647
+TERM_EUCARISTIA = 1648
+TERM_MULTIMEDIA = 1649
+TERM_ARTICULOS = 1650
+
+SECTION_TO_TERM = {
+    "lectura": TERM_LECTURA,
+    "comentario_editorial": TERM_COMENTARIO_EDITORIAL,
+    "comentario": TERM_COMENTARIO,
+    "articulo": TERM_ARTICULOS,
+    "eucaristia": TERM_EUCARISTIA,
+    "multimedia": TERM_MULTIMEDIA,
+}
+
+CARTAS = [
+    {
+        "content_id": 9136,
+        "url": "/es/ayuda/otras-semanas/9136-uno-y-trino.html",
+        "date": "2026-05-28 00:00:00",
+        "cats": [TERM_CARTAS_OTRAS, TERM_FEADULTA],
+    },
+    {
+        "content_id": 9143,
+        "url": "/es/ayuda/semana-pasada/9143-20-anos-de-fe-adulta.html",
+        "date": "2026-06-06 00:00:00",
+        "cats": [TERM_CARTAS_OTRAS, TERM_CARTA_PASADA, TERM_FEADULTA],
+    },
+    {
+        "content_id": 9150,
+        "url": "/es/ayuda/esta-semana/9150-la-puerta-pequena.html",
+        "date": "2026-06-13 00:00:00",
+        "cats": [TERM_CARTA_SEMANA, TERM_CARTAS_OTRAS, TERM_FEADULTA],
+    },
+]
+
+
+@dataclass
+class Item:
+    kind: str
+    source_id: int
+    url: str
+    title: str = ""
+    content: str = ""
+    slug: str = ""
+    date: str = "2026-06-13 00:00:00"
+    author_name: Optional[str] = None
+    term_ids: set[int] = field(default_factory=set)
+    carta_source_id: Optional[int] = None
+
+
+def wp_ip() -> str:
+    result = subprocess.run(
+        [
+            "docker",
+            "inspect",
+            "wordpress-mysql",
+            "--format",
+            "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}",
+        ],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    return result.stdout.strip()
+
+
+def conn():
+    return pymysql.connect(
+        host=wp_ip(),
+        user=WP_DB_USER,
+        password=WP_DB_PASS,
+        database=WP_DB_NAME,
+        charset="utf8mb4",
+        autocommit=False,
+        cursorclass=pymysql.cursors.DictCursor,
+    )
+
+
+def normalize(text: str) -> str:
+    text = unicodedata.normalize("NFKD", text)
+    text = "".join(c for c in text if not unicodedata.combining(c))
+    return re.sub(r"\s+", " ", text).strip().lower()
+
+
+def slug_from_url(path: str) -> str:
+    name = path.rstrip("/").rsplit("/", 1)[-1]
+    name = name.split("?", 1)[0]
+    name = re.sub(r"^\d+-", "", name)
+    return re.sub(r"\.html$", "", name)
+
+
+def id_from_url(path: str) -> Optional[int]:
+    m = re.search(r"/(\d+)-[^/?#]+(?:\.html)?", path)
+    return int(m.group(1)) if m else None
+
+
+def fetch(path: str) -> str:
+    url = urljoin(BASE, path)
+    print(f"FETCH {path}", file=sys.stderr, flush=True)
+    result = subprocess.run(
+        [
+            "curl",
+            "--resolve",
+            f"{HOST}:443:{ORIGIN_IP}",
+            "-k",
+            "-L",
+            "--max-time",
+            "12",
+            "-A",
+            "Mozilla/5.0 Codex Feadulta delta importer",
+            "-sS",
+            url,
+        ],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    return result.stdout
+
+
+def clean_fragment(fragment: str) -> str:
+    fragment = re.sub(r"<script\b.*?</script>", "", fragment, flags=re.I | re.S)
+    fragment = re.sub(r"<form\b.*?</form>", "", fragment, flags=re.I | re.S)
+    fragment = re.sub(r"\s+href=\"([^\"]*)\?tmpl=component[^\"]*\"", r' href="\1"', fragment)
+    fragment = fragment.replace("\r\n", "\n")
+    # Rutas de imagen Joomla -> uploads WP cuando el fichero existe localmente.
+    def repl(m):
+        attr, path = m.group(1), m.group(2)
+        local = f"/home/rafa/joomla-migration/wordpress/wp-content/uploads/{path}"
+        try:
+            exists = subprocess.run(["test", "-f", local]).returncode == 0
+        except Exception:
+            exists = False
+        if exists:
+            return f'{attr}="/fea/wp-content/uploads/{path}"'
+        return f'{attr}="/images/{path}"'
+
+    fragment = re.sub(r'(src|href)="/images/([^"]+)"', repl, fragment)
+    return fragment.strip()
+
+
+def extract_title_and_content(doc: str) -> tuple[str, str]:
+    title = ""
+    m = re.search(r'<h2 class="fa-postheader">\s*(.*?)\s*</h2>', doc, re.I | re.S)
+    if not m:
+        m = re.search(r'<h2 class="itemTitle">\s*(.*?)\s*</h2>', doc, re.I | re.S)
+    if not m:
+        m = re.search(r'<meta property="og:title" content="([^"]+)"', doc, re.I | re.S)
+    if not m:
+        m = re.search(r"<title>\s*(.*?)\s*</title>", doc, re.I | re.S)
+    if m:
+        title = html.unescape(re.sub(r"<.*?>", "", m.group(1))).strip()
+    m = re.search(r'<div class="fa-article">\s*(.*?)\s*</div>\s*</div>\s*<div class="cleared"', doc, re.I | re.S)
+    if not m:
+        m = re.search(r'<div class="itemFullText">\s*(.*?)\s*</div>', doc, re.I | re.S)
+    if not m:
+        m = re.search(r'<div class="fa-article">\s*(.*?)\s*</div>', doc, re.I | re.S)
+    content = clean_fragment(m.group(1)) if m else ""
+    return title, content
+
+
+def extract_author(doc: str) -> Optional[str]:
+    m = re.search(r'<meta name="author" content="([^"]+)"', doc, re.I)
+    if m:
+        return html.unescape(m.group(1)).strip()
+    m = re.search(r'<a rel="author"[^>]*>\s*(.*?)\s*</a>', doc, re.I | re.S)
+    if m:
+        return html.unescape(re.sub(r"<.*?>", "", m.group(1))).strip()
+    return None
+
+
+def iter_paragraphs(content: str):
+    for m in re.finditer(r"<p\b[^>]*>(.*?)</p>", content, flags=re.I | re.S):
+        yield m.group(1)
+
+
+def links_by_section(carta: Item) -> list[tuple[str, str, str, Optional[str]]]:
+    section = None
+    evangelio_pos = 0
+    out = []
+    for p in iter_paragraphs(carta.content):
+        plain = normalize(re.sub(r"<.*?>", " ", html.unescape(p)))
+        if "evangelio y comentarios al evangelio" in plain:
+            section = "evangelio"
+            evangelio_pos = 0
+            continue
+        if "articulos seleccionados para la semana" in plain:
+            section = "articulo"
+            continue
+        if "eucaristias mas participativas" in plain:
+            section = "eucaristia"
+            continue
+        if "material multimedia" in plain:
+            section = "multimedia"
+            continue
+        if not section:
+            continue
+        for href, text in re.findall(r'<a\b[^>]*href="([^"]+)"[^>]*>(.*?)</a>', p, flags=re.I | re.S):
+            href = html.unescape(href)
+            text_plain = html.unescape(re.sub(r"<.*?>", " ", text))
+            text_plain = re.sub(r"\s+", " ", text_plain).strip()
+            if section == "evangelio":
+                if evangelio_pos == 0:
+                    cat = "lectura"
+                elif evangelio_pos == 1:
+                    cat = "comentario_editorial"
+                else:
+                    cat = "comentario"
+                evangelio_pos += 1
+            else:
+                cat = section
+            author = text_plain.split(":", 1)[0].strip() if ":" in text_plain else None
+            out.append((href, cat, text_plain, author))
+    return out
+
+
+def load_existing(c, meta_key: str) -> set[int]:
+    with c.cursor() as cur:
+        cur.execute(
+            "SELECT CAST(meta_value AS UNSIGNED) id FROM wp_postmeta WHERE meta_key=%s",
+            (meta_key,),
+        )
+        return {int(r["id"]) for r in cur.fetchall() if r["id"] is not None}
+
+
+def max_existing(ids: set[int]) -> int:
+    return max(ids) if ids else 0
+
+
+def load_terms(c) -> dict[int, int]:
+    term_ids = [
+        TERM_FEADULTA,
+        TERM_CARTA_SEMANA,
+        TERM_CARTAS_OTRAS,
+        TERM_CARTA_PASADA,
+        TERM_INDICE_MULTIMEDIA,
+        TERM_VIDEOS,
+        TERM_LECTURA,
+        TERM_COMENTARIO_EDITORIAL,
+        TERM_COMENTARIO,
+        TERM_EUCARISTIA,
+        TERM_MULTIMEDIA,
+        TERM_ARTICULOS,
+    ]
+    with c.cursor() as cur:
+        cur.execute(
+            "SELECT term_id, term_taxonomy_id FROM wp_term_taxonomy "
+            "WHERE taxonomy='category' AND term_id IN (%s)" % ",".join(["%s"] * len(term_ids)),
+            term_ids,
+        )
+        return {int(r["term_id"]): int(r["term_taxonomy_id"]) for r in cur.fetchall()}
+
+
+def load_lang_es(c) -> Optional[int]:
+    with c.cursor() as cur:
+        cur.execute(
+            "SELECT tt.term_taxonomy_id FROM wp_terms t "
+            "JOIN wp_term_taxonomy tt ON tt.term_id=t.term_id "
+            "WHERE tt.taxonomy='language' AND t.slug='es' LIMIT 1"
+        )
+        row = cur.fetchone()
+        return int(row["term_taxonomy_id"]) if row else None
+
+
+def load_authors(c) -> dict[str, int]:
+    with c.cursor() as cur:
+        cur.execute("SELECT ID, display_name, user_login FROM wp_users")
+        rows = cur.fetchall()
+    authors = {}
+    for r in rows:
+        authors[normalize(r["display_name"])] = int(r["ID"])
+        authors[normalize(r["user_login"])] = int(r["ID"])
+    return authors
+
+
+def resolve_author(author_map: dict[str, int], name: Optional[str]) -> int:
+    if not name:
+        return 1
+    n = normalize(name)
+    if n in author_map:
+        return author_map[n]
+    for key, uid in author_map.items():
+        if n == key or n in key or key in n:
+            return uid
+    return 1
+
+
+def build_items(c) -> list[Item]:
+    existing_k2 = load_existing(c, "_fgj2wp_old_k2_id")
+    existing_content = load_existing(c, "_fgj2wp_old_content_id")
+    max_k2 = max_existing(existing_k2)
+    max_content = max_existing(existing_content)
+    print(
+        f"WP existentes: K2={len(existing_k2)} max={max_k2} "
+        f"content={len(existing_content)} max={max_content}"
+    )
+
+    items: dict[tuple[str, int], Item] = {}
+    for carta_def in CARTAS:
+        doc = fetch(carta_def["url"])
+        title, content = extract_title_and_content(doc)
+        carta = Item(
+            kind="content",
+            source_id=carta_def["content_id"],
+            url=carta_def["url"],
+            title=title,
+            content=content,
+            slug=slug_from_url(carta_def["url"]),
+            date=carta_def["date"],
+            term_ids=set(carta_def["cats"]),
+        )
+        if carta.source_id > max_content and carta.source_id not in existing_content:
+            items[(carta.kind, carta.source_id)] = carta
+
+        for href, cat_name, _text, author in links_by_section(carta):
+            if "/buscadoravanzado/item/" in href:
+                sid = id_from_url(href)
+                if not sid or sid <= max_k2 or sid in existing_k2:
+                    continue
+                key = ("k2", sid)
+                item = items.get(key)
+                if not item:
+                    item = Item(
+                        kind="k2",
+                        source_id=sid,
+                        url=href,
+                        slug=slug_from_url(href),
+                        date=carta.date,
+                        author_name=author,
+                        term_ids={TERM_FEADULTA},
+                        carta_source_id=carta.source_id,
+                    )
+                    items[key] = item
+                item.term_ids.add(SECTION_TO_TERM[cat_name])
+            elif "/indice-multimedia/" in href or "/videos/" in href:
+                sid = id_from_url(href)
+                if not sid or sid <= max_content or sid in existing_content:
+                    continue
+                is_video = "/videos/" in href
+                key = ("content", sid)
+                item = items.get(key)
+                if not item:
+                    item = Item(
+                        kind="content",
+                        source_id=sid,
+                        url=href,
+                        slug=slug_from_url(href),
+                        date=carta.date,
+                        term_ids={TERM_MULTIMEDIA, TERM_VIDEOS if is_video else TERM_INDICE_MULTIMEDIA},
+                    )
+                    items[key] = item
+
+    # Fetch item pages after discovery.
+    for item in items.values():
+        if item.title and item.content:
+            continue
+        doc = fetch(item.url)
+        title, content = extract_title_and_content(doc)
+        author = extract_author(doc)
+        item.title = title or item.slug.replace("-", " ").title()
+        item.content = content
+        if author and not item.author_name:
+            item.author_name = author
+    return sorted(items.values(), key=lambda x: (x.date, x.kind, x.source_id))
+
+
+def insert_item(c, item: Item, term_to_tt: dict[int, int], lang_es_tt: Optional[int], author_map: dict[str, int], dry_run: bool) -> Optional[int]:
+    author_id = resolve_author(author_map, item.author_name)
+    if dry_run:
+        print(
+            f"[DRY] {item.kind:7s} {item.source_id:5d} "
+            f"terms={sorted(item.term_ids)} author={author_id} {item.title[:70]}"
+        )
+        return None
+    with c.cursor() as cur:
+        cur.execute(
+            """
+            INSERT INTO wp_posts
+              (post_author, post_date, post_date_gmt, post_content, post_title,
+               post_excerpt, post_status, comment_status, ping_status, post_name,
+               post_type, post_modified, post_modified_gmt, comment_count,
+               to_ping, pinged, post_content_filtered)
+            VALUES
+              (%s,%s,%s,%s,%s,'','publish','open','open',%s,
+               'post',%s,%s,0,'','','')
+            """,
+            (author_id, item.date, item.date, item.content, item.title, item.slug, item.date, item.date),
+        )
+        post_id = cur.lastrowid
+        meta_key = "_fgj2wp_old_k2_id" if item.kind == "k2" else "_fgj2wp_old_content_id"
+        cur.execute(
+            "INSERT INTO wp_postmeta (post_id, meta_key, meta_value) VALUES (%s,%s,%s)",
+            (post_id, meta_key, str(item.source_id)),
+        )
+        cur.execute(
+            "INSERT INTO wp_postmeta (post_id, meta_key, meta_value) VALUES (%s,'Idioma','1')",
+            (post_id,),
+        )
+        for term_id in sorted(item.term_ids):
+            tt = term_to_tt.get(term_id)
+            if tt:
+                cur.execute(
+                    "INSERT IGNORE INTO wp_term_relationships (object_id, term_taxonomy_id) VALUES (%s,%s)",
+                    (post_id, tt),
+                )
+        if lang_es_tt:
+            cur.execute(
+                "INSERT IGNORE INTO wp_term_relationships (object_id, term_taxonomy_id) VALUES (%s,%s)",
+                (post_id, lang_es_tt),
+            )
+        return post_id
+
+
+def refresh_counts(c, term_to_tt: dict[int, int], lang_es_tt: Optional[int]):
+    ttids = list(term_to_tt.values())
+    if lang_es_tt:
+        ttids.append(lang_es_tt)
+    with c.cursor() as cur:
+        cur.execute(
+            "UPDATE wp_term_taxonomy tt SET count = ("
+            "SELECT COUNT(*) FROM wp_term_relationships tr "
+            "WHERE tr.term_taxonomy_id=tt.term_taxonomy_id"
+            ") WHERE tt.term_taxonomy_id IN (%s)" % ",".join(["%s"] * len(ttids)),
+            ttids,
+        )
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--apply", action="store_true", help="escribe en WordPress local")
+    args = ap.parse_args()
+    dry_run = not args.apply
+
+    c = conn()
+    try:
+        term_to_tt = load_terms(c)
+        lang_es_tt = load_lang_es(c)
+        author_map = load_authors(c)
+        items = build_items(c)
+        print(f"Items nuevos detectados: {len(items)}")
+        print(
+            "  K2:",
+            len([i for i in items if i.kind == "k2"]),
+            "content:",
+            len([i for i in items if i.kind == "content"]),
+        )
+
+        source_to_wp = {}
+        for item in items:
+            wp_id = insert_item(c, item, term_to_tt, lang_es_tt, author_map, dry_run)
+            if wp_id:
+                source_to_wp[(item.kind, item.source_id)] = wp_id
+
+        if not dry_run:
+            with c.cursor() as cur:
+                for item in items:
+                    if item.kind != "k2" or not item.carta_source_id:
+                        continue
+                    wp_id = source_to_wp.get(("k2", item.source_id))
+                    carta_wp_id = source_to_wp.get(("content", item.carta_source_id))
+                    if wp_id and carta_wp_id:
+                        cur.execute(
+                            "INSERT IGNORE INTO wp_postmeta (post_id, meta_key, meta_value) VALUES (%s,'_carta_id',%s)",
+                            (wp_id, str(carta_wp_id)),
+                        )
+            refresh_counts(c, term_to_tt, lang_es_tt)
+            c.commit()
+            print("Import commit OK.")
+        else:
+            c.rollback()
+            print("Dry-run: sin cambios.")
+    except Exception:
+        c.rollback()
+        raise
+    finally:
+        c.close()
+
+
+if __name__ == "__main__":
+    main()