feadulta/scripts/audit_translations.py

#!/usr/bin/env python3
"""
audit_translations.py

Audits all new translated posts (ID > 42760) to check:
- Assigned Polylang language
- Detected language of the title
- Detected language of the content
Flags mismatches.
"""

import pymysql
import re
import html
from langdetect import detect, LangDetectException

DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
          password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
          cursorclass=pymysql.cursors.DictCursor)

# Map langdetect codes to our Polylang slugs
LANG_MAP = {'es': 'es', 'pt': 'pt', 'fr': 'fr', 'en': 'en', 'it': 'it',
            'ca': 'es',  # Catalan often confused with Spanish
            }

def strip_html(text):
    if not text:
        return ''
    text = re.sub(r'<[^>]+>', ' ', text)
    text = html.unescape(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def detect_lang(text, min_len=50):
    text = text.strip()
    if len(text) < min_len:
        return None
    try:
        return detect(text)
    except LangDetectException:
        return None

def main():
    db = pymysql.connect(**DB)
    c = db.cursor()

    c.execute("""
        SELECT p.ID, p.post_title, p.post_content,
               t_lang.slug as assigned_lang,
               (
                 SELECT p2.post_title FROM wp_posts p2
                 JOIN wp_term_relationships trl2 ON p2.ID=trl2.object_id
                 JOIN wp_term_taxonomy ttl2 ON trl2.term_taxonomy_id=ttl2.term_taxonomy_id AND ttl2.taxonomy='language'
                 JOIN wp_terms tl2 ON ttl2.term_id=tl2.term_id AND tl2.slug='es'
                 JOIN wp_term_relationships trg2 ON p2.ID=trg2.object_id
                 JOIN wp_term_taxonomy ttg2 ON trg2.term_taxonomy_id=ttg2.term_taxonomy_id AND ttg2.taxonomy='post_translations'
                 WHERE ttg2.term_taxonomy_id = (
                   SELECT ttg3.term_taxonomy_id FROM wp_term_relationships trg3
                   JOIN wp_term_taxonomy ttg3 ON trg3.term_taxonomy_id=ttg3.term_taxonomy_id AND ttg3.taxonomy='post_translations'
                   WHERE trg3.object_id=p.ID LIMIT 1
                 )
                 LIMIT 1
               ) as es_title
        FROM wp_posts p
        JOIN wp_term_relationships trl ON p.ID=trl.object_id
        JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
        JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id
        WHERE p.ID > 42760 AND p.post_type='post' AND p.post_status='publish'
        AND t_lang.slug != 'es'
        ORDER BY t_lang.slug, p.ID
    """)
    posts = c.fetchall()
    db.close()

    print(f"Auditing {len(posts)} translated posts...\n")

    issues = []

    for p in posts:
        post_id = p['ID']
        assigned = p['assigned_lang']
        title = p['post_title'] or ''
        es_title = p['es_title'] or ''
        content_raw = p['post_content'] or ''
        content = strip_html(content_raw)[:600]  # first 600 chars for detection

        # Detect content language
        content_lang = detect_lang(content, min_len=100)
        content_lang_norm = LANG_MAP.get(content_lang, content_lang)

        # Check title: is it the same as Spanish original?
        title_is_spanish = (title.strip().lower() == es_title.strip().lower() and es_title.strip())

        # Detect title language (only if long enough)
        title_lang = detect_lang(title, min_len=30)
        title_lang_norm = LANG_MAP.get(title_lang, title_lang)

        problems = []

        # Content language mismatch
        if content_lang_norm and content_lang_norm != assigned:
            # Allow es/pt confusion only if very short
            if not (content_lang_norm in ('es', 'pt') and assigned in ('es', 'pt') and len(content) < 200):
                problems.append(f"content={content_lang_norm}≠{assigned}")

        # Title still in Spanish
        if title_is_spanish:
            problems.append(f"title=ES_ORIGINAL")
        elif title_lang_norm and title_lang_norm != assigned and len(title) > 20:
            # Allow es/pt confusion for titles
            if not (title_lang_norm in ('es', 'pt') and assigned in ('es', 'pt')):
                problems.append(f"title_lang={title_lang_norm}≠{assigned}")

        if problems:
            issues.append({
                'id': post_id,
                'assigned': assigned,
                'problems': problems,
                'title': title[:70],
                'content_start': content[:80],
            })

    # Summary by language
    print(f"{'='*70}")
    print(f"ISSUES FOUND: {len(issues)} out of {len(posts)} posts")
    print(f"{'='*70}\n")

    by_lang = {}
    for issue in issues:
        by_lang.setdefault(issue['assigned'], []).append(issue)

    for lang in sorted(by_lang.keys()):
        lang_issues = by_lang[lang]
        print(f"--- {lang.upper()} ({len(lang_issues)} issues) ---")
        for i in sorted(lang_issues, key=lambda x: x['problems'][0]):
            print(f"  [{i['id']}] {', '.join(i['problems'])}")
            print(f"    Title:   {i['title']}")
            print(f"    Content: {i['content_start']}")
        print()

    # Write CSV for easier review
    with open('/tmp/translation_audit.csv', 'w') as f:
        f.write('id,assigned_lang,problems,title,content_start\n')
        for i in issues:
            title_esc = i['title'].replace('"', '""')
            content_esc = i['content_start'].replace('"', '""')
            f.write(f'{i["id"]},{i["assigned"]},"{",".join(i["problems"])}","{title_esc}","{content_esc}"\n')
    print(f"CSV saved to /tmp/translation_audit.csv")

if __name__ == '__main__':
    main()