#!/usr/bin/env python3 """ audit_translations.py Audits all new translated posts (ID > 42760) to check: - Assigned Polylang language - Detected language of the title - Detected language of the content Flags mismatches. """ import pymysql import re import html from langdetect import detect, LangDetectException DB = dict(host='172.18.0.2', port=3306, user='wordpress_user', password='wordpress_pass', database='wordpress_db', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) # Map langdetect codes to our Polylang slugs LANG_MAP = {'es': 'es', 'pt': 'pt', 'fr': 'fr', 'en': 'en', 'it': 'it', 'ca': 'es', # Catalan often confused with Spanish } def strip_html(text): if not text: return '' text = re.sub(r'<[^>]+>', ' ', text) text = html.unescape(text) text = re.sub(r'\s+', ' ', text).strip() return text def detect_lang(text, min_len=50): text = text.strip() if len(text) < min_len: return None try: return detect(text) except LangDetectException: return None def main(): db = pymysql.connect(**DB) c = db.cursor() c.execute(""" SELECT p.ID, p.post_title, p.post_content, t_lang.slug as assigned_lang, ( SELECT p2.post_title FROM wp_posts p2 JOIN wp_term_relationships trl2 ON p2.ID=trl2.object_id JOIN wp_term_taxonomy ttl2 ON trl2.term_taxonomy_id=ttl2.term_taxonomy_id AND ttl2.taxonomy='language' JOIN wp_terms tl2 ON ttl2.term_id=tl2.term_id AND tl2.slug='es' JOIN wp_term_relationships trg2 ON p2.ID=trg2.object_id JOIN wp_term_taxonomy ttg2 ON trg2.term_taxonomy_id=ttg2.term_taxonomy_id AND ttg2.taxonomy='post_translations' WHERE ttg2.term_taxonomy_id = ( SELECT ttg3.term_taxonomy_id FROM wp_term_relationships trg3 JOIN wp_term_taxonomy ttg3 ON trg3.term_taxonomy_id=ttg3.term_taxonomy_id AND ttg3.taxonomy='post_translations' WHERE trg3.object_id=p.ID LIMIT 1 ) LIMIT 1 ) as es_title FROM wp_posts p JOIN wp_term_relationships trl ON p.ID=trl.object_id JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language' JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id WHERE p.ID > 42760 AND p.post_type='post' AND p.post_status='publish' AND t_lang.slug != 'es' ORDER BY t_lang.slug, p.ID """) posts = c.fetchall() db.close() print(f"Auditing {len(posts)} translated posts...\n") issues = [] for p in posts: post_id = p['ID'] assigned = p['assigned_lang'] title = p['post_title'] or '' es_title = p['es_title'] or '' content_raw = p['post_content'] or '' content = strip_html(content_raw)[:600] # first 600 chars for detection # Detect content language content_lang = detect_lang(content, min_len=100) content_lang_norm = LANG_MAP.get(content_lang, content_lang) # Check title: is it the same as Spanish original? title_is_spanish = (title.strip().lower() == es_title.strip().lower() and es_title.strip()) # Detect title language (only if long enough) title_lang = detect_lang(title, min_len=30) title_lang_norm = LANG_MAP.get(title_lang, title_lang) problems = [] # Content language mismatch if content_lang_norm and content_lang_norm != assigned: # Allow es/pt confusion only if very short if not (content_lang_norm in ('es', 'pt') and assigned in ('es', 'pt') and len(content) < 200): problems.append(f"content={content_lang_norm}≠{assigned}") # Title still in Spanish if title_is_spanish: problems.append(f"title=ES_ORIGINAL") elif title_lang_norm and title_lang_norm != assigned and len(title) > 20: # Allow es/pt confusion for titles if not (title_lang_norm in ('es', 'pt') and assigned in ('es', 'pt')): problems.append(f"title_lang={title_lang_norm}≠{assigned}") if problems: issues.append({ 'id': post_id, 'assigned': assigned, 'problems': problems, 'title': title[:70], 'content_start': content[:80], }) # Summary by language print(f"{'='*70}") print(f"ISSUES FOUND: {len(issues)} out of {len(posts)} posts") print(f"{'='*70}\n") by_lang = {} for issue in issues: by_lang.setdefault(issue['assigned'], []).append(issue) for lang in sorted(by_lang.keys()): lang_issues = by_lang[lang] print(f"--- {lang.upper()} ({len(lang_issues)} issues) ---") for i in sorted(lang_issues, key=lambda x: x['problems'][0]): print(f" [{i['id']}] {', '.join(i['problems'])}") print(f" Title: {i['title']}") print(f" Content: {i['content_start']}") print() # Write CSV for easier review with open('/tmp/translation_audit.csv', 'w') as f: f.write('id,assigned_lang,problems,title,content_start\n') for i in issues: title_esc = i['title'].replace('"', '""') content_esc = i['content_start'].replace('"', '""') f.write(f'{i["id"]},{i["assigned"]},"{",".join(i["problems"])}","{title_esc}","{content_esc}"\n') print(f"CSV saved to /tmp/translation_audit.csv") if __name__ == '__main__': main()