Files
feadulta/scripts/audit_translations.py
T

152 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""
audit_translations.py
Audits all new translated posts (ID > 42760) to check:
- Assigned Polylang language
- Detected language of the title
- Detected language of the content
Flags mismatches.
"""
import pymysql
import re
import html
from langdetect import detect, LangDetectException
DB = dict(host='172.18.0.2', port=3306, user='wordpress_user',
password='wordpress_pass', database='wordpress_db', charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
# Map langdetect codes to our Polylang slugs
LANG_MAP = {'es': 'es', 'pt': 'pt', 'fr': 'fr', 'en': 'en', 'it': 'it',
'ca': 'es', # Catalan often confused with Spanish
}
def strip_html(text):
if not text:
return ''
text = re.sub(r'<[^>]+>', ' ', text)
text = html.unescape(text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def detect_lang(text, min_len=50):
text = text.strip()
if len(text) < min_len:
return None
try:
return detect(text)
except LangDetectException:
return None
def main():
db = pymysql.connect(**DB)
c = db.cursor()
c.execute("""
SELECT p.ID, p.post_title, p.post_content,
t_lang.slug as assigned_lang,
(
SELECT p2.post_title FROM wp_posts p2
JOIN wp_term_relationships trl2 ON p2.ID=trl2.object_id
JOIN wp_term_taxonomy ttl2 ON trl2.term_taxonomy_id=ttl2.term_taxonomy_id AND ttl2.taxonomy='language'
JOIN wp_terms tl2 ON ttl2.term_id=tl2.term_id AND tl2.slug='es'
JOIN wp_term_relationships trg2 ON p2.ID=trg2.object_id
JOIN wp_term_taxonomy ttg2 ON trg2.term_taxonomy_id=ttg2.term_taxonomy_id AND ttg2.taxonomy='post_translations'
WHERE ttg2.term_taxonomy_id = (
SELECT ttg3.term_taxonomy_id FROM wp_term_relationships trg3
JOIN wp_term_taxonomy ttg3 ON trg3.term_taxonomy_id=ttg3.term_taxonomy_id AND ttg3.taxonomy='post_translations'
WHERE trg3.object_id=p.ID LIMIT 1
)
LIMIT 1
) as es_title
FROM wp_posts p
JOIN wp_term_relationships trl ON p.ID=trl.object_id
JOIN wp_term_taxonomy ttl ON trl.term_taxonomy_id=ttl.term_taxonomy_id AND ttl.taxonomy='language'
JOIN wp_terms t_lang ON ttl.term_id=t_lang.term_id
WHERE p.ID > 42760 AND p.post_type='post' AND p.post_status='publish'
AND t_lang.slug != 'es'
ORDER BY t_lang.slug, p.ID
""")
posts = c.fetchall()
db.close()
print(f"Auditing {len(posts)} translated posts...\n")
issues = []
for p in posts:
post_id = p['ID']
assigned = p['assigned_lang']
title = p['post_title'] or ''
es_title = p['es_title'] or ''
content_raw = p['post_content'] or ''
content = strip_html(content_raw)[:600] # first 600 chars for detection
# Detect content language
content_lang = detect_lang(content, min_len=100)
content_lang_norm = LANG_MAP.get(content_lang, content_lang)
# Check title: is it the same as Spanish original?
title_is_spanish = (title.strip().lower() == es_title.strip().lower() and es_title.strip())
# Detect title language (only if long enough)
title_lang = detect_lang(title, min_len=30)
title_lang_norm = LANG_MAP.get(title_lang, title_lang)
problems = []
# Content language mismatch
if content_lang_norm and content_lang_norm != assigned:
# Allow es/pt confusion only if very short
if not (content_lang_norm in ('es', 'pt') and assigned in ('es', 'pt') and len(content) < 200):
problems.append(f"content={content_lang_norm}{assigned}")
# Title still in Spanish
if title_is_spanish:
problems.append(f"title=ES_ORIGINAL")
elif title_lang_norm and title_lang_norm != assigned and len(title) > 20:
# Allow es/pt confusion for titles
if not (title_lang_norm in ('es', 'pt') and assigned in ('es', 'pt')):
problems.append(f"title_lang={title_lang_norm}{assigned}")
if problems:
issues.append({
'id': post_id,
'assigned': assigned,
'problems': problems,
'title': title[:70],
'content_start': content[:80],
})
# Summary by language
print(f"{'='*70}")
print(f"ISSUES FOUND: {len(issues)} out of {len(posts)} posts")
print(f"{'='*70}\n")
by_lang = {}
for issue in issues:
by_lang.setdefault(issue['assigned'], []).append(issue)
for lang in sorted(by_lang.keys()):
lang_issues = by_lang[lang]
print(f"--- {lang.upper()} ({len(lang_issues)} issues) ---")
for i in sorted(lang_issues, key=lambda x: x['problems'][0]):
print(f" [{i['id']}] {', '.join(i['problems'])}")
print(f" Title: {i['title']}")
print(f" Content: {i['content_start']}")
print()
# Write CSV for easier review
with open('/tmp/translation_audit.csv', 'w') as f:
f.write('id,assigned_lang,problems,title,content_start\n')
for i in issues:
title_esc = i['title'].replace('"', '""')
content_esc = i['content_start'].replace('"', '""')
f.write(f'{i["id"]},{i["assigned"]},"{",".join(i["problems"])}","{title_esc}","{content_esc}"\n')
print(f"CSV saved to /tmp/translation_audit.csv")
if __name__ == '__main__':
main()