Files
feadulta/scripts/detect_untranslated.php
T

78 lines
3.0 KiB
PHP

<?php
/**
* Issue #80 — detecta traducciones con fragmentos en español sin traducir.
* Señal: una frase sin traducir queda IDÉNTICA a la del ES original.
* Para cada traducción (en/fr/it/pt) compara sus frases contra el set de frases
* del ES enlazado (Polylang) y calcula el % de caracteres que coinciden literal.
*
* Uso (en contenedor): php detect_untranslated.php [umbral] [status]
* umbral: ratio mínimo para marcar (def 0.12)
* status: draft (def) | publish | any
*/
require "/var/www/html/wp-load.php";
global $wpdb;
$THRESH = isset($argv[1]) ? (float)$argv[1] : 0.12;
$STATUS = $argv[2] ?? 'draft';
function norm_text($html) {
$t = preg_replace('~(?i)</p>|<br\s*/?>|</h[1-6]>~', "\n", $html);
$t = preg_replace('~<[^>]+>~', ' ', $t);
$t = preg_replace('~\[[^\]]+\]~', ' ', $t);
$t = html_entity_decode($t, ENT_QUOTES);
return $t;
}
/** Frases normalizadas de longitud >= 40 (las cortas dan falsos positivos). */
function sentences($html) {
$t = norm_text($html);
$parts = preg_split('~(?<=[.!?…])\s+|\n+~u', $t);
$out = [];
foreach ($parts as $s) {
$s = trim(preg_replace('~\s+~u', ' ', $s));
$s = mb_strtolower($s);
if (mb_strlen($s) >= 40) $out[$s] = mb_strlen($s);
}
return $out;
}
$statuses = $STATUS === 'any' ? ['draft','publish'] : [$STATUS];
$in = "'" . implode("','", $statuses) . "'";
$ids = $wpdb->get_col(
"SELECT p.ID FROM wp_posts p
JOIN wp_term_relationships tr ON tr.object_id=p.ID
JOIN wp_term_taxonomy tt ON tt.term_taxonomy_id=tr.term_taxonomy_id AND tt.taxonomy='language'
JOIN wp_terms t ON t.term_id=tt.term_id AND t.slug IN ('en','fr','it','pt')
WHERE p.post_type='post' AND p.post_status IN ($in)
GROUP BY p.ID"
);
$by_lang = []; $offenders = [];
foreach ($ids as $id) {
$lang = pll_get_post_language($id);
$es = pll_get_post((int)$id, 'es');
if (!$es) continue;
$tr_s = sentences(get_post($id)->post_content);
if (!$tr_s) continue;
$es_s = sentences(get_post($es)->post_content);
if (!$es_s) continue;
$total = array_sum($tr_s); $match = 0;
foreach ($tr_s as $s => $len) if (isset($es_s[$s])) $match += $len;
$ratio = $total ? $match / $total : 0;
$by_lang[$lang]['n'] = ($by_lang[$lang]['n'] ?? 0) + 1;
if ($ratio >= $THRESH) {
$by_lang[$lang]['bad'] = ($by_lang[$lang]['bad'] ?? 0) + 1;
$offenders[] = [$id, $lang, $es, round($ratio, 2), get_post($id)->post_title];
}
}
usort($offenders, fn($a, $b) => $b[3] <=> $a[3]);
echo "=== Traducciones con fragmentos ES (ratio >= $THRESH, status=$STATUS) ===\n";
foreach ($offenders as $o)
echo sprintf("#%d [%s] ratio=%.2f es=%d %s\n", $o[0], $o[1], $o[3], $o[2], mb_substr($o[4], 0, 45));
echo "\n--- resumen por idioma ---\n";
foreach ($by_lang as $l => $d)
echo sprintf("%s: %d/%d con fragmentos ES\n", $l, $d['bad'] ?? 0, $d['n']);
echo "TOTAL ofensores: " . count($offenders) . "\n";
// Volcar IDs para el reprocesado
file_put_contents('/tmp/untranslated_ids.txt', implode("\n", array_map(fn($o) => $o[0], $offenders)));