78 lines
3.0 KiB
PHP
78 lines
3.0 KiB
PHP
<?php
|
|
/**
|
|
* Issue #80 — detecta traducciones con fragmentos en español sin traducir.
|
|
* Señal: una frase sin traducir queda IDÉNTICA a la del ES original.
|
|
* Para cada traducción (en/fr/it/pt) compara sus frases contra el set de frases
|
|
* del ES enlazado (Polylang) y calcula el % de caracteres que coinciden literal.
|
|
*
|
|
* Uso (en contenedor): php detect_untranslated.php [umbral] [status]
|
|
* umbral: ratio mínimo para marcar (def 0.12)
|
|
* status: draft (def) | publish | any
|
|
*/
|
|
require "/var/www/html/wp-load.php";
|
|
global $wpdb;
|
|
|
|
$THRESH = isset($argv[1]) ? (float)$argv[1] : 0.12;
|
|
$STATUS = $argv[2] ?? 'draft';
|
|
|
|
function norm_text($html) {
|
|
$t = preg_replace('~(?i)</p>|<br\s*/?>|</h[1-6]>~', "\n", $html);
|
|
$t = preg_replace('~<[^>]+>~', ' ', $t);
|
|
$t = preg_replace('~\[[^\]]+\]~', ' ', $t);
|
|
$t = html_entity_decode($t, ENT_QUOTES);
|
|
return $t;
|
|
}
|
|
/** Frases normalizadas de longitud >= 40 (las cortas dan falsos positivos). */
|
|
function sentences($html) {
|
|
$t = norm_text($html);
|
|
$parts = preg_split('~(?<=[.!?…])\s+|\n+~u', $t);
|
|
$out = [];
|
|
foreach ($parts as $s) {
|
|
$s = trim(preg_replace('~\s+~u', ' ', $s));
|
|
$s = mb_strtolower($s);
|
|
if (mb_strlen($s) >= 40) $out[$s] = mb_strlen($s);
|
|
}
|
|
return $out;
|
|
}
|
|
|
|
$statuses = $STATUS === 'any' ? ['draft','publish'] : [$STATUS];
|
|
$in = "'" . implode("','", $statuses) . "'";
|
|
$ids = $wpdb->get_col(
|
|
"SELECT p.ID FROM wp_posts p
|
|
JOIN wp_term_relationships tr ON tr.object_id=p.ID
|
|
JOIN wp_term_taxonomy tt ON tt.term_taxonomy_id=tr.term_taxonomy_id AND tt.taxonomy='language'
|
|
JOIN wp_terms t ON t.term_id=tt.term_id AND t.slug IN ('en','fr','it','pt')
|
|
WHERE p.post_type='post' AND p.post_status IN ($in)
|
|
GROUP BY p.ID"
|
|
);
|
|
|
|
$by_lang = []; $offenders = [];
|
|
foreach ($ids as $id) {
|
|
$lang = pll_get_post_language($id);
|
|
$es = pll_get_post((int)$id, 'es');
|
|
if (!$es) continue;
|
|
$tr_s = sentences(get_post($id)->post_content);
|
|
if (!$tr_s) continue;
|
|
$es_s = sentences(get_post($es)->post_content);
|
|
if (!$es_s) continue;
|
|
$total = array_sum($tr_s); $match = 0;
|
|
foreach ($tr_s as $s => $len) if (isset($es_s[$s])) $match += $len;
|
|
$ratio = $total ? $match / $total : 0;
|
|
$by_lang[$lang]['n'] = ($by_lang[$lang]['n'] ?? 0) + 1;
|
|
if ($ratio >= $THRESH) {
|
|
$by_lang[$lang]['bad'] = ($by_lang[$lang]['bad'] ?? 0) + 1;
|
|
$offenders[] = [$id, $lang, $es, round($ratio, 2), get_post($id)->post_title];
|
|
}
|
|
}
|
|
|
|
usort($offenders, fn($a, $b) => $b[3] <=> $a[3]);
|
|
echo "=== Traducciones con fragmentos ES (ratio >= $THRESH, status=$STATUS) ===\n";
|
|
foreach ($offenders as $o)
|
|
echo sprintf("#%d [%s] ratio=%.2f es=%d %s\n", $o[0], $o[1], $o[3], $o[2], mb_substr($o[4], 0, 45));
|
|
echo "\n--- resumen por idioma ---\n";
|
|
foreach ($by_lang as $l => $d)
|
|
echo sprintf("%s: %d/%d con fragmentos ES\n", $l, $d['bad'] ?? 0, $d['n']);
|
|
echo "TOTAL ofensores: " . count($offenders) . "\n";
|
|
// Volcar IDs para el reprocesado
|
|
file_put_contents('/tmp/untranslated_ids.txt', implode("\n", array_map(fn($o) => $o[0], $offenders)));
|