rockbox/tools/updatelang
Solomon Peachy 73a47a1b5e updatelang: Make sure translated string has the correct format
We do this by parsing out the format specifiers and making sure the
translation has the correct number, type, and order of specifiers.
Percent literals ('%%') are ignored.

Mis-matched formats can lead to much badness, so to be safe, use the
untranslated string instead and flag it as a problem on the translation
site.

Change-Id: Ib48c2e5c3502735b1c724dd3621549faa8b602b7
2024-04-29 22:03:12 -04:00

534 lines
15 KiB
Perl
Executable file

#!/usr/bin/perl -s -w
# __________ __ ___.
# Open \______ \ ____ ____ | | _\_ |__ _______ ___
# Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
# Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
# Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
# \/ \/ \/ \/ \/
#
# Copyright (C) 2020 Solomon Peachy
#
use utf8;
use File::Basename;
sub trim {
my ($string) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub parselangfile {
my ($filename) = @_;
my %phrases;
my @order;
my %empty = ( #'phrase' => {},
#'source' => {},
#'dest' => {},
#'voice' => {},
'notes' => "",
'new' => 0
);
my %thisphrase = %empty;
open(FH, "<$filename") || die ("Can't open $filename");
my @lines = <FH>;
close(FH);
my $pos = 'lang';
my $id = '';
my @comments;
foreach my $line (@lines) {
$line = trim($line);
if($line =~ /^ *###/) {
# Filter out warnings from prior runs
next;
} elsif($line =~ /^ *#/) {
push(@comments, "$line\n") if ($pos eq 'lang');
# comments are ignored!
next;
} elsif ($pos eq 'phrase' && $line =~ /^([^:]+): ?(.*)$/) {
$thisphrase{$pos}->{$1} = $2;
if ($1 eq 'id') {
push(@order, $2);
$id = $2;
}
} elsif ($pos ne 'phrase' && $line =~ /^([^:]+): ?\"?([^\"]*)\"?$/) {
my @targets = split(',', $1);
my $w = $2;
foreach (@targets) {
my $l = trim($_);
# Convert some obsolete keys
if ($l eq "swcodec") {
$l = "*";
} elsif ($l eq "lcd_bitmap") {
$l = "*";
} elsif ($l eq "recording_swcodec") {
$l = "recording";
# } elsif ($id =~ /USB_MODE/ && $l =~ /ibassodx/) {
# $l = "*";
}
$thisphrase{$pos}->{$l} = $w;
}
}
if ($line eq '</voice>' ||
$line eq '</dest>' ||
$line eq '</source>' ||
$line eq '<phrase>') {
$pos = 'phrase';
} elsif ($line eq '</phrase>') {
my %copy = %thisphrase;
$phrases{$id} = \%copy;
%thisphrase = %empty;
$pos = 'lang';
$id = '';
} elsif ($line eq '<source>') {
$pos = 'source';
} elsif ($line eq '<dest>') {
$pos = 'dest';
} elsif ($line eq '<voice>') {
$pos = 'voice';
}
}
$phrases{'HEADER'} = \@comments;
$phrases{'ORDER'} = \@order;
return %phrases;
}
sub combinetgts {
my (%tgtmap) = (@_);
my %strmap;
my %combined;
# Reverse-map things
foreach my $tgt (sort(keys(%tgtmap))) {
next if ($tgt eq '*'); # Do not combine anything with fallback
if (defined($strmap{$tgtmap{$tgt}})) {
$strmap{$tgtmap{$tgt}} .= ",$tgt";
} else {
$strmap{$tgtmap{$tgt}} = "$tgt";
}
}
# Copy over default/fallback as it was skipped
$combined{'*'} = $tgtmap{'*'};
foreach my $str (keys(%strmap)) {
$combined{$strmap{$str}} = $str;
}
return %combined;
}
sub reduceformat($) {
my ($in) = @_;
my $out = "";
my $infmt = 0;
for (my $i = 0; $i < length($in) ; $i++) {
my $c = substr($in, $i, 1);
if (!$infmt && ($c eq '%')) {
# First char in a format string!
$infmt = 1;
next;
}
next if (!$infmt);
if ($c ne '%') {
# Ignore literal %, otherwise dump specifier over
$out .= $c;
}
# Look for a terminating field:
my $count = $c =~ tr/sSdDuUxXzZ%//;
if ($count) {
$infmt = 0;
next;
}
}
return $out;
}
##################
if($#ARGV != 2) {
print "Usage: updatelang <english.lang> <otherlang> <outfile|->\n";
exit;
}
# Parse master file
my %english = parselangfile($ARGV[0]);
my @englishorder = @{$english{'ORDER'}};
# Parse secondary file
my %lang = parselangfile($ARGV[1]);
my @langorder = @{$lang{'ORDER'}};
my @langheader = @{$lang{'HEADER'}};
# Clean up
delete $english{'ORDER'};
delete $english{'HEADER'};
delete $lang{'ORDER'};
delete $lang{'HEADER'};
# Extract language names
my @tmp = split(/\./, basename($ARGV[0]));
my $f1 = $tmp[0];
@tmp = split(/\./, basename($ARGV[1]));
my $f2 = $tmp[0];
undef @tmp;
# Read in ignore list
my $igname = dirname($0) . "/langignorelist.txt";
open (FH, "<$igname") || die ("Can't open $igname!");
my @ignorelist = <FH>;
close (FH);
sub not_ignorelist {
my ($key) = @_;
foreach (@ignorelist) {
chomp;
if ($_ eq $key) {
return 0;
}
}
return 1;
}
undef $igname;
# Do we care about notes?
my $printnotes = 1;
my $ignoredups = 0;
if ($f1 eq $f2) {
# Ignore all notes for master language
$printnotes = 0;
}
if (index($f2, $f1) > -1) {
# Ignore duplicates for sub-languages
$ignoredups = 1;
}
# work out the missing phrases
my %missing;
my @missingorder;
foreach (@englishorder) {
$missing{$_} = 1;
}
foreach (@langorder) {
if (!defined($english{$_})) {
delete($lang{$_});
# print "#!! '$_' no longer needed\n";
next;
}
delete $missing{$_};
}
foreach (@englishorder) {
push(@missingorder, $_) if defined($missing{$_});
}
# And add them to the phrase list.
foreach (@missingorder) {
# print "#!! '$_' missing\n";
push(@langorder, $_);
$lang{$_} = $english{$_};
$lang{$_}{'notes'} .= "### This phrase is missing entirely, copying from english!\n";
$lang{$_}{'new'} = 1;
}
undef @missingorder;
undef %missing;
# Sanity-check a few things
foreach my $id (@langorder) {
if (!defined($english{$id})) {
next;
}
my %ep = %{$english{$id}{'phrase'}};
my %lp = %{$lang{$id}{'phrase'}};
if ($lp{'desc'} ne $ep{'desc'} || $ep{'desc'} eq 'deprecated') {
if ($ep{'desc'} eq 'deprecated') {
# Nuke all deprecated targets; just copy from English
# print "#!! '$id' deprecated, deleting\n";
$lang{$id} = $english{$id};
} else {
$lang{$id}{'notes'} .= "### The 'desc' field for '$id' differs from the english!\n### the previously used desc is commented below:\n### desc: $lp{desc}\n";
$lang{$id}{'phrase'}{'desc'} = $english{$id}{'phrase'}{'desc'};
# print "#!! '$id' changed description\n";
}
}
if (!defined($ep{'user'}) || length($ep{'user'}) == 0) {
$lp{'user'} = 'core';
}
if (!defined($lp{'user'}) || $lp{'user'} ne $ep{'user'}) {
$lang{$id}{'notes'} .= "### The 'user' field for '$id' differs from the english!\n### the previously used desc is commented below:\n### desc: $lp{user}\n";
if (!defined($lp{'user'}) || length($lp{'user'}) == 0) {
$lp{'user'} = $ep{'user'};
}
$lang{$id}{'phrase'}{'user'} = $english{$id}{'phrase'}{'user'};
# print "#!! '$id' changed user\n";
}
}
# Check sources
foreach my $id (@langorder) {
if (!defined($english{$id})) {
next;
}
my %ep = %{$english{$id}{'source'}};
my %lp;
if (defined($lang{$id}{'source'})) {
%lp = %{$lang{$id}{'source'}};
} else {
%lp = ();
}
foreach my $tgt (keys(%lp)) {
if (!defined($ep{$tgt})) {
# Delete any targets that have been nuked in master
delete($lang{$id}{'source'}{$tgt});
}
}
foreach my $tgt (keys(%ep)) {
if (!defined($lp{$tgt})) {
# If it doesn't exist in the language, copy it from English
if ($ep{$tgt} ne 'none' && $ep{$tgt} ne '' ) {
$lang{$id}{'notes'} .= "### The <source> section for '$id:$tgt' is missing! Copying from english!\n";
# print "#!! '$id:$tgt' source missing\n";
}
$lang{$id}{'source'}{$tgt} = $english{$id}{'source'}{$tgt};
} elsif ($lp{$tgt} ne $ep{$tgt}) {
# If the source string differs, complain, and copy from English
$lang{$id}{'notes'} .= "### The <source> section for '$id:$tgt' differs from the english!\n";
$lang{$id}{'notes'} .= "### the previously used one is commented below:\n";
$lang{$id}{'notes'} .= "### $english{$id}{source}{$tgt}\n";
# print "#!! '$id:$tgt' source changed ('$lp{$tgt}' vs '$ep{$tgt}')\n";
$lang{$id}{'source'}{$tgt} = $english{$id}{'source'}{$tgt};
}
}
}
# Check dests
foreach my $id (@langorder) {
if (!defined($english{$id})) {
next;
}
my %ep = %{$english{$id}{'dest'}};
my %lp;
if (defined($lang{$id}{'dest'})) {
%lp = %{$lang{$id}{'dest'}};
} else {
%lp = ();
}
foreach my $tgt (keys(%lp)) {
if (!defined($ep{$tgt})) {
# Delete any targets that have been nuked in master
delete($lang{$id}{'dest'}{$tgt});
}
}
foreach my $tgt (keys(%ep)) {
if (!defined($lp{$tgt}) || ($lp{$tgt} eq 'none')) {
# If it doesn't exist in the language, copy it from English
if ($ep{$tgt} ne 'none' && $ep{$tgt} ne '' ) {
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' is missing! Copying from english!\n";
# print "#!! '$id:$tgt' dest missing\n";
}
$lang{$id}{'dest'}{$tgt} = $english{$id}{'dest'}{$tgt};
} elsif ($lp{$tgt} ne $ep{$tgt}) {
# If the source string differs, complain, and copy from English
if ($lp{$tgt} eq '' && $ep{$tgt} ne '') {
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' is blank! Copying from english!\n";
# print "#!! '$id:$tgt' dest is blank ('$lp{$tgt}' vs '$ep{$tgt}')\n";
$lang{$id}{'dest'}{$tgt} = $english{$id}{'dest'}{$tgt};
} elsif ($lp{$tgt} ne '' && $ep{$tgt} eq '') {
# It should be kept blank!
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' is not blank!\n";
$lang{$id}{'notes'} .= "### the previously used one is commented below:\n";
$lang{$id}{'notes'} .= "### $english{$id}{dest}{$tgt}\n";
# print "#!! '$id:$tgt' dest not blank ('$lp{$tgt}' vs '$ep{$tgt}')\n";
$lang{$id}{'dest'}{$tgt} = $english{$id}{'dest'}{$tgt};
}
} elsif ($lp{$tgt} ne 'none' && $lp{$tgt} ne '' && not_ignorelist($id) && !$lang{$id}{'new'} && !$ignoredups) {
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' is identical to english!\n";
# print "#!! '$id:$tgt' dest identical ('$lp{$tgt}')\n";
}
my $count1 = $ep{$tgt} =~ tr/%//;
my $count2 = $lp{$tgt} =~ tr/%//;
if ($count1 || $count2) {
my $fmt1 = reduceformat($ep{$tgt});
my $fmt2 = reduceformat($lp{$tgt});
if ($fmt1 ne $fmt2) {
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' has incorrect format specifiers! Copying from English!\n";
$lang{$id}{'dest'}{$tgt} = $english{$id}{'dest'}{$tgt};
# print "#!! '$id:$tgt' dest does not match src format args: '$fmt1' vs '$fmt2'\n";
}
}
}
}
# Check voices
foreach my $id (@langorder) {
if (!defined($english{$id})) {
next;
}
my %ep = %{$english{$id}{'voice'}};
my %lp;
if (defined($lang{$id}{'voice'})) {
%lp = %{$lang{$id}{'voice'}};
} else {
%lp = ();
}
foreach my $tgt (keys(%lp)) {
if (!defined($ep{$tgt})) {
# Delete any targets that have been nuked in master
delete($lang{$id}{'voice'}{$tgt});
}
}
foreach my $tgt (keys(%ep)) {
if (!defined($lp{$tgt}) || ($lp{$tgt} eq 'none')) {
# If it doesn't exist in the language, copy it from English
if ($ep{$tgt} ne 'none' && $ep{$tgt} ne '' ) {
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is missing! Copying from english!\n";
# print "#!! '$id:$tgt' voice missing\n";
}
$lang{$id}{'voice'}{$tgt} = $english{$id}{'voice'}{$tgt};
} elsif ($lp{$tgt} ne $ep{$tgt}) {
if ($lp{$tgt} eq '' && $ep{$tgt} ne '') {
# If the lang voice string is blank, complain and copy from translation
# print "#!! '$id:$tgt' voice is blank ('$lp{$tgt}' vs '$ep{$tgt}')\n";
if ($lang{$id}{'dest'}{$tgt} ne '' &&
$lang{$id}{'dest'}{$tgt} ne $english{$id}{'dest'}{$tgt}) {
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is blank! Copying from translated <dest>!\n";
$lang{$id}{'voice'}{$tgt} = $lang{$id}{'dest'}{$tgt};
if ($lang{$id}{'voice'}{$tgt} =~ /%/) {
# If it has suspicious characters that are not normally voiced..
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' has some suspicious characters, please double-check!\n";
#print "#!! '$id:$tgt' suspicious characters\n";
}
} else {
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is blank! Copying from english!\n";
$lang{$id}{'voice'}{$tgt} = $english{$id}{'voice'}{$tgt};
}
} elsif ($lp{$tgt} ne '' && $ep{$tgt} eq '') {
if ($id ne 'VOICE_NUMERIC_TENS_SWAP_SEPARATOR') {
# If it's not blank, clear it and complain!
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is not blank!\n";
$lang{$id}{'notes'} .= "### the previously used one is commented below:\n";
$lang{$id}{'notes'} .= "### $english{$id}{voice}{$tgt}\n";
# print "#!! '$id:$tgt' voice not blank ('$lp{$tgt}' vs '$ep{$tgt}')\n";
$lang{$id}{'voice'}{$tgt} = $english{$id}{'voice'}{$tgt};
}
}
} elsif ($lp{$tgt} ne 'none' && $lp{$tgt} ne '' && not_ignorelist($id) && !$lang{$id}{'new'} && !$ignoredups) {
# print "#!! '$id:$tgt' voice identical ('$lp{$tgt}')\n";
if ($lang{$id}{'dest'}{$tgt} ne '' &&
$lang{$id}{'dest'}{$tgt} ne $english{$id}{'dest'}{$tgt}) {
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is identical to english, copying translated <dest>\n";
$lang{$id}{'voice'}{$tgt} = $lang{$id}{'dest'}{$tgt};
} else {
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is identical to english!\n";
}
}
if ($lp{$tgt} =~ /%/) {
# If it has suspicious characters that are not normally voiced..
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' has some suspicious characters, please double-check!\n";
# print "#!! '$id:$tgt' suspicious characters\n";
}
}
}
########## Write new language file
my $fh;
if ($ARGV[2] ne '-') {
open(FH, ">$ARGV[2]") || die ("Can't open $ARGV[2]");
$fh = *FH;
} else {
$fh = *STDOUT;
}
foreach (@langheader) {
print $fh $_;
}
my @finalorder = @langorder; # TODO make configurable vs @englishorder
foreach my $id (@finalorder) {
if (!defined($english{$id})) {
next;
}
my %lp;
# phrase
%lp = %{$lang{$id}{'phrase'}};
# Drop all deprecated phrases?
# next if ($lp{'desc'} eq 'deprecated');
if (length($lang{$id}{'notes'}) && $printnotes) {
print $fh "$lang{$id}{notes}";
}
print $fh "<phrase>\n";
print $fh " id: $lp{id}\n";
if ($lp{'desc'} ne '') {
print $fh " desc: $lp{desc}\n";
} else {
print $fh " desc:\n";
}
print $fh " user: $lp{user}\n";
# source
%lp = combinetgts(%{$lang{$id}{'source'}});
print $fh " <source>\n";
foreach my $tgt (sort(keys(%lp))) {
if ($lp{$tgt} eq 'none') {
print $fh " $tgt: $lp{$tgt}\n";
} else {
print $fh " $tgt: \"$lp{$tgt}\"\n";
}
}
print $fh " </source>\n";
# dest
%lp = combinetgts(%{$lang{$id}{'dest'}});
print $fh " <dest>\n";
foreach my $tgt (sort(keys(%lp))) {
if ($lp{$tgt} eq 'none') {
print $fh " $tgt: $lp{$tgt}\n";
} else {
print $fh " $tgt: \"$lp{$tgt}\"\n";
}
}
print $fh " </dest>\n";
# voice
%lp = combinetgts(%{$lang{$id}{'voice'}});
print $fh " <voice>\n";
foreach my $tgt (sort(keys(%lp))) {
if ($lp{$tgt} eq 'none') {
print $fh " $tgt: $lp{$tgt}\n";
} else {
print $fh " $tgt: \"$lp{$tgt}\"\n";
}
}
print $fh " </voice>\n";
# FiN
print $fh "</phrase>\n";
}
if ($ARGV[2] ne '-') {
close(FH);
}