1
0
Fork 0
forked from len0rd/rockbox
foxbox/tools/updatelang
Solomon Peachy 3a6ed727d4 lang: Add a special flag to differentiate "intentionally identical to english"
We normally treat "same as English" as a translation errors that needs to be
corrected.  However, many languages effectively use english words as-is, so
we need a way of distinguishing the "intentionally the same" situations with
our tools "automatically copying missing translated strings from English" to
avoid blank or missing UI strings.

The solution is to make sure these "intentionally same as english"
strings are actually different.  This will be accomplished by prepending
'~' to the these strings.  This special character is stripped from the
binary data files used by the player and the voice generation tools.

Change-Id: I90088cbd74de0e5cb9d65f75f26afe04f7e301bf
2024-05-16 20:40:37 -04:00

560 lines
16 KiB
Perl
Executable file

#!/usr/bin/perl -s -w
# __________ __ ___.
# Open \______ \ ____ ____ | | _\_ |__ _______ ___
# Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
# Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
# Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
# \/ \/ \/ \/ \/
#
# Copyright (C) 2020 Solomon Peachy
#
use utf8;
use File::Basename;
sub trim {
my ($string) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub parselangfile {
my ($filename) = @_;
my %phrases;
my @order;
my %empty = ( #'phrase' => {},
#'source' => {},
#'dest' => {},
#'voice' => {},
'notes' => "",
'new' => 0
);
my %thisphrase = %empty;
open(FH, "<$filename") || die ("Can't open $filename");
my @lines = <FH>;
close(FH);
my $pos = 'lang';
my $id = '';
my @comments;
foreach my $line (@lines) {
$line = trim($line);
if($line =~ /^ *###/) {
# Filter out warnings from prior runs
next;
} elsif($line =~ /^ *#/) {
push(@comments, "$line\n") if ($pos eq 'lang');
# comments are ignored, but retained!
next;
} elsif ($pos eq 'phrase' && $line =~ /^([^:]+): ?(.*)$/) {
$thisphrase{$pos}->{$1} = $2;
if ($1 eq 'id') {
push(@order, $2);
$id = $2;
}
} elsif ($pos ne 'phrase' && $line =~ /^([^:]+): ?\"?([^\"]*)\"?$/) {
my @targets = split(',', $1);
my $w = $2;
foreach (@targets) {
my $l = trim($_);
# Convert some obsolete keys
if ($l eq "swcodec") {
$l = "*";
} elsif ($l eq "lcd_bitmap") {
$l = "*";
} elsif ($l eq "recording_swcodec") {
$l = "recording";
# } elsif ($id =~ /USB_MODE/ && $l =~ /ibassodx/) {
# $l = "*";
}
$thisphrase{$pos}->{$l} = $w;
}
}
if ($line eq '</voice>' ||
$line eq '</dest>' ||
$line eq '</source>' ||
$line eq '<phrase>') {
$pos = 'phrase';
} elsif ($line eq '</phrase>') {
my %copy = %thisphrase;
$phrases{$id} = \%copy;
%thisphrase = %empty;
$pos = 'lang';
$id = '';
} elsif ($line eq '<source>') {
$pos = 'source';
} elsif ($line eq '<dest>') {
$pos = 'dest';
} elsif ($line eq '<voice>') {
$pos = 'voice';
}
}
$phrases{'HEADER'} = \@comments;
$phrases{'ORDER'} = \@order;
return %phrases;
}
sub combinetgts {
my (%tgtmap) = (@_);
my %strmap;
my %combined;
# Reverse-map things
foreach my $tgt (sort(keys(%tgtmap))) {
next if ($tgt eq '*'); # Do not combine anything with fallback
if (defined($strmap{$tgtmap{$tgt}})) {
$strmap{$tgtmap{$tgt}} .= ",$tgt";
} else {
$strmap{$tgtmap{$tgt}} = "$tgt";
}
}
# Copy over default/fallback as it was skipped
$combined{'*'} = $tgtmap{'*'};
foreach my $str (keys(%strmap)) {
$combined{$strmap{$str}} = $str;
}
return %combined;
}
sub reduceformat($) {
my ($in) = @_;
my $out = "";
my $infmt = 0;
for (my $i = 0; $i < length($in) ; $i++) {
my $c = substr($in, $i, 1);
if (!$infmt && ($c eq '%')) {
# First char in a format string!
$infmt = 1;
next;
}
next if (!$infmt);
if ($c ne '%') {
# Ignore literal %, otherwise dump specifier over
$out .= $c;
}
# Look for a terminating field:
my $count = $c =~ tr/sSdDuUxXzZ%//;
if ($count) {
$infmt = 0;
next;
}
}
return $out;
}
##################
if($#ARGV != 2) {
print "Usage: updatelang <english.lang> <otherlang> <outfile|->\n";
exit;
}
# Parse master file
my %english = parselangfile($ARGV[0]);
my @englishorder = @{$english{'ORDER'}};
# Parse secondary file
my %lang = parselangfile($ARGV[1]);
my @langorder = @{$lang{'ORDER'}};
my @langheader = @{$lang{'HEADER'}};
# Clean up
delete $english{'ORDER'};
delete $english{'HEADER'};
delete $lang{'ORDER'};
delete $lang{'HEADER'};
# Extract language names
my @tmp = split(/\./, basename($ARGV[0]));
my $f1 = $tmp[0];
@tmp = split(/\./, basename($ARGV[1]));
my $f2 = $tmp[0];
undef @tmp;
# Read in ignore list
my $igname = dirname($0) . "/langignorelist.txt";
open (FH, "<$igname") || die ("Can't open $igname!");
my @ignorelist = <FH>;
close (FH);
sub not_ignorelist {
my ($key) = @_;
foreach (@ignorelist) {
chomp;
if ($_ eq $key) {
return 0;
}
}
return 1;
}
undef $igname;
# Do we care about notes?
my $printnotes = 1;
my $ignoredups = 0;
if ($f1 eq $f2) {
# Ignore all notes for master language
$printnotes = 0;
}
if (index($f2, $f1) > -1) {
# Ignore duplicates for sub-languages
$ignoredups = 1;
}
# work out the missing phrases
my %missing;
my @missingorder;
foreach (@englishorder) {
$missing{$_} = 1;
}
foreach (@langorder) {
if (!defined($english{$_})) {
delete($lang{$_});
# print "#!! '$_' no longer needed\n";
next;
}
delete $missing{$_};
}
foreach (@englishorder) {
push(@missingorder, $_) if defined($missing{$_});
}
# And add them to the phrase list.
foreach (@missingorder) {
# print "#!! '$_' missing\n";
push(@langorder, $_);
$lang{$_} = $english{$_};
$lang{$_}{'notes'} .= "### This phrase is missing entirely, copying from english!\n";
$lang{$_}{'new'} = 1;
}
undef @missingorder;
undef %missing;
# Sanity-check a few things
foreach my $id (@langorder) {
if (!defined($english{$id})) {
next;
}
my %ep = %{$english{$id}{'phrase'}};
my %lp = %{$lang{$id}{'phrase'}};
if ($lp{'desc'} ne $ep{'desc'} || $ep{'desc'} eq 'deprecated') {
if ($ep{'desc'} eq 'deprecated') {
# Nuke all deprecated targets; just copy from English
# print "#!! '$id' deprecated, deleting\n";
$lang{$id} = $english{$id};
} else {
$lang{$id}{'notes'} .= "### The 'desc' field for '$id' differs from the english!\n### the previously used desc is commented below:\n### desc: $lp{desc}\n";
$lang{$id}{'phrase'}{'desc'} = $english{$id}{'phrase'}{'desc'};
# print "#!! '$id' changed description\n";
}
}
if (!defined($ep{'user'}) || length($ep{'user'}) == 0) {
$lp{'user'} = 'core';
}
if (!defined($lp{'user'}) || $lp{'user'} ne $ep{'user'}) {
$lang{$id}{'notes'} .= "### The 'user' field for '$id' differs from the english!\n### the previously used desc is commented below:\n### desc: $lp{user}\n";
if (!defined($lp{'user'}) || length($lp{'user'}) == 0) {
$lp{'user'} = $ep{'user'};
}
$lang{$id}{'phrase'}{'user'} = $english{$id}{'phrase'}{'user'};
# print "#!! '$id' changed user\n";
}
}
# Check sources
foreach my $id (@langorder) {
if (!defined($english{$id})) {
next;
}
my %ep = %{$english{$id}{'source'}};
my %lp;
if (defined($lang{$id}{'source'})) {
%lp = %{$lang{$id}{'source'}};
} else {
%lp = ();
}
foreach my $tgt (keys(%lp)) {
if (!defined($ep{$tgt})) {
# Delete any targets that have been nuked in master
delete($lang{$id}{'source'}{$tgt});
}
}
foreach my $tgt (keys(%ep)) {
if (!defined($lp{$tgt})) {
# If it doesn't exist in the language, copy it from English
if ($ep{$tgt} ne 'none' && $ep{$tgt} ne '' ) {
$lang{$id}{'notes'} .= "### The <source> section for '$id:$tgt' is missing! Copying from english!\n";
# print "#!! '$id:$tgt' source missing\n";
}
$lang{$id}{'source'}{$tgt} = $english{$id}{'source'}{$tgt};
} elsif ($lp{$tgt} ne $ep{$tgt}) {
# If the source string differs, complain, and copy from English
$lang{$id}{'notes'} .= "### The <source> section for '$id:$tgt' differs from the english!\n";
$lang{$id}{'notes'} .= "### the previously used one is commented below:\n";
$lang{$id}{'notes'} .= "### $english{$id}{source}{$tgt}\n";
# print "#!! '$id:$tgt' source changed ('$lp{$tgt}' vs '$ep{$tgt}')\n";
$lang{$id}{'source'}{$tgt} = $english{$id}{'source'}{$tgt};
}
}
}
# Check dests
foreach my $id (@langorder) {
if (!defined($english{$id})) {
next;
}
my %ep = %{$english{$id}{'dest'}};
my %lp;
if (defined($lang{$id}{'dest'})) {
%lp = %{$lang{$id}{'dest'}};
} else {
%lp = ();
}
foreach my $tgt (keys(%lp)) {
if (!defined($ep{$tgt})) {
# Delete any targets that have been nuked in master
delete($lang{$id}{'dest'}{$tgt});
}
}
foreach my $tgt (keys(%ep)) {
if (!defined($lp{$tgt}) || ($lp{$tgt} eq 'none')) {
# If it doesn't exist in the language, copy it from English
if ($ep{$tgt} ne 'none' && $ep{$tgt} ne '' ) {
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' is missing! Copying from english!\n";
# print "#!! '$id:$tgt' dest missing\n";
}
$lang{$id}{'dest'}{$tgt} = $english{$id}{'dest'}{$tgt};
} elsif ($lp{$tgt} ne $ep{$tgt}) {
# If the source string differs, complain, and copy from English
if ($lp{$tgt} eq '' && $ep{$tgt} ne '') {
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' is blank! Copying from english!\n";
# print "#!! '$id:$tgt' dest is blank ('$lp{$tgt}' vs '$ep{$tgt}')\n";
$lang{$id}{'dest'}{$tgt} = $english{$id}{'dest'}{$tgt};
} elsif ($lp{$tgt} ne '' && $ep{$tgt} eq '') {
# It should be kept blank!
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' is not blank!\n";
$lang{$id}{'notes'} .= "### the previously used one is commented below:\n";
$lang{$id}{'notes'} .= "### $english{$id}{dest}{$tgt}\n";
# print "#!! '$id:$tgt' dest not blank ('$lp{$tgt}' vs '$ep{$tgt}')\n";
$lang{$id}{'dest'}{$tgt} = $english{$id}{'dest'}{$tgt};
}
} elsif ($lp{$tgt} ne 'none' && $lp{$tgt} ne '' && not_ignorelist($id) && !$lang{$id}{'new'} && !$ignoredups) {
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' is identical to english! (correct or prefix with ~)\n";
# print "#!! '$id:$tgt' dest identical ('$lp{$tgt}')\n";
}
if ($id eq 'LANG_VOICED_DATE_FORMAT') {
my $sane = $lp{$tgt};
$sane =~ tr/YAmd//d;
if (length($sane) != 0) {
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' has illegal characters! Restoring from English!\n";
$lang{$id}{'notes'} .= "### the previously used one is commented below:\n";
$lang{$id}{'notes'} .= "### $lang{$id}{dest}{$tgt}\n";
$lang{$id}{'dest'}{$tgt} = $english{$id}{'dest'}{$tgt};
}
}
my $count1 = $ep{$tgt} =~ tr/%//;
my $count2 = 0;
if (defined($lp{$tgt})) {
$count2 = $lp{$tgt} =~ tr/%//;
}
if ($count1 || $count2) {
my $fmt1 = reduceformat($ep{$tgt});
my $fmt2 = "";
if ($count2) {
$fmt2 = reduceformat($lp{$tgt});
}
if ($fmt1 ne $fmt2) {
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' has incorrect format specifiers! Copying from English!\n";
$lang{$id}{'notes'} .= "### the previously used one is commented below:\n";
$lang{$id}{'notes'} .= "### $lang{$id}{dest}{$tgt}\n";
$lang{$id}{'dest'}{$tgt} = $english{$id}{'dest'}{$tgt};
# print "#!! '$id:$tgt' dest does not match src format args: '$fmt1' vs '$fmt2'\n";
}
}
if (defined($lp{$tgt})) {
$count2 = $lp{$tgt} =~ tr/"//;
if ($count2 > 0) {
# If it has suspicious characters that are not allowed
$lang{$id}{'notes'} .= "### The <dest> section for '$id:$tgt' has some suspicious characters, please double-check!\n";
# print "#!! '$id:$tgt' suspicious characters\n";
}
}
}
}
# Check voices
foreach my $id (@langorder) {
if (!defined($english{$id})) {
next;
}
my %ep = %{$english{$id}{'voice'}};
my %lp;
if (defined($lang{$id}{'voice'})) {
%lp = %{$lang{$id}{'voice'}};
} else {
%lp = ();
}
foreach my $tgt (keys(%lp)) {
if (!defined($ep{$tgt})) {
# Delete any targets that have been nuked in master
delete($lang{$id}{'voice'}{$tgt});
}
}
foreach my $tgt (keys(%ep)) {
if (!defined($lp{$tgt}) || ($lp{$tgt} eq 'none')) {
# If it doesn't exist in the language, copy it from English
if ($ep{$tgt} ne 'none' && $ep{$tgt} ne '' ) {
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is missing! Copying from english!\n";
# print "#!! '$id:$tgt' voice missing\n";
}
$lang{$id}{'voice'}{$tgt} = $english{$id}{'voice'}{$tgt};
} elsif ($lp{$tgt} ne $ep{$tgt}) {
if ($lp{$tgt} eq '' && $ep{$tgt} ne '') {
# If the lang voice string is blank, complain and copy from translation
# print "#!! '$id:$tgt' voice is blank ('$lp{$tgt}' vs '$ep{$tgt}')\n";
if ($lang{$id}{'dest'}{$tgt} ne '' &&
$lang{$id}{'dest'}{$tgt} ne $english{$id}{'dest'}{$tgt}) {
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is blank! Copying from translated <dest>!\n";
$lang{$id}{'voice'}{$tgt} = $lang{$id}{'dest'}{$tgt};
if ($lang{$id}{'voice'}{$tgt} =~ /%"/) {
# If it has suspicious characters that are not normally voiced..
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' has some suspicious characters, please double-check!\n";
#print "#!! '$id:$tgt' suspicious characters\n";
}
} else {
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is blank! Copying from english!\n";
$lang{$id}{'voice'}{$tgt} = $english{$id}{'voice'}{$tgt};
}
} elsif ($lp{$tgt} ne '' && $ep{$tgt} eq '') {
if ($id ne 'VOICE_NUMERIC_TENS_SWAP_SEPARATOR') {
# If it's not blank, clear it and complain!
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is not blank!\n";
$lang{$id}{'notes'} .= "### the previously used one is commented below:\n";
$lang{$id}{'notes'} .= "### $english{$id}{voice}{$tgt}\n";
# print "#!! '$id:$tgt' voice not blank ('$lp{$tgt}' vs '$ep{$tgt}')\n";
$lang{$id}{'voice'}{$tgt} = $english{$id}{'voice'}{$tgt};
}
}
} elsif ($lp{$tgt} ne 'none' && $lp{$tgt} ne '' && not_ignorelist($id) && !$lang{$id}{'new'} && !$ignoredups) {
# print "#!! '$id:$tgt' voice identical ('$lp{$tgt}')\n";
if ($lang{$id}{'dest'}{$tgt} ne '' &&
$lang{$id}{'dest'}{$tgt} ne $english{$id}{'dest'}{$tgt}) {
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is identical to english, copying translated <dest>\n";
$lang{$id}{'voice'}{$tgt} = $lang{$id}{'dest'}{$tgt};
} else {
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' is identical to english! (correct or prefix with ~)\n";
}
}
if (defined($lp{$tgt}) && ($lp{$tgt} =~ /%"/)) {
# If it has suspicious characters that are not normally voiced..
$lang{$id}{'notes'} .= "### The <voice> section for '$id:$tgt' has some suspicious characters, please double-check!\n";
# print "#!! '$id:$tgt' suspicious characters\n";
}
}
}
########## Write new language file
my $fh;
if ($ARGV[2] ne '-') {
open(FH, ">$ARGV[2]") || die ("Can't open $ARGV[2]");
$fh = *FH;
} else {
$fh = *STDOUT;
}
foreach (@langheader) {
print $fh $_;
}
my @finalorder = @langorder; # TODO make configurable vs @englishorder
foreach my $id (@finalorder) {
if (!defined($english{$id})) {
next;
}
my %lp;
# phrase
%lp = %{$lang{$id}{'phrase'}};
# Drop all deprecated phrases?
# next if ($lp{'desc'} eq 'deprecated');
if (length($lang{$id}{'notes'}) && $printnotes) {
print $fh "$lang{$id}{notes}";
}
print $fh "<phrase>\n";
print $fh " id: $lp{id}\n";
if ($lp{'desc'} ne '') {
print $fh " desc: $lp{desc}\n";
} else {
print $fh " desc:\n";
}
print $fh " user: $lp{user}\n";
# source
%lp = combinetgts(%{$lang{$id}{'source'}});
print $fh " <source>\n";
foreach my $tgt (sort(keys(%lp))) {
if ($lp{$tgt} eq 'none') {
print $fh " $tgt: $lp{$tgt}\n";
} else {
print $fh " $tgt: \"$lp{$tgt}\"\n";
}
}
print $fh " </source>\n";
# dest
%lp = combinetgts(%{$lang{$id}{'dest'}});
print $fh " <dest>\n";
foreach my $tgt (sort(keys(%lp))) {
if ($lp{$tgt} eq 'none') {
print $fh " $tgt: $lp{$tgt}\n";
} else {
print $fh " $tgt: \"$lp{$tgt}\"\n";
}
}
print $fh " </dest>\n";
# voice
%lp = combinetgts(%{$lang{$id}{'voice'}});
print $fh " <voice>\n";
foreach my $tgt (sort(keys(%lp))) {
if ($lp{$tgt} eq 'none') {
print $fh " $tgt: $lp{$tgt}\n";
} else {
print $fh " $tgt: \"$lp{$tgt}\"\n";
}
}
print $fh " </voice>\n";
# FiN
print $fh "</phrase>\n";
}
if ($ARGV[2] ne '-') {
close(FH);
}