rockbox/tools/genlang

#!/usr/bin/perl -s
#             __________               __   ___.
#   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
#   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
#   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
#   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
#                     \/            \/     \/    \/            \/
# $Id$
#
# Copyright (C) 2006 - 2008 by Daniel Stenberg
#
use utf8;
use Unicode::Normalize;
use Encode qw( encode_utf8 );

# See apps/language.c (TODO: Use common include for both)
# Cookie and binary version for the binary lang file
my $LANGUAGE_COOKIE   = 0x1a;
my $VOICE_COOKIE      = 0x9a;
my $LANGUAGE_VERSION  = 0x06;
my $LANGUAGE_FLAG_RTL = 0x01;
my $LANGUAGE_FLAG_UNITS_FIRST = 0x02;

my $HEADER_SIZE       = 4;
my $SUBHEADER_SIZE    = 6;

if(!$ARGV[0]) {
    print <<MOO
Usage: genlang [options] <langv2 file>

 -p=<prefix>
    Make the tool create a [prefix].c and [prefix].h file.

 -b=<outfile>
    Make the tool create a binary language (.lng) file named [outfile].
    The use of this option requires that you also use -e, -t and -i.

 -c=<outfile>
    Create binary voicestring file named [outfile]. Works like -b and can be
    used the same time.

 -e=<english lang file>
    Point out the english (original source) file, to use that as master
    language template. Always required.

 -t=<target>
    Specify which target you want the translations/phrases for. Required when
    -b, -o, or -p is used.

    The target can in fact be specified as numerous different strings,
    separated with colons. This will make genlang to use all the specified
    strings when searching for a matching phrase.

 -i=<target id>
    The target id number, needed for -b.

 -o
    Voice mode output. Outputs all id: and voice: lines for the given target!

 -v
    Enables verbose (debug) output.
MOO
;
    exit;
}

my $prefix = $p;
my $binary = $b;
my $binvoice = $c;
my $voiceout = $o;

my $english = $e;
if (!$english) {
    print STDERR "Please specify the english lang source (with -e)!\n";
    exit;
}

my $target_id = $i;
if($binary && !$target_id) {
    print STDERR "Please specify a target id number (with -i)!\n";
    exit;
}

my $target = $t;
if(!$target) {
    print STDERR "Please specify a target (with -t)!\n";
    exit;
}

my $check = ($binary?.5:0) + ($prefix?1:0) + ($voiceout?1:0) + ($binvoice?.5:0);
if($check > 1) {
    print STDERR "Please use only one of -p, -o, -b, and -c\n";
    exit;
}
if(!$check) {
    print STDERR "Please use at least one of -p, -o, -c, and -e\n";
    exit;
}

# Build up a regex which can be applied to target wildcard lists. We only need
# to support prefix matches, so a target parameter of foo:bar can be expanded
# to the regex "\*|f\*|fo\*|foo|b\*|ba\*|bar" and applied to the wildcard list
# (plus end-of-string or commas on either side). The regex engine should
# discard any duplicates generated for us in the process of constructing the
# state machine, so we don't bother to check.
my $target_regex = "(?:^|,) *(?:\\*";
my $model_regex = "";  # This matches the player model only!
foreach my $target_part (split ':', $target) {
    for (my $c=1; $c<=length $target_part; $c++) {
        my $partial = substr $target_part, 0, $c;
        $target_regex .= "|$partial\\*";
    }
    $target_regex .= "|$target_part";
    $model_regex = $target_regex if (!$model_regex);
}
$target_regex .= ") *(?:,|\$)";
$target_regex = qr/$target_regex/;
$model_regex .= ") *(?:,|\$)";
$model_regex = qr/$model_regex/;

my $binpath = "";
if ($binary =~ m|(.*)/[^/]+|) {
    $binpath = $1;
}

my $verbose=$v;

my %id; # string to num hash
my @idnum; # num to string array

my %allphrases;  # For sorting - an array of the <phrase> elements
my %source; # id string to source phrase hash
my %dest; # id string to dest phrase hash
my %voice; # id string to voice phrase hash

my %users =
  ('core' => 0);

my $input = $ARGV[0];

my @m;
my $m="blank";

sub trim {
    my ($string) = @_;
    $string =~ s/^\s+//;
    $string =~ s/\s+$//;
    return $string;
}

sub blank {
    # nothing to do
}

my %head;
sub header {
    my ($full, $n, $v)=@_;
    $head{$n}=$v;
}

my %phrase;
sub phrase {
    my ($full, $n, $v)=@_;
    $phrase{$n}=$v;
}

my %options;
sub options {
    my ($full, $n, $v)=@_;
    $options{$n}=$v;
}

sub parsetarget {
    my ($debug, $strref, $full, $n, $v)=@_;
    my $string;
    if ($n =~ $target_regex) {
        $string = $v;
        # Only override the previously set string if this is a device-specific match
        $$strref = $string if (!$$strref || $$strref eq 'none' || $n =~ $model_regex);
        return $string;
    }
}

my $src;
sub source {
    parsetarget("src", \$src, @_);
}

my $dest;
sub dest {
    parsetarget("dest", \$dest, @_);
}

my $voice;
sub voice {
    parsetarget("voice", \$voice, @_);
}

sub file_is_newer {
    my ($file1, $file2) = @_;

    my @s1 = stat $file1;
    my @s2 = stat $file2;

    return 1 if ($s1[9] > $s2[9]);
    return 0;
}

my %idmap;
my %english;
if($english) {
    readenglish();
}

sub readenglish {
    # For the cases where the english file needs to be scanned/read, we must
    # do it before we read the translated file.

    my @idnum = ((0));       # start with a true number
    my @vidnum = ((0x8000)); # first voice id

    if ($binary and file_is_newer("$binpath/english.list", $english)) {
        open(ENG, "<$binpath/english.list") ||
            die "Error: can't open $binpath/english.list";
        while (<ENG>) {
            my ($user, $id, $value) = split ':', $_;
            $idmap[$user]{$id} = $value;
            $english{$id} = 1;
        }
        close ENG;

        return;
    }

    open(ENG, "<$english") || die "Error: can't open $english";
    my @phrase;
    my $id;
    my $maybeid;
    my $user;
    my $withindest;
    my $numphrases = 0;
    my $numusers = 1; # core is already in the users map

    while(<ENG>) {
        # get rid of DOS newlines
        $_ =~ tr/\r//d;

        if($_ =~ /^ *\<phrase\>/) {
            # this is the start of a phrase
        } elsif($_ =~ /\<\/phrase\>/) {
            # if id is something, when we count and store this phrase
            if($id) {
                # voice-only entries get a difference range
                if($id =~ /^VOICE_/) {
                    # Assign an ID number to this entry
                    $idmap[$user]{$id}=$vidnum[$user];
                    $vidnum[$user]++;
                } else {
                    # Assign an ID number to this entry
                    $idmap[$user]{$id}=$idnum[$user];
                    $idnum[$user]++;
   #                 print STDERR "DEST: bumped idnum to $idnum[$user]\n";
                }

                # this is the end of a phrase, add it to the english hash
                $english{$id}=join("", @phrase);
            }
            undef @phrase;
            $id="";
        } elsif($_ ne "\n") {
            # gather everything related to this phrase
            push @phrase, $_;
            if($_ =~ /^ *\<dest\>/i) {
                $withindest=1;
                $deststr="";
            } elsif($withindest && ($_ =~ /^ *\<\/dest\>/i)) {
                $withindest=0;
                if($deststr && ($deststr !~ /^none\z/i)) {
                    # we unconditionally always use all IDs when the "update"
                    # feature is used
                    $id = $maybeid;
    #                print "DEST: use this id $id\n";
                } else {
    #                print "skip $maybeid for $name\n";
                }
            } elsif($withindest && ($_ =~ / *([^:]+): *(.*)/)) {
                my ($name, $val)=($1, $2);
                $dest=""; # in case it is left untouched for when the
                # model name isn't "ours"
                dest($_, $name, $val);

                if($dest) {
                    # Store the current dest string. If this target matches
                    # multiple strings, it will get updated several times.
                    $deststr = $dest;
                }
            }
        }

        if($_ =~ /^ *id: ([^ \t\n]+)/i) {
            $maybeid=$1;
            $sortorder{$maybeid}=$numphrases++;
        }
        if($_ =~ /^ *user: ([^ \t\n]+)/i) {
            $user = $users{$1};
            if(!(defined $user)) {
                $user = ++$numusers;
                $users{$1} = $user;
            }
        }
    }
    close(ENG);
}

my @idcount;           # counter for lang ID numbers
my @voiceid;           # counter for voice-only ID numbers

for (keys %users) {
    push @idcount, 0;
    push @voiceid, 0x8001;
}

#
# Now start the scanning of the selected language string
#

open(LANG, "<$input") || die "Error: couldn't read language file named $input\n";
binmode(LANG, ":utf8");

my @phrase;
my $langoptions = 0;

while(<LANG>) {

    $line++;

    # get rid of DOS newlines
    $_ =~ tr/\r//d;

    if($_ =~ /^( *\#|[ \t\n\r]*\z)/) {
        # comment or empty line - output it if it's part of the header
        if ($_ =~ /LANGUAGE_IS_RTL/) {
            $langoptions |= $LANGUAGE_FLAG_RTL;
        }
        if ($_ =~ /LANGUAGE_UNITS_FIRST/) {
            $langoptions |= $LANGUAGE_FLAG_UNITS_FIRST;
        }
        next;
    }

    my $ll = $_;

    # print "M: $m\n";

    push @phrase, $ll;

    # this is an XML-lookalike tag
    if (/^(<|[^\"<]+<)([^>]*)>/) {
        my $part = $2;
        # print "P: $part\n";

        if($part =~ /^\//) {
            # this was a closing tag

            if($part eq "/phrase") {
                # closing the phrase

                my $idstr = $phrase{'id'};
                my $idnum;

                if($english && !$english{$idstr}) {
                    # print STDERR "$idstr doesn't exist for english, skip it\n";
                } elsif($dest =~ /^none\z/i || $src =~ /^none\z/i ) {
                    # "none" as dest (without quotes) means that this entire
                    # phrase is to be ignored
                } elsif($sortfile) {
                    $allphrases{$idstr}=join('',@phrase);
                } else {
                    # allow the keyword 'deprecated' to be used on dest and
                    # voice strings to mark that as deprecated. It will then
                    # be replaced with "".

                    $dest =~ s/^deprecate(|d)\z/\"\"/i;
                    $voice =~ s/^deprecate(|d)\z/\"\"/i;

                    # basic syntax error alerts, if there are no quotes we
                    # will assume an empty string was intended
                    if($dest !~ /^\"/) {
                        print STDERR "$input:$line:1: warning: dest before line lacks quotes ($dest)!\n";
                        $dest='""';
                    }
                    if($src !~ /^\"/) {
                        print STDERR "$input:$line:1: warning: source before line lacks quotes ($src)!\n";
                        $src='""';
                    }
                    if($voice !~ /^\"/ and $voice !~ /^none\z/i) {
                        print STDERR "$input:$line:1: warning: voice before line lacks quotes ($voice)!\n";
                        $voice='""';
                    }
                    if($dest eq '""' && $phrase{'desc'} !~ /deprecated/i && $idstr !~ /^VOICE/) {
                        print STDERR "$input:$line:1: warning: empty dest before line in non-deprecated phrase!\n";
                    }

                    my $userstr = trim($phrase{'user'});
                    my $user = $users{$userstr};
                    if ($userstr eq "") {
                        print STDERR "$input:$line:1: warning: missing user!\n";
                        $user = $users{"core"};
                    }
                    elsif(!(defined $user)) {
                        if($english) {
                           print STDERR "$input:$line:1: warning: user was not found in $english!\n";
                           $user = keys %users;  # set to an invalid user so it won't be added
                        } else {
                            # we found a new user, add it to the usermap
                            $user = ++$numusers;
                            $users{$userstr} = $user;
                        }
                    }

                    # Use the ID name to figure out which id number range we
                    # should use for this phrase. Voice-only strings are
                    # separated.

                    if($idstr =~ /^VOICE/) {
                        $idnum = $voiceid[$user]++;
                    } else {
                        $idnum = $idcount[$user]++;
                    }

		    # Strip out the magic "Same as english" flag
		    $dest =~ s/^("?)~+/$1/;
		    $voice =~ s/^("?)~+/$1/;

                    $id{$idstr} = $idnum;
                    $idnum[$user][$idnum]=$idstr;

                    $source{$idstr}=$src;
                    $dest{$idstr}=$dest;
                    $voice{$idstr}=$voice;

                    if($verbose) {
                        print "id: $phrase{id} ($idnum)\n";
                        print "source: $src\n";
                        print "dest: $dest\n";
                        print "voice: $voice\n";
                        print "user: $user\n";
                    }

                    undef $src;
                    undef $dest;
                    undef $voice;
                    undef $user;
                    undef %phrase;
                }
                undef @phrase;
            } # end of </phrase>

            # starts with a slash, this _ends_ this section
            $m = pop @m; # get back old value, the previous level's tag
            next;
        } # end of tag close

        # This is an opening (sub) tag

        push @m, $m; # store old value
        $m = $part;
        next;
    }

    if(/^ *([^:]+): *(.*)/) {
        my ($name, $val)=($1, $2);
        &$m($_, $name, $val);
    }
}
close(LANG);

if ($sortfile) {
    for(sort { $sortorder{$a} <=> $sortorder{$b} } keys %allphrases) {
         print $allphrases{$_};
    }
}

if($prefix) {
    # We create a .c and .h file

    open(HFILE_CORE, ">$prefix/lang.h") ||
        die "Error: couldn't create file $prefix/lang.h\n";
    open(CFILE_CORE, ">$prefix/lang_core.c") ||
        die "Error: couldn't create file $prefix/lang_core.c\n";

   # get header file name
   $headername = "$prefix/lang.h";
   $headername =~ s/(.*\/)*//;

    print HFILE_CORE <<MOO
/* This file was automatically generated using genlang */
/*
 * The str() macro/functions is how to access strings that might be
 * translated. Use it like str(MACRO) and expect a string to be
 * returned!
 */
#define str(x) language_strings[x]

/* this is the array for holding the string pointers.
   It will be initialized at runtime. */
extern unsigned char *language_strings[];
/* this contains the concatenation of all strings, separated by \\0 chars */
extern const unsigned char core_language_builtin[];

#include "${prefix}_enum.h"

MOO
    ;

    close(HFILE_CORE);

    open(HFILE_CORE, ">${prefix}_enum.h") ||
        die "couldn't create file ${prefix}_enum.h\n";

    print HFILE_CORE <<MOO
/* This file was automatically generated using genlang */
#ifndef _LANG_ENUM_H_
#define _LANG_ENUM_H_
/* The enum below contains all available strings */
enum \{
MOO
    ;

    print CFILE_CORE <<MOO
/* This file was automatically generated using genlang, the strings come
   from "$input" */

#include "$headername"

unsigned char *language_strings[LANG_LAST_INDEX_IN_ARRAY];
const unsigned char core_language_builtin[] =
MOO
;

    # Output the ID names for the enum in the header file
    my $i;
    for $i (0 .. $idcount[$users{"core"}]-1) {
        my $name=$idnum[$users{"core"}][$i]; # get the ID name

        $name =~ tr/\"//d; # cut off the quotes

        printf HFILE_CORE ("    %s, /* %d */\n", $name, $i);
    }

# Output separation marker for last string ID and the upcoming voice IDs

    print HFILE_CORE <<MOO
    LANG_LAST_INDEX_IN_ARRAY, /* this is not a string, this is a marker */
    /* --- below this follows voice-only strings --- */
    VOICEONLY_DELIMITER = 0x8000,
MOO
    ;

# Output the ID names for the enum in the header file
    for $i (0x8001 .. ($voiceid[$users{"core"}]-1)) {
        my $name=$idnum[$users{"core"}][$i]; # get the ID name

        $name =~ tr/\"//d; # cut off the quotes

        printf HFILE_CORE ("    %s, /* 0x%x */\n", $name, $i);
    }

    # Output end of lang_enum.h
    print HFILE_CORE <<MOO
    LANG_LAST_VOICEONLY_INDEX_IN_ARRAY /* this is not a string, this is a marker */
};
/* end of generated enum list */
#endif /* _LANG_ENUM_H_ */
MOO
    ;

    # Output the target phrases for the source file
    for $i (0 .. $idcount[$users{"core"}]-1) {
        my $name=$idnum[$users{"core"}][$i]; # get the ID
        my $dest = $dest{$name}; # get the destination phrase

        $dest =~ s:\"$:\\0\":; # insert a \0 before the second quote

        if(!$dest) {
            # this is just to be on the safe side
            $dest = '"\0"';
        }

        printf CFILE_CORE ("    %s\n", $dest);
    }

# Output end of string chunk
    print CFILE_CORE <<MOO
;
/* end of generated string list */
MOO
;

    close(HFILE_CORE);
    close(CFILE_CORE);
} # end of the c/h file generation
elsif($binary || $binvoice) {
    # Creation of a binary lang file was requested

    # We must first scan the english file to get the correct order of the id
    # numbers used there, as that is what sets the id order for all language
    # files. The english file is scanned before the translated file was
    # scanned.

    if($binary) {
        open(OUTF, ">$binary") or die "Error: Can't create $binary";
        binmode OUTF;
        printf OUTF ("%c%c%c%c", $LANGUAGE_COOKIE, $LANGUAGE_VERSION, $target_id,
            $langoptions); # magic lang file header
    }
    if($binvoice) {
        open(OUTV, ">$binvoice") or die "Error: Can't create $binvoice";
        binmode OUTV;
        printf OUTV ("%c%c%c%c", $VOICE_COOKIE, $LANGUAGE_VERSION, $target_id,
            $langoptions); # magic lang file header
    }

    # output the number of strings for each user
    my $foffset = $HEADER_SIZE + $SUBHEADER_SIZE * keys(%users);
    for (keys %users) {
        my $size;
        for $n (0 .. $idcount[$_]-1) {
            $size += length(trim($dest{$idnum[$_][$n]})) + 1;
        }
        if($binary) {
            printf OUTF ("%c%c%c%c%c%c", ($idcount[$_] >> 8), ($idcount[$_] & 0xff),
                ($size >> 8), ($size & 0xff), ($foffset >> 8), ($foffset & 0xff));
        }
        if($binvoice) {
            printf OUTV ("%c%c%c%c%c%c", ($idcount[$_] >> 8), ($idcount[$_] & 0xff),
                ($size >> 8), ($size & 0xff), ($foffset >> 8), ($foffset & 0xff));
        }
        $foffset += $size;
    }

    for (keys %users) {
        # loop over the target phrases
        # This loops over the strings in the translated language file order
        my @ids = ((0 .. ($idcount[$_]-1)));
        push @ids, (0x8001 .. ($voiceid[$_]-1));
        for $n (@ids) {
            my $name=$idnum[$_][$n]; # get the ID
            my $dest = $dest{$name}; # get the destination phrase
            my $voice = $voice{$name}; # get the destination voice string

            if($dest && $n < 0x8000 && $binary) {
                $dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
                $dest = encode_utf8(NFD($dest)); # Decompose

                # Now, make sure we get the number from the english sort order:
                $idnum = $idmap[$_]{$name};

                printf OUTF ("%c%c%s\x00", (($idnum>>8)&0xff), ($idnum&0xff), $dest);
            }
            if($voice && $binvoice) {
                $voice =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
                $voice = encode_utf8($voice);
                # Now, make sure we get the number from the english sort order:
                $idnum = $idmap[$_]{$name};
                printf OUTV ("%c%c%s\x00", (($idnum>>8)&0xff), ($idnum&0xff), $voice);
            }
        }
    }
    if($binary) {
        close(OUTF);
    }
    if($binvoice) {
        close(OUTV);
    }
}
elsif($voiceout) {
    # voice output requested, display id: and voice: strings in a v1-like
    # fashion

    my @engl;

    for (keys %users) {
        # loop over the target phrases
        # This loops over the strings in the translated language file order
        my @ids = ((0 .. ($idcount[$_]-1)));
        push @ids, (0x8001 .. ($voiceid[$_]-1));
        for $n (@ids) {
            my $name=$idnum[$_][$n]; # get the ID
            my $dest = $dest{$name}; # get the destination phrase
            my $voice = $voice{$name}; # get the destination voice string

            if($voice) {
                $voice =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
                # Now, make sure we get the number from the english sort order:
                $idnum = $idmap[$_]{$name};
		$engl[$idnum] = "#$idnum ($n)\nid: $name\nvoice: \"$voice\"\n";
            }
        }
    }
    # Print the list in the the English sort order
    for (@engl) {
        print $_;
    }
}


if($verbose) {
    my $num_str = 0;

    for (keys %users) {
        $num_str += $idcount[$_];
    }

    printf("%d ID strings scanned\n", $num_str);

    print "* head *\n";
    for(keys %head) {
        printf "$_: %s\n", $head{$_};
    }
}

if ($binary and !file_is_newer("$binpath/english.list", $english)) {
    open(ENGLIST, ">$binpath/english.list") ||
        die "Failed creating $binpath/english.list";
    for my $user (keys %users) {
        for my $id (sort { $idmap[$user]{$a} <=> $idmap[$user]{$b} } keys %{$idmap[$user]}) {
            print ENGLIST "$user:$id:$idmap[$user]{$id}\n";
        }
    }
    close ENGLIST;
}