Replace the voicebuilding with a perl-based approach. Should greatly speed up building on Cygwin. See more in FS#7646.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@14457 a1c6a512-1295-4272-9138-f99709370657
2025-10-14 02:27:39 -04:00 · 2007-08-25 22:00:13 +00:00 · 2007-08-25 22:00:13 +00:00 · 17e03e75a4
commit 17e03e75a4
parent b8ded7d674
5 changed files with 465 additions and 22 deletions
--- a/docs/MAINTAINERS
+++ b/docs/MAINTAINERS
@ -290,7 +290,7 @@ Build Tools
 :scramble: Linus Nielsen Feltzing
 :descramble: Linus Nielsen Feltzing
 :mkinfo: Daniel Stenberg
-:voice shell scripts: Jonas Häggqvist
+:voice perl script: Jonas Häggqvist
 Install Tools
 -------------
--- a/tools/VOICE_PAUSE.wav
+++ b/tools/VOICE_PAUSE.wav
--- a/tools/configure
+++ b/tools/configure
@ -288,7 +288,8 @@ whichadvanced () {
    # Ask about languages to build
    echo "Select a number for the language to use (default is english)"
-    echo "You may enter a comma-separated list of languages to build"
+    # The multiple-language feature is currently broken
    # echo "You may enter a comma-separated list of languages to build"
    picklang
    voicelanguage=`whichlang`
@ -329,7 +330,7 @@ voiceconfig () {
    if [ -f "`which flite`" ]; then
        FLITE="F(l)ite "
-        FLITE_OPTS="FLITE_OPTS=\"\""
+        FLITE_OPTS=""
        DEFAULT_TTS="flite"
        DEFAULT_TTS_OPTS=$FLITE_OPTS
        DEFAULT_NOISEFLOOR="500"
@ -337,7 +338,7 @@ voiceconfig () {
    fi
    if [ -f "`which espeak`" ]; then
        ESPEAK="(e)Speak "
-        ESPEAK_OPTS="ESPEAK_OPTS=\"\""
+        ESPEAK_OPTS=""
        DEFAULT_TTS="espeak"
        DEFAULT_TTS_OPTS=$ESPEAK_OPTS
        DEFAULT_NOISEFLOOR="500"
@ -345,7 +346,23 @@ voiceconfig () {
    fi
    if [ -f "`which festival`" ]; then
        FESTIVAL="(F)estival "
-        FESTIVAL_OPTS="FESTIVAL_OPTS=\"\""
+        case "$thislang" in
            "italiano")
            FESTIVAL_OPTS="--language italian"
            ;;
            "espanol")
            FESTIVAL_OPTS="--language spanish"
            ;;
            "finnish")
            FESTIVAL_OPTS="--language finnish"
            ;;
            "czech")
            FESTIVAL_OPTS="--language czech"
            ;;
            *)
            FESTIVAL_OPTS=""
            ;;
        esac
        DEFAULT_TTS="festival"
        DEFAULT_TTS_OPTS=$FESTIVAL_OPTS
        DEFAULT_NOISEFLOOR="500"
@ -354,7 +371,7 @@ voiceconfig () {
    # Allow SAPI if Windows is in use
    if [ -f "`which winver`" ]; then
        SAPI5="(S)API5 "
-        SAPI5_OPTS="SAPI5_OPTS=\"\""
+        SAPI5_OPTS=""
        DEFAULT_TTS="sapi5"
        DEFAULT_TTS_OPTS=$SAPI5_OPTS
        DEFAULT_NOISEFLOOR="500"
@ -397,10 +414,10 @@ voiceconfig () {
    echo "Using $TTS_ENGINE for TTS"
    # Allow the user to input manual commandline options
-    printf "Enter $TTS_ENGINE options (enter for defaults `echo $TTS_OPTS |sed 's/.*=//'`): "
+    printf "Enter $TTS_ENGINE options (enter for defaults \"$TTS_OPTS\"): "
    USER_TTS_OPTS=`input`
    if [ -n "$USER_TTS_OPTS" ]; then
-        TTS_OPTS="`echo $TTS_OPTS | sed 's/=.*//'`=\"$USER_TTS_OPTS\""
+        TTS_OPTS="$USER_TTS_OPTS"
    fi
    echo ""
@ -408,7 +425,7 @@ voiceconfig () {
    if [ -f "`which oggenc`" ]; then
        OGGENC="(O)ggenc "
        DEFAULT_ENC="oggenc"
-        VORBIS_OPTS="VORBIS_OPTS=\"-q0 --downmix\""
+        VORBIS_OPTS="-q0 --downmix"
        DEFAULT_ENC_OPTS=$VORBIS_OPTS
        DEFAULT_CHOICE="O"
    fi
@ -422,7 +439,7 @@ voiceconfig () {
    if [ -f "`which lame`" ]; then
        LAME="(L)ame "
        DEFAULT_ENC="lame"
-        LAME_OPTS="LAME_OPTS=\"--resample 12 -t -m m -h -V 9 -S\""
+        LAME_OPTS="--resample 12 -t -m m -h -V 9 -S -B 64 --vbr-new"
        DEFAULT_ENC_OPTS=$LAME_OPTS
        DEFAULT_CHOICE="L"
    fi
@ -456,25 +473,16 @@ voiceconfig () {
    echo "Using $ENCODER for encoding voice clips"
    # Allow the user to input manual commandline options
-    printf "Enter $ENCODER options (enter for defaults `echo $ENC_OPTS |sed 's/.*=//'`): "
+    printf "Enter $ENCODER options (enter for defaults \"$ENC_OPTS\"): "
    USER_ENC_OPTS=`input`
    if [ -n "$USER_ENC_OPTS" ]; then
-        ENC_OPTS="`echo $ENC_OPTS | sed 's/=.*//'`=\"$USER_ENC_OPTS\""
+        ENC_OPTS=$USER_ENC_OPTS
    fi
    TEMPDIR="${pwd}"
    if [ -f "`which cygpath`" ]; then
        TEMPDIR=`cygpath . -a -w`
    fi
    cat > voicesettings-$thislang.sh <<EOF
 TTS_ENGINE="${TTS_ENGINE}"
 ENCODER="${ENCODER}"
 TEMPDIR="$TEMPDIR"
 NOISEFLOOR="${NOISEFLOOR}"
 ${TTS_OPTS}
 ${ENC_OPTS}
 EOF
 }
 picklang() {
@ -1664,6 +1672,10 @@ sed > Makefile \
 -e "${simmagic1}" \
 -e "${simmagic2}" \
 -e "s,@MANUALDEV@,${manualdev},g" \
 -e "s,@ENCODER@,${ENCODER},g" \
 -e "s,@ENC_OPTS@,${ENC_OPTS},g" \
 -e "s,@TTS_ENGINE@,${TTS_ENGINE},g" \
 -e "s,@TTS_OPTS@,${TTS_OPTS},g" \
 <<EOF
 ## Automaticly generated. http://www.rockbox.org/
@ -1732,6 +1744,10 @@ export GCCVER=@GCCVER@
 export GCCNUM=@GCCNUM@
 export UNAME=@UNAME@
 export MANUALDEV=@MANUALDEV@
 export TTS_OPTS=@TTS_OPTS@
 export TTS_ENGINE=@TTS_ENGINE@
 export ENC_OPTS=@ENC_OPTS@
 export ENCODER=@ENCODER@
 # Do not print "Entering directory ..."
 MAKEFLAGS += --no-print-directory
@ -1866,7 +1882,7 @@ if [ "yes" = "$voice" ]; then
 voice: tools features
 	\$(SILENT)for f in \`cat \$(BUILDDIR)/${apps}/features\`; do feat="\$\$feat:\$\$f" ; done ; \\
-	for lang in \`echo \$(VOICELANGUAGE) |sed "s/,/ /g"\`; do \$(TOOLSDIR)/genvoice.sh \$(ROOTDIR) \$\$lang \$(ARCHOS)\$\$feat \$(TARGET_ID) voicesettings-\$\$lang.sh ; done \\
+	for lang in \`echo \$(VOICELANGUAGE) |sed "s/,/ /g"\`; do \$(TOOLSDIR)/voice.pl -V -l=\$\$lang -t=\$(ARCHOS)\$\$feat -i=\$(TARGET_ID) -e=\$(ENCODER) -E="\$(ENC_OPTS)" -s=\$(TTS_ENGINE) -S="\$(TTS_OPTS)"; done \\
 EOF
 fi
--- a/tools/sapi5_voice_new.vbs
+++ b/tools/sapi5_voice_new.vbs
@ -0,0 +1,67 @@
 '***************************************************************************
 '             __________               __   ___.
 '   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 '   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 '   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 '   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 '                     \/            \/     \/    \/            \/
 ' $Id: sapi5_voice.vbs$
 '
 ' Copyright (C) 2007 Steve Bavin, Jens Arnold, Mesar Hameed
 '
 ' All files in this archive are subject to the GNU General Public License.
 ' See the file COPYING in the source tree root for full license agreement.
 '
 ' This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 ' KIND, either express or implied.
 '
 '***************************************************************************
 ' Purpose: Make a voice clip file for the given text on stdin
 'To be done:
 ' - Allow user to override voice, speed and/or format (currently uses Control Panel defaults for voice/speed)
 ' - Voice specific replacements/corrections for pronounciation (this should be at a higher level really)
 Const SSFMCreateForWrite = 3
 Const SPSF_8kHz16BitMono = 6
 Const SPSF_11kHz16BitMono = 10
 Const SPSF_12kHz16BitMono = 14
 Const SPSF_16kHz16BitMono = 18
 Const SPSF_22kHz16BitMono = 22
 Const SPSF_24kHz16BitMono = 26
 Const SPSF_32kHz16BitMono = 30
 Const SPSF_44kHz16BitMono = 34
 Const SPSF_48kHz16BitMono = 38
 Dim oSpVoice, oSpFS, nAudioFormat, sText, sOutputFile
 nAudioFormat = SPSF_22kHz16BitMono 'Audio format to use, recommended settings:
 '- for AT&T natural voices, use SPSF_32kHz16BitMono
 '- for MS voices, use SPSF_22kHz16BitMono
 Set oSpVoice = CreateObject("SAPI.SpVoice")
 If Err.Number <> 0 Then
    WScript.Echo "Error - could not get SpVoice object. " & _
    "SAPI 5 not installed?"
    Err.Clear
    WScript.Quit 1
 End If
 While 1 > 0
    sText = WScript.StdIn.ReadLine
    sOutputFile = WScript.StdIn.ReadLine
    If sOutputFile = "" Then
        Set oSpFS = Nothing
        Set oSpVoice = Nothing
        Set oArgs = Nothing
        WScript.Quit 0
    End If
    ' WScript.Echo "Saying " + sText + " in " + sOutputFile
    Set oSpFS = CreateObject("SAPI.SpFileStream")
    oSpFS.Format.Type = nAudioFormat
    oSpFS.Open sOutputFile, SSFMCreateForWrite, False
    Set oSpVoice.AudioOutputStream = oSpFS
    oSpVoice.Speak sText
    oSpFS.Close
 Wend
--- a/tools/voice.pl
+++ b/tools/voice.pl
@ -0,0 +1,360 @@
 #!/usr/bin/perl -s
 #             __________               __   ___.
 #   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 #   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 #   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 #   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 #                     \/            \/     \/    \/            \/
 # $Id: 
 #
 # Copyright (C) 2007 Jonas Häggqvist
 #
 # All files in this archive are subject to the GNU General Public License.
 # See the file COPYING in the source tree root for full license agreement.
 #
 # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 # KIND, either express or implied.
 use strict;
 use warnings;
 use File::Basename;
 use File::Copy;
 use Switch;
 use vars qw($V $C $t $l $e $E $s $S $i $v);
 use IPC::Open3;
 use Digest::MD5 qw(md5_hex);
 sub printusage {
    print <<USAGE
 Usage: voice.pl [options] [path to dir]
 -V
    Create voice file. You must also specify -t and -l.
 -C
    Create .talk clips.
 -t=<target>
    Specify which target you want to build voicefile for. Must include
    any features that target supports.
 -i=<target_id>
    Numeric target id. Needed for voice building.
 -l=<language>
    Specify which language you want to build. Without .lang extension.
 -e=<encoder>
    Which encoder to use for voice strings
 -E=<encoder options>
    Which encoder options to use when compressing voice strings. Enclose
    in double quotes if the options include spaces.
 -s=<TTS engine>
    Which TTS engine to use.
 -S=<TTS engine options>
    Options to pass to the TTS engine. Enclose in double quotes if the
    options include spaces.
 -v
    Be verbose
 USAGE
 ;
 }
 # Initialize TTS engine. May return an object or value which will be passed
 # to voicestring and shutdown_tts
 sub init_tts {
    our $verbose;
    my ($tts_engine, $tts_engine_opts, $language) = @_;
    my $ret = undef;
    switch($tts_engine) {
        case "festival" {
            print("> festival $tts_engine_opts --server\n") if $verbose;
            my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1");
            $ret = *FESTIVAL_SERVER;
            $ret = $pid;
            $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
            $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
        }
        case "sapi5" {
            my $toolsdir = dirname($0);
            my $path = `cygpath $toolsdir -a -w`;
            chomp($path);
            $path = $path . "\\sapi5_voice_new.vbs $language $tts_engine_opts";
            $path =~ s/\\/\\\\/g;
            print("> cscript /B $path\n") if $verbose;
            my $pid = open(F, "| cscript /B $path");
            $ret = *F;
            $SIG{INT} = sub { print($ret "\r\n\r\n"); panic_cleanup(); };
            $SIG{KILL} = sub { print($ret "\r\n\r\n"); panic_cleanup(); };
        }
    }
    return $ret;
 }
 # Shutdown TTS engine if necessary.
 sub shutdown_tts {
    my ($tts_engine, $tts_object) = @_;
    switch($tts_engine) {
        case "festival" {
            # Send SIGTERM to festival server
            kill TERM => $tts_object;
        }
        case "sapi5" {
            print($tts_object "\r\n\r\n");
            close($tts_object);
        }
    }
 }
 # Apply corrections to a voice-string to make it sound better
 sub correct_string {
    our $verbose;
    my ($string, $language, $tts_engine) = @_;
    my $orig = $string;
    switch($language) {
        # General for all engines and languages (perhaps - just an example)
        $string =~ s/USB/U S B/;
        case ("deutsch") {
            switch($tts_engine) {
                $string =~ s/alphabet/alfabet/;
                $string =~ s/alkaline/alkalein/;
                $string =~ s/ampere/amper/;
                $string =~ s/byte(s?)\b/beit$1/;
                $string =~ s/\bdezibel\b/de-zibell/;
                $string =~ s/energie\b/ener-gie/;
                $string =~ s/\bflash\b/fläsh/g;
                $string =~ s/\bfirmware(s?)\b/firmwer$1/;
                $string =~ s/\bid3 tag\b/id3 täg/g; # can't just use "tag" here
                $string =~ s/\bloudness\b/laudness/;
                $string =~ s/\bnumerisch\b/numehrisch/;
                $string =~ s/\brücklauf\b/rück-lauf/;
                $string =~ s/\bsuchlauf\b/such-lauf/;
            }
        }
    }
    if ($orig ne $string) {
        printf("%s -> %s\n", $orig, $string) if $verbose;
    }
    return $string;
 }
 # Produce a wav file of the text given
 sub voicestring {
    our $verbose;
    my ($string, $output, $tts_engine, $tts_engine_opts, $tts_object) = @_;
    my $cmd;
    printf("Generate \"%s\" with %s in file %s\n", $string, $tts_engine, $output) if $verbose;
    switch($tts_engine) {
        case "festival" {
            # festival_client lies to us, so we have to do awful soul-eating
            # work with IPC::open3()
            $cmd = "festival_client --server localhost --otype riff --ttw --output \"$output\"";
            print("> $cmd\n") if $verbose;
            # Open command, and filehandles for STDIN, STDOUT, STDERR
            my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd);
            # Put the string to speak into STDIN and close it
            print(CMD_IN $string);
            close(CMD_IN);
            # Read all output from festival_client (because it LIES TO US)
            while (<CMD_ERR>) {
            }
            close(CMD_OUT);
            close(CMD_ERR);
        }
        case "flite" {
            $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
            print("> $cmd\n") if $verbose;
            `$cmd`;
        }
        case "espeak" {
            # xxx: $tts_engine_opts isn't used
            $cmd = "espeak $tts_engine_opts -w $output";
            print("> $cmd\n") if $verbose;
            open(ESPEAK, "| $cmd");
            print ESPEAK $string . "\n";
            close(ESPEAK);
        }
        case "sapi5" {
            print($tts_object sprintf("%s\r\n%s\r\n", $string, $output));
        }
    }
 }
 # Encode a wav file into the given destination file
 sub encodewav {
    our $verbose;
    my ($input, $output, $encoder, $encoder_opts) = @_;
    printf("Encode \"%s\" with %s in file %s\n", $input, $encoder, $output) if $verbose;
    switch ($encoder) {
        case 'lame' {
            my $cmd = "lame $encoder_opts \"$input\" \"$output\"";
            print("> $cmd\n") if $verbose;
            `lame $encoder_opts "$input" "$output"`;
            `$cmd`;
        }
        case 'vorbis' {
            `oggenc $encoder_opts "$input" -o "$output"`;
        }
        case 'speexenc' {
            `speexenc $encoder_opts "$input" "$output"`;
        }
    }
 }
 sub wavtrim {
    our $verbose;
    my ($file) = @_;
    my $cmd = dirname($0) . "/wavtrim \"$file\"";
    print("> $cmd\n") if $verbose;
    `$cmd`;
 }
 # Run genlang and create voice clips for each string
 sub generateclips {
    our $verbose;
    my ($language, $target, $encoder, $encoder_opts, $tts_engine, $tts_engine_opts) = @_;
    my $genlang = dirname($0) . '/genlang';
    my $english = dirname($0) . '/../apps/lang/english.lang';
    my $langfile = dirname($0) . '/../apps/lang/' . $language . '.lang';
    my $id = '';
    my $voice = '';
    my $cmd = "$genlang -o -t=$target -e=$english $langfile 2>/dev/null";
    my $pool_file;
    open(VOICEFONTIDS, "> voicefontids");
    my $i = 0;
    my $tts_object = init_tts($tts_engine, $tts_engine_opts, $language);
    print("Generating voice clips");
    print("\n") if $verbose;
    for (`$cmd`) {
        my $line = $_;
        print(VOICEFONTIDS $line);
        if ($line =~ /^id: (.*)$/) {
            $id = $1;
        }
        elsif ($line =~ /^voice: "(.*)"$/) {
            $voice = $1;
            if ($id !~ /^NOT_USED_.*$/ && $voice ne "") {
                my $wav = $id . '.wav';
                my $mp3 = $id . '.mp3';
                # Print some progress information
                if (++$i % 10 == 0 and !$verbose) {
                    print(".");
                }
                # Apply corrections to the string
                $voice = correct_string($voice);
                # If we have a pool of snippes, see if the string exists there first
                if (defined($ENV{'POOL'})) {
                    $pool_file = sprintf("%s/%s-%s-%s.mp3", $ENV{'POOL'}, md5_hex($voice), $language, $tts_engine);
                    if (-f $pool_file) {
                        printf("Re-using %s (%s) from pool\n", $id, $voice) if $verbose;
                        copy($pool_file, $mp3);
                    }
                }
                # Don't generate MP3 if it already exists (probably from the POOL)
                if (! -f $mp3) {
                    if ($id eq "VOICE_PAUSE") {
                        print("Use distributed $wav\n") if $verbose;
                        copy(dirname($0)."/VOICE_PAUSE.wav", $wav);
                    }
                    else {
                        voicestring($voice, $wav, $tts_engine, $tts_engine_opts, $tts_object);
                        wavtrim($wav, 500); # 500 seems to be a reasonable default for now
                    }
                    encodewav($wav, $mp3, $encoder, $encoder_opts);
                    if (defined($ENV{'POOL'})) {
                        copy($mp3, $pool_file);
                    }
                    unlink($wav);
                }
                $voice = "";
                $id = "";
            }
        }
    }
    print("\n");
    close(VOICEFONTIDS);
    shutdown_tts($tts_engine, $tts_object);
 }
 # Assemble the voicefile
 sub createvoice {
    our $verbose;
    my ($language, $target_id) = @_;
    my $voicefont = dirname($0) . '/voicefont';
    my $outfile = "";
    my $i = 0;
    do {
        $outfile = sprintf("%s%s.voice", $language, ($i++ == 0 ? '' : '-'.$i));
    } while (-f $outfile);
    printf("Saving voice file to %s\n", $outfile) if $verbose;
    my $cmd = "$voicefont 'voicefontids' $target_id ./ $outfile";
    print("> $cmd\n") if $verbose;
    my $output = `$cmd`;
    print($output) if $verbose;
 }
 sub deletemp3s() {
    for (glob('*.mp3')) {
        unlink($_);
    }
    for (glob('*.wav')) {
        unlink($_);
    }
 }
 sub panic_cleanup {
    deletemp3s();
    die "moo";
 }
 # Check parameters
 my $printusage = 0;
 unless (defined($V) or defined($C)) { print("Missing either -V or -C\n"); $printusage = 1; }
 if (defined($V)) {
    unless (defined($t)) { print("Missing -t argument\n"); $printusage = 1; }
    unless (defined($l)) { print("Missing -l argument\n"); $printusage = 1; }
    unless (defined($i)) { print("Missing -i argument\n"); $printusage = 1; }
 }
 elsif (defined($C)) {
    unless (defined($ARGV[0])) { print "Missing path argument\n"; $printusage = 1; }
 }
 unless (defined($e)) { print("Missing -e argument\n"); $printusage = 1; }
 unless (defined($E)) { print("Missing -E argument\n"); $printusage = 1; }
 unless (defined($s)) { print("Missing -s argument\n"); $printusage = 1; }
 unless (defined($S)) { print("Missing -S argument\n"); $printusage = 1; }
 if ($printusage == 1) { printusage(); exit 1; }
 $SIG{INT} = \&panic_cleanup;
 $SIG{KILL} = \&panic_cleanup;
 if (defined($v) or defined($ENV{'V'})) {
    our $verbose = 1;
 }
 # Do what we're told
 if ($V == 1) {
    printf("Generating voice\n  Target: %s\n  Language: %s\n  Encoder (options): %s (%s)\n  TTS Engine (options): %s (%s)\n",
        $t, $l, $e, $E, $s, $S);
    generateclips($l, $t, $e, $E, $s, $S);
    createvoice($l, $i);
    deletemp3s();
 }
 elsif ($C) {
    # xxx: Implement .talk clip generation
 }
 else {
    printusage();
    exit 1;
 }