voice: Add support for Google Translate's speech synthesizer

Uses the 'gtts-cli' command line client. Supports a wide variety of languages, including all "Complete" and "Good" Rockbox translations. Additional changes: * voice synth script can accept pre-encoded mp3 files * Move language->synth options mapping into the voice script * Additional cleanups Change-Id: I9523e2bca87cbcee2d8c4111f9892e8e458c7419
2025-11-09 21:22:39 -05:00 · 2020-07-08 19:05:09 -04:00 · 2020-07-08 19:05:09 -04:00 · 2c3399537c
commit 2c3399537c
parent 5e98eba8ab
2 changed files with 93 additions and 51 deletions
--- a/tools/configure
+++ b/tools/configure
@ -1111,23 +1111,7 @@ voiceconfig () {
    fi
    if [ -n "`findtool festival`" ]; then
        FESTIVAL="(F)estival "
-        case "$thislang" in
+        FESTIVAL_OPTS=""
            "italiano")
            FESTIVAL_OPTS="--language italian"
            ;;
            "espanol")
            FESTIVAL_OPTS="--language spanish"
            ;;
            "finnish")
            FESTIVAL_OPTS="--language finnish"
            ;;
            "czech")
            FESTIVAL_OPTS="--language czech"
            ;;
            *)
            FESTIVAL_OPTS=""
            ;;
        esac
        DEFAULT_TTS="festival"
        DEFAULT_TTS_OPTS=$FESTIVAL_OPTS
        DEFAULT_NOISEFLOOR="500"
@ -1149,6 +1133,23 @@ voiceconfig () {
        DEFAULT_NOISEFLOOR="500"
        DEFAULT_CHOICE="w"
    fi
    # Allow SAPI if Windows is in use
    if [ -n "`findtool winver`" ]; then
        SAPI="(S)API "
        SAPI_OPTS=""
        DEFAULT_TTS="sapi"
        DEFAULT_TTS_OPTS=$SAPI_OPTS
        DEFAULT_NOISEFLOOR="500"
        DEFAULT_CHOICE="S"
    fi
    if [ -n "`findtool gtts-cli`" ]; then
        GTTS="(g)tts "
        GTTS_OPTS=""
        DEFAULT_TTS="gtts"
        DEFAULT_TTS_OPTS=$GTTS_OPTS
        DEFAULT_NOISEFLOOR="500"
        DEFAULT_CHOICE="g"
    fi
    if [ -n "`findtool rbspeak`" ]; then
        RBSPEAK="(O)ther "
        RBSPEAK_OPTS=""
@ -1157,17 +1158,8 @@ voiceconfig () {
        DEFAULT_NOISEFLOOR="500"
        DEFAULT_CHOICE="O"
    fi
    # Allow SAPI if Windows is in use
    if [ -n "`findtool winver`" ]; then
        SAPI="(S)API "
        SAPI_OPTS=""
        DEFAULT_TTS="sapi"
        DEFAULT_TTS_OPTS=$SAPI_OPTS
        DEFAULT_NOISEFLOOR="500"
        DEFAULT_CHOICE="s"
    fi
-    if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC"] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$RBSPEAK" ]; then
+    if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC"] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$RBSPEAK" ] && [ "$RBSPEAK" = "$GTTS" ] ; then
        echo "You need Festival, eSpeak, Mimic, Flite, or rbspeak in your path, or SAPI available to build voice files"
        exit 3
    fi
@ -1175,7 +1167,7 @@ voiceconfig () {
    if [ "$ARG_TTS" ]; then
        option=$ARG_TTS
    else
-        echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${RBSPEAK}(${DEFAULT_CHOICE})?"
+        echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}(${DEFAULT_CHOICE})?"
        option=`input`
        if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi
        advopts="$advopts --tts=$option"
@ -1211,6 +1203,11 @@ voiceconfig () {
        NOISEFLOOR="500"
        TTS_OPTS=$SWIFT_OPTS
 	;;
 	[Gg)
        TTS_ENGINE="gtts"
        NOISEFLOOR="500"
        TTS_OPTS=$GTTS_OPTS
 	;;
 	[Oo])
        TTS_ENGINE="rbspeak"
        NOISEFLOOR="500"
--- a/tools/voice.pl
+++ b/tools/voice.pl
@ -5,7 +5,7 @@
 #   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 #   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 #                     \/            \/     \/    \/            \/
-# $Id$ 
+# $Id$
 #
 # Copyright (C) 2007 Jonas Häggqvist
 #
@ -33,46 +33,73 @@ sub printusage {
 Usage: voice.pl [options] [path to dir]
 -V
    Create voice file. You must also specify -t and -l.
- 
+
 -C
    Create .talk clips.
 -t=<target>
    Specify which target you want to build voicefile for. Must include
    any features that target supports.
- 
+
 -i=<target_id>
    Numeric target id. Needed for voice building.
- 
+
 -l=<language>
    Specify which language you want to build. Without .lang extension.
- 
+
 -e=<encoder>
    Which encoder to use for voice strings
 -E=<encoder options>
    Which encoder options to use when compressing voice strings. Enclose
    in double quotes if the options include spaces.
- 
+
 -s=<TTS engine>
    Which TTS engine to use.
- 
+
 -S=<TTS engine options>
    Options to pass to the TTS engine. Enclose in double quotes if the
    options include spaces.
- 
+
 -v
    Be verbose
 USAGE
 ;
 }
 my %festival_lang_map = {
                           'english' => 'english',
 			   'english-us' => 'english',
 			   'espanol' => 'spanish',
 			  #'finnish' => 'finnish'
 			  #'italiano' => 'italian',
                          #'czech' => 'czech',
 			  #'welsh' => 'welsh'
 };
 my %gtts_lang_map = {
    'english' => 'en-gb',  # Always first, it's the golden master
 	'deutsch' => 'de',
 	'english-us' => 'en-us',
 	'francais' => 'fr-fr',
 	'greek' => 'gr',
 	'italiano' => 'it',
 	'norsk' => 'no',
 	'polski' => 'pl',
 	'russian' => 'ru',
 	'slovak' => 'sk',
 	'srpski' => 'sr',
 };
 # Initialize TTS engine. May return an object or value which will be passed
 # to voicestring and shutdown_tts
 sub init_tts {
    our $verbose;
    my ($tts_engine, $tts_engine_opts, $language) = @_;
    my %ret = ("name" => $tts_engine);
    $ret{"format"} = 'wav';
    $ret{"ttsoptions"} = "";
    # Don't use given/when here - it's not compatible with old perl versions
    if ($tts_engine eq 'festival') {
        print("> festival $tts_engine_opts --server\n") if $verbose;
@ -81,8 +108,10 @@ sub init_tts {
        $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
        $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
        $ret{"pid"} = $pid;
-    }
+        if (defined($festival_lang_map{$language})) {
-    elsif ($tts_engine eq 'sapi') {
+            $ret{"ttsoptions"} = "-l $festival_lang_map{$language} ";
        }
    } elsif ($tts_engine eq 'sapi') {
        my $toolsdir = dirname($0);
        my $path = `cygpath $toolsdir -a -w`;
        chomp($path);
@ -102,6 +131,11 @@ sub init_tts {
                "stdin" => *CMD_IN,
                "stdout" => *CMD_OUT,
                "vendor" => $vendor);
    } elsif ($tts_engine eq 'gtts') {
        $ret{"format"} = 'mp3';
        if (defined($gtts_lang_map{$language})) {
            $ret{"ttsoptions"} = "-l $gtts_lang_map{$language} ";
        }
    }
    return \%ret;
 }
@ -143,6 +177,9 @@ sub voicestring {
    my ($string, $output, $tts_engine_opts, $tts_object) = @_;
    my $cmd;
    my $name = $$tts_object{'name'};
    $tts_engine_opts .= $$tts_object{"ttsoptions"};
    printf("Generate \"%s\" with %s in file %s\n", $string, $name, $output) if $verbose;
    if ($name eq 'festival') {
        # festival_client lies to us, so we have to do awful soul-eating
@ -167,7 +204,7 @@ sub voicestring {
    elsif ($name eq 'flite') {
        $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
        print("> $cmd\n") if $verbose;
-        `$cmd`;
+        system($cmd);
    }
    elsif ($name eq 'espeak') {
        $cmd = "espeak $tts_engine_opts -w \"$output\"";
@ -193,11 +230,14 @@ sub voicestring {
        close(RBSPEAK);
    }
    elsif ($name eq 'mimic') {
-	$cmd = "mimic $tts_engine_opts -o $output";
+        $cmd = "mimic $tts_engine_opts -o $output -t \"$string\" ";
-	print("> $cmd\n") if $verbose;
+        print("> $cmd\n") if $verbose;
-	open (MIMIC, "| $cmd");
+        system($cmd);
-	print MIMIC $string . "\n";
+    }
-	close(MIMIC);
+    elsif ($name eq 'gtts') {
        $cmd = "gtts-cli $tts_engine_opts -o $output \"$string\"";
        print("> $cmd\n") if $verbose;
        system($cmd);
    }
 }
@ -326,17 +366,22 @@ sub generateclips {
                    if ($id eq "VOICE_PAUSE") {
                        print("Use distributed $wav\n") if $verbose;
                        copy(dirname($0)."/VOICE_PAUSE.wav", $wav);
                    } else {
 			voicestring($voice, $wav, $tts_engine_opts, $tts_object);
 			if ($tts_object->{'format'} eq "wav") {
 			    wavtrim($wav, 500, $tts_object);
 			    # 500 seems to be a reasonable default for now
 			}
                    }
-                    else {
+		    if ($tts_object->{'format'} eq "wav" || $id eq "VOICE_PAUSE") {
-                        voicestring($voice, $wav, $tts_engine_opts, $tts_object);
+			encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
-                        wavtrim($wav, 500, $tts_object);
+		    } else {
-                        # 500 seems to be a reasonable default for now
+			copy($wav, $mp3);
-                    }
+		    }
                    encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
                    synchronize($tts_object);
                    if (defined($ENV{'POOL'})) {
-                        copy($mp3, $pool_file);
+			copy($mp3, $pool_file);
                    }
                    unlink($wav);
                }