voice: Add support for Google Translate's speech synthesizer

Uses the 'gtts-cli' command line client.  Supports a wide variety of
languages, including all "Complete" and "Good" Rockbox translations.

Additional changes:

 * voice synth script can accept pre-encoded mp3 files
 * Move language->synth options mapping into the voice script
 * Additional cleanups

Change-Id: I9523e2bca87cbcee2d8c4111f9892e8e458c7419
This commit is contained in:
Solomon Peachy 2020-07-08 19:05:09 -04:00
parent 5e98eba8ab
commit 2c3399537c
2 changed files with 93 additions and 51 deletions

53
tools/configure vendored
View file

@ -1111,23 +1111,7 @@ voiceconfig () {
fi
if [ -n "`findtool festival`" ]; then
FESTIVAL="(F)estival "
case "$thislang" in
"italiano")
FESTIVAL_OPTS="--language italian"
;;
"espanol")
FESTIVAL_OPTS="--language spanish"
;;
"finnish")
FESTIVAL_OPTS="--language finnish"
;;
"czech")
FESTIVAL_OPTS="--language czech"
;;
*)
FESTIVAL_OPTS=""
;;
esac
FESTIVAL_OPTS=""
DEFAULT_TTS="festival"
DEFAULT_TTS_OPTS=$FESTIVAL_OPTS
DEFAULT_NOISEFLOOR="500"
@ -1149,6 +1133,23 @@ voiceconfig () {
DEFAULT_NOISEFLOOR="500"
DEFAULT_CHOICE="w"
fi
# Allow SAPI if Windows is in use
if [ -n "`findtool winver`" ]; then
SAPI="(S)API "
SAPI_OPTS=""
DEFAULT_TTS="sapi"
DEFAULT_TTS_OPTS=$SAPI_OPTS
DEFAULT_NOISEFLOOR="500"
DEFAULT_CHOICE="S"
fi
if [ -n "`findtool gtts-cli`" ]; then
GTTS="(g)tts "
GTTS_OPTS=""
DEFAULT_TTS="gtts"
DEFAULT_TTS_OPTS=$GTTS_OPTS
DEFAULT_NOISEFLOOR="500"
DEFAULT_CHOICE="g"
fi
if [ -n "`findtool rbspeak`" ]; then
RBSPEAK="(O)ther "
RBSPEAK_OPTS=""
@ -1157,17 +1158,8 @@ voiceconfig () {
DEFAULT_NOISEFLOOR="500"
DEFAULT_CHOICE="O"
fi
# Allow SAPI if Windows is in use
if [ -n "`findtool winver`" ]; then
SAPI="(S)API "
SAPI_OPTS=""
DEFAULT_TTS="sapi"
DEFAULT_TTS_OPTS=$SAPI_OPTS
DEFAULT_NOISEFLOOR="500"
DEFAULT_CHOICE="s"
fi
if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC"] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$RBSPEAK" ]; then
if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC"] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$RBSPEAK" ] && [ "$RBSPEAK" = "$GTTS" ] ; then
echo "You need Festival, eSpeak, Mimic, Flite, or rbspeak in your path, or SAPI available to build voice files"
exit 3
fi
@ -1175,7 +1167,7 @@ voiceconfig () {
if [ "$ARG_TTS" ]; then
option=$ARG_TTS
else
echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${RBSPEAK}(${DEFAULT_CHOICE})?"
echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}(${DEFAULT_CHOICE})?"
option=`input`
if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi
advopts="$advopts --tts=$option"
@ -1211,6 +1203,11 @@ voiceconfig () {
NOISEFLOOR="500"
TTS_OPTS=$SWIFT_OPTS
;;
[Gg)
TTS_ENGINE="gtts"
NOISEFLOOR="500"
TTS_OPTS=$GTTS_OPTS
;;
[Oo])
TTS_ENGINE="rbspeak"
NOISEFLOOR="500"

View file

@ -5,7 +5,7 @@
# Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
# Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
# \/ \/ \/ \/ \/
# $Id$
# $Id$
#
# Copyright (C) 2007 Jonas Häggqvist
#
@ -33,46 +33,73 @@ sub printusage {
Usage: voice.pl [options] [path to dir]
-V
Create voice file. You must also specify -t and -l.
-C
Create .talk clips.
-t=<target>
Specify which target you want to build voicefile for. Must include
any features that target supports.
-i=<target_id>
Numeric target id. Needed for voice building.
-l=<language>
Specify which language you want to build. Without .lang extension.
-e=<encoder>
Which encoder to use for voice strings
-E=<encoder options>
Which encoder options to use when compressing voice strings. Enclose
in double quotes if the options include spaces.
-s=<TTS engine>
Which TTS engine to use.
-S=<TTS engine options>
Options to pass to the TTS engine. Enclose in double quotes if the
options include spaces.
-v
Be verbose
USAGE
;
}
my %festival_lang_map = {
'english' => 'english',
'english-us' => 'english',
'espanol' => 'spanish',
#'finnish' => 'finnish'
#'italiano' => 'italian',
#'czech' => 'czech',
#'welsh' => 'welsh'
};
my %gtts_lang_map = {
'english' => 'en-gb', # Always first, it's the golden master
'deutsch' => 'de',
'english-us' => 'en-us',
'francais' => 'fr-fr',
'greek' => 'gr',
'italiano' => 'it',
'norsk' => 'no',
'polski' => 'pl',
'russian' => 'ru',
'slovak' => 'sk',
'srpski' => 'sr',
};
# Initialize TTS engine. May return an object or value which will be passed
# to voicestring and shutdown_tts
sub init_tts {
our $verbose;
my ($tts_engine, $tts_engine_opts, $language) = @_;
my %ret = ("name" => $tts_engine);
$ret{"format"} = 'wav';
$ret{"ttsoptions"} = "";
# Don't use given/when here - it's not compatible with old perl versions
if ($tts_engine eq 'festival') {
print("> festival $tts_engine_opts --server\n") if $verbose;
@ -81,8 +108,10 @@ sub init_tts {
$SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
$SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
$ret{"pid"} = $pid;
}
elsif ($tts_engine eq 'sapi') {
if (defined($festival_lang_map{$language})) {
$ret{"ttsoptions"} = "-l $festival_lang_map{$language} ";
}
} elsif ($tts_engine eq 'sapi') {
my $toolsdir = dirname($0);
my $path = `cygpath $toolsdir -a -w`;
chomp($path);
@ -102,6 +131,11 @@ sub init_tts {
"stdin" => *CMD_IN,
"stdout" => *CMD_OUT,
"vendor" => $vendor);
} elsif ($tts_engine eq 'gtts') {
$ret{"format"} = 'mp3';
if (defined($gtts_lang_map{$language})) {
$ret{"ttsoptions"} = "-l $gtts_lang_map{$language} ";
}
}
return \%ret;
}
@ -143,6 +177,9 @@ sub voicestring {
my ($string, $output, $tts_engine_opts, $tts_object) = @_;
my $cmd;
my $name = $$tts_object{'name'};
$tts_engine_opts .= $$tts_object{"ttsoptions"};
printf("Generate \"%s\" with %s in file %s\n", $string, $name, $output) if $verbose;
if ($name eq 'festival') {
# festival_client lies to us, so we have to do awful soul-eating
@ -167,7 +204,7 @@ sub voicestring {
elsif ($name eq 'flite') {
$cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
print("> $cmd\n") if $verbose;
`$cmd`;
system($cmd);
}
elsif ($name eq 'espeak') {
$cmd = "espeak $tts_engine_opts -w \"$output\"";
@ -193,11 +230,14 @@ sub voicestring {
close(RBSPEAK);
}
elsif ($name eq 'mimic') {
$cmd = "mimic $tts_engine_opts -o $output";
print("> $cmd\n") if $verbose;
open (MIMIC, "| $cmd");
print MIMIC $string . "\n";
close(MIMIC);
$cmd = "mimic $tts_engine_opts -o $output -t \"$string\" ";
print("> $cmd\n") if $verbose;
system($cmd);
}
elsif ($name eq 'gtts') {
$cmd = "gtts-cli $tts_engine_opts -o $output \"$string\"";
print("> $cmd\n") if $verbose;
system($cmd);
}
}
@ -326,17 +366,22 @@ sub generateclips {
if ($id eq "VOICE_PAUSE") {
print("Use distributed $wav\n") if $verbose;
copy(dirname($0)."/VOICE_PAUSE.wav", $wav);
} else {
voicestring($voice, $wav, $tts_engine_opts, $tts_object);
if ($tts_object->{'format'} eq "wav") {
wavtrim($wav, 500, $tts_object);
# 500 seems to be a reasonable default for now
}
}
else {
voicestring($voice, $wav, $tts_engine_opts, $tts_object);
wavtrim($wav, 500, $tts_object);
# 500 seems to be a reasonable default for now
}
if ($tts_object->{'format'} eq "wav" || $id eq "VOICE_PAUSE") {
encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
} else {
copy($wav, $mp3);
}
encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
synchronize($tts_object);
if (defined($ENV{'POOL'})) {
copy($mp3, $pool_file);
copy($mp3, $pool_file);
}
unlink($wav);
}