forked from len0rd/rockbox
voice: Add support for the Piper TTS engine
https://github.com/rhasspy/piper High quality, offline, neural-network-based, with good language coverage Note that you have to manually download the piper voice models, and set PIPER_MODEL_DIR appropriately. The configure script will let you choose from the available models and remember your choices. Change-Id: I8eba9fcf78b51b01b89491539aac3e423cc42f16
This commit is contained in:
parent
418a5acea0
commit
e8a51569ad
2 changed files with 147 additions and 51 deletions
52
tools/configure
vendored
52
tools/configure
vendored
|
|
@ -1159,6 +1159,13 @@ voiceconfig () {
|
||||||
DEFAULT_TTS_OPTS=$GTTS_OPTS
|
DEFAULT_TTS_OPTS=$GTTS_OPTS
|
||||||
DEFAULT_CHOICE="g"
|
DEFAULT_CHOICE="g"
|
||||||
fi
|
fi
|
||||||
|
if [ -n "`findtool piper`" ]; then
|
||||||
|
PIPER="(p)iper "
|
||||||
|
PIPER_OPTS=""
|
||||||
|
DEFAULT_TTS="piper"
|
||||||
|
DEFAULT_TTS_OPTS=$PIPER_OPTS
|
||||||
|
DEFAULT_CHOICE="p"
|
||||||
|
fi
|
||||||
if [ -n "`findtool rbspeak`" ]; then
|
if [ -n "`findtool rbspeak`" ]; then
|
||||||
RBSPEAK="(O)ther "
|
RBSPEAK="(O)ther "
|
||||||
RBSPEAK_OPTS=""
|
RBSPEAK_OPTS=""
|
||||||
|
|
@ -1167,15 +1174,15 @@ voiceconfig () {
|
||||||
DEFAULT_CHOICE="O"
|
DEFAULT_CHOICE="O"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$RBSPEAK" ] ; then
|
if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$PIPER" ] && [ "$PIPER" = "$RBSPEAK" ] ; then
|
||||||
echo "You need Festival, eSpeak, Mimic, Flite, gtts, or rbspeak in your path, or SAPI available to build voice files"
|
echo "You need Festival, eSpeak, Mimic, Flite, piper, gtts, or rbspeak in your path, or SAPI available to build voice files"
|
||||||
exit 3
|
exit 3
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$ARG_TTS" ]; then
|
if [ "$ARG_TTS" ]; then
|
||||||
option=$ARG_TTS
|
option=$ARG_TTS
|
||||||
else
|
else
|
||||||
echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}(${DEFAULT_CHOICE})?"
|
echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}${PIPER}(${DEFAULT_CHOICE})?"
|
||||||
option=`input`
|
option=`input`
|
||||||
if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi
|
if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi
|
||||||
advopts="$advopts --tts=$option"
|
advopts="$advopts --tts=$option"
|
||||||
|
|
@ -1209,6 +1216,10 @@ voiceconfig () {
|
||||||
TTS_ENGINE="gtts"
|
TTS_ENGINE="gtts"
|
||||||
TTS_OPTS=$GTTS_OPTS
|
TTS_OPTS=$GTTS_OPTS
|
||||||
;;
|
;;
|
||||||
|
[Pp]|piper)
|
||||||
|
TTS_ENGINE="piper"
|
||||||
|
TTS_OPTS=$PIPER_OPTS
|
||||||
|
;;
|
||||||
[Oo]|rbspeak)
|
[Oo]|rbspeak)
|
||||||
TTS_ENGINE="rbspeak"
|
TTS_ENGINE="rbspeak"
|
||||||
TTS_OPTS=$RBSPEAK_OPTS
|
TTS_OPTS=$RBSPEAK_OPTS
|
||||||
|
|
@ -1247,6 +1258,39 @@ voiceconfig () {
|
||||||
advopts="$advopts --voice=$CHOICE"
|
advopts="$advopts --voice=$CHOICE"
|
||||||
echo "Festival voice set to $TTS_FESTIVAL_VOICE"
|
echo "Festival voice set to $TTS_FESTIVAL_VOICE"
|
||||||
echo "(voice_$TTS_FESTIVAL_VOICE)" > festival-prolog.scm
|
echo "(voice_$TTS_FESTIVAL_VOICE)" > festival-prolog.scm
|
||||||
|
elif [ "$TTS_ENGINE" = "piper" ]; then
|
||||||
|
if [ -z "$PIPER_MODEL_DIR" ]; then
|
||||||
|
echo "Please set PIPER_MODEL_DIR!";
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
models=`(cd $PIPER_MODEL_DIR ; ls -1 *onnx)`
|
||||||
|
for model in $models; do
|
||||||
|
PIPER_MODEL="$model" # Default
|
||||||
|
break;
|
||||||
|
done
|
||||||
|
if [ "$ARG_VOICE" ]; then
|
||||||
|
CHOICE=$ARG_VOICE
|
||||||
|
else
|
||||||
|
i=1
|
||||||
|
for model in $models; do
|
||||||
|
printf "%3d. %s\n" "$i" "$model"
|
||||||
|
i=`expr $i + 1`
|
||||||
|
done
|
||||||
|
printf "Please select which piper model to use (default is $PIPER_MODEL): "
|
||||||
|
CHOICE=`input`
|
||||||
|
fi
|
||||||
|
i=1
|
||||||
|
for model in $models; do
|
||||||
|
if [ "$i" = "$CHOICE" -o "$model" = "$CHOICE" ]; then
|
||||||
|
PIPER_MODEL="$model"
|
||||||
|
break;
|
||||||
|
fi
|
||||||
|
i=`expr $i + 1`
|
||||||
|
done
|
||||||
|
|
||||||
|
TTS_OPTS="$TTS_OPTS --model $PIPER_MODEL_DIR/$PIPER_MODEL"
|
||||||
|
advopts="$advopts --voice=$PIPER_MODEL"
|
||||||
|
echo "Piper model set to $PIPER_MODEL"
|
||||||
elif [ "$TTS_ENGINE" = "mimic" ]; then
|
elif [ "$TTS_ENGINE" = "mimic" ]; then
|
||||||
voicelist=`mimic -lv | cut -d':' -f2`
|
voicelist=`mimic -lv | cut -d':' -f2`
|
||||||
for voice in $voicelist; do
|
for voice in $voicelist; do
|
||||||
|
|
@ -1268,6 +1312,7 @@ voiceconfig () {
|
||||||
for voice in $voicelist; do
|
for voice in $voicelist; do
|
||||||
if [ "$i" = "$CHOICE" -o "$voice" = "$CHOICE" ]; then
|
if [ "$i" = "$CHOICE" -o "$voice" = "$CHOICE" ]; then
|
||||||
TTS_MIMIC_VOICE="$voice"
|
TTS_MIMIC_VOICE="$voice"
|
||||||
|
break
|
||||||
fi
|
fi
|
||||||
i=`expr $i + 1`
|
i=`expr $i + 1`
|
||||||
done
|
done
|
||||||
|
|
@ -4756,6 +4801,7 @@ export ANDROID_NDK_PATH=${ANDROID_NDK_PATH}
|
||||||
export ANDROID_SDK_PATH=${ANDROID_SDK_PATH}
|
export ANDROID_SDK_PATH=${ANDROID_SDK_PATH}
|
||||||
export ANDROID_PLATFORM_VERSION=${ANDROID_PLATFORM_VERSION}
|
export ANDROID_PLATFORM_VERSION=${ANDROID_PLATFORM_VERSION}
|
||||||
export TOOLSET=${toolset}
|
export TOOLSET=${toolset}
|
||||||
|
export PIPER_MODEL_DIR=${PIPER_MODEL_DIR}
|
||||||
$CCACHE_ARG
|
$CCACHE_ARG
|
||||||
|
|
||||||
CONFIGURE_OPTIONS=${cmdline}
|
CONFIGURE_OPTIONS=${cmdline}
|
||||||
|
|
|
||||||
146
tools/voice.pl
146
tools/voice.pl
|
|
@ -42,7 +42,8 @@ Usage: voice.pl [options] [path to dir]
|
||||||
Specify which target you want to build voicefile for. Must include
|
Specify which target you want to build voicefile for. Must include
|
||||||
any features that target supports.
|
any features that target supports.
|
||||||
|
|
||||||
-f=<file> Use existing voiceids file
|
-f=<file>
|
||||||
|
Use existing voiceids file
|
||||||
|
|
||||||
-i=<target_id>
|
-i=<target_id>
|
||||||
Numeric target id. Needed for voice building.
|
Numeric target id. Needed for voice building.
|
||||||
|
|
@ -64,7 +65,8 @@ Usage: voice.pl [options] [path to dir]
|
||||||
Options to pass to the TTS engine. Enclose in double quotes if the
|
Options to pass to the TTS engine. Enclose in double quotes if the
|
||||||
options include spaces.
|
options include spaces.
|
||||||
|
|
||||||
-F Force the file to be regenerated even if present
|
-F
|
||||||
|
Force the file to be regenerated even if present
|
||||||
|
|
||||||
-v
|
-v
|
||||||
Be verbose
|
Be verbose
|
||||||
|
|
@ -73,57 +75,78 @@ USAGE
|
||||||
}
|
}
|
||||||
|
|
||||||
my %festival_lang_map = (
|
my %festival_lang_map = (
|
||||||
'english' => 'english',
|
'english' => 'english',
|
||||||
'english-us' => 'english',
|
'english-us' => 'english',
|
||||||
'espanol' => 'spanish',
|
'espanol' => 'spanish',
|
||||||
#'finnish' => 'finnish'
|
#'finnish' => 'finnish'
|
||||||
#'italiano' => 'italian',
|
#'italiano' => 'italian',
|
||||||
#'czech' => 'czech',
|
#'czech' => 'czech',
|
||||||
#'welsh' => 'welsh'
|
#'welsh' => 'welsh'
|
||||||
);
|
);
|
||||||
|
|
||||||
my %gtts_lang_map = (
|
my %gtts_lang_map = (
|
||||||
'english' => '-l en -t co.uk', # Always first, it's the golden master
|
'english' => '-l en -t co.uk', # Always first, it's the golden master
|
||||||
'czech' => '-l cs', # not supported
|
'czech' => '-l cs',
|
||||||
'dansk' => '-l da',
|
'dansk' => '-l da',
|
||||||
'deutsch' => '-l de',
|
'deutsch' => '-l de',
|
||||||
'english-us' => '-l en -t us',
|
'english-us' => '-l en -t us',
|
||||||
'espanol' => '-l es',
|
'espanol' => '-l es',
|
||||||
'francais' => '-l fr',
|
'francais' => '-l fr',
|
||||||
'greek' => '-l el',
|
'greek' => '-l el',
|
||||||
'magyar' => '-l hu',
|
'magyar' => '-l hu',
|
||||||
'italiano' => '-l it',
|
'italiano' => '-l it',
|
||||||
'nederlands' => '-l nl',
|
'nederlands' => '-l nl',
|
||||||
'norsk' => '-l no',
|
'norsk' => '-l no',
|
||||||
'polski' => '-l pl',
|
'polski' => '-l pl',
|
||||||
'russian' => '-l ru',
|
'russian' => '-l ru',
|
||||||
'slovak' => '-l sk',
|
'slovak' => '-l sk',
|
||||||
'srpski' => '-l sr',
|
'srpski' => '-l sr',
|
||||||
'svenska' => '-l sv',
|
'svenska' => '-l sv',
|
||||||
'turkce' => '-l tr',
|
'turkce' => '-l tr',
|
||||||
);
|
);
|
||||||
|
|
||||||
my %espeak_lang_map = (
|
my %espeak_lang_map = (
|
||||||
'english' => 'en-gb', # Always first, it's the golden master
|
'english' => '-ven-gb -k 5', # Always first, it's the golden master
|
||||||
'czech' => 'cs',
|
'czech' => '-vcs',
|
||||||
'dansk' => 'da',
|
'dansk' => '-vda',
|
||||||
'deutsch' => 'de',
|
'deutsch' => '-vde',
|
||||||
'english-us' => 'en-us',
|
'english-us' => '-ven-us -k 5',
|
||||||
'espanol' => 'es',
|
'espanol' => '-ves',
|
||||||
'francais' => 'fr-fr',
|
'francais' => '-vfr-fr',
|
||||||
'greek' => 'el',
|
'greek' => '-vel',
|
||||||
'nederlands' => 'nl',
|
'magyar' => '-vhu',
|
||||||
'magyar' => 'hu',
|
'italiano' => '-vit',
|
||||||
'italiano' => 'it',
|
'japanese' => '-vja',
|
||||||
'japanese' => 'ja',
|
'nederlands' => '-vnl',
|
||||||
'nederlands' => 'nl',
|
'norsk' => '-vno',
|
||||||
'norsk' => 'no',
|
'polski' => '-vpl',
|
||||||
'polski' => 'pl',
|
'russian' => '-vru',
|
||||||
'russian' => 'ru',
|
'slovak' => '-vsk',
|
||||||
'slovak' => 'sk',
|
'srpski' => '-vsr',
|
||||||
'srpski' => 'sr',
|
'svenska' => '-vsv',
|
||||||
'svenska' => 'sv',
|
'turkce' => '-vtr',
|
||||||
'turkce' => 'tr',
|
);
|
||||||
|
|
||||||
|
my %piper_lang_map = (
|
||||||
|
'english' => 'en_GB-cori-high.onnx', # Always first, it's the golden master
|
||||||
|
'czech' => 'cs_CZ-jirka-medium.onnx',
|
||||||
|
'dansk' => 'da_DK-talesyntese-medium.onnx',
|
||||||
|
'deutsch' => 'de_DE-thorsten-high.onnx',
|
||||||
|
'english-us' => 'en_US-libritts-high.onnx',
|
||||||
|
'espanol' => 'es_ES-sharvard-medium.onnx',
|
||||||
|
'francais' => 'fr_FR-siwis-medium.onnx',
|
||||||
|
'greek' => 'el_GR-rapunzelina-low.onnx',
|
||||||
|
# 'magyar' => '-vhu',
|
||||||
|
'italiano' => 'it_IT-riccardo-x_low.onnx',
|
||||||
|
# 'japanese' => '-vja',
|
||||||
|
'nederlands' => 'nl_NL-mls-medium.onnx',
|
||||||
|
'norsk' => 'no_NO-talesyntese-medium.onnx',
|
||||||
|
'polski' => 'pl_PL-gosia-medium.onnx',
|
||||||
|
'russian' => 'ru_RU-irina-medium.onnx',
|
||||||
|
'slovak' => 'sk_SK-lili-medium.onnx',
|
||||||
|
'srpski' => 'sr_RS-serbski_institut-medium.onnx',
|
||||||
|
'svenska' => 'sv_SE-nst-medium.onnx',
|
||||||
|
'turkce' => 'tr_TR-fettah-medium.onnx',
|
||||||
);
|
);
|
||||||
|
|
||||||
my $trim_thresh = 500; # Trim silence if over this, in ms
|
my $trim_thresh = 500; # Trim silence if over this, in ms
|
||||||
|
|
@ -141,6 +164,7 @@ sub init_tts {
|
||||||
# Don't use given/when here - it's not compatible with old perl versions
|
# Don't use given/when here - it's not compatible with old perl versions
|
||||||
if ($tts_engine eq 'festival') {
|
if ($tts_engine eq 'festival') {
|
||||||
print("> festival $tts_engine_opts --server\n") if $verbose;
|
print("> festival $tts_engine_opts --server\n") if $verbose;
|
||||||
|
# Open command, and filehandles for STDIN, STDOUT, STDERR
|
||||||
my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1");
|
my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1");
|
||||||
my $dummy = *FESTIVAL_SERVER; #suppress warning
|
my $dummy = *FESTIVAL_SERVER; #suppress warning
|
||||||
$SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
|
$SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
|
||||||
|
|
@ -149,6 +173,21 @@ sub init_tts {
|
||||||
if (defined($festival_lang_map{$language}) && $tts_engine_opts !~ /--language/) {
|
if (defined($festival_lang_map{$language}) && $tts_engine_opts !~ /--language/) {
|
||||||
$ret{"ttsoptions"} = "--language $festival_lang_map{$language} ";
|
$ret{"ttsoptions"} = "--language $festival_lang_map{$language} ";
|
||||||
}
|
}
|
||||||
|
} elsif ($tts_engine eq 'piper') {
|
||||||
|
my $cmd = "piper $tts_engine_opts --json-input";
|
||||||
|
print("> $cmd\n") if $verbose;
|
||||||
|
|
||||||
|
my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd);
|
||||||
|
$SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
|
||||||
|
$SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
|
||||||
|
$ret{"pid"} = $pid;
|
||||||
|
binmode(*CMD_IN, ':encoding(utf8)');
|
||||||
|
binmode(*CMD_OUT, ':encoding(utf8)');
|
||||||
|
binmode(*CMD_ERR, ':encoding(utf8)');
|
||||||
|
if (defined($piper_lang_map{$language}) && $tts_engine_opts !~ /--model/) {
|
||||||
|
die("Need PIPER_MODEL_DIR\n") if (!defined($ENV{'PIPER_MODEL_DIR'}));
|
||||||
|
$ret{"ttsoptions"} = "--model $ENV{PIPER_MODEL_DIR}/$piper_lang_map{$language} ";
|
||||||
|
}
|
||||||
} elsif ($tts_engine eq 'sapi') {
|
} elsif ($tts_engine eq 'sapi') {
|
||||||
my $toolsdir = dirname($0);
|
my $toolsdir = dirname($0);
|
||||||
my $path = `cygpath $toolsdir -a -w`;
|
my $path = `cygpath $toolsdir -a -w`;
|
||||||
|
|
@ -176,7 +215,7 @@ sub init_tts {
|
||||||
}
|
}
|
||||||
} elsif ($tts_engine eq 'espeak' || $tts_engine eq 'espeak-ng') {
|
} elsif ($tts_engine eq 'espeak' || $tts_engine eq 'espeak-ng') {
|
||||||
if (defined($espeak_lang_map{$language}) && $tts_engine_opts !~ /-v/) {
|
if (defined($espeak_lang_map{$language}) && $tts_engine_opts !~ /-v/) {
|
||||||
$ret{"ttsoptions"} = "-v$espeak_lang_map{$language} ";
|
$ret{"ttsoptions"} = " $espeak_lang_map{$language} ";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -190,6 +229,10 @@ sub shutdown_tts {
|
||||||
# Send SIGTERM to festival server
|
# Send SIGTERM to festival server
|
||||||
kill TERM => $$tts_object{"pid"};
|
kill TERM => $$tts_object{"pid"};
|
||||||
}
|
}
|
||||||
|
elsif ($$tts_object{'name'} eq 'piper') {
|
||||||
|
# Send SIGTERM to piper
|
||||||
|
kill TERM => $$tts_object{"pid"};
|
||||||
|
}
|
||||||
elsif ($$tts_object{'name'} eq 'sapi') {
|
elsif ($$tts_object{'name'} eq 'sapi') {
|
||||||
print({$$tts_object{"stdin"}} "QUIT\r\n");
|
print({$$tts_object{"stdin"}} "QUIT\r\n");
|
||||||
close($$tts_object{"stdin"});
|
close($$tts_object{"stdin"});
|
||||||
|
|
@ -244,6 +287,13 @@ sub voicestring {
|
||||||
close(CMD_OUT);
|
close(CMD_OUT);
|
||||||
close(CMD_ERR);
|
close(CMD_ERR);
|
||||||
}
|
}
|
||||||
|
elsif ($name eq 'piper') {
|
||||||
|
$cmd = "{ \"text\": \"$string\", \"output_file\": \"$output\" }";
|
||||||
|
print(">> $cmd\n") if $verbose;
|
||||||
|
print(CMD_IN "$cmd\n");
|
||||||
|
my $res = <CMD_OUT>;
|
||||||
|
$res = <CMD_ERR>;
|
||||||
|
}
|
||||||
elsif ($name eq 'flite') {
|
elsif ($name eq 'flite') {
|
||||||
$cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
|
$cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
|
||||||
print("> $cmd\n") if $verbose;
|
print("> $cmd\n") if $verbose;
|
||||||
|
|
@ -469,7 +519,6 @@ sub generateclips {
|
||||||
print("\n");
|
print("\n");
|
||||||
|
|
||||||
unlink($updfile) if (-f $updfile);
|
unlink($updfile) if (-f $updfile);
|
||||||
shutdown_tts($tts_object);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Assemble the voicefile
|
# Assemble the voicefile
|
||||||
|
|
@ -608,6 +657,7 @@ if ($V == 1) {
|
||||||
defined($t) ? $t : "unknown",
|
defined($t) ? $t : "unknown",
|
||||||
$l, $e, $E, $s, $S);
|
$l, $e, $E, $s, $S);
|
||||||
generateclips($l, $t, $e, $E, $tts_object, $S, $f);
|
generateclips($l, $t, $e, $E, $tts_object, $S, $f);
|
||||||
|
shutdown_tts($tts_object);
|
||||||
createvoice($l, $i, $f);
|
createvoice($l, $i, $f);
|
||||||
deleteencs();
|
deleteencs();
|
||||||
} elsif ($C) {
|
} elsif ($C) {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue