1
0
Fork 0
forked from len0rd/rockbox

voice: Add support for the Piper TTS engine

https://github.com/rhasspy/piper

High quality, offline, neural-network-based, with good language coverage

Note that you have to manually download the piper voice models, and set
PIPER_MODEL_DIR appropriately.  The configure script will let you choose
from the available models and remember your choices.

Change-Id: I8eba9fcf78b51b01b89491539aac3e423cc42f16
This commit is contained in:
Solomon Peachy 2024-04-19 21:53:43 -04:00
parent 418a5acea0
commit e8a51569ad
2 changed files with 147 additions and 51 deletions

52
tools/configure vendored
View file

@ -1159,6 +1159,13 @@ voiceconfig () {
DEFAULT_TTS_OPTS=$GTTS_OPTS DEFAULT_TTS_OPTS=$GTTS_OPTS
DEFAULT_CHOICE="g" DEFAULT_CHOICE="g"
fi fi
if [ -n "`findtool piper`" ]; then
PIPER="(p)iper "
PIPER_OPTS=""
DEFAULT_TTS="piper"
DEFAULT_TTS_OPTS=$PIPER_OPTS
DEFAULT_CHOICE="p"
fi
if [ -n "`findtool rbspeak`" ]; then if [ -n "`findtool rbspeak`" ]; then
RBSPEAK="(O)ther " RBSPEAK="(O)ther "
RBSPEAK_OPTS="" RBSPEAK_OPTS=""
@ -1167,15 +1174,15 @@ voiceconfig () {
DEFAULT_CHOICE="O" DEFAULT_CHOICE="O"
fi fi
if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$RBSPEAK" ] ; then if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$PIPER" ] && [ "$PIPER" = "$RBSPEAK" ] ; then
echo "You need Festival, eSpeak, Mimic, Flite, gtts, or rbspeak in your path, or SAPI available to build voice files" echo "You need Festival, eSpeak, Mimic, Flite, piper, gtts, or rbspeak in your path, or SAPI available to build voice files"
exit 3 exit 3
fi fi
if [ "$ARG_TTS" ]; then if [ "$ARG_TTS" ]; then
option=$ARG_TTS option=$ARG_TTS
else else
echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}(${DEFAULT_CHOICE})?" echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}${PIPER}(${DEFAULT_CHOICE})?"
option=`input` option=`input`
if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi
advopts="$advopts --tts=$option" advopts="$advopts --tts=$option"
@ -1209,6 +1216,10 @@ voiceconfig () {
TTS_ENGINE="gtts" TTS_ENGINE="gtts"
TTS_OPTS=$GTTS_OPTS TTS_OPTS=$GTTS_OPTS
;; ;;
[Pp]|piper)
TTS_ENGINE="piper"
TTS_OPTS=$PIPER_OPTS
;;
[Oo]|rbspeak) [Oo]|rbspeak)
TTS_ENGINE="rbspeak" TTS_ENGINE="rbspeak"
TTS_OPTS=$RBSPEAK_OPTS TTS_OPTS=$RBSPEAK_OPTS
@ -1247,6 +1258,39 @@ voiceconfig () {
advopts="$advopts --voice=$CHOICE" advopts="$advopts --voice=$CHOICE"
echo "Festival voice set to $TTS_FESTIVAL_VOICE" echo "Festival voice set to $TTS_FESTIVAL_VOICE"
echo "(voice_$TTS_FESTIVAL_VOICE)" > festival-prolog.scm echo "(voice_$TTS_FESTIVAL_VOICE)" > festival-prolog.scm
elif [ "$TTS_ENGINE" = "piper" ]; then
if [ -z "$PIPER_MODEL_DIR" ]; then
echo "Please set PIPER_MODEL_DIR!";
exit 1
fi
models=`(cd $PIPER_MODEL_DIR ; ls -1 *onnx)`
for model in $models; do
PIPER_MODEL="$model" # Default
break;
done
if [ "$ARG_VOICE" ]; then
CHOICE=$ARG_VOICE
else
i=1
for model in $models; do
printf "%3d. %s\n" "$i" "$model"
i=`expr $i + 1`
done
printf "Please select which piper model to use (default is $PIPER_MODEL): "
CHOICE=`input`
fi
i=1
for model in $models; do
if [ "$i" = "$CHOICE" -o "$model" = "$CHOICE" ]; then
PIPER_MODEL="$model"
break;
fi
i=`expr $i + 1`
done
TTS_OPTS="$TTS_OPTS --model $PIPER_MODEL_DIR/$PIPER_MODEL"
advopts="$advopts --voice=$PIPER_MODEL"
echo "Piper model set to $PIPER_MODEL"
elif [ "$TTS_ENGINE" = "mimic" ]; then elif [ "$TTS_ENGINE" = "mimic" ]; then
voicelist=`mimic -lv | cut -d':' -f2` voicelist=`mimic -lv | cut -d':' -f2`
for voice in $voicelist; do for voice in $voicelist; do
@ -1268,6 +1312,7 @@ voiceconfig () {
for voice in $voicelist; do for voice in $voicelist; do
if [ "$i" = "$CHOICE" -o "$voice" = "$CHOICE" ]; then if [ "$i" = "$CHOICE" -o "$voice" = "$CHOICE" ]; then
TTS_MIMIC_VOICE="$voice" TTS_MIMIC_VOICE="$voice"
break
fi fi
i=`expr $i + 1` i=`expr $i + 1`
done done
@ -4756,6 +4801,7 @@ export ANDROID_NDK_PATH=${ANDROID_NDK_PATH}
export ANDROID_SDK_PATH=${ANDROID_SDK_PATH} export ANDROID_SDK_PATH=${ANDROID_SDK_PATH}
export ANDROID_PLATFORM_VERSION=${ANDROID_PLATFORM_VERSION} export ANDROID_PLATFORM_VERSION=${ANDROID_PLATFORM_VERSION}
export TOOLSET=${toolset} export TOOLSET=${toolset}
export PIPER_MODEL_DIR=${PIPER_MODEL_DIR}
$CCACHE_ARG $CCACHE_ARG
CONFIGURE_OPTIONS=${cmdline} CONFIGURE_OPTIONS=${cmdline}

View file

@ -42,7 +42,8 @@ Usage: voice.pl [options] [path to dir]
Specify which target you want to build voicefile for. Must include Specify which target you want to build voicefile for. Must include
any features that target supports. any features that target supports.
-f=<file> Use existing voiceids file -f=<file>
Use existing voiceids file
-i=<target_id> -i=<target_id>
Numeric target id. Needed for voice building. Numeric target id. Needed for voice building.
@ -64,7 +65,8 @@ Usage: voice.pl [options] [path to dir]
Options to pass to the TTS engine. Enclose in double quotes if the Options to pass to the TTS engine. Enclose in double quotes if the
options include spaces. options include spaces.
-F Force the file to be regenerated even if present -F
Force the file to be regenerated even if present
-v -v
Be verbose Be verbose
@ -73,57 +75,78 @@ USAGE
} }
my %festival_lang_map = ( my %festival_lang_map = (
'english' => 'english', 'english' => 'english',
'english-us' => 'english', 'english-us' => 'english',
'espanol' => 'spanish', 'espanol' => 'spanish',
#'finnish' => 'finnish' #'finnish' => 'finnish'
#'italiano' => 'italian', #'italiano' => 'italian',
#'czech' => 'czech', #'czech' => 'czech',
#'welsh' => 'welsh' #'welsh' => 'welsh'
); );
my %gtts_lang_map = ( my %gtts_lang_map = (
'english' => '-l en -t co.uk', # Always first, it's the golden master 'english' => '-l en -t co.uk', # Always first, it's the golden master
'czech' => '-l cs', # not supported 'czech' => '-l cs',
'dansk' => '-l da', 'dansk' => '-l da',
'deutsch' => '-l de', 'deutsch' => '-l de',
'english-us' => '-l en -t us', 'english-us' => '-l en -t us',
'espanol' => '-l es', 'espanol' => '-l es',
'francais' => '-l fr', 'francais' => '-l fr',
'greek' => '-l el', 'greek' => '-l el',
'magyar' => '-l hu', 'magyar' => '-l hu',
'italiano' => '-l it', 'italiano' => '-l it',
'nederlands' => '-l nl', 'nederlands' => '-l nl',
'norsk' => '-l no', 'norsk' => '-l no',
'polski' => '-l pl', 'polski' => '-l pl',
'russian' => '-l ru', 'russian' => '-l ru',
'slovak' => '-l sk', 'slovak' => '-l sk',
'srpski' => '-l sr', 'srpski' => '-l sr',
'svenska' => '-l sv', 'svenska' => '-l sv',
'turkce' => '-l tr', 'turkce' => '-l tr',
); );
my %espeak_lang_map = ( my %espeak_lang_map = (
'english' => 'en-gb', # Always first, it's the golden master 'english' => '-ven-gb -k 5', # Always first, it's the golden master
'czech' => 'cs', 'czech' => '-vcs',
'dansk' => 'da', 'dansk' => '-vda',
'deutsch' => 'de', 'deutsch' => '-vde',
'english-us' => 'en-us', 'english-us' => '-ven-us -k 5',
'espanol' => 'es', 'espanol' => '-ves',
'francais' => 'fr-fr', 'francais' => '-vfr-fr',
'greek' => 'el', 'greek' => '-vel',
'nederlands' => 'nl', 'magyar' => '-vhu',
'magyar' => 'hu', 'italiano' => '-vit',
'italiano' => 'it', 'japanese' => '-vja',
'japanese' => 'ja', 'nederlands' => '-vnl',
'nederlands' => 'nl', 'norsk' => '-vno',
'norsk' => 'no', 'polski' => '-vpl',
'polski' => 'pl', 'russian' => '-vru',
'russian' => 'ru', 'slovak' => '-vsk',
'slovak' => 'sk', 'srpski' => '-vsr',
'srpski' => 'sr', 'svenska' => '-vsv',
'svenska' => 'sv', 'turkce' => '-vtr',
'turkce' => 'tr', );
my %piper_lang_map = (
'english' => 'en_GB-cori-high.onnx', # Always first, it's the golden master
'czech' => 'cs_CZ-jirka-medium.onnx',
'dansk' => 'da_DK-talesyntese-medium.onnx',
'deutsch' => 'de_DE-thorsten-high.onnx',
'english-us' => 'en_US-libritts-high.onnx',
'espanol' => 'es_ES-sharvard-medium.onnx',
'francais' => 'fr_FR-siwis-medium.onnx',
'greek' => 'el_GR-rapunzelina-low.onnx',
# 'magyar' => '-vhu',
'italiano' => 'it_IT-riccardo-x_low.onnx',
# 'japanese' => '-vja',
'nederlands' => 'nl_NL-mls-medium.onnx',
'norsk' => 'no_NO-talesyntese-medium.onnx',
'polski' => 'pl_PL-gosia-medium.onnx',
'russian' => 'ru_RU-irina-medium.onnx',
'slovak' => 'sk_SK-lili-medium.onnx',
'srpski' => 'sr_RS-serbski_institut-medium.onnx',
'svenska' => 'sv_SE-nst-medium.onnx',
'turkce' => 'tr_TR-fettah-medium.onnx',
); );
my $trim_thresh = 500; # Trim silence if over this, in ms my $trim_thresh = 500; # Trim silence if over this, in ms
@ -141,6 +164,7 @@ sub init_tts {
# Don't use given/when here - it's not compatible with old perl versions # Don't use given/when here - it's not compatible with old perl versions
if ($tts_engine eq 'festival') { if ($tts_engine eq 'festival') {
print("> festival $tts_engine_opts --server\n") if $verbose; print("> festival $tts_engine_opts --server\n") if $verbose;
# Open command, and filehandles for STDIN, STDOUT, STDERR
my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1"); my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1");
my $dummy = *FESTIVAL_SERVER; #suppress warning my $dummy = *FESTIVAL_SERVER; #suppress warning
$SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); }; $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
@ -149,6 +173,21 @@ sub init_tts {
if (defined($festival_lang_map{$language}) && $tts_engine_opts !~ /--language/) { if (defined($festival_lang_map{$language}) && $tts_engine_opts !~ /--language/) {
$ret{"ttsoptions"} = "--language $festival_lang_map{$language} "; $ret{"ttsoptions"} = "--language $festival_lang_map{$language} ";
} }
} elsif ($tts_engine eq 'piper') {
my $cmd = "piper $tts_engine_opts --json-input";
print("> $cmd\n") if $verbose;
my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd);
$SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
$SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
$ret{"pid"} = $pid;
binmode(*CMD_IN, ':encoding(utf8)');
binmode(*CMD_OUT, ':encoding(utf8)');
binmode(*CMD_ERR, ':encoding(utf8)');
if (defined($piper_lang_map{$language}) && $tts_engine_opts !~ /--model/) {
die("Need PIPER_MODEL_DIR\n") if (!defined($ENV{'PIPER_MODEL_DIR'}));
$ret{"ttsoptions"} = "--model $ENV{PIPER_MODEL_DIR}/$piper_lang_map{$language} ";
}
} elsif ($tts_engine eq 'sapi') { } elsif ($tts_engine eq 'sapi') {
my $toolsdir = dirname($0); my $toolsdir = dirname($0);
my $path = `cygpath $toolsdir -a -w`; my $path = `cygpath $toolsdir -a -w`;
@ -176,7 +215,7 @@ sub init_tts {
} }
} elsif ($tts_engine eq 'espeak' || $tts_engine eq 'espeak-ng') { } elsif ($tts_engine eq 'espeak' || $tts_engine eq 'espeak-ng') {
if (defined($espeak_lang_map{$language}) && $tts_engine_opts !~ /-v/) { if (defined($espeak_lang_map{$language}) && $tts_engine_opts !~ /-v/) {
$ret{"ttsoptions"} = "-v$espeak_lang_map{$language} "; $ret{"ttsoptions"} = " $espeak_lang_map{$language} ";
} }
} }
@ -190,6 +229,10 @@ sub shutdown_tts {
# Send SIGTERM to festival server # Send SIGTERM to festival server
kill TERM => $$tts_object{"pid"}; kill TERM => $$tts_object{"pid"};
} }
elsif ($$tts_object{'name'} eq 'piper') {
# Send SIGTERM to piper
kill TERM => $$tts_object{"pid"};
}
elsif ($$tts_object{'name'} eq 'sapi') { elsif ($$tts_object{'name'} eq 'sapi') {
print({$$tts_object{"stdin"}} "QUIT\r\n"); print({$$tts_object{"stdin"}} "QUIT\r\n");
close($$tts_object{"stdin"}); close($$tts_object{"stdin"});
@ -244,6 +287,13 @@ sub voicestring {
close(CMD_OUT); close(CMD_OUT);
close(CMD_ERR); close(CMD_ERR);
} }
elsif ($name eq 'piper') {
$cmd = "{ \"text\": \"$string\", \"output_file\": \"$output\" }";
print(">> $cmd\n") if $verbose;
print(CMD_IN "$cmd\n");
my $res = <CMD_OUT>;
$res = <CMD_ERR>;
}
elsif ($name eq 'flite') { elsif ($name eq 'flite') {
$cmd = "flite $tts_engine_opts -t \"$string\" \"$output\""; $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
print("> $cmd\n") if $verbose; print("> $cmd\n") if $verbose;
@ -469,7 +519,6 @@ sub generateclips {
print("\n"); print("\n");
unlink($updfile) if (-f $updfile); unlink($updfile) if (-f $updfile);
shutdown_tts($tts_object);
} }
# Assemble the voicefile # Assemble the voicefile
@ -608,6 +657,7 @@ if ($V == 1) {
defined($t) ? $t : "unknown", defined($t) ? $t : "unknown",
$l, $e, $E, $s, $S); $l, $e, $E, $s, $S);
generateclips($l, $t, $e, $E, $tts_object, $S, $f); generateclips($l, $t, $e, $E, $tts_object, $S, $f);
shutdown_tts($tts_object);
createvoice($l, $i, $f); createvoice($l, $i, $f);
deleteencs(); deleteencs();
} elsif ($C) { } elsif ($C) {