genlang: Explicitly decompose all display strings

We already have pretty solid support for glyph combining, so this will
allow us to utilize that to fill in gaps of our font coverage.  This is
most notable for Vietnamese, Turkish, and numerous other latin-based
writing systems that have unique glyphs that are essentially just a
"standard" ascii letter plus a diacritic mark.

This leaves *voice* strings fully composed/normalized.

It also has no effect on user-supplied strings (eg filenames or file
metadata)

When we eventually utf8proc merged, this can be removed in favor
of always doing the [de]composition in-system.  We will also need
to revisit our diacritic tables to ensure there's nothing missing.

Change-Id: I7012d27010bb33fb0b565ac7dfd57a16bdcad34f
This commit is contained in:
Solomon Peachy 2025-10-21 08:57:14 -04:00
parent cd54b4e046
commit 4bc336b8be

View file

@ -9,6 +9,9 @@
# #
# Copyright (C) 2006 - 2008 by Daniel Stenberg # Copyright (C) 2006 - 2008 by Daniel Stenberg
# #
use utf8;
use Unicode::Normalize;
use Encode qw( encode_utf8 );
# See apps/language.c (TODO: Use common include for both) # See apps/language.c (TODO: Use common include for both)
# Cookie and binary version for the binary lang file # Cookie and binary version for the binary lang file
@ -322,6 +325,8 @@ for (keys %users) {
# #
open(LANG, "<$input") || die "Error: couldn't read language file named $input\n"; open(LANG, "<$input") || die "Error: couldn't read language file named $input\n";
binmode(LANG, ":utf8");
my @phrase; my @phrase;
my $langoptions = 0; my $langoptions = 0;
@ -610,7 +615,7 @@ elsif($binary || $binvoice) {
$langoptions); # magic lang file header $langoptions); # magic lang file header
} }
if($binvoice) { if($binvoice) {
open(OUTV, ">$binvoice") or die "Error: Can't create $binary"; open(OUTV, ">$binvoice") or die "Error: Can't create $binvoice";
binmode OUTV; binmode OUTV;
printf OUTV ("%c%c%c%c", $VOICE_COOKIE, $LANGUAGE_VERSION, $target_id, printf OUTV ("%c%c%c%c", $VOICE_COOKIE, $LANGUAGE_VERSION, $target_id,
$langoptions); # magic lang file header $langoptions); # magic lang file header
@ -646,17 +651,19 @@ elsif($binary || $binvoice) {
if($dest && $n < 0x8000 && $binary) { if($dest && $n < 0x8000 && $binary) {
$dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes $dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
$dest = encode_utf8(NFD($dest)); # Decompose
# Now, make sure we get the number from the english sort order: # Now, make sure we get the number from the english sort order:
$idnum = $idmap[$_]{$name}; $idnum = $idmap[$_]{$name};
printf OUTF ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $dest); printf OUTF ("%c%c%s\x00", (($idnum>>8)&0xff), ($idnum&0xff), $dest);
} }
if($voice && $binvoice) { if($voice && $binvoice) {
$voice =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes $voice =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes
$voice = encode_utf8($voice);
# Now, make sure we get the number from the english sort order: # Now, make sure we get the number from the english sort order:
$idnum = $idmap[$_]{$name}; $idnum = $idmap[$_]{$name};
printf OUTV ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $voice); printf OUTV ("%c%c%s\x00", (($idnum>>8)&0xff), ($idnum&0xff), $voice);
} }
} }
} }