From 4bc336b8be4d70123b3af1e2e29f649a219e3045 Mon Sep 17 00:00:00 2001 From: Solomon Peachy Date: Tue, 21 Oct 2025 08:57:14 -0400 Subject: [PATCH 1/2] genlang: Explicitly decompose all display strings We already have pretty solid support for glyph combining, so this will allow us to utilize that to fill in gaps of our font coverage. This is most notable for Vietnamese, Turkish, and numerous other latin-based writing systems that have unique glyphs that are essentially just a "standard" ascii letter plus a diacritic mark. This leaves *voice* strings fully composed/normalized. It also has no effect on user-supplied strings (eg filenames or file metadata) When we eventually utf8proc merged, this can be removed in favor of always doing the [de]composition in-system. We will also need to revisit our diacritic tables to ensure there's nothing missing. Change-Id: I7012d27010bb33fb0b565ac7dfd57a16bdcad34f --- tools/genlang | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/genlang b/tools/genlang index 95762a9217..0c3502f559 100755 --- a/tools/genlang +++ b/tools/genlang @@ -9,6 +9,9 @@ # # Copyright (C) 2006 - 2008 by Daniel Stenberg # +use utf8; +use Unicode::Normalize; +use Encode qw( encode_utf8 ); # See apps/language.c (TODO: Use common include for both) # Cookie and binary version for the binary lang file @@ -322,6 +325,8 @@ for (keys %users) { # open(LANG, "<$input") || die "Error: couldn't read language file named $input\n"; +binmode(LANG, ":utf8"); + my @phrase; my $langoptions = 0; @@ -610,7 +615,7 @@ elsif($binary || $binvoice) { $langoptions); # magic lang file header } if($binvoice) { - open(OUTV, ">$binvoice") or die "Error: Can't create $binary"; + open(OUTV, ">$binvoice") or die "Error: Can't create $binvoice"; binmode OUTV; printf OUTV ("%c%c%c%c", $VOICE_COOKIE, $LANGUAGE_VERSION, $target_id, $langoptions); # magic lang file header @@ -646,17 +651,19 @@ elsif($binary || $binvoice) { if($dest && $n < 0x8000 && $binary) { $dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes + $dest = encode_utf8(NFD($dest)); # Decompose # Now, make sure we get the number from the english sort order: $idnum = $idmap[$_]{$name}; - printf OUTF ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $dest); + printf OUTF ("%c%c%s\x00", (($idnum>>8)&0xff), ($idnum&0xff), $dest); } if($voice && $binvoice) { $voice =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes + $voice = encode_utf8($voice); # Now, make sure we get the number from the english sort order: $idnum = $idmap[$_]{$name}; - printf OUTV ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $voice); + printf OUTV ("%c%c%s\x00", (($idnum>>8)&0xff), ($idnum&0xff), $voice); } } } From af7f45bac0a5ea06247de2c7d4389edd07502cdb Mon Sep 17 00:00:00 2001 From: Solomon Peachy Date: Wed, 22 Oct 2025 09:28:35 -0400 Subject: [PATCH 2/2] Revert "genlang: Explicitly decompose all display strings" This reverts commit 4bc336b8be4d70123b3af1e2e29f649a219e3045. Reason for revert: Font coverage for decomposed diacritics is actually *worse* than using fully composed characters. Change-Id: Ide37dadd5e9883c1c764ffa35f7e64a7ba91d705 --- tools/genlang | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tools/genlang b/tools/genlang index 0c3502f559..95762a9217 100755 --- a/tools/genlang +++ b/tools/genlang @@ -9,9 +9,6 @@ # # Copyright (C) 2006 - 2008 by Daniel Stenberg # -use utf8; -use Unicode::Normalize; -use Encode qw( encode_utf8 ); # See apps/language.c (TODO: Use common include for both) # Cookie and binary version for the binary lang file @@ -325,8 +322,6 @@ for (keys %users) { # open(LANG, "<$input") || die "Error: couldn't read language file named $input\n"; -binmode(LANG, ":utf8"); - my @phrase; my $langoptions = 0; @@ -615,7 +610,7 @@ elsif($binary || $binvoice) { $langoptions); # magic lang file header } if($binvoice) { - open(OUTV, ">$binvoice") or die "Error: Can't create $binvoice"; + open(OUTV, ">$binvoice") or die "Error: Can't create $binary"; binmode OUTV; printf OUTV ("%c%c%c%c", $VOICE_COOKIE, $LANGUAGE_VERSION, $target_id, $langoptions); # magic lang file header @@ -651,19 +646,17 @@ elsif($binary || $binvoice) { if($dest && $n < 0x8000 && $binary) { $dest =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes - $dest = encode_utf8(NFD($dest)); # Decompose # Now, make sure we get the number from the english sort order: $idnum = $idmap[$_]{$name}; - printf OUTF ("%c%c%s\x00", (($idnum>>8)&0xff), ($idnum&0xff), $dest); + printf OUTF ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $dest); } if($voice && $binvoice) { $voice =~ s/^\"(.*)\"\s*$/$1/g; # cut off quotes - $voice = encode_utf8($voice); # Now, make sure we get the number from the english sort order: $idnum = $idmap[$_]{$name}; - printf OUTV ("%c%c%s\x00", (($idnum>>8)&0xff), ($idnum&0xff), $voice); + printf OUTV ("%c%c%s\x00", ($idnum>>8), ($idnum&0xff), $voice); } } }