internals: Support characters beyond the first unicode plane (WIP)

We used 16-bit variables to store the 'character code' everywhere but this won't let us represent anything beyond U+FFFF. This patch changes those variables to a custom type that can be 32 or 16 bits depending on the build, and adjusts numerous internal APIs and datastructures to match. This includes: * utf8decode() and friends * on-screen keyboard * font manipulation, caching, rendering, and generation * VFAT code parses and generates utf16 dirents * WIN32 simulator reads and writes utf16 filenames Note that this patch doesn't _enable_ >16bit unicode support; a followup patch will turn that on for appropriate targets. Known bugs: * Native players in 32-bit unicode mode generate mangled filename entries if they include UTF16 surrogate codepoints. Root cause is unclear, and may reside in core dircache code. Needs testing on: * windows simulator (16bit+32bit) Change-Id: I193a00fe2a11a4181ddc82df2d71be52bf00b6e6
2025-10-13 18:17:39 -04:00 · 2024-12-17 08:55:21 -05:00 · 2024-12-17 08:55:21 -05:00 · d05c59f35b
commit d05c59f35b
parent 94712b34d4
44 changed files with 480 additions and 335 deletions
--- a/firmware/common/unicode.c
+++ b/firmware/common/unicode.c
@ -127,7 +127,7 @@ static int volatile cp_table_ref = 0;

 /* non-default codepage table buffer (cannot be bufalloced! playback itself
   may be making the load request) */
-static unsigned short codepage_table[MAX_CP_TABLE_SIZE+1];
+static unsigned short codepage_table[MAX_CP_TABLE_SIZE+1]; // XXX convert to ucschar_t if we ever need > 16bit mappings?

 #if defined(APPLICATION) && defined(__linux__)
 static const char * const name_codepages_linux[NUM_CODEPAGES+1] =
@ -344,7 +344,7 @@ unsigned char* iso_decode_ex(const unsigned char *iso, unsigned char *utf8, int
    cp_lock_leave();

    while (count-- && utf8_size > 0) {
-        unsigned short ucs, tmp;
+        ucschar_t ucs, tmp;

        if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */
        {
@ -420,10 +420,6 @@ unsigned char* iso_decode_ex(const unsigned char *iso, unsigned char *utf8, int
 unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
        int count, int utf8_size, bool le)
 {
-    // little-endian flag is used as significant byte index
-    if (le)
-        le = 1;
-
    unsigned long ucs;

    while (count > 0 && utf8_size > 0) {
@ -511,8 +507,25 @@ unsigned long utf8length(const unsigned char *utf8)
    return l;
 }

+/* Take a utf8 string and return the encoded length in utf16 code units */
+unsigned long utf16len_utf8(const unsigned char *utf8)
+{
+    ucschar_t cp;
+    unsigned long length = 0;
+    while (*utf8) {
+        utf8 = utf8decode(utf8, &cp);
+#ifdef UNICODE32
+        if (cp > 0x10000)
+            length++;
+#endif
+        length++;
+    }
+
+    return length;
+}
+
 /* Decode 1 UTF-8 char and return a pointer to the next char. */
-const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
+const unsigned char* utf8decode(const unsigned char *utf8, ucschar_t *ucs)
 {
    unsigned char c = *utf8++;
    unsigned long code;
@ -552,8 +565,13 @@ const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
        /* Invalid UTF-8 char */
        code = 0xfffd;
    }
-    /* currently we don't support chars above U-FFFF */
-    *ucs = (code < 0x10000) ? code : 0xfffd;
+
+#ifndef UNICODE32
+    if (code > 0xffff)
+        code = 0xfffd;
+#endif
+
+    *ucs = code;
    return utf8;
 }