unicode: add utf16decode with utf8 buffer size check

Make use of it in id3tags and cuesheet Change-Id: I153c23f1f7312e9d5e1de9f03725f2d2ab0abc93
2025-12-06 13:15:25 -05:00 · 2024-12-14 12:54:00 +02:00 · 2024-12-14 12:54:00 +02:00 · 1f548f74e6
commit 1f548f74e6
parent e334a1f95e
4 changed files with 34 additions and 42 deletions
--- a/apps/cuesheet.c
+++ b/apps/cuesheet.c
@ -265,7 +265,7 @@ bool parse_cuesheet(struct cuesheet_file *cue_file, struct cuesheet *cue)
    {
        if (char_enc == CHAR_ENC_UTF_16_LE)
        {
-            s = utf16LEdecode(line, utf16_buf, line_len);
+            s = utf16decode(line, utf16_buf, line_len>>1, sizeof(utf16_buf) - 1, true);
            /* terminate the string at the newline */
            *s = '\0';
            strcpy(line, utf16_buf);
@ -275,7 +275,7 @@ bool parse_cuesheet(struct cuesheet_file *cue_file, struct cuesheet *cue)
        }
        else if (char_enc == CHAR_ENC_UTF_16_BE)
        {
-            s = utf16BEdecode(line, utf16_buf, line_len);
+            s = utf16decode(line, utf16_buf, line_len>>1, sizeof(utf16_buf) - 1, false);
            *s = '\0';
            strcpy(line, utf16_buf);
        }
--- a/firmware/common/unicode.c
+++ b/firmware/common/unicode.c
@ -265,7 +265,7 @@ static unsigned char * utf8encode_internal(unsigned long ucs, unsigned char *utf
    return utf8;
 }

-static unsigned char* utf8encode_ex(unsigned long ucs, unsigned char *utf8, int* utf8_size)
+FORCE_INLINE static unsigned char* utf8encode_ex(unsigned long ucs, unsigned char *utf8, int* utf8_size)
 {
    const int tail = utf8_ucs_get_extra_bytes_count(ucs);
    *utf8_size -= tail + 1;
@ -420,49 +420,46 @@ unsigned char* iso_decode_ex(const unsigned char *iso, unsigned char *utf8, int
    return utf8;
 }

-/* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
-unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
-        int count)
+unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
+        int count, int utf8_size, bool le)
 {
+    if (utf8_size == -1)
+        utf8_size = INT_MAX;
+
+    // little-endian flag is used as significant byte index
+    if (le)
+        le = 1;
+
    unsigned long ucs;

-    while (count > 0) {
+    while (count > 0 && utf8_size > 0) {
        /* Check for a surrogate pair */
-        if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) {
-            ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18)
-                    | utf16[2] | ((utf16[3] - 0xDC) << 8));
+        if (*(utf16 + le) >= 0xD8 && *(utf16 + le) < 0xE0) {
+            ucs = 0x10000 + ((utf16[1 - le] << 10) | ((utf16[le] - 0xD8) << 18)
+                  | utf16[2 + (1 - le)] | ((utf16[2 + le] - 0xDC) << 8));
            utf16 += 4;
            count -= 2;
        } else {
-            ucs = getle16(utf16);
+            ucs = utf16[le] << 8 | utf16[1 - le];
            utf16 += 2;
            count -= 1;
        }
-        utf8 = utf8encode(ucs, utf8);
+        utf8 = utf8encode_ex(ucs, utf8, &utf8_size);
    }
    return utf8;
 }

 /* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
+unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
+        int count)
+{
+    return utf16decode(utf16, utf8, count, -1, true);
+}
+
 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
        int count)
 {
-    unsigned long ucs;
-
-    while (count > 0) {
-        if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
-            ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10)
-                    | ((utf16[2] - 0xDC) << 8) | utf16[3]);
-            utf16 += 4;
-            count -= 2;
-        } else {
-            ucs = getbe16(utf16);
-            utf16 += 2;
-            count -= 1;
-        }
-        utf8 = utf8encode(ucs, utf8);
-    }
-    return utf8;
+    return utf16decode(utf16, utf8, count, -1, false);
 }

 bool utf16_has_bom(const unsigned char *utf16, bool *le)
--- a/firmware/include/rbunicode.h
+++ b/firmware/include/rbunicode.h
@ -61,6 +61,7 @@ unsigned char* iso_decode_ex(const unsigned char *iso, unsigned char *utf8, int

 unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, int count);
 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, int count);
+unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8, int count, int utf8_size, bool le);
 bool utf16_has_bom(const unsigned char *utf16, bool *le);
 unsigned long utf8length(const unsigned char *utf8);
 const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs);
--- a/lib/rbcodec/metadata/id3tags.c
+++ b/lib/rbcodec/metadata/id3tags.c
@ -574,17 +574,13 @@ static void unicode_munge(unsigned char* string, unsigned char* utf8buf, int *le
    unsigned char *str = string;
    unsigned char* utf8 = utf8buf;

-    int i = 0;
-    int templen = 0;
-
    switch (str[0]) {
        case 0x01: /* Unicode with or without BOM */
        case 0x02:
            (*len)--;
            str++;
            bool le;
-
-
+            int i = 0;
            /* Handle frames with more than one string
               (needed for TXXX frames).*/
            do {
@ -593,24 +589,22 @@ static void unicode_munge(unsigned char* string, unsigned char* utf8buf, int *le
                    str += BOM_UTF_16_SIZE;
                    *len -= BOM_UTF_16_SIZE;
                }
+                string = str;

                while ((i < *len) && (str[0] || str[1])) {
-                    if(le)
-                        utf8 = utf16LEdecode(str, utf8, 1);
-                    else
-                        utf8 = utf16BEdecode(str, utf8, 1);
-
                    str+=2;
                    i += 2;
                }

+                utf8 = utf16decode(string, utf8, (str-string)>>1 /*(str-string)/2*/, utf8buf_size, le);
                *utf8++ = 0; /* Terminate the string */
-                templen += (strlen(&utf8buf[templen]) + 1);
+                utf8buf_size -= utf8 - utf8buf;
                str += 2;
-                i+=2;
-            } while(i < *len);
-            *len = templen - 1;
+                i += 2;
+            } while(i < *len && utf8buf_size > 0);
+            *len = utf8 - utf8buf  - 1;
            break;
+
        /* case 0x03:  UTF-8 encoded string handled by parse_as_utf8 */

        case 0x00: /* Type 0x00 is ordinary ISO 8859-1 */