unicode: Unify check for UTF-16 BOM

Adds utf16_has_bom function Change-Id: I67ea474c9cf6ca6e6684351c2f54131164b7903c
2025-10-14 02:27:39 -04:00 · 2024-12-11 16:00:55 +02:00 · 2024-12-11 16:00:55 +02:00 · a23002cd5e
commit a23002cd5e
parent 11fbbc7826
4 changed files with 43 additions and 33 deletions
--- a/apps/cuesheet.c
+++ b/apps/cuesheet.c
@ -231,15 +231,14 @@ bool parse_cuesheet(struct cuesheet_file *cue_file, struct cuesheet *cue)
            char_enc = CHAR_ENC_UTF_8;
            bom_read = BOM_UTF_8_SIZE;
        }
-        else if(!memcmp(line, BOM_UTF_16_LE, BOM_UTF_16_SIZE))
+        else
        {
-            char_enc = CHAR_ENC_UTF_16_LE;
-            bom_read = BOM_UTF_16_SIZE;
-        }
-        else if(!memcmp(line, BOM_UTF_16_BE, BOM_UTF_16_SIZE))
-        {
-            char_enc = CHAR_ENC_UTF_16_BE;
-            bom_read = BOM_UTF_16_SIZE;
+            bool le;
+            if (utf16_has_bom(line, &le))
+            {
+                char_enc = le ? CHAR_ENC_UTF_16_LE : CHAR_ENC_UTF_16_BE;
+                bom_read = BOM_UTF_16_SIZE;
+            }
        }
    }

--- a/firmware/common/unicode.c
+++ b/firmware/common/unicode.c
@ -437,6 +437,28 @@ unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
    return utf8;
 }

+bool utf16_has_bom(const unsigned char *utf16, bool *le)
+{
+    unsigned long ucs = utf16[0] << 8 | utf16[1];
+
+    if (ucs == 0xFEFF) /* Check for BOM */
+    {
+        *le  = false;
+        return true;
+    }
+
+    if (ucs == 0xFFFE)
+    {
+        *le = true;
+        return true;
+    }
+
+    /* If there is no BOM let's try to guess it. If one of the bytes is 0x00, it is
+       probably the most significant one. */
+    *le = utf16[1] == 0;
+    return false;
+}
+
 #if 0 /* currently unused */
 /* Recode any UTF-16 string to UTF-8 */
 unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
--- a/firmware/include/rbunicode.h
+++ b/firmware/include/rbunicode.h
@ -29,6 +29,7 @@
 #define _RBUNICODE_H_
 
 #include "config.h"
+#include <stdbool.h>

 #define MASK   0xC0 /* 11000000 */
 #define COMP   0x80 /* 10x      */
@ -58,6 +59,7 @@ unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8);
 unsigned char* iso_decode(const unsigned char *latin1, unsigned char *utf8, int cp, int count);
 unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, int count);
 unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, int count);
+bool utf16_has_bom(const unsigned char *utf16, bool *le);
 unsigned long utf8length(const unsigned char *utf8);
 const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs);
 void set_codepage(int cp);
--- a/lib/rbcodec/metadata/id3tags.c
+++ b/lib/rbcodec/metadata/id3tags.c
@ -570,8 +570,6 @@ static bool parse_as_utf8(char* string, int *len)
   string.  If it is, we convert it to a UTF-8 string.  If it's not unicode,
   we convert from the default codepage */
 static void unicode_munge(char* string, char* utf8buf, int *len) {
-    long tmp;
-    bool le = false;
    int i = 0;
    unsigned char *str = (unsigned char *)string;
    int templen = 0;
@ -590,28 +588,17 @@ static void unicode_munge(char* string, char* utf8buf, int *len) {
        case 0x02:
            (*len)--;
            str++;
+            bool le;
+

            /* Handle frames with more than one string
               (needed for TXXX frames).*/
            do {
-                tmp = bytes2int(0, 0, str[0], str[1]);
-
-                /* Now check if there is a BOM
-                   (zero-width non-breaking space, 0xfeff)
-                   and if it is in little or big endian format */
-                if(tmp == 0xfffe) { /* Little endian? */
-                    le = true;
-                    str += 2;
-                    (*len)-=2;
-                } else if(tmp == 0xfeff) { /* Big endian? */
-                    str += 2;
-                    (*len)-=2;
-                } else
-                /* If there is no BOM (which is a specification violation),
-                   let's try to guess it. If one of the bytes is 0x00, it is
-                   probably the most significant one. */
-                    if(str[1] == 0)
-                        le = true;
+                if (utf16_has_bom(str, &le))
+                {
+                    str += BOM_UTF_16_SIZE;
+                    *len -= BOM_UTF_16_SIZE;
+                }

                while ((i < *len) && (str[0] || str[1])) {
                    if(le)
@ -734,17 +721,17 @@ static bool is_cuesheet(char *tag, unsigned char *char_enc, unsigned char *cuesh
    switch (*(tag++))
    {
        case 0x01:
-            if (!memcmp(tag, BOM_UTF_16_BE, BOM_UTF_16_SIZE))
-                *char_enc = CHAR_ENC_UTF_16_BE;
-            else if (!memcmp(tag, BOM_UTF_16_LE, BOM_UTF_16_SIZE))
-                *char_enc = CHAR_ENC_UTF_16_LE;
-            else
+        {
+            bool le;
+            if (!utf16_has_bom(tag, &le))
                return false;

+            *char_enc = le ? CHAR_ENC_UTF_16_LE: CHAR_ENC_UTF_16_BE;
            tag+= BOM_UTF_16_SIZE;
            /* \1 + BOM(2) + C0U0E0S0H0E0E0T000 = 21 */
            *cuesheet_offset = 21;
            break;
+        }

        case 0x02:
            *char_enc = CHAR_ENC_UTF_16_BE;