waiting is over: initial unicode commit

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@8169 a1c6a512-1295-4272-9138-f99709370657
2005-12-06 13:27:15 +00:00 · 2005-12-06 13:27:15 +00:00 · b0fee17d6e
commit b0fee17d6e
parent 01917ec980
70 changed files with 253032 additions and 14343 deletions
--- a/firmware/common/unicode.c
+++ b/firmware/common/unicode.c
@ -0,0 +1,307 @@
+/*   Some conversion functions for handling UTF-8
+ *
+ *   copyright Marcoen Hirschberg (2004,2005)
+ *
+ *   I got all the info from:
+ *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
+ *   and
+ *   http://en.wikipedia.org/wiki/Unicode
+ */
+
+#include <stdio.h>
+#include "file.h"
+#include "debug.h"
+#include "rbunicode.h"
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+#define NUM_TABLES 5
+#define NUM_CODEPAGES 13
+
+static int default_codepage = 0;
+static unsigned short codepage_table[MAX_CP_TABLE_SIZE];
+static int loaded_cp_table = 0;
+
+
+static const unsigned char utf8comp[6] = 
+{
+    0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
+};
+
+static const char *filename[NUM_TABLES] =
+{
+    CODEPAGE_DIR"/iso.cp",
+    CODEPAGE_DIR"/932.cp",  /* SJIS    */
+    CODEPAGE_DIR"/936.cp",  /* GB2312  */
+    CODEPAGE_DIR"/949.cp",  /* KSX1001 */
+    CODEPAGE_DIR"/950.cp"   /* BIG5    */
+};
+
+static const char cp_2_table[NUM_CODEPAGES] =
+{
+    0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5
+};
+
+/* Load codepage file into memory */
+int load_cp_table(int cp)
+{
+    int i=0;
+    int table = cp_2_table[cp];
+    int file, tablesize;
+    unsigned char tmp[2];
+
+    if (cp == 0 || table == loaded_cp_table)
+        return 1;
+
+    file = open(filename[table-1], O_RDONLY|O_BINARY);
+
+    if (file < 0) {
+        DEBUGF("Can't open codepage file: %s.cp\n", filename[table-1]);
+        return 0;
+    }
+
+    tablesize = lseek(file, 0, SEEK_END) / 2;
+    lseek(file, 0, SEEK_SET);
+
+    if (tablesize > MAX_CP_TABLE_SIZE) {
+        DEBUGF("Invalid codepage file: %s.cp\n", filename[table-1]);
+        close(file);
+        return 0;
+    }
+
+    while (i < tablesize) {
+        if (!read(file, tmp, 2)) {
+            DEBUGF("Can't read from codepage file: %s.cp\n", filename[table-1]);
+            loaded_cp_table = 0;
+            return 0;
+        }
+        codepage_table[i++] = (tmp[1] << 8) | tmp[0];
+    }
+
+    loaded_cp_table = table;
+    close(file);
+    return 1;
+}
+
+/* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
+unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
+{
+    int tail = 0;
+
+    if (ucs > 0x7F)
+        while (ucs >> (6*tail + 2))
+            tail++;
+
+    *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
+    while (tail--)
+        *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
+
+    return utf8;
+}
+
+/* Recode an iso encoded string to UTF-8 */
+unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8,
+                          int cp, int count)
+{
+    unsigned short ucs, tmp;
+
+    if (cp == -1) /* use default codepage */
+       cp = default_codepage;
+
+    if (!load_cp_table(cp)) cp = 0;
+
+    while (count--) {
+        if (*iso < 128)
+            *utf8++ = *iso++;
+
+        else {
+
+            /* cp tells us which codepage to convert from */
+            switch (cp) {
+                case 0x01: /* Greek (ISO-8859-7) */
+                case 0x02: /* Hebrew (ISO-8859-8) */
+                case 0x03: /* Russian (CP1251) */
+                case 0x04: /* Thai (ISO-8859-11) */
+                case 0x05: /* Arabic (ISO-8859-6) */
+                case 0x06: /* Turkish (ISO-8859-9) */
+                case 0x07: /* Latin Extended (ISO-8859-2) */
+                    tmp = ((cp-1)*128) + (*iso++ - 128);
+                    ucs = codepage_table[tmp];
+                    break;
+
+                case 0x08: /* Japanese (SJIS) */
+                    if (*iso > 0xA0 && *iso < 0xE0) {
+                        tmp = *iso | 0xA100;
+                        ucs = codepage_table[tmp];
+                        break;
+                    }
+
+                case 0x09: /* Simplified Chinese (GB2312) */
+                case 0x0A: /* Korean (KSX1001) */
+                case 0x0B: /* Traditional Chinese (BIG5) */
+                    if (count < 1 || !iso[1]) {
+                        ucs = *iso++;
+                        break;
+                    }
+
+                    /* we assume all cjk strings are written
+                       in big endian order */
+                    tmp = *iso++ << 8;
+                    tmp |= *iso++;
+                    tmp -= 0x8000;
+                    ucs = codepage_table[tmp];
+                    count--;
+                    break;
+
+                case 0x0C: /* UTF-8, do nothing */
+                default:
+                    ucs = *iso++;
+                    break;
+            }
+
+            if (ucs == 0) /* unknown char, assume invalid encoding */
+                ucs = 0xffff;
+            utf8 = utf8encode(ucs, utf8);
+        }
+    }
+    return utf8;
+}
+
+/* Recode a UTF-16 string with little-endian byte ordering to UTF-8 */
+unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, unsigned int count)
+{
+    unsigned long ucs;
+
+    while (count != 0) {
+        if (utf16[1] >= 0xD8 && utf16[1] < 0xE0) { /* Check for a surrogate pair */
+            ucs = 0x10000 + ((utf16[0] << 10) | ((utf16[1] - 0xD8) << 18) | utf16[2] | ((utf16[3] - 0xDC) << 8));
+            utf16 += 4;
+            count -= 2;
+        } else {
+            ucs = (utf16[0] | (utf16[1] << 8));
+            utf16 += 2;
+            count -= 1;
+        }
+        utf8 = utf8encode(ucs, utf8);
+    }
+    return utf8;
+}
+
+/* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
+unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, unsigned int count)
+{
+    unsigned long ucs;
+
+    while (count != 0) {
+        if (*utf16 >= 0xD8 && *utf16 < 0xE0) { /* Check for a surrogate pair */
+            ucs = 0x10000 + (((utf16[0] - 0xD8) << 18) | (utf16[1] << 10) | ((utf16[2] - 0xDC) << 8) | utf16[3]);
+            utf16 += 4;
+            count -= 2;
+        } else {
+            ucs = (utf16[0] << 8) | utf16[1];
+            utf16 += 2;
+            count -= 1;
+        }
+        utf8 = utf8encode(ucs, utf8);
+    }
+    return utf8;
+}
+
+/* Recode any UTF-16 string to UTF-8 */
+//unsigned char* utf16decode(unsigned const char *utf16, unsigned char *utf8, unsigned int count)
+unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8, unsigned int count)
+{
+    unsigned long ucs;
+
+    ucs = *(utf16++) << 8;
+    ucs |= *(utf16++);
+
+    if (ucs == 0xFEFF) /* Check for BOM */
+        return utf16BEdecode(utf16, utf8, count-1);
+    else if (ucs == 0xFFFE)
+        return utf16LEdecode(utf16, utf8, count-1);
+    else { /* ADDME: Should default be LE or BE? */
+        utf16 -= 2;
+        return utf16BEdecode(utf16, utf8, count);
+    }
+}
+
+/* Return the number of UTF-8 chars in a string */
+unsigned long utf8length(const unsigned char *utf8)
+{
+    unsigned long l = 0;
+
+    while (*utf8 != 0)
+        if ((*utf8++ & MASK) != COMP)
+            l++;
+
+    return l;
+}
+
+/* Decode 1 UTF-8 char and return a pointer to the next char. */
+const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
+{
+    unsigned char c = *utf8++;
+    unsigned long code;
+    int tail = 0;
+
+    if ((c <= 0x7f) || (c >= 0xc2)) {
+        /* Start of new character. */
+        if (c < 0x80) {        /* U-00000000 - U-0000007F, 1 byte */
+            code = c;
+        } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
+            tail = 1;
+            code = c & 0x1f;
+        } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
+            tail = 2;
+            code = c & 0x0f;
+        } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
+            tail = 3;
+            code = c & 0x07;
+        } else {
+            /* Invalid size. */
+            code = 0xffff;
+        }
+
+        while (tail-- && ((c = *utf8++) != 0)) {
+            if ((c & 0xc0) == 0x80) {
+                /* Valid continuation character. */
+                code = (code << 6) | (c & 0x3f);
+
+            } else {
+                /* Invalid continuation char */
+                code = 0xffff;
+                utf8--;
+                break;
+            }
+        }
+    } else {
+        /* Invalid UTF-8 char */
+        code = 0xffff;
+    }
+    /* currently we don't support chars above U-FFFF */
+    *ucs = (code < 0x10000) ? code : 0xffff;
+    return utf8;
+}
+
+void set_codepage(int cp)
+{
+    default_codepage = cp;
+    return;
+}
+
+/* seek to a given char in a utf8 string and
+   return its start position in the string */
+int utf8seek(const unsigned char* utf8, int offset)
+{
+    int pos = 0;
+
+    while (offset--) {
+        pos++;
+        while ((utf8[pos] & MASK) == COMP)
+            pos++;
+    }
+    return pos;
+}