rockbox/firmware/common/unicode.c

/***************************************************************************
 *             __________               __   ___.
 *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
 *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
 *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 *                     \/            \/     \/    \/            \/
 * $Id$
 *
 * Copyright (c) 2004,2005 by Marcoen Hirschberg
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
 * KIND, either express or implied.
 *
 ****************************************************************************/
/*   Some conversion functions for handling UTF-8
 *
 *   I got all the info from:
 *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 *   and
 *   http://en.wikipedia.org/wiki/Unicode
 */

#include <stdio.h>
#include "config.h"
#include "system.h"
#include "thread.h"
#include "file.h"
#include "debug.h"
#include "rbunicode.h"
#include "rbpaths.h"
#include "pathfuncs.h"
#include "core_alloc.h"

#ifndef O_BINARY
#define O_BINARY 0
#endif
#ifndef O_NOISODECODE
#define O_NOISODECODE 0
#endif

#define getle16(p) (p[0] | (p[1] << 8))
#define getbe16(p) ((p[0] << 8) | p[1])

#if !defined (__PCTOOL__) && (CONFIG_PLATFORM & PLATFORM_NATIVE)
/* Because file scanning uses the default CP table when matching entries,
   on-demand loading is not feasible; we also must use the filesystem lock */
#include "file_internal.h"
#else /* APPLICATION */
#ifdef __PCTOOL__
#define yield()
#define DEFAULT_CP_STATIC_ALLOC
#endif
#define open_noiso_internal open
#endif /* !APPLICATION */

#if 0 /* not needed just now (will probably end up a spinlock) */
#include "mutex.h"
static struct mutex cp_mutex SHAREDBSS_ATTR;
#define cp_lock_init()   mutex_init(&cp_mutex)
#define cp_lock_enter()  mutex_lock(&cp_mutex)
#define cp_lock_leave()  mutex_unlock(&cp_mutex)
#else
#define cp_lock_init()   do {} while (0)
#define cp_lock_enter()  asm volatile ("")
#define cp_lock_leave()  asm volatile ("")
#endif

enum cp_tid
{
    CP_TID_NONE = -1,
    CP_TID_ISO,
    CP_TID_932,
    CP_TID_936,
    CP_TID_949,
    CP_TID_950,
};

struct cp_info
{
    int8_t      tid;
    const char  *filename;
    const char  *name;
};

#define MAX_CP_TABLE_SIZE  32768

#define CPF_ISO "iso.cp"
#define CPF_932 "932.cp"  /* SJIS    */
#define CPF_936 "936.cp"  /* GB2312  */
#define CPF_949 "949.cp"  /* KSX1001 */
#define CPF_950 "950.cp"  /* BIG5    */

static const struct cp_info cp_info[NUM_CODEPAGES+1] =
{
    [0 ... NUM_CODEPAGES] = { CP_TID_NONE, NULL   , "unknown"     },
    [ISO_8859_1]          = { CP_TID_NONE, NULL   , "ISO-8859-1"  },
    [ISO_8859_7]          = { CP_TID_ISO , CPF_ISO, "ISO-8859-7"  },
    [ISO_8859_8]          = { CP_TID_ISO , CPF_ISO, "ISO-8859-8"  },
    [WIN_1251]            = { CP_TID_ISO , CPF_ISO, "CP1251"      },
    [ISO_8859_11]         = { CP_TID_ISO , CPF_ISO, "ISO-8859-11" },
    [WIN_1256]            = { CP_TID_ISO , CPF_ISO, "CP1256"      },
    [ISO_8859_9]          = { CP_TID_ISO , CPF_ISO, "ISO-8859-9"  },
    [ISO_8859_2]          = { CP_TID_ISO , CPF_ISO, "ISO-8859-2"  },
    [WIN_1250]            = { CP_TID_ISO , CPF_ISO, "CP1250"      },
    [WIN_1252]            = { CP_TID_ISO , CPF_ISO, "CP1252"      },
    [SJIS]                = { CP_TID_932 , CPF_932, "SJIS"        },
    [GB_2312]             = { CP_TID_936 , CPF_936, "GB-2312"     },
    [KSX_1001]            = { CP_TID_949 , CPF_949, "KSX-1001"    },
    [BIG_5]               = { CP_TID_950 , CPF_950, "BIG5"        },
    [UTF_8]               = { CP_TID_NONE, NULL   , "UTF-8"       },
};

static int default_cp = INIT_CODEPAGE;
static int default_cp_tid = CP_TID_NONE;
static int default_cp_handle = 0;
static int volatile default_cp_table_ref = 0;

static int loaded_cp_tid = CP_TID_NONE;
static int volatile cp_table_ref = 0;
#define CP_LOADING BIT_N(sizeof(int)*8-1) /* guard against multi loaders */

/* non-default codepage table buffer (cannot be bufalloced! playback itself
   may be making the load request) */
static unsigned short codepage_table[MAX_CP_TABLE_SIZE+1]; // XXX convert to ucschar_t if we ever need > 16bit mappings?

#if defined(APPLICATION) && defined(__linux__)
static const char * const name_codepages_linux[NUM_CODEPAGES+1] =
{
    [0 ... NUM_CODEPAGES] = "unknown",
    [ISO_8859_1]          = "iso8859-1",
    [ISO_8859_7]          = "iso8859-7",
    [ISO_8859_8]          = "iso8859-8",
    [WIN_1251]            = "cp1251",
    [ISO_8859_11]         = "iso8859-11",
    [WIN_1256]            = "cp1256",
    [ISO_8859_9]          = "iso8859-9",
    [ISO_8859_2]          = "iso8859-2",
    [WIN_1250]            = "cp1250",
    /* iso8859-15 is closest, linux doesnt have a codepage named cp1252 */
    [WIN_1252]            = "iso8859-15",
    [SJIS]                = "cp932",
    [GB_2312]             = "cp936",
    [KSX_1001]            = "cp949",
    [BIG_5]               = "cp950",
    [UTF_8]               = "utf8",
};

const char *get_current_codepage_name_linux(void)
{
    int cp = default_cp;
    if (cp < 0 || cp>= NUM_CODEPAGES)
        cp = NUM_CODEPAGES;
    return name_codepages_linux[cp];
}
#endif /* defined(APPLICATION) && defined(__linux__) */

#ifdef DEFAULT_CP_STATIC_ALLOC
static unsigned short default_cp_table_buf[MAX_CP_TABLE_SIZE+1];
#define cp_table_get_data(handle) \
    default_cp_table_buf
#define cp_table_free(handle) \
    do {} while (0)
#define cp_table_alloc(size, opsp) \
    ({ (void)(opsp); 1; })
#define cp_table_pin(handle) \
    do { (void)handle; } while(0)
#define cp_table_unpin(handle) \
    do { (void)handle; } while(0)
#else
#define cp_table_alloc(size, opsp) \
    core_alloc_ex((size), (opsp))
#define cp_table_free(handle) \
    core_free(handle)
#define cp_table_get_data(handle) \
    core_get_data(handle)
#define cp_table_pin(handle) \
    core_pin(handle)
#define cp_table_unpin(handle) \
    core_unpin(handle)
#endif

static const unsigned char utf8comp[6] =
{
    0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
};

static inline void cptable_tohw16(uint16_t *buf, unsigned int count)
{
#ifdef ROCKBOX_BIG_ENDIAN
    for (unsigned int i = 0; i < count; i++)
        buf[i] = letoh16(buf[i]);
#endif
    (void)buf; (void)count;
}

static int alloc_and_load_cp_table(int cp, void *buf)
{
    /* alloc and read only if there is an associated file */
    const char *filename = cp_info[cp].filename;
    if (!filename)
        return 0;

    char path[MAX_PATH];
    if (path_append(path, CODEPAGE_DIR, filename, sizeof (path))
        >= sizeof (path)) {
        return -1;
    }

    /* must be opened without a chance of reentering from FS code */
    int fd = open_noiso_internal(path, O_RDONLY);
    if (fd < 0)
        return -1;

    off_t size = filesize(fd);

    if (size > 0 && size <= MAX_CP_TABLE_SIZE*2 &&
        !(size % (off_t)sizeof (uint16_t))) {

        /* if the buffer is provided, use that but don't alloc */
        int handle = buf ? 0 : cp_table_alloc(size, NULL);
        if (handle > 0) {
            cp_table_pin(handle);
            buf = cp_table_get_data(handle);
        }

        if (buf && read(fd, buf, size) == size) {
            close(fd);
            cptable_tohw16(buf, size / sizeof (uint16_t));
            if (handle > 0)
                cp_table_unpin(handle);
            return handle;
        }

        if (handle > 0)
            cp_table_free(handle);
    }

    close(fd);
    return -1;
}

/* returns number of additional bytes required in encoded string (bytes_count - 1) */
static int utf8_ucs_get_extra_bytes_count(unsigned long ucs)
{
    int tail = 0;

    if (ucs > 0x7F)
        while (ucs >> (5*tail + 6))
            tail++;

    return tail;
}

static unsigned char * utf8encode_internal(unsigned long ucs, unsigned char *utf8, int tail)
{
    *utf8++ = (ucs >> (6*tail)) | utf8comp[tail];
    while (tail--)
        *utf8++ = ((ucs >> (6*tail)) & (MASK ^ 0xFF)) | COMP;
    return utf8;
}

FORCE_INLINE static unsigned char* utf8encode_ex(unsigned long ucs, unsigned char *utf8, int* utf8_size)
{
    const int tail = utf8_ucs_get_extra_bytes_count(ucs);
    *utf8_size -= tail + 1;
    return  *utf8_size < 0 ? utf8 : utf8encode_internal(ucs, utf8, tail);
}

/* Encode a UCS value as UTF-8 and return a pointer after this UTF-8 char. */
unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8)
{
    return utf8encode_internal(ucs, utf8, utf8_ucs_get_extra_bytes_count(ucs));
}

unsigned char* iso_decode(const unsigned char *iso, unsigned char *utf8, int cp, int count)
{
    return iso_decode_ex(iso, utf8, cp, count, INT_MAX);
}

/* Recode an iso encoded string to UTF-8 */
unsigned char* iso_decode_ex(const unsigned char *iso, unsigned char *utf8, int cp, int count, int utf8_size)
{
    uint16_t *table = NULL;

    cp_lock_enter();

    if (cp < 0 || cp >= NUM_CODEPAGES)
        cp = default_cp;

    int tid = cp_info[cp].tid;

    while (1) {
        if (tid == default_cp_tid) {
            /* use default table */
            if (default_cp_handle > 0) {
                table = cp_table_get_data(default_cp_handle);
                default_cp_table_ref++;
            }

            break;
        }

        bool load = false;

        if (tid == loaded_cp_tid) {
            /* use loaded table */
            if (!(cp_table_ref & CP_LOADING)) {
                if (tid != CP_TID_NONE) {
                    table = codepage_table;
                    cp_table_ref++;
                }

                break;
            }
        } else if (cp_table_ref == 0) {
            load = true;
            cp_table_ref |= CP_LOADING;
        }

        /* alloc and load must be done outside the lock */
        cp_lock_leave();

        if (!load) {
            yield();
        } else if (alloc_and_load_cp_table(cp, codepage_table) < 0) {
            cp = INIT_CODEPAGE; /* table may be clobbered now */
            tid = cp_info[cp].tid;
        }

        cp_lock_enter();

        if (load) {
            loaded_cp_tid = tid;
            cp_table_ref &= ~CP_LOADING;
        }
    }

    cp_lock_leave();

    while (count-- && utf8_size > 0) {
        ucschar_t ucs, tmp;

        if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */
        {
            *utf8++ = *iso++;
            --utf8_size;
        }

        else {
            /* tid tells us which table to use and how */
            switch (tid) {
                case CP_TID_ISO: /* Greek */
                                 /* Hebrew */
                                 /* Cyrillic */
                                 /* Thai */
                                 /* Arabic */
                                 /* Turkish */
                                 /* Latin Extended */
                                 /* Central European */
                                 /* Western European */
                    tmp = ((cp-1)*128) + (*iso++ - 128);
                    ucs = table[tmp];
                    break;

                case CP_TID_932: /* Japanese */
                    if (*iso > 0xA0 && *iso < 0xE0) {
                        tmp = *iso++ | (0xA100 - 0x8000);
                        ucs = table[tmp];
                        break;
                    }
                    /* fallthrough */
                case CP_TID_936: /* Simplified Chinese */
                case CP_TID_949: /* Korean */
                case CP_TID_950: /* Traditional Chinese */
                    if (count < 1 || !iso[1]) {
                        ucs = *iso++;
                        break;
                    }

                    /* we assume all cjk strings are written
                       in big endian order */
                    tmp = *iso++ << 8;
                    tmp |= *iso++;
                    tmp -= 0x8000;
                    ucs = table[tmp];
                    count--;
                    break;

                default:
                    ucs = *iso++;
                    break;
            }

            if (ucs == 0) /* unknown char, use replacement char */
                ucs = 0xfffd;

            utf8 = utf8encode_ex(ucs, utf8, &utf8_size);
        }
    }

    if (table) {
        cp_lock_enter();
        if (table == codepage_table) {
            cp_table_ref--;
        } else {
            default_cp_table_ref--;
        }
        cp_lock_leave();
    }

    return utf8;
}

unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
        int count, int utf8_size, bool le)
{
    // little-endian flag is used as significant byte index
    if (le)
        le = 1;

    unsigned long ucs;

    while (count > 0 && utf8_size > 0) {
        /* Check for a surrogate pair */
        if (*(utf16 + le) >= 0xD8 && *(utf16 + le) < 0xE0) {
            ucs = 0x10000 + ((utf16[1 - le] << 10) | ((utf16[le] - 0xD8) << 18)
                  | utf16[2 + (1 - le)] | ((utf16[2 + le] - 0xDC) << 8));
            utf16 += 4;
            count -= 2;
        } else {
            ucs = utf16[le] << 8 | utf16[1 - le];
            utf16 += 2;
            count -= 1;
        }
        utf8 = utf8encode_ex(ucs, utf8, &utf8_size);
    }
    return utf8;
}

/* Recode a UTF-16 string with big-endian byte ordering to UTF-8 */
unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8,
        int count)
{
    return utf16decode(utf16, utf8, count, INT_MAX, true);
}

unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
        int count)
{
    return utf16decode(utf16, utf8, count, INT_MAX, false);
}

bool utf16_has_bom(const unsigned char *utf16, bool *le)
{
    unsigned long ucs = utf16[0] << 8 | utf16[1];

    if (ucs == 0xFEFF) /* Check for BOM */
    {
        *le  = false;
        return true;
    }

    if (ucs == 0xFFFE)
    {
        *le = true;
        return true;
    }

    /* If there is no BOM let's try to guess it. If one of the bytes is 0x00, it is
       probably the most significant one. */
    *le = utf16[1] == 0;
    return false;
}

#if 0 /* currently unused */
/* Recode any UTF-16 string to UTF-8 */
unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
        unsigned int count)
{
    unsigned long ucs;

    ucs = *(utf16++) << 8;
    ucs |= *(utf16++);

    if (ucs == 0xFEFF) /* Check for BOM */
        return utf16BEdecode(utf16, utf8, count-1);
    else if (ucs == 0xFFFE)
        return utf16LEdecode(utf16, utf8, count-1);
    else { /* ADDME: Should default be LE or BE? */
        utf16 -= 2;
        return utf16BEdecode(utf16, utf8, count);
    }
}
#endif

/* Return the number of UTF-8 chars in a string */
unsigned long utf8length(const unsigned char *utf8)
{
    unsigned long l = 0;

    while (*utf8 != 0)
        if ((*utf8++ & MASK) != COMP)
            l++;

    return l;
}

/* Take a utf8 string and return the encoded length in utf16 code units */
unsigned long utf16len_utf8(const unsigned char *utf8)
{
    ucschar_t cp;
    unsigned long length = 0;
    while (*utf8) {
        utf8 = utf8decode(utf8, &cp);
#ifdef UNICODE32
        if (cp >= 0x10000)
            length++;
#endif
        length++;
    }

    return length;
}

/* Decode 1 UTF-8 char and return a pointer to the next char. */
const unsigned char* utf8decode(const unsigned char *utf8, ucschar_t *ucs)
{
    unsigned char c = *utf8++;
    unsigned long code;
    int tail = 0;

    if ((c <= 0x7f) || (c >= 0xc2)) {
        /* Start of new character. */
        if (c < 0x80) {        /* U-00000000 - U-0000007F, 1 byte */
            code = c;
        } else if (c < 0xe0) { /* U-00000080 - U-000007FF, 2 bytes */
            tail = 1;
            code = c & 0x1f;
        } else if (c < 0xf0) { /* U-00000800 - U-0000FFFF, 3 bytes */
            tail = 2;
            code = c & 0x0f;
        } else if (c < 0xf5) { /* U-00010000 - U-001FFFFF, 4 bytes */
            tail = 3;
            code = c & 0x07;
        } else {
            /* Invalid size. */
            code = 0xfffd;
        }

        while (tail-- && ((c = *utf8++) != 0)) {
            if ((c & 0xc0) == 0x80) {
                /* Valid continuation character. */
                code = (code << 6) | (c & 0x3f);

            } else {
                /* Invalid continuation char */
                code = 0xfffd;
                utf8--;
                break;
            }
        }
    } else {
        /* Invalid UTF-8 char */
        code = 0xfffd;
    }

#ifdef UNICODE32
    if (code > 0x10ffff)
        code = 0xfffd;
#else
    if (code > 0xffff)
        code = 0xfffd;
#endif

    *ucs = code;
    return utf8;
}

void set_codepage(int cp)
{
    if (cp < 0 || cp >= NUM_CODEPAGES)
        cp = NUM_CODEPAGES;

    /* load first then swap if load is successful, else just leave it; if
       handle is 0 then we just free the current one; this won't happen often
       thus we don't worry about reusing it and consequently avoid possible
       clobbering of the existing one */

    int handle = -1;
    int tid = cp_info[cp].tid;

    while (1) {
        cp_lock_enter();

        if (default_cp_tid == tid)
            break;

        if (handle >= 0 && default_cp_table_ref == 0) {
            int hold = default_cp_handle;
            default_cp_handle = handle;
            handle = hold;
            default_cp_tid = tid;
            break;
        }

        /* alloc and load must be done outside the lock */
        cp_lock_leave();

        if (handle < 0 && (handle = alloc_and_load_cp_table(cp, NULL)) < 0)
            return; /* OOM; change nothing */

        yield();
    }

    default_cp = cp;
    cp_lock_leave();

    if (handle > 0)
        cp_table_free(handle);
}

int get_codepage(void)
{
    return default_cp;
}

/* seek to a given char in a utf8 string and
   return its start position in the string */
int utf8seek(const unsigned char* utf8, int offset)
{
    int pos = 0;

    while (offset--) {
        pos++;
        while ((utf8[pos] & MASK) == COMP)
            pos++;
    }
    return pos;
}

const char * get_codepage_name(int cp)
{
    if (cp < 0 || cp >= NUM_CODEPAGES)
        cp = NUM_CODEPAGES;
    return cp_info[cp].name;
}

#if 0 /* not needed just now */
void unicode_init(void)
{
    cp_lock_init();
}
#endif