mirror of
https://github.com/Rockbox/rockbox.git
synced 2025-11-09 13:12:37 -05:00
internals: Support characters beyond the first unicode plane (WIP)
We used 16-bit variables to store the 'character code' everywhere but
this won't let us represent anything beyond U+FFFF.
This patch changes those variables to a custom type that can be 32 or 16
bits depending on the build, and adjusts numerous internal APIs and
datastructures to match. This includes:
* utf8decode() and friends
* on-screen keyboard
* font manipulation, caching, rendering, and generation
* VFAT code parses and generates utf16 dirents
* WIN32 simulator reads and writes utf16 filenames
Note that this patch doesn't _enable_ >16bit unicode support; a followup
patch will turn that on for appropriate targets.
Known bugs:
* Native players in 32-bit unicode mode generate mangled filename
entries if they include UTF16 surrogate codepoints. Root cause
is unclear, and may reside in core dircache code.
Needs testing on:
* windows simulator (16bit+32bit)
Change-Id: I193a00fe2a11a4181ddc82df2d71be52bf00b6e6
This commit is contained in:
parent
94712b34d4
commit
d05c59f35b
44 changed files with 480 additions and 335 deletions
|
|
@ -28,8 +28,8 @@
|
|||
#include "system.h"
|
||||
|
||||
#define DIAC_NUM_RANGES (ARRAYLEN(diac_ranges))
|
||||
#define DIAC_RTL (1 << 7)
|
||||
#define DIAC_CNT (0xFF ^ DIAC_RTL)
|
||||
#define DIAC_RTL (1 << 15)
|
||||
#define DIAC_CNT (0xFFFF ^ DIAC_RTL)
|
||||
|
||||
/* Each diac_range_ struct defines a Unicode range that begins with
|
||||
* N diacritic characters, and continues with non-diacritic characters up to the
|
||||
|
|
@ -39,8 +39,8 @@
|
|||
|
||||
struct diac_range
|
||||
{
|
||||
uint16_t base;
|
||||
uint8_t info; /* [RTL:1 CNT:7] */
|
||||
uint16_t base; /* Not ucschar_t until we need >16b */
|
||||
uint16_t info; /* [RTL:1 CNT:15] */
|
||||
};
|
||||
|
||||
#define DIAC_RANGE_ENTRY(first_diac, first_non_diac, is_rtl) \
|
||||
|
|
@ -51,7 +51,7 @@ struct diac_range
|
|||
static const struct diac_range diac_ranges[] =
|
||||
{
|
||||
DIAC_RANGE_ENTRY(0x0000, 0x0000, 0),
|
||||
DIAC_RANGE_ENTRY(FIRST_DIACRITIC, 0x0370, 0),
|
||||
DIAC_RANGE_ENTRY(FIRST_DIACRITIC, 0x0370, 0), /* v1 - v4.1 */
|
||||
DIAC_RANGE_ENTRY(0x0483, 0x048a, 0),
|
||||
DIAC_RANGE_ENTRY(0x0591, 0x05be, 1),
|
||||
DIAC_RANGE_ENTRY(0x05bf, 0x05c0, 1),
|
||||
|
|
@ -146,6 +146,7 @@ static const struct diac_range diac_ranges[] =
|
|||
DIAC_RANGE_ENTRY(0x19c8, 0x19ca, 0),
|
||||
DIAC_RANGE_ENTRY(0x1a17, 0x1a1c, 0),
|
||||
DIAC_RANGE_ENTRY(0x1a55, 0x1a80, 0),
|
||||
DIAC_RANGE_ENTRY(0x1ab0, 0x1b00, 0), /* v7.0 */
|
||||
DIAC_RANGE_ENTRY(0x1b00, 0x1b05, 0),
|
||||
DIAC_RANGE_ENTRY(0x1b34, 0x1b45, 0),
|
||||
DIAC_RANGE_ENTRY(0x1b6b, 0x1b74, 0),
|
||||
|
|
@ -156,10 +157,10 @@ static const struct diac_range diac_ranges[] =
|
|||
DIAC_RANGE_ENTRY(0x1cd4, 0x1ce9, 0),
|
||||
DIAC_RANGE_ENTRY(0x1ced, 0x1cee, 0),
|
||||
DIAC_RANGE_ENTRY(0x1cf2, 0x1cf3, 0),
|
||||
DIAC_RANGE_ENTRY(0x1dc0, 0x1e00, 0),
|
||||
DIAC_RANGE_ENTRY(0x20d0, 0x20f1, 0),
|
||||
DIAC_RANGE_ENTRY(0x1dc0, 0x1e00, 0), /* v4.1 - v5.2 */
|
||||
DIAC_RANGE_ENTRY(0x20d0, 0x2100, 0), /* v1.0 - v5.1 */
|
||||
DIAC_RANGE_ENTRY(0x2cef, 0x2cf2, 0),
|
||||
DIAC_RANGE_ENTRY(0x2de0, 0x2e00, 0),
|
||||
DIAC_RANGE_ENTRY(0x2de0, 0x2e00, 0), /* v5.1 */
|
||||
DIAC_RANGE_ENTRY(0x302a, 0x3030, 0),
|
||||
DIAC_RANGE_ENTRY(0x3099, 0x309b, 0),
|
||||
DIAC_RANGE_ENTRY(0xa66f, 0xa673, 0),
|
||||
|
|
@ -188,7 +189,7 @@ static const struct diac_range diac_ranges[] =
|
|||
DIAC_RANGE_ENTRY(0xabe3, 0xabeb, 0),
|
||||
DIAC_RANGE_ENTRY(0xabec, 0xabee, 0),
|
||||
DIAC_RANGE_ENTRY(0xfb1e, 0xfb1f, 0),
|
||||
DIAC_RANGE_ENTRY(0xfe20, 0xfe27, 0),
|
||||
DIAC_RANGE_ENTRY(0xfe20, 0xfe30, 0), /* v1.0 - v8.0 */
|
||||
DIAC_RANGE_ENTRY(0xfe70, 0xfe70, 1),
|
||||
DIAC_RANGE_ENTRY(0xff00, 0xff00, 0),
|
||||
DIAC_RANGE_ENTRY(0xffff, 0xffff, 0),
|
||||
|
|
@ -196,7 +197,7 @@ static const struct diac_range diac_ranges[] =
|
|||
|
||||
#define MRU_MAX_LEN 32
|
||||
|
||||
bool is_diacritic(const unsigned short char_code, bool *is_rtl)
|
||||
bool is_diacritic(const ucschar_t char_code, bool *is_rtl)
|
||||
{
|
||||
static uint8_t mru_len = 0;
|
||||
static uint8_t diacritic_mru[MRU_MAX_LEN];
|
||||
|
|
@ -209,7 +210,6 @@ bool is_diacritic(const unsigned short char_code, bool *is_rtl)
|
|||
/* Search in MRU */
|
||||
for (mru = 0, i = 0; mru < mru_len; mru++)
|
||||
{
|
||||
|
||||
/* Items shifted >> 1 */
|
||||
itmp = i;
|
||||
i = diacritic_mru[mru];
|
||||
|
|
@ -250,10 +250,10 @@ Found:
|
|||
if (is_rtl)
|
||||
*is_rtl = ((DIAC_RTL & info) == DIAC_RTL);
|
||||
|
||||
return (char_code < diac->base + (info & DIAC_CNT));
|
||||
return (char_code < (diac->base + (info & DIAC_CNT)));
|
||||
}
|
||||
#else /*BOOTLOADER*/
|
||||
inline bool is_diacritic(const unsigned short char_code, bool *is_rtl)
|
||||
inline bool is_diacritic(const ucschar_t char_code, bool *is_rtl)
|
||||
{
|
||||
(void)char_code;
|
||||
if (is_rtl)
|
||||
|
|
|
|||
|
|
@ -127,7 +127,7 @@ static int volatile cp_table_ref = 0;
|
|||
|
||||
/* non-default codepage table buffer (cannot be bufalloced! playback itself
|
||||
may be making the load request) */
|
||||
static unsigned short codepage_table[MAX_CP_TABLE_SIZE+1];
|
||||
static unsigned short codepage_table[MAX_CP_TABLE_SIZE+1]; // XXX convert to ucschar_t if we ever need > 16bit mappings?
|
||||
|
||||
#if defined(APPLICATION) && defined(__linux__)
|
||||
static const char * const name_codepages_linux[NUM_CODEPAGES+1] =
|
||||
|
|
@ -344,7 +344,7 @@ unsigned char* iso_decode_ex(const unsigned char *iso, unsigned char *utf8, int
|
|||
cp_lock_leave();
|
||||
|
||||
while (count-- && utf8_size > 0) {
|
||||
unsigned short ucs, tmp;
|
||||
ucschar_t ucs, tmp;
|
||||
|
||||
if (*iso < 128 || cp == UTF_8) /* Already UTF-8 */
|
||||
{
|
||||
|
|
@ -420,10 +420,6 @@ unsigned char* iso_decode_ex(const unsigned char *iso, unsigned char *utf8, int
|
|||
unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
|
||||
int count, int utf8_size, bool le)
|
||||
{
|
||||
// little-endian flag is used as significant byte index
|
||||
if (le)
|
||||
le = 1;
|
||||
|
||||
unsigned long ucs;
|
||||
|
||||
while (count > 0 && utf8_size > 0) {
|
||||
|
|
@ -511,8 +507,25 @@ unsigned long utf8length(const unsigned char *utf8)
|
|||
return l;
|
||||
}
|
||||
|
||||
/* Take a utf8 string and return the encoded length in utf16 code units */
|
||||
unsigned long utf16len_utf8(const unsigned char *utf8)
|
||||
{
|
||||
ucschar_t cp;
|
||||
unsigned long length = 0;
|
||||
while (*utf8) {
|
||||
utf8 = utf8decode(utf8, &cp);
|
||||
#ifdef UNICODE32
|
||||
if (cp > 0x10000)
|
||||
length++;
|
||||
#endif
|
||||
length++;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
/* Decode 1 UTF-8 char and return a pointer to the next char. */
|
||||
const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
|
||||
const unsigned char* utf8decode(const unsigned char *utf8, ucschar_t *ucs)
|
||||
{
|
||||
unsigned char c = *utf8++;
|
||||
unsigned long code;
|
||||
|
|
@ -552,8 +565,13 @@ const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs)
|
|||
/* Invalid UTF-8 char */
|
||||
code = 0xfffd;
|
||||
}
|
||||
/* currently we don't support chars above U-FFFF */
|
||||
*ucs = (code < 0x10000) ? code : 0xfffd;
|
||||
|
||||
#ifndef UNICODE32
|
||||
if (code > 0xffff)
|
||||
code = 0xfffd;
|
||||
#endif
|
||||
|
||||
*ucs = code;
|
||||
return utf8;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue