1
0
Fork 0
forked from len0rd/rockbox

internals: Support characters beyond the first unicode plane (WIP)

We used 16-bit variables to store the 'character code' everywhere but
this won't let us represent anything beyond U+FFFF.

This patch changes those variables to a custom type that can be 32 or 16
bits depending on the build, and adjusts numerous internal APIs and
datastructures to match.  This includes:

 * utf8decode() and friends
 * on-screen keyboard
 * font manipulation, caching, rendering, and generation
 * VFAT code parses and generates utf16 dirents
 * WIN32 simulator reads and writes utf16 filenames

Note that this patch doesn't _enable_ >16bit unicode support; a followup
patch will turn that on for appropriate targets.

Known bugs:

  * Native players in 32-bit unicode mode generate mangled filename
    entries if they include UTF16 surrogate codepoints.  Root cause
    is unclear, and may reside in core dircache code.

Needs testing on:

 * windows simulator (16bit+32bit)

Change-Id: I193a00fe2a11a4181ddc82df2d71be52bf00b6e6
This commit is contained in:
Solomon Peachy 2024-12-17 08:55:21 -05:00
parent 94712b34d4
commit d05c59f35b
44 changed files with 480 additions and 335 deletions

View file

@ -41,7 +41,7 @@ static unsigned text_type = TV_TEXT_UNKNOWN;
static const unsigned char *end_ptr;
static unsigned short ucsbuf[TV_MAX_BLOCKS][TV_MAX_CHARS_PER_BLOCK];
static ucschar_t ucsbuf[TV_MAX_BLOCKS][TV_MAX_CHARS_PER_BLOCK];
static unsigned char utf8buf[TV_MAX_CHARS_PER_BLOCK * (2 * 3)];
static unsigned char *outbuf;
@ -54,11 +54,11 @@ static bool expand_extra_line = false;
/* when a line is divided, this value sets true. */
static bool is_break_line = false;
static unsigned short break_chars[] =
static unsigned short break_chars[] = // XXX promote to ucschar_t if we get a codepoint > 0xffff
{
0,
/* halfwidth characters */
'\t', '\n', 0x0b, 0x0c, ' ', '!', ',', '-', '.', ':', ';', '?', 0xb7,
'\t', '\n', 0x0b, 0x0c, ' ', '!', ',', '-', '.', ':', ';', '?', 0xb7,
/* fullwidth characters */
0x2010, /* hyphen */
0x3000, /* fullwidth space */
@ -76,7 +76,7 @@ static unsigned short break_chars[] =
};
/* the characters which is not judged as space with isspace() */
static unsigned short extra_spaces[] = { 0, 0x3000 };
static unsigned short extra_spaces[] = { 0, 0x3000 }; // XXX promote to ucschar_t if we get a codepoint > 0xffff
static int tv_glyph_width(int ch)
{
@ -93,7 +93,7 @@ static int tv_glyph_width(int ch)
return rb->font_get_width(rb->font_get(preferences->font_id), ch);
}
static unsigned char *tv_get_ucs(const unsigned char *str, unsigned short *ch)
static unsigned char *tv_get_ucs(const unsigned char *str, ucschar_t *ch)
{
int count = 1;
unsigned char utf8_tmp[3];
@ -148,7 +148,7 @@ static unsigned char *tv_get_ucs(const unsigned char *str, unsigned short *ch)
return (unsigned char *)str + count;
}
static void tv_decode2utf8(const unsigned short *ucs, int count)
static void tv_decode2utf8(const ucschar_t *ucs, int count)
{
int i;
@ -158,7 +158,7 @@ static void tv_decode2utf8(const unsigned short *ucs, int count)
*outbuf = '\0';
}
static bool tv_is_line_break_char(unsigned short ch)
static bool tv_is_line_break_char(ucschar_t ch)
{
size_t i;
@ -166,7 +166,7 @@ static bool tv_is_line_break_char(unsigned short ch)
if (preferences->word_mode == WM_CHOP)
return false;
for (i = 0; i < sizeof(break_chars)/sizeof(unsigned short); i++)
for (i = 0; i < sizeof(break_chars)/sizeof(ucschar_t); i++)
{
if (break_chars[i] == ch)
return true;
@ -174,14 +174,14 @@ static bool tv_is_line_break_char(unsigned short ch)
return false;
}
static bool tv_isspace(unsigned short ch)
static bool tv_isspace(ucschar_t ch)
{
size_t i;
if (ch < 128 && isspace(ch))
return true;
for (i = 0; i < sizeof(extra_spaces)/sizeof(unsigned short); i++)
for (i = 0; i < sizeof(extra_spaces)/sizeof(ucschar_t); i++)
{
if (extra_spaces[i] == ch)
return true;
@ -191,17 +191,17 @@ static bool tv_isspace(unsigned short ch)
static bool tv_is_break_line_join_mode(const unsigned char *next_str)
{
unsigned short ch;
ucschar_t ch;
tv_get_ucs(next_str, &ch);
return tv_isspace(ch);
}
static int tv_form_reflow_line(unsigned short *ucs, int chars)
static int tv_form_reflow_line(ucschar_t *ucs, int chars)
{
unsigned short new_ucs[TV_MAX_CHARS_PER_BLOCK];
unsigned short *p = new_ucs;
unsigned short ch;
ucschar_t new_ucs[TV_MAX_CHARS_PER_BLOCK];
ucschar_t *p = new_ucs;
ucschar_t ch;
int i;
int k;
int expand_spaces;
@ -262,15 +262,15 @@ static int tv_form_reflow_line(unsigned short *ucs, int chars)
}
}
rb->memcpy(ucs, new_ucs, sizeof(unsigned short) * TV_MAX_CHARS_PER_BLOCK);
rb->memcpy(ucs, new_ucs, sizeof(ucschar_t) * TV_MAX_CHARS_PER_BLOCK);
return indent_chars + nonspace_chars + expand_spaces;
}
static void tv_align_right(int *block_chars)
{
unsigned short *cur_text;
unsigned short *prev_text;
unsigned short ch;
ucschar_t *cur_text;
ucschar_t *prev_text;
ucschar_t ch;
int cur_block = block_count - 1;
int prev_block;
int cur_chars;
@ -335,9 +335,9 @@ static void tv_align_right(int *block_chars)
if (break_pos < prev_chars)
{
rb->memmove(cur_text + prev_chars - break_pos,
cur_text, block_chars[cur_block] * sizeof(unsigned short));
cur_text, block_chars[cur_block] * sizeof(ucschar_t));
rb->memcpy(cur_text, prev_text + break_pos,
(prev_chars - break_pos) * sizeof(unsigned short));
(prev_chars - break_pos) * sizeof(ucschar_t));
block_chars[prev_block] = break_pos;
block_chars[cur_block ] += prev_chars - break_pos;
@ -347,15 +347,15 @@ static void tv_align_right(int *block_chars)
}
}
static int tv_parse_text(const unsigned char *src, unsigned short *ucs,
static int tv_parse_text(const unsigned char *src, ucschar_t *ucs,
int *ucs_chars, bool is_indent)
{
const unsigned char *cur = src;
const unsigned char *next = src;
const unsigned char *line_break_ptr = NULL;
const unsigned char *line_end_ptr = NULL;
unsigned short ch = 0;
unsigned short prev_ch;
ucschar_t ch = 0;
ucschar_t prev_ch;
int chars = 0;
int gw;
int line_break_width = 0;
@ -480,7 +480,7 @@ static int tv_parse_text(const unsigned char *src, unsigned short *ucs,
int tv_create_formed_text(const unsigned char *src, ssize_t bufsize,
int block, bool is_multi, const unsigned char **dst)
{
unsigned short ch;
ucschar_t ch;
int chars[block_count];
int i;
int size = 0;