internals: Support characters beyond the first unicode plane (WIP)

We used 16-bit variables to store the 'character code' everywhere but
this won't let us represent anything beyond U+FFFF.

This patch changes those variables to a custom type that can be 32 or 16
bits depending on the build, and adjusts numerous internal APIs and
datastructures to match.  This includes:

 * utf8decode() and friends
 * on-screen keyboard
 * font manipulation, caching, rendering, and generation
 * VFAT code parses and generates utf16 dirents
 * WIN32 simulator reads and writes utf16 filenames

Note that this patch doesn't _enable_ >16bit unicode support; a followup
patch will turn that on for appropriate targets.

Known bugs:

  * Native players in 32-bit unicode mode generate mangled filename
    entries if they include UTF16 surrogate codepoints.  Root cause
    is unclear, and may reside in core dircache code.

Needs testing on:

 * windows simulator (16bit+32bit)

Change-Id: I193a00fe2a11a4181ddc82df2d71be52bf00b6e6
This commit is contained in:
Solomon Peachy 2024-12-17 08:55:21 -05:00
parent 94712b34d4
commit d05c59f35b
44 changed files with 480 additions and 335 deletions

View file

@ -747,6 +747,8 @@ static bool fatlong_parse_entry(struct fatlong_parse_state *lnparse,
/* so far so good; save entry information */
lnparse->ord = ord;
/* Treat entries as opaque 16-bit values;
utf8decode happens in fatlong_parse_finish() */
uint16_t *ucsp = fatent->ucssegs[ord - 1 + 5];
unsigned int i = longent_char_first();
@ -797,13 +799,23 @@ static bool fatlong_parse_finish(struct fatlong_parse_state *lnparse,
/* ensure the last segment is NULL-terminated if it is filled */
fatent->ucssegs[lnparse->ord_max + 5][0] = 0x0000;
for (uint16_t *ucsp = fatent->ucssegs[5], ucc = *ucsp;
ucc; ucc = *++ucsp)
unsigned long ucc; /* Decoded codepoint */
uint16_t *ucsp, ucs;
for (ucsp = fatent->ucssegs[5], ucs=*ucsp; ucs; ucs = *++ucsp)
{
/* end should be hit before ever seeing padding */
if (ucc == 0xffff)
if (ucs == 0xffff)
return false;
/* Check for a surrogate UTF16 pair */
if (ucs >= 0xd800 && ucs < 0xdc00 &&
*(ucsp+1) >= 0xdc00 && *(ucsp+1) < 0xe000) {
ucc = 0x10000 + ((ucs & 0x3ff) << 10) | (*(ucsp+1) & 0x3ff);
ucsp++;
} else {
ucc = ucs;
}
if ((p = utf8encode(ucc, p)) - name > FAT_DIRENTRY_NAME_MAX)
return false;
}
@ -1608,7 +1620,7 @@ static int write_longname(struct bpb *fat_bpb, struct fat_filestr *parentstr,
/* we need to convert the name first since the entries are written in
reverse order */
unsigned long ucspadlen = ALIGN_UP(ucslen, FATLONG_NAME_CHARS);
uint16_t ucsname[ucspadlen];
ucschar_t ucsname[ucspadlen];
for (unsigned long i = 0; i < ucspadlen; i++)
{
@ -1626,6 +1638,9 @@ static int write_longname(struct bpb *fat_bpb, struct fat_filestr *parentstr,
const unsigned int firstentry = file->e.entry - longentries;
/* longame entries */
#ifdef UNICODE32
long carried_val = -1;
#endif
for (unsigned int i = 0; i < longentries; i++)
{
ent = cache_direntry(fat_bpb, parentstr, firstentry + i);
@ -1651,11 +1666,38 @@ static int write_longname(struct bpb *fat_bpb, struct fat_filestr *parentstr,
ent->ldir_chksum = chksum;
/* set name */
uint16_t *ucsptr = &ucsname[(ord - 1) * FATLONG_NAME_CHARS];
ucschar_t *ucsptr = &ucsname[(ord - 1) * FATLONG_NAME_CHARS];
for (unsigned j = longent_char_first(); j; j = longent_char_next(j))
{
uint16_t ucs = *ucsptr++;
INT162BYTES(ent->data, j, ucs);
#ifdef UNICODE32
if (carried_val >= 0) {
INT162BYTES(ent->data, j, carried_val);
carried_val = -1;
continue;
}
#endif
ucschar_t ucs = *ucsptr++;
#ifdef UNICODE32
if (ucs >= 0x10000) {
ucs-=0x10000;
uint16_t v = 0xdc00 | (ucs & 0x3ff);
unsigned oldj = j;
INT162BYTES(ent->data, j, v);
j = longent_char_next(j);
v = 0xd800 | ((ucs >> 10) & 0x3ff);
if (j) {
INT162BYTES(ent->data, j, v);
} else if ((i + 1) < longentries) {
/* Carry the other end of the surrogate pair to the next block */
carried_val = v;
} else {
/* No more blocks, so re-write the first entry of the pair */
v = 0xfffd;
INT162BYTES(ent->data, oldj, v);
}
} else
#endif
INT162BYTES(ent->data, j, ucs);
}
dc_dirty_buf(ent);
@ -1744,9 +1786,12 @@ static int add_dir_entry(struct bpb *fat_bpb, struct fat_filestr *parentstr,
create_dos_name(basisname, name, &n);
randomize_dos_name(shortname, basisname, &n);
/* one dir entry needed for every 13 characters of filename,
plus one entry for the short name */
ucslen = utf8length(name);
/* one dir entry needed for every 13 "code units"
of filename, plus one entry for the short name.
Keep in mind that a utf8 character can take 1
or 2 code units.
*/
ucslen = utf16len_utf8(name);
if (ucslen > 255)
FAT_ERROR(-2); /* name is too long */

View file

@ -385,7 +385,7 @@ static void LCDFN(mono_bmp_part_helper)(const unsigned char *src, int src_x,
/* put a string at a given pixel position, skipping first ofs pixel columns */
static void LCDFN(putsxyofs)(int x, int y, int ofs, const unsigned char *str)
{
unsigned short *ucs;
ucschar_t *ucs;
struct viewport *vp = LCDFN(current_viewport);
font_lock(vp->font, true);
struct font* pf = font_get(vp->font);
@ -429,7 +429,7 @@ static void LCDFN(putsxyofs)(int x, int y, int ofs, const unsigned char *str)
bool is_rtl, is_diac;
const unsigned char *bits;
int width, base_width, base_ofs = 0;
const unsigned short next_ch = ucs[1];
const ucschar_t next_ch = ucs[1];
if (x >= vp->width)
break;
@ -447,7 +447,7 @@ static void LCDFN(putsxyofs)(int x, int y, int ofs, const unsigned char *str)
{
if (!rtl_next_non_diac_width)
{
const unsigned short *u;
const ucschar_t *u;
/* Jump to next non-diacritic char, and calc its width */
for (u = &ucs[1]; *u && IS_DIACRITIC(*u); u++);
@ -529,7 +529,7 @@ static void LCDFN(putsxyofs)(int x, int y, int ofs, const unsigned char *str)
/* put a string at a given pixel position, skipping first ofs pixel columns */
static void LCDFN(putsxyofs)(int x, int y, int ofs, const unsigned char *str)
{
unsigned short *ucs;
ucschar_t *ucs;
struct viewport *vp = LCDFN(current_viewport);
struct font* pf = font_get(vp->font);
const unsigned char *bits;
@ -567,7 +567,7 @@ static void LCDFN(putsxyofs)(int x, int y, int ofs, const unsigned char *str)
/* allow utf but no diacritics or rtl lang */
for (ucs = bidi_l2v(str, 1); *ucs; ucs++)
{
const unsigned short next_ch = ucs[1];
const ucschar_t next_ch = ucs[1];
if (x >= vp->width)
break;