forked from len0rd/rockbox
unicode: Unify check for UTF-16 BOM
Adds utf16_has_bom function Change-Id: I67ea474c9cf6ca6e6684351c2f54131164b7903c
This commit is contained in:
parent
11fbbc7826
commit
a23002cd5e
4 changed files with 43 additions and 33 deletions
|
@ -231,15 +231,14 @@ bool parse_cuesheet(struct cuesheet_file *cue_file, struct cuesheet *cue)
|
||||||
char_enc = CHAR_ENC_UTF_8;
|
char_enc = CHAR_ENC_UTF_8;
|
||||||
bom_read = BOM_UTF_8_SIZE;
|
bom_read = BOM_UTF_8_SIZE;
|
||||||
}
|
}
|
||||||
else if(!memcmp(line, BOM_UTF_16_LE, BOM_UTF_16_SIZE))
|
else
|
||||||
{
|
{
|
||||||
char_enc = CHAR_ENC_UTF_16_LE;
|
bool le;
|
||||||
bom_read = BOM_UTF_16_SIZE;
|
if (utf16_has_bom(line, &le))
|
||||||
}
|
{
|
||||||
else if(!memcmp(line, BOM_UTF_16_BE, BOM_UTF_16_SIZE))
|
char_enc = le ? CHAR_ENC_UTF_16_LE : CHAR_ENC_UTF_16_BE;
|
||||||
{
|
bom_read = BOM_UTF_16_SIZE;
|
||||||
char_enc = CHAR_ENC_UTF_16_BE;
|
}
|
||||||
bom_read = BOM_UTF_16_SIZE;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -437,6 +437,28 @@ unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8,
|
||||||
return utf8;
|
return utf8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool utf16_has_bom(const unsigned char *utf16, bool *le)
|
||||||
|
{
|
||||||
|
unsigned long ucs = utf16[0] << 8 | utf16[1];
|
||||||
|
|
||||||
|
if (ucs == 0xFEFF) /* Check for BOM */
|
||||||
|
{
|
||||||
|
*le = false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ucs == 0xFFFE)
|
||||||
|
{
|
||||||
|
*le = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If there is no BOM let's try to guess it. If one of the bytes is 0x00, it is
|
||||||
|
probably the most significant one. */
|
||||||
|
*le = utf16[1] == 0;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
#if 0 /* currently unused */
|
#if 0 /* currently unused */
|
||||||
/* Recode any UTF-16 string to UTF-8 */
|
/* Recode any UTF-16 string to UTF-8 */
|
||||||
unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
|
unsigned char* utf16decode(const unsigned char *utf16, unsigned char *utf8,
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
#define _RBUNICODE_H_
|
#define _RBUNICODE_H_
|
||||||
|
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
#include <stdbool.h>
|
||||||
|
|
||||||
#define MASK 0xC0 /* 11000000 */
|
#define MASK 0xC0 /* 11000000 */
|
||||||
#define COMP 0x80 /* 10x */
|
#define COMP 0x80 /* 10x */
|
||||||
|
@ -58,6 +59,7 @@ unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8);
|
||||||
unsigned char* iso_decode(const unsigned char *latin1, unsigned char *utf8, int cp, int count);
|
unsigned char* iso_decode(const unsigned char *latin1, unsigned char *utf8, int cp, int count);
|
||||||
unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, int count);
|
unsigned char* utf16LEdecode(const unsigned char *utf16, unsigned char *utf8, int count);
|
||||||
unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, int count);
|
unsigned char* utf16BEdecode(const unsigned char *utf16, unsigned char *utf8, int count);
|
||||||
|
bool utf16_has_bom(const unsigned char *utf16, bool *le);
|
||||||
unsigned long utf8length(const unsigned char *utf8);
|
unsigned long utf8length(const unsigned char *utf8);
|
||||||
const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs);
|
const unsigned char* utf8decode(const unsigned char *utf8, unsigned short *ucs);
|
||||||
void set_codepage(int cp);
|
void set_codepage(int cp);
|
||||||
|
|
|
@ -570,8 +570,6 @@ static bool parse_as_utf8(char* string, int *len)
|
||||||
string. If it is, we convert it to a UTF-8 string. If it's not unicode,
|
string. If it is, we convert it to a UTF-8 string. If it's not unicode,
|
||||||
we convert from the default codepage */
|
we convert from the default codepage */
|
||||||
static void unicode_munge(char* string, char* utf8buf, int *len) {
|
static void unicode_munge(char* string, char* utf8buf, int *len) {
|
||||||
long tmp;
|
|
||||||
bool le = false;
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
unsigned char *str = (unsigned char *)string;
|
unsigned char *str = (unsigned char *)string;
|
||||||
int templen = 0;
|
int templen = 0;
|
||||||
|
@ -590,28 +588,17 @@ static void unicode_munge(char* string, char* utf8buf, int *len) {
|
||||||
case 0x02:
|
case 0x02:
|
||||||
(*len)--;
|
(*len)--;
|
||||||
str++;
|
str++;
|
||||||
|
bool le;
|
||||||
|
|
||||||
|
|
||||||
/* Handle frames with more than one string
|
/* Handle frames with more than one string
|
||||||
(needed for TXXX frames).*/
|
(needed for TXXX frames).*/
|
||||||
do {
|
do {
|
||||||
tmp = bytes2int(0, 0, str[0], str[1]);
|
if (utf16_has_bom(str, &le))
|
||||||
|
{
|
||||||
/* Now check if there is a BOM
|
str += BOM_UTF_16_SIZE;
|
||||||
(zero-width non-breaking space, 0xfeff)
|
*len -= BOM_UTF_16_SIZE;
|
||||||
and if it is in little or big endian format */
|
}
|
||||||
if(tmp == 0xfffe) { /* Little endian? */
|
|
||||||
le = true;
|
|
||||||
str += 2;
|
|
||||||
(*len)-=2;
|
|
||||||
} else if(tmp == 0xfeff) { /* Big endian? */
|
|
||||||
str += 2;
|
|
||||||
(*len)-=2;
|
|
||||||
} else
|
|
||||||
/* If there is no BOM (which is a specification violation),
|
|
||||||
let's try to guess it. If one of the bytes is 0x00, it is
|
|
||||||
probably the most significant one. */
|
|
||||||
if(str[1] == 0)
|
|
||||||
le = true;
|
|
||||||
|
|
||||||
while ((i < *len) && (str[0] || str[1])) {
|
while ((i < *len) && (str[0] || str[1])) {
|
||||||
if(le)
|
if(le)
|
||||||
|
@ -734,17 +721,17 @@ static bool is_cuesheet(char *tag, unsigned char *char_enc, unsigned char *cuesh
|
||||||
switch (*(tag++))
|
switch (*(tag++))
|
||||||
{
|
{
|
||||||
case 0x01:
|
case 0x01:
|
||||||
if (!memcmp(tag, BOM_UTF_16_BE, BOM_UTF_16_SIZE))
|
{
|
||||||
*char_enc = CHAR_ENC_UTF_16_BE;
|
bool le;
|
||||||
else if (!memcmp(tag, BOM_UTF_16_LE, BOM_UTF_16_SIZE))
|
if (!utf16_has_bom(tag, &le))
|
||||||
*char_enc = CHAR_ENC_UTF_16_LE;
|
|
||||||
else
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
*char_enc = le ? CHAR_ENC_UTF_16_LE: CHAR_ENC_UTF_16_BE;
|
||||||
tag+= BOM_UTF_16_SIZE;
|
tag+= BOM_UTF_16_SIZE;
|
||||||
/* \1 + BOM(2) + C0U0E0S0H0E0E0T000 = 21 */
|
/* \1 + BOM(2) + C0U0E0S0H0E0E0T000 = 21 */
|
||||||
*cuesheet_offset = 21;
|
*cuesheet_offset = 21;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case 0x02:
|
case 0x02:
|
||||||
*char_enc = CHAR_ENC_UTF_16_BE;
|
*char_enc = CHAR_ENC_UTF_16_BE;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue