From 6bbfd480d0d173887305db527b641f832b6c4310 Mon Sep 17 00:00:00 2001 From: Darren Salt Date: Fri, 14 Dec 2007 21:44:43 +0000 Subject: Convert XML to UTF-8 for parsing if a BOM is detected. --- src/xine-utils/xmllexer.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'src') diff --git a/src/xine-utils/xmllexer.c b/src/xine-utils/xmllexer.c index 8879f7d0c..75362e10d 100644 --- a/src/xine-utils/xmllexer.c +++ b/src/xine-utils/xmllexer.c @@ -40,6 +40,8 @@ #include #endif +#include "bswap.h" + /* private constants*/ #define NORMAL 0 /* normal lex mode */ #define DATA 1 /* data lex mode */ @@ -50,10 +52,70 @@ static int lexbuf_size = 0; static int lexbuf_pos = 0; static int lex_mode = NORMAL; static int in_comment = 0; +static char *lex_malloc = NULL; + +enum utf { UTF32BE, UTF32LE, UTF16BE, UTF16LE }; + +static void lex_convert (const char * buf, int size, enum utf utf) +{ + char *utf8 = malloc (size * (utf >= UTF16BE ? 3 : 6) + 1); + char *bp = utf8; + while (size > 0) + { + uint32_t c = 0; + switch (utf) + { + case UTF32BE: c = _X_BE_32 (buf); buf += 4; break; + case UTF32LE: c = _X_LE_32 (buf); buf += 4; break; + case UTF16BE: c = _X_BE_16 (buf); buf += 2; break; + case UTF16LE: c = _X_LE_16 (buf); buf += 2; break; + } + if (!c) + break; /* embed a NUL, get a truncated string */ + if (c < 128) + *bp++ = c; + else + { + int count = (c >= 0x04000000) ? 5 : + (c >= 0x00200000) ? 4 : + (c >= 0x00010000) ? 3 : + (c >= 0x00000800) ? 2 : 1; + *bp = (char)(0x1F80 >> count); + count *= 6; + *bp++ |= c >> count; + while ((count -= 6) >= 0) + *bp++ = 128 | ((c >> count) & 0x3F); + } + } + *bp = 0; + lexbuf_size = bp - utf8; + lexbuf = lex_malloc = realloc (utf8, lexbuf_size + 1); +} void lexer_init(const char * buf, int size) { + static const char boms[] = { 0xFF, 0xFE, 0, 0, 0xFE, 0xFF }, + bom_utf8[] = { 0xEF, 0xBB, 0xBF }; + + free (lex_malloc); + lex_malloc = NULL; + lexbuf = buf; lexbuf_size = size; + + if (size >= 4 && !memcmp (buf, boms + 2, 4)) + lex_convert (buf + 4, size - 4, UTF32BE); + else if (size >= 4 && !memcmp (buf, boms, 4)) + lex_convert (buf + 4, size - 4, UTF32LE); + else if (size >= 3 && !memcmp (buf, bom_utf8, 4)) + { + lexbuf += 3; + lexbuf_size -= 3; + } + else if (size >= 2 && !memcmp (buf, boms + 4, 2)) + lex_convert (buf + 2, size - 2, UTF16BE); + else if (size >= 2 && !memcmp (buf, boms, 2)) + lex_convert (buf + 2, size - 2, UTF16LE); + lexbuf_pos = 0; lex_mode = NORMAL; in_comment = 0; -- cgit v1.2.3