diff options
author | Darren Salt <linux@youmustbejoking.demon.co.uk> | 2007-12-14 21:44:43 +0000 |
---|---|---|
committer | Darren Salt <linux@youmustbejoking.demon.co.uk> | 2007-12-14 21:44:43 +0000 |
commit | 6bbfd480d0d173887305db527b641f832b6c4310 (patch) | |
tree | e92937333b5f522db912da31a6b87a2e9a6284b6 | |
parent | d1d18588d3bc91f72f6e9534d1f90b8aac7c118f (diff) | |
download | xine-lib-6bbfd480d0d173887305db527b641f832b6c4310.tar.gz xine-lib-6bbfd480d0d173887305db527b641f832b6c4310.tar.bz2 |
Convert XML to UTF-8 for parsing if a BOM is detected.
-rw-r--r-- | ChangeLog | 1 | ||||
-rw-r--r-- | src/xine-utils/xmllexer.c | 62 |
2 files changed, 63 insertions, 0 deletions
@@ -12,6 +12,7 @@ xine-lib (1.1.9) (unreleased) * Build fix for when using Linux 2.6.23 headers. [Bug 1820958] * Implemented decoding of XML character entities with codes >= 256. This requires conversion to UTF-8 of entities with codes >= 128. + * Handle initial Unicde BOMs in XML; convert other UTF encodings to UTF-8. * Fixed ATSC support. [Bug 1749508] * Fixed a possible DVB plugin crash when switching channels. * Fixed a crash closing the frontend. [Bug FS#3] diff --git a/src/xine-utils/xmllexer.c b/src/xine-utils/xmllexer.c index 8879f7d0c..75362e10d 100644 --- a/src/xine-utils/xmllexer.c +++ b/src/xine-utils/xmllexer.c @@ -40,6 +40,8 @@ #include <iconv.h> #endif +#include "bswap.h" + /* private constants*/ #define NORMAL 0 /* normal lex mode */ #define DATA 1 /* data lex mode */ @@ -50,10 +52,70 @@ static int lexbuf_size = 0; static int lexbuf_pos = 0; static int lex_mode = NORMAL; static int in_comment = 0; +static char *lex_malloc = NULL; + +enum utf { UTF32BE, UTF32LE, UTF16BE, UTF16LE }; + +static void lex_convert (const char * buf, int size, enum utf utf) +{ + char *utf8 = malloc (size * (utf >= UTF16BE ? 3 : 6) + 1); + char *bp = utf8; + while (size > 0) + { + uint32_t c = 0; + switch (utf) + { + case UTF32BE: c = _X_BE_32 (buf); buf += 4; break; + case UTF32LE: c = _X_LE_32 (buf); buf += 4; break; + case UTF16BE: c = _X_BE_16 (buf); buf += 2; break; + case UTF16LE: c = _X_LE_16 (buf); buf += 2; break; + } + if (!c) + break; /* embed a NUL, get a truncated string */ + if (c < 128) + *bp++ = c; + else + { + int count = (c >= 0x04000000) ? 5 : + (c >= 0x00200000) ? 4 : + (c >= 0x00010000) ? 3 : + (c >= 0x00000800) ? 2 : 1; + *bp = (char)(0x1F80 >> count); + count *= 6; + *bp++ |= c >> count; + while ((count -= 6) >= 0) + *bp++ = 128 | ((c >> count) & 0x3F); + } + } + *bp = 0; + lexbuf_size = bp - utf8; + lexbuf = lex_malloc = realloc (utf8, lexbuf_size + 1); +} void lexer_init(const char * buf, int size) { + static const char boms[] = { 0xFF, 0xFE, 0, 0, 0xFE, 0xFF }, + bom_utf8[] = { 0xEF, 0xBB, 0xBF }; + + free (lex_malloc); + lex_malloc = NULL; + lexbuf = buf; lexbuf_size = size; + + if (size >= 4 && !memcmp (buf, boms + 2, 4)) + lex_convert (buf + 4, size - 4, UTF32BE); + else if (size >= 4 && !memcmp (buf, boms, 4)) + lex_convert (buf + 4, size - 4, UTF32LE); + else if (size >= 3 && !memcmp (buf, bom_utf8, 4)) + { + lexbuf += 3; + lexbuf_size -= 3; + } + else if (size >= 2 && !memcmp (buf, boms + 4, 2)) + lex_convert (buf + 2, size - 2, UTF16BE); + else if (size >= 2 && !memcmp (buf, boms, 2)) + lex_convert (buf + 2, size - 2, UTF16LE); + lexbuf_pos = 0; lex_mode = NORMAL; in_comment = 0; |