summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDarren Salt <linux@youmustbejoking.demon.co.uk>2007-12-14 21:44:43 +0000
committerDarren Salt <linux@youmustbejoking.demon.co.uk>2007-12-14 21:44:43 +0000
commit6bbfd480d0d173887305db527b641f832b6c4310 (patch)
treee92937333b5f522db912da31a6b87a2e9a6284b6
parentd1d18588d3bc91f72f6e9534d1f90b8aac7c118f (diff)
downloadxine-lib-6bbfd480d0d173887305db527b641f832b6c4310.tar.gz
xine-lib-6bbfd480d0d173887305db527b641f832b6c4310.tar.bz2
Convert XML to UTF-8 for parsing if a BOM is detected.
-rw-r--r--ChangeLog1
-rw-r--r--src/xine-utils/xmllexer.c62
2 files changed, 63 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index 5212cf99b..2102e3db5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -12,6 +12,7 @@ xine-lib (1.1.9) (unreleased)
* Build fix for when using Linux 2.6.23 headers. [Bug 1820958]
* Implemented decoding of XML character entities with codes >= 256.
This requires conversion to UTF-8 of entities with codes >= 128.
+ * Handle initial Unicde BOMs in XML; convert other UTF encodings to UTF-8.
* Fixed ATSC support. [Bug 1749508]
* Fixed a possible DVB plugin crash when switching channels.
* Fixed a crash closing the frontend. [Bug FS#3]
diff --git a/src/xine-utils/xmllexer.c b/src/xine-utils/xmllexer.c
index 8879f7d0c..75362e10d 100644
--- a/src/xine-utils/xmllexer.c
+++ b/src/xine-utils/xmllexer.c
@@ -40,6 +40,8 @@
#include <iconv.h>
#endif
+#include "bswap.h"
+
/* private constants*/
#define NORMAL 0 /* normal lex mode */
#define DATA 1 /* data lex mode */
@@ -50,10 +52,70 @@ static int lexbuf_size = 0;
static int lexbuf_pos = 0;
static int lex_mode = NORMAL;
static int in_comment = 0;
+static char *lex_malloc = NULL;
+
+enum utf { UTF32BE, UTF32LE, UTF16BE, UTF16LE };
+
+static void lex_convert (const char * buf, int size, enum utf utf)
+{
+ char *utf8 = malloc (size * (utf >= UTF16BE ? 3 : 6) + 1);
+ char *bp = utf8;
+ while (size > 0)
+ {
+ uint32_t c = 0;
+ switch (utf)
+ {
+ case UTF32BE: c = _X_BE_32 (buf); buf += 4; break;
+ case UTF32LE: c = _X_LE_32 (buf); buf += 4; break;
+ case UTF16BE: c = _X_BE_16 (buf); buf += 2; break;
+ case UTF16LE: c = _X_LE_16 (buf); buf += 2; break;
+ }
+ if (!c)
+ break; /* embed a NUL, get a truncated string */
+ if (c < 128)
+ *bp++ = c;
+ else
+ {
+ int count = (c >= 0x04000000) ? 5 :
+ (c >= 0x00200000) ? 4 :
+ (c >= 0x00010000) ? 3 :
+ (c >= 0x00000800) ? 2 : 1;
+ *bp = (char)(0x1F80 >> count);
+ count *= 6;
+ *bp++ |= c >> count;
+ while ((count -= 6) >= 0)
+ *bp++ = 128 | ((c >> count) & 0x3F);
+ }
+ }
+ *bp = 0;
+ lexbuf_size = bp - utf8;
+ lexbuf = lex_malloc = realloc (utf8, lexbuf_size + 1);
+}
void lexer_init(const char * buf, int size) {
+ static const char boms[] = { 0xFF, 0xFE, 0, 0, 0xFE, 0xFF },
+ bom_utf8[] = { 0xEF, 0xBB, 0xBF };
+
+ free (lex_malloc);
+ lex_malloc = NULL;
+
lexbuf = buf;
lexbuf_size = size;
+
+ if (size >= 4 && !memcmp (buf, boms + 2, 4))
+ lex_convert (buf + 4, size - 4, UTF32BE);
+ else if (size >= 4 && !memcmp (buf, boms, 4))
+ lex_convert (buf + 4, size - 4, UTF32LE);
+ else if (size >= 3 && !memcmp (buf, bom_utf8, 4))
+ {
+ lexbuf += 3;
+ lexbuf_size -= 3;
+ }
+ else if (size >= 2 && !memcmp (buf, boms + 4, 2))
+ lex_convert (buf + 2, size - 2, UTF16BE);
+ else if (size >= 2 && !memcmp (buf, boms, 2))
+ lex_convert (buf + 2, size - 2, UTF16LE);
+
lexbuf_pos = 0;
lex_mode = NORMAL;
in_comment = 0;