summaryrefslogtreecommitdiff
path: root/src/xine-utils/xmllexer.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/xine-utils/xmllexer.c')
-rw-r--r--src/xine-utils/xmllexer.c290
1 files changed, 230 insertions, 60 deletions
diff --git a/src/xine-utils/xmllexer.c b/src/xine-utils/xmllexer.c
index 575c37611..75a1aafec 100644
--- a/src/xine-utils/xmllexer.c
+++ b/src/xine-utils/xmllexer.c
@@ -15,11 +15,8 @@
*
* You should have received a copy of the GNU Library General Public
* License along with the Gnome Library; see the file COPYING.LIB. If not,
- * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- *
- * $Id: xmllexer.c,v 1.13 2007/03/04 16:19:12 hadess Exp $
- *
+ * write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
+ * Floor, Boston, MA 02110, USA
*/
#define LOG_MODULE "xmllexer"
@@ -29,31 +26,99 @@
*/
#ifdef XINE_COMPILE
-#include "xineutils.h"
+#include <xine/xineutils.h>
#else
#define lprintf(...)
#define xine_xmalloc malloc
#endif
-#include "xmllexer.h"
+#include <xine/xmllexer.h>
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
+#ifdef HAVE_ICONV
+#include <iconv.h>
+#endif
+
+#include "bswap.h"
/* private constants*/
-#define NORMAL 0 /* normal lex mode */
-#define DATA 1 /* data lex mode */
/* private global variables */
static const char * lexbuf;
static int lexbuf_size = 0;
static int lexbuf_pos = 0;
-static int lex_mode = NORMAL;
static int in_comment = 0;
+static char *lex_malloc = NULL;
+
+enum utf { UTF32BE, UTF32LE, UTF16BE, UTF16LE };
+
+static void lex_convert (const char * buf, int size, enum utf utf)
+{
+ char *utf8 = malloc (size * (utf >= UTF16BE ? 3 : 6) + 1);
+ char *bp = utf8;
+ while (size > 0)
+ {
+ uint32_t c = 0;
+ switch (utf)
+ {
+ case UTF32BE: c = _X_BE_32 (buf); buf += 4; break;
+ case UTF32LE: c = _X_LE_32 (buf); buf += 4; break;
+ case UTF16BE: c = _X_BE_16 (buf); buf += 2; break;
+ case UTF16LE: c = _X_LE_16 (buf); buf += 2; break;
+ }
+ if (!c)
+ break; /* embed a NUL, get a truncated string */
+ if (c < 128)
+ *bp++ = c;
+ else
+ {
+ int count = (c >= 0x04000000) ? 5 :
+ (c >= 0x00200000) ? 4 :
+ (c >= 0x00010000) ? 3 :
+ (c >= 0x00000800) ? 2 : 1;
+ *bp = (char)(0x1F80 >> count);
+ count *= 6;
+ *bp++ |= c >> count;
+ while ((count -= 6) >= 0)
+ *bp++ = 128 | ((c >> count) & 0x3F);
+ }
+ }
+ *bp = 0;
+ lexbuf_size = bp - utf8;
+ lexbuf = lex_malloc = realloc (utf8, lexbuf_size + 1);
+}
+
+static enum {
+ NORMAL,
+ DATA,
+ CDATA,
+} lex_mode = NORMAL;
void lexer_init(const char * buf, int size) {
+ static const char boms[] = { 0xFF, 0xFE, 0, 0, 0xFE, 0xFF },
+ bom_utf8[] = { 0xEF, 0xBB, 0xBF };
+
+ free (lex_malloc);
+ lex_malloc = NULL;
+
lexbuf = buf;
lexbuf_size = size;
+
+ if (size >= 4 && !memcmp (buf, boms + 2, 4))
+ lex_convert (buf + 4, size - 4, UTF32BE);
+ else if (size >= 4 && !memcmp (buf, boms, 4))
+ lex_convert (buf + 4, size - 4, UTF32LE);
+ else if (size >= 3 && !memcmp (buf, bom_utf8, 3))
+ {
+ lexbuf += 3;
+ lexbuf_size -= 3;
+ }
+ else if (size >= 2 && !memcmp (buf, boms + 4, 2))
+ lex_convert (buf + 2, size - 2, UTF16BE);
+ else if (size >= 2 && !memcmp (buf, boms, 2))
+ lex_convert (buf + 2, size - 2, UTF16LE);
+
lexbuf_pos = 0;
lex_mode = NORMAL;
in_comment = 0;
@@ -61,79 +126,104 @@ void lexer_init(const char * buf, int size) {
lprintf("buffer length %d\n", size);
}
-int lexer_get_token(char * tok, int tok_size) {
+typedef enum {
+ STATE_UNKNOWN = -1,
+ STATE_IDLE,
+ STATE_EOL,
+ STATE_SEPAR,
+ STATE_T_M_START,
+ STATE_T_M_STOP_1,
+ STATE_T_M_STOP_2,
+ STATE_T_EQUAL,
+ STATE_T_STRING_SINGLE,
+ STATE_T_STRING_DOUBLE,
+ STATE_T_COMMENT,
+ STATE_T_TI_STOP,
+ STATE_T_DASHDASH,
+ STATE_T_C_STOP,
+ STATE_IDENT /* must be last */
+} lexer_state_t;
+
+int lexer_get_token_d(char ** _tok, int * _tok_size, int fixed) {
+ char *tok = *_tok;
+ int tok_size = *_tok_size;
int tok_pos = 0;
- int state = 0;
+ lexer_state_t state = STATE_IDLE;
char c;
if (tok) {
while ((tok_pos < tok_size) && (lexbuf_pos < lexbuf_size)) {
c = lexbuf[lexbuf_pos];
- lprintf("c=%c, state=%d, in_comment=%d\n", c, state, in_comment);
+ lprintf("c=%c, state=%d, lex_mode=%d, in_comment=%d\n", c, state, lex_mode, in_comment);
- if (lex_mode == NORMAL) {
- /* normal mode */
+ switch (lex_mode) {
+ case NORMAL:
switch (state) {
/* init state */
- case 0:
+ case STATE_IDLE:
switch (c) {
case '\n':
case '\r':
- state = 1;
+ state = STATE_EOL;
tok[tok_pos] = c;
tok_pos++;
break;
case ' ':
case '\t':
- state = 2;
+ state = STATE_SEPAR;
tok[tok_pos] = c;
tok_pos++;
break;
case '<':
- state = 3;
+ state = STATE_T_M_START;
tok[tok_pos] = c;
tok_pos++;
break;
case '>':
- state = 4;
+ state = STATE_T_M_STOP_1;
tok[tok_pos] = c;
tok_pos++;
break;
case '/':
if (!in_comment)
- state = 5;
+ state = STATE_T_M_STOP_2;
tok[tok_pos] = c;
tok_pos++;
break;
case '=':
- state = 6;
+ state = STATE_T_EQUAL;
tok[tok_pos] = c;
tok_pos++;
break;
case '\"': /* " */
- state = 7;
+ state = STATE_T_STRING_DOUBLE;
+ break;
+
+ case '\'': /* " */
+ state = STATE_T_STRING_SINGLE;
break;
case '-':
- state = 10;
+ state = STATE_T_DASHDASH;
tok[tok_pos] = c;
tok_pos++;
break;
case '?':
- state = 9;
+ if (!in_comment)
+ state = STATE_T_TI_STOP;
tok[tok_pos] = c;
tok_pos++;
break;
default:
- state = 100;
+ state = STATE_IDENT;
tok[tok_pos] = c;
tok_pos++;
break;
@@ -142,7 +232,7 @@ int lexer_get_token(char * tok, int tok_size) {
break;
/* end of line */
- case 1:
+ case STATE_EOL:
if (c == '\n' || (c == '\r')) {
tok[tok_pos] = c;
lexbuf_pos++;
@@ -154,7 +244,7 @@ int lexer_get_token(char * tok, int tok_size) {
break;
/* T_SEPAR */
- case 2:
+ case STATE_SEPAR:
if (c == ' ' || (c == '\t')) {
tok[tok_pos] = c;
lexbuf_pos++;
@@ -166,7 +256,7 @@ int lexer_get_token(char * tok, int tok_size) {
break;
/* T_M_START < or </ or <! or <? */
- case 3:
+ case STATE_T_M_START:
switch (c) {
case '/':
tok[tok_pos] = c;
@@ -179,7 +269,7 @@ int lexer_get_token(char * tok, int tok_size) {
tok[tok_pos] = c;
lexbuf_pos++;
tok_pos++;
- state = 8;
+ state = STATE_T_COMMENT;
break;
case '?':
tok[tok_pos] = c;
@@ -195,7 +285,7 @@ int lexer_get_token(char * tok, int tok_size) {
break;
/* T_M_STOP_1 */
- case 4:
+ case STATE_T_M_STOP_1:
tok[tok_pos] = '\0';
if (!in_comment)
lex_mode = DATA;
@@ -203,7 +293,7 @@ int lexer_get_token(char * tok, int tok_size) {
break;
/* T_M_STOP_2 */
- case 5:
+ case STATE_T_M_STOP_2:
if (c == '>') {
tok[tok_pos] = c;
lexbuf_pos++;
@@ -219,13 +309,13 @@ int lexer_get_token(char * tok, int tok_size) {
break;
/* T_EQUAL */
- case 6:
+ case STATE_T_EQUAL:
tok[tok_pos] = '\0';
return T_EQUAL;
break;
/* T_STRING */
- case 7:
+ case STATE_T_STRING_DOUBLE:
tok[tok_pos] = c;
lexbuf_pos++;
if (c == '\"') { /* " */
@@ -235,8 +325,8 @@ int lexer_get_token(char * tok, int tok_size) {
tok_pos++;
break;
- /* T_C_START or T_DOCTYPE_START */
- case 8:
+ /* T_C_START or T_DOCTYPE_START or T_CDATA_START */
+ case STATE_T_COMMENT:
switch (c) {
case '-':
lexbuf_pos++;
@@ -260,6 +350,17 @@ int lexer_get_token(char * tok, int tok_size) {
return T_ERROR;
}
break;
+ case '[':
+ lexbuf_pos++;
+ if (strncmp(lexbuf + lexbuf_pos, "CDATA[", 6) == 0) {
+ strncpy (tok + tok_pos, "[CDATA[", 7); /* FIXME */
+ lexbuf_pos += 6;
+ lex_mode = CDATA;
+ return T_CDATA_START;
+ } else{
+ return T_ERROR;
+ }
+ break;
default:
/* error */
return T_ERROR;
@@ -267,12 +368,14 @@ int lexer_get_token(char * tok, int tok_size) {
break;
/* T_TI_STOP */
- case 9:
+ case STATE_T_TI_STOP:
if (c == '>') {
tok[tok_pos] = c;
lexbuf_pos++;
tok_pos++; /* FIXME */
tok[tok_pos] = '\0';
+ if (!in_comment)
+ lex_mode = DATA;
return T_TI_STOP;
} else {
tok[tok_pos] = '\0';
@@ -281,24 +384,24 @@ int lexer_get_token(char * tok, int tok_size) {
break;
/* -- */
- case 10:
+ case STATE_T_DASHDASH:
switch (c) {
case '-':
tok[tok_pos] = c;
tok_pos++;
lexbuf_pos++;
- state = 11;
+ state = STATE_T_C_STOP;
break;
default:
tok[tok_pos] = c;
tok_pos++;
lexbuf_pos++;
- state = 100;
+ state = STATE_IDENT;
}
break;
/* --> */
- case 11:
+ case STATE_T_C_STOP:
switch (c) {
case '>':
tok[tok_pos] = c;
@@ -318,12 +421,23 @@ int lexer_get_token(char * tok, int tok_size) {
tok[tok_pos] = c;
tok_pos++;
lexbuf_pos++;
- state = 100;
+ state = STATE_IDENT;
}
break;
+ /* T_STRING (single quotes) */
+ case STATE_T_STRING_SINGLE:
+ tok[tok_pos] = c;
+ lexbuf_pos++;
+ if (c == '\'') { /* " */
+ tok[tok_pos] = '\0'; /* FIXME */
+ return T_STRING;
+ }
+ tok_pos++;
+ break;
+
/* IDENT */
- case 100:
+ case STATE_IDENT:
switch (c) {
case '<':
case '>':
@@ -340,13 +454,13 @@ int lexer_get_token(char * tok, int tok_size) {
tok[tok_pos] = c;
tok_pos++;
lexbuf_pos++;
- state = 9;
+ state = STATE_T_TI_STOP;
break;
case '-':
tok[tok_pos] = c;
tok_pos++;
lexbuf_pos++;
- state = 10;
+ state = STATE_T_DASHDASH;
break;
default:
tok[tok_pos] = c;
@@ -358,8 +472,9 @@ int lexer_get_token(char * tok, int tok_size) {
lprintf("expected char \'%c\'\n", tok[tok_pos - 1]); /* FIX ME */
return T_ERROR;
}
- } else {
- /* data mode, stop if char equal '<' */
+ break;
+
+ case DATA: /* data mode, stop if char equal '<' */
switch (c)
{
case '<':
@@ -371,6 +486,28 @@ int lexer_get_token(char * tok, int tok_size) {
tok_pos++;
lexbuf_pos++;
}
+ break;
+
+ case CDATA: /* cdata mode, stop if next token is "]]>" */
+ switch (c)
+ {
+ case ']':
+ if (strncmp(lexbuf + lexbuf_pos, "]]>", 3) == 0) {
+ lexbuf_pos += 3;
+ lex_mode = DATA;
+ return T_CDATA_STOP;
+ } else {
+ tok[tok_pos] = c;
+ tok_pos++;
+ lexbuf_pos++;
+ }
+ break;
+ default:
+ tok[tok_pos] = c;
+ tok_pos++;
+ lexbuf_pos++;
+ }
+ break;
}
}
lprintf ("loop done tok_pos = %d, tok_size=%d, lexbuf_pos=%d, lexbuf_size=%d\n",
@@ -378,33 +515,44 @@ int lexer_get_token(char * tok, int tok_size) {
/* pb */
if (tok_pos >= tok_size) {
- lprintf("token buffer is too little\n");
+ if (fixed)
+ return T_ERROR;
+ *_tok_size *= 2;
+ *_tok = realloc (*_tok, *_tok_size);
+ lprintf("token buffer is too small\n");
+ lprintf("increasing buffer size to %d bytes\n", *_tok_size);
+ if (*_tok) {
+ return lexer_get_token_d (_tok, _tok_size, 0);
+ } else {
+ return T_ERROR;
+ }
} else {
if (lexbuf_pos >= lexbuf_size) {
/* Terminate the current token */
tok[tok_pos] = '\0';
switch (state) {
- case 0:
- case 1:
- case 2:
+ case STATE_IDLE:
+ case STATE_EOL:
+ case STATE_SEPAR:
return T_EOF;
break;
- case 3:
+ case STATE_T_M_START:
return T_M_START_1;
break;
- case 4:
+ case STATE_T_M_STOP_1:
return T_M_STOP_1;
break;
- case 5:
+ case STATE_T_M_STOP_2:
return T_ERROR;
break;
- case 6:
+ case STATE_T_EQUAL:
return T_EQUAL;
break;
- case 7:
+ case STATE_T_STRING_SINGLE:
+ case STATE_T_STRING_DOUBLE:
return T_STRING;
break;
- case 100:
+ case STATE_IDENT:
return T_DATA;
break;
default:
@@ -421,6 +569,12 @@ int lexer_get_token(char * tok, int tok_size) {
return T_ERROR;
}
+/* for ABI compatibility */
+int lexer_get_token (char *tok, int tok_size)
+{
+ return lexer_get_token_d (&tok, &tok_size, 1);
+}
+
static struct {
char code;
unsigned char namelen;
@@ -448,7 +602,7 @@ char *lexer_decode_entities (const char *tok)
{
/* parse the character entity (on failure, treat it as literal text) */
const char *tp = tok;
- long i;
+ signed long i;
for (i = 0; lexer_entities[i].code; ++i)
if (!strncmp (lexer_entities[i].name, tok, lexer_entities[i].namelen)
@@ -476,7 +630,7 @@ char *lexer_decode_entities (const char *tok)
else
i = strtol (tp, (char **)&tp, 10);
- if (i < 1 || i > 255 || *tp != ';')
+ if (*tp != ';' || i < 1)
{
/* out of range, or format error */
*bp++ = '&';
@@ -484,7 +638,23 @@ char *lexer_decode_entities (const char *tok)
}
tok = tp + 1;
- *bp++ = i;
+
+ if (i < 128)
+ /* ASCII - store as-is */
+ *bp++ = i;
+ else
+ {
+ /* Non-ASCII, so convert to UTF-8 */
+ int count = (i >= 0x04000000) ? 5 :
+ (i >= 0x00200000) ? 4 :
+ (i >= 0x00010000) ? 3 :
+ (i >= 0x00000800) ? 2 : 1;
+ *bp = (char)(0x1F80 >> count);
+ count *= 6;
+ *bp++ |= i >> count;
+ while ((count -= 6) >= 0)
+ *bp++ = 128 | ((i >> count) & 0x3F);
+ }
}
}
*bp = 0;