diff options
Diffstat (limited to 'src/xine-utils/xmllexer.c')
-rw-r--r-- | src/xine-utils/xmllexer.c | 290 |
1 files changed, 230 insertions, 60 deletions
diff --git a/src/xine-utils/xmllexer.c b/src/xine-utils/xmllexer.c index 575c37611..75a1aafec 100644 --- a/src/xine-utils/xmllexer.c +++ b/src/xine-utils/xmllexer.c @@ -15,11 +15,8 @@ * * You should have received a copy of the GNU Library General Public * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - * - * $Id: xmllexer.c,v 1.13 2007/03/04 16:19:12 hadess Exp $ - * + * write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth + * Floor, Boston, MA 02110, USA */ #define LOG_MODULE "xmllexer" @@ -29,31 +26,99 @@ */ #ifdef XINE_COMPILE -#include "xineutils.h" +#include <xine/xineutils.h> #else #define lprintf(...) #define xine_xmalloc malloc #endif -#include "xmllexer.h" +#include <xine/xmllexer.h> #include <stdio.h> #include <ctype.h> #include <string.h> #include <stdlib.h> +#ifdef HAVE_ICONV +#include <iconv.h> +#endif + +#include "bswap.h" /* private constants*/ -#define NORMAL 0 /* normal lex mode */ -#define DATA 1 /* data lex mode */ /* private global variables */ static const char * lexbuf; static int lexbuf_size = 0; static int lexbuf_pos = 0; -static int lex_mode = NORMAL; static int in_comment = 0; +static char *lex_malloc = NULL; + +enum utf { UTF32BE, UTF32LE, UTF16BE, UTF16LE }; + +static void lex_convert (const char * buf, int size, enum utf utf) +{ + char *utf8 = malloc (size * (utf >= UTF16BE ? 3 : 6) + 1); + char *bp = utf8; + while (size > 0) + { + uint32_t c = 0; + switch (utf) + { + case UTF32BE: c = _X_BE_32 (buf); buf += 4; break; + case UTF32LE: c = _X_LE_32 (buf); buf += 4; break; + case UTF16BE: c = _X_BE_16 (buf); buf += 2; break; + case UTF16LE: c = _X_LE_16 (buf); buf += 2; break; + } + if (!c) + break; /* embed a NUL, get a truncated string */ + if (c < 128) + *bp++ = c; + else + { + int count = (c >= 0x04000000) ? 5 : + (c >= 0x00200000) ? 4 : + (c >= 0x00010000) ? 3 : + (c >= 0x00000800) ? 2 : 1; + *bp = (char)(0x1F80 >> count); + count *= 6; + *bp++ |= c >> count; + while ((count -= 6) >= 0) + *bp++ = 128 | ((c >> count) & 0x3F); + } + } + *bp = 0; + lexbuf_size = bp - utf8; + lexbuf = lex_malloc = realloc (utf8, lexbuf_size + 1); +} + +static enum { + NORMAL, + DATA, + CDATA, +} lex_mode = NORMAL; void lexer_init(const char * buf, int size) { + static const char boms[] = { 0xFF, 0xFE, 0, 0, 0xFE, 0xFF }, + bom_utf8[] = { 0xEF, 0xBB, 0xBF }; + + free (lex_malloc); + lex_malloc = NULL; + lexbuf = buf; lexbuf_size = size; + + if (size >= 4 && !memcmp (buf, boms + 2, 4)) + lex_convert (buf + 4, size - 4, UTF32BE); + else if (size >= 4 && !memcmp (buf, boms, 4)) + lex_convert (buf + 4, size - 4, UTF32LE); + else if (size >= 3 && !memcmp (buf, bom_utf8, 3)) + { + lexbuf += 3; + lexbuf_size -= 3; + } + else if (size >= 2 && !memcmp (buf, boms + 4, 2)) + lex_convert (buf + 2, size - 2, UTF16BE); + else if (size >= 2 && !memcmp (buf, boms, 2)) + lex_convert (buf + 2, size - 2, UTF16LE); + lexbuf_pos = 0; lex_mode = NORMAL; in_comment = 0; @@ -61,79 +126,104 @@ void lexer_init(const char * buf, int size) { lprintf("buffer length %d\n", size); } -int lexer_get_token(char * tok, int tok_size) { +typedef enum { + STATE_UNKNOWN = -1, + STATE_IDLE, + STATE_EOL, + STATE_SEPAR, + STATE_T_M_START, + STATE_T_M_STOP_1, + STATE_T_M_STOP_2, + STATE_T_EQUAL, + STATE_T_STRING_SINGLE, + STATE_T_STRING_DOUBLE, + STATE_T_COMMENT, + STATE_T_TI_STOP, + STATE_T_DASHDASH, + STATE_T_C_STOP, + STATE_IDENT /* must be last */ +} lexer_state_t; + +int lexer_get_token_d(char ** _tok, int * _tok_size, int fixed) { + char *tok = *_tok; + int tok_size = *_tok_size; int tok_pos = 0; - int state = 0; + lexer_state_t state = STATE_IDLE; char c; if (tok) { while ((tok_pos < tok_size) && (lexbuf_pos < lexbuf_size)) { c = lexbuf[lexbuf_pos]; - lprintf("c=%c, state=%d, in_comment=%d\n", c, state, in_comment); + lprintf("c=%c, state=%d, lex_mode=%d, in_comment=%d\n", c, state, lex_mode, in_comment); - if (lex_mode == NORMAL) { - /* normal mode */ + switch (lex_mode) { + case NORMAL: switch (state) { /* init state */ - case 0: + case STATE_IDLE: switch (c) { case '\n': case '\r': - state = 1; + state = STATE_EOL; tok[tok_pos] = c; tok_pos++; break; case ' ': case '\t': - state = 2; + state = STATE_SEPAR; tok[tok_pos] = c; tok_pos++; break; case '<': - state = 3; + state = STATE_T_M_START; tok[tok_pos] = c; tok_pos++; break; case '>': - state = 4; + state = STATE_T_M_STOP_1; tok[tok_pos] = c; tok_pos++; break; case '/': if (!in_comment) - state = 5; + state = STATE_T_M_STOP_2; tok[tok_pos] = c; tok_pos++; break; case '=': - state = 6; + state = STATE_T_EQUAL; tok[tok_pos] = c; tok_pos++; break; case '\"': /* " */ - state = 7; + state = STATE_T_STRING_DOUBLE; + break; + + case '\'': /* " */ + state = STATE_T_STRING_SINGLE; break; case '-': - state = 10; + state = STATE_T_DASHDASH; tok[tok_pos] = c; tok_pos++; break; case '?': - state = 9; + if (!in_comment) + state = STATE_T_TI_STOP; tok[tok_pos] = c; tok_pos++; break; default: - state = 100; + state = STATE_IDENT; tok[tok_pos] = c; tok_pos++; break; @@ -142,7 +232,7 @@ int lexer_get_token(char * tok, int tok_size) { break; /* end of line */ - case 1: + case STATE_EOL: if (c == '\n' || (c == '\r')) { tok[tok_pos] = c; lexbuf_pos++; @@ -154,7 +244,7 @@ int lexer_get_token(char * tok, int tok_size) { break; /* T_SEPAR */ - case 2: + case STATE_SEPAR: if (c == ' ' || (c == '\t')) { tok[tok_pos] = c; lexbuf_pos++; @@ -166,7 +256,7 @@ int lexer_get_token(char * tok, int tok_size) { break; /* T_M_START < or </ or <! or <? */ - case 3: + case STATE_T_M_START: switch (c) { case '/': tok[tok_pos] = c; @@ -179,7 +269,7 @@ int lexer_get_token(char * tok, int tok_size) { tok[tok_pos] = c; lexbuf_pos++; tok_pos++; - state = 8; + state = STATE_T_COMMENT; break; case '?': tok[tok_pos] = c; @@ -195,7 +285,7 @@ int lexer_get_token(char * tok, int tok_size) { break; /* T_M_STOP_1 */ - case 4: + case STATE_T_M_STOP_1: tok[tok_pos] = '\0'; if (!in_comment) lex_mode = DATA; @@ -203,7 +293,7 @@ int lexer_get_token(char * tok, int tok_size) { break; /* T_M_STOP_2 */ - case 5: + case STATE_T_M_STOP_2: if (c == '>') { tok[tok_pos] = c; lexbuf_pos++; @@ -219,13 +309,13 @@ int lexer_get_token(char * tok, int tok_size) { break; /* T_EQUAL */ - case 6: + case STATE_T_EQUAL: tok[tok_pos] = '\0'; return T_EQUAL; break; /* T_STRING */ - case 7: + case STATE_T_STRING_DOUBLE: tok[tok_pos] = c; lexbuf_pos++; if (c == '\"') { /* " */ @@ -235,8 +325,8 @@ int lexer_get_token(char * tok, int tok_size) { tok_pos++; break; - /* T_C_START or T_DOCTYPE_START */ - case 8: + /* T_C_START or T_DOCTYPE_START or T_CDATA_START */ + case STATE_T_COMMENT: switch (c) { case '-': lexbuf_pos++; @@ -260,6 +350,17 @@ int lexer_get_token(char * tok, int tok_size) { return T_ERROR; } break; + case '[': + lexbuf_pos++; + if (strncmp(lexbuf + lexbuf_pos, "CDATA[", 6) == 0) { + strncpy (tok + tok_pos, "[CDATA[", 7); /* FIXME */ + lexbuf_pos += 6; + lex_mode = CDATA; + return T_CDATA_START; + } else{ + return T_ERROR; + } + break; default: /* error */ return T_ERROR; @@ -267,12 +368,14 @@ int lexer_get_token(char * tok, int tok_size) { break; /* T_TI_STOP */ - case 9: + case STATE_T_TI_STOP: if (c == '>') { tok[tok_pos] = c; lexbuf_pos++; tok_pos++; /* FIXME */ tok[tok_pos] = '\0'; + if (!in_comment) + lex_mode = DATA; return T_TI_STOP; } else { tok[tok_pos] = '\0'; @@ -281,24 +384,24 @@ int lexer_get_token(char * tok, int tok_size) { break; /* -- */ - case 10: + case STATE_T_DASHDASH: switch (c) { case '-': tok[tok_pos] = c; tok_pos++; lexbuf_pos++; - state = 11; + state = STATE_T_C_STOP; break; default: tok[tok_pos] = c; tok_pos++; lexbuf_pos++; - state = 100; + state = STATE_IDENT; } break; /* --> */ - case 11: + case STATE_T_C_STOP: switch (c) { case '>': tok[tok_pos] = c; @@ -318,12 +421,23 @@ int lexer_get_token(char * tok, int tok_size) { tok[tok_pos] = c; tok_pos++; lexbuf_pos++; - state = 100; + state = STATE_IDENT; } break; + /* T_STRING (single quotes) */ + case STATE_T_STRING_SINGLE: + tok[tok_pos] = c; + lexbuf_pos++; + if (c == '\'') { /* " */ + tok[tok_pos] = '\0'; /* FIXME */ + return T_STRING; + } + tok_pos++; + break; + /* IDENT */ - case 100: + case STATE_IDENT: switch (c) { case '<': case '>': @@ -340,13 +454,13 @@ int lexer_get_token(char * tok, int tok_size) { tok[tok_pos] = c; tok_pos++; lexbuf_pos++; - state = 9; + state = STATE_T_TI_STOP; break; case '-': tok[tok_pos] = c; tok_pos++; lexbuf_pos++; - state = 10; + state = STATE_T_DASHDASH; break; default: tok[tok_pos] = c; @@ -358,8 +472,9 @@ int lexer_get_token(char * tok, int tok_size) { lprintf("expected char \'%c\'\n", tok[tok_pos - 1]); /* FIX ME */ return T_ERROR; } - } else { - /* data mode, stop if char equal '<' */ + break; + + case DATA: /* data mode, stop if char equal '<' */ switch (c) { case '<': @@ -371,6 +486,28 @@ int lexer_get_token(char * tok, int tok_size) { tok_pos++; lexbuf_pos++; } + break; + + case CDATA: /* cdata mode, stop if next token is "]]>" */ + switch (c) + { + case ']': + if (strncmp(lexbuf + lexbuf_pos, "]]>", 3) == 0) { + lexbuf_pos += 3; + lex_mode = DATA; + return T_CDATA_STOP; + } else { + tok[tok_pos] = c; + tok_pos++; + lexbuf_pos++; + } + break; + default: + tok[tok_pos] = c; + tok_pos++; + lexbuf_pos++; + } + break; } } lprintf ("loop done tok_pos = %d, tok_size=%d, lexbuf_pos=%d, lexbuf_size=%d\n", @@ -378,33 +515,44 @@ int lexer_get_token(char * tok, int tok_size) { /* pb */ if (tok_pos >= tok_size) { - lprintf("token buffer is too little\n"); + if (fixed) + return T_ERROR; + *_tok_size *= 2; + *_tok = realloc (*_tok, *_tok_size); + lprintf("token buffer is too small\n"); + lprintf("increasing buffer size to %d bytes\n", *_tok_size); + if (*_tok) { + return lexer_get_token_d (_tok, _tok_size, 0); + } else { + return T_ERROR; + } } else { if (lexbuf_pos >= lexbuf_size) { /* Terminate the current token */ tok[tok_pos] = '\0'; switch (state) { - case 0: - case 1: - case 2: + case STATE_IDLE: + case STATE_EOL: + case STATE_SEPAR: return T_EOF; break; - case 3: + case STATE_T_M_START: return T_M_START_1; break; - case 4: + case STATE_T_M_STOP_1: return T_M_STOP_1; break; - case 5: + case STATE_T_M_STOP_2: return T_ERROR; break; - case 6: + case STATE_T_EQUAL: return T_EQUAL; break; - case 7: + case STATE_T_STRING_SINGLE: + case STATE_T_STRING_DOUBLE: return T_STRING; break; - case 100: + case STATE_IDENT: return T_DATA; break; default: @@ -421,6 +569,12 @@ int lexer_get_token(char * tok, int tok_size) { return T_ERROR; } +/* for ABI compatibility */ +int lexer_get_token (char *tok, int tok_size) +{ + return lexer_get_token_d (&tok, &tok_size, 1); +} + static struct { char code; unsigned char namelen; @@ -448,7 +602,7 @@ char *lexer_decode_entities (const char *tok) { /* parse the character entity (on failure, treat it as literal text) */ const char *tp = tok; - long i; + signed long i; for (i = 0; lexer_entities[i].code; ++i) if (!strncmp (lexer_entities[i].name, tok, lexer_entities[i].namelen) @@ -476,7 +630,7 @@ char *lexer_decode_entities (const char *tok) else i = strtol (tp, (char **)&tp, 10); - if (i < 1 || i > 255 || *tp != ';') + if (*tp != ';' || i < 1) { /* out of range, or format error */ *bp++ = '&'; @@ -484,7 +638,23 @@ char *lexer_decode_entities (const char *tok) } tok = tp + 1; - *bp++ = i; + + if (i < 128) + /* ASCII - store as-is */ + *bp++ = i; + else + { + /* Non-ASCII, so convert to UTF-8 */ + int count = (i >= 0x04000000) ? 5 : + (i >= 0x00200000) ? 4 : + (i >= 0x00010000) ? 3 : + (i >= 0x00000800) ? 2 : 1; + *bp = (char)(0x1F80 >> count); + count *= 6; + *bp++ |= i >> count; + while ((count -= 6) >= 0) + *bp++ = 128 | ((i >> count) & 0x3F); + } } } *bp = 0; |