diff options
Diffstat (limited to 'src/xine-utils/xmlparser.c')
-rw-r--r-- | src/xine-utils/xmlparser.c | 419 |
1 files changed, 355 insertions, 64 deletions
diff --git a/src/xine-utils/xmlparser.c b/src/xine-utils/xmlparser.c index 1b82074fb..8619b1be8 100644 --- a/src/xine-utils/xmlparser.c +++ b/src/xine-utils/xmlparser.c @@ -22,6 +22,12 @@ #ifdef HAVE_CONFIG_H #include "config.h" #endif + +#ifdef XINE_COMPILE +# include "config.h" +#endif + +#include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -40,13 +46,13 @@ */ #ifdef XINE_COMPILE -#include "xineutils.h" +#include <xine/xineutils.h> #else #define lprintf(...) #define XINE_MALLOC #endif -#include "xmllexer.h" -#include "xmlparser.h" +#include <xine/xmllexer.h> +#include <xine/xmlparser.h> #define TOKEN_SIZE 4 * 1024 @@ -80,8 +86,11 @@ static xml_node_t * new_xml_node(void) { return new_node; } +static const char cdata[] = CDATA_MARKER; + static void free_xml_node(xml_node_t * node) { - free (node->name); + if (node->name != cdata) + free (node->name); free (node->data); free(node); } @@ -170,23 +179,86 @@ void xml_parser_free_tree(xml_node_t *current_node) { xml_parser_free_tree_rec(current_node, 1); } -#define STATE_IDLE 0 -#define STATE_NODE 1 -#define STATE_COMMENT 7 +typedef enum { + /*0*/ + STATE_IDLE, + /* <foo ...> */ + STATE_NODE, + STATE_ATTRIBUTE, + STATE_NODE_CLOSE, + STATE_TAG_TERM, + STATE_ATTRIBUTE_EQUALS, + STATE_STRING, + STATE_TAG_TERM_IGNORE, + /* <?foo ...?> */ + STATE_Q_NODE, + STATE_Q_ATTRIBUTE, + STATE_Q_NODE_CLOSE, + STATE_Q_TAG_TERM, + STATE_Q_ATTRIBUTE_EQUALS, + STATE_Q_STRING, + /* Others */ + STATE_COMMENT, + STATE_DOCTYPE, + STATE_CDATA, +} parser_state_t; + +static xml_node_t *xml_parser_append_text (xml_node_t *node, xml_node_t *subnode, const char *text, int flags) +{ + if (!text || !*text) + return subnode; /* empty string -> nothing to do */ + + if ((flags & XML_PARSER_MULTI_TEXT) && subnode) { + /* we have a subtree, so we can't use node->data */ + if (subnode->name == cdata) { + /* most recent node is CDATA - append to it */ + char *newtext; + if (asprintf (&newtext, "%s%s", subnode->data, text) < 0) + newtext = NULL; + free (subnode->data); + subnode->data = newtext; + } else { + /* most recent node is not CDATA - add a sibling */ + subnode->next = new_xml_node (); + subnode->next->name = cdata; + subnode->next->data = strdup (text); + subnode = subnode->next; + } + } else if (node->data) { + /* "no" subtree, but we have existing text - append to it */ + char *newtext; + if (asprintf (&newtext, "%s%s", node->data, text) < 0) + newtext = NULL; + free (node->data); + node->data = newtext; + } else { + /* no text, "no" subtree - duplicate & assign */ + while (isspace (*text)) + ++text; + if (*text) + node->data = strdup (text); + } + + return subnode; +} -static int xml_parser_get_node (xml_parser_t *xml_parser, xml_node_t *current_node, char *root_name, int rec); +#define Q_STATE(CURRENT,NEW) (STATE_##NEW + state - STATE_##CURRENT) -static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, int * token_buffer_size, + +static int xml_parser_get_node_internal (xml_parser_t *xml_parser, + char ** token_buffer, int * token_buffer_size, char ** pname_buffer, int * pname_buffer_size, char ** nname_buffer, int * nname_buffer_size, - xml_node_t *current_node, char *root_name, int rec) { + xml_node_t *current_node, char *root_names[], int rec, int flags) +{ char *tok = *token_buffer; char *property_name = *pname_buffer; char *node_name = *nname_buffer; - int state = STATE_IDLE; + parser_state_t state = STATE_IDLE; int res = 0; int parse_res; int bypass_get_token = 0; + int retval = 0; /* used when state==4; non-0 if there are missing </...> */ xml_node_t *subtree = NULL; xml_node_t *current_subtree = NULL; xml_property_t *current_property = NULL; @@ -209,30 +281,33 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, /* do nothing */ break; case (T_EOF): - return 0; /* normal end */ + return retval; /* normal end */ break; case (T_M_START_1): state = STATE_NODE; break; case (T_M_START_2): - state = 3; + state = STATE_NODE_CLOSE; break; case (T_C_START): state = STATE_COMMENT; break; case (T_TI_START): - state = 8; + state = STATE_Q_NODE; break; case (T_DOCTYPE_START): - state = 9; + state = STATE_DOCTYPE; + break; + case (T_CDATA_START): + state = STATE_CDATA; break; case (T_DATA): /* current data */ - if (current_node->data) { - /* avoid a memory leak */ - free(current_node->data); + { + char *decoded = lexer_decode_entities (tok); + current_subtree = xml_parser_append_text (current_node, current_subtree, decoded, flags); + free (decoded); } - current_node->data = lexer_decode_entities(tok); lprintf("info: node data : %s\n", current_node->data); break; default: @@ -243,6 +318,7 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, break; case STATE_NODE: + case STATE_Q_NODE: switch (res) { case (T_IDENT): properties = NULL; @@ -252,14 +328,19 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, if (xml_parser->mode == XML_PARSER_CASE_INSENSITIVE) { strtoupper(tok); } - /* make sure the buffer for the node name is big enough */ - if (*token_buffer_size > *nname_buffer_size) { - *nname_buffer_size = *token_buffer_size; - *nname_buffer = realloc (*nname_buffer, *nname_buffer_size); - node_name = *nname_buffer; + if (state == STATE_Q_NODE) { + if (asprintf (&node_name, "?%s", tok) < 0) + node_name = NULL; + free (*nname_buffer); + *nname_buffer = node_name; + *nname_buffer_size = strlen (node_name) + 1; + state = STATE_Q_ATTRIBUTE; + } else { + free (*nname_buffer); + *nname_buffer = node_name = strdup (tok); + *nname_buffer_size = strlen (node_name) + 1; + state = STATE_ATTRIBUTE; } - strcpy(node_name, tok); - state = 2; lprintf("info: current node name \"%s\"\n", node_name); break; default: @@ -268,7 +349,8 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, break; } break; - case 2: + + case STATE_ATTRIBUTE: switch (res) { case (T_EOL): case (T_SEPAR): @@ -284,8 +366,13 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, /* set node propertys */ subtree->props = properties; lprintf("info: rec %d new subtree %s\n", rec, node_name); - parse_res = xml_parser_get_node(xml_parser, subtree, node_name, rec + 1); - if (parse_res != 0) { + root_names[rec + 1] = strdup (node_name); + parse_res = xml_parser_get_node_internal (xml_parser, token_buffer, token_buffer_size, + pname_buffer, pname_buffer_size, + nname_buffer, nname_buffer_size, + subtree, root_names, rec + 1, flags); + free (root_names[rec + 1]); + if (parse_res == -1 || parse_res > 0) { return parse_res; } if (current_subtree == NULL) { @@ -295,11 +382,16 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, current_subtree->next = subtree; current_subtree = subtree; } + if (parse_res < -1) { + /* badly-formed XML (missing close tag) */ + return parse_res + 1 + (parse_res == -2); + } state = STATE_IDLE; break; case (T_M_STOP_2): /* new leaf */ /* new subtree */ + new_leaf: subtree = new_xml_node(); /* set node name */ @@ -321,6 +413,7 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, break; case (T_IDENT): /* save property name */ + new_prop: if (xml_parser->mode == XML_PARSER_CASE_INSENSITIVE) { strtoupper(tok); } @@ -331,7 +424,7 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, property_name = *pname_buffer; } strcpy(property_name, tok); - state = 5; + state = Q_STATE(ATTRIBUTE, ATTRIBUTE_EQUALS); lprintf("info: current property name \"%s\"\n", property_name); break; default: @@ -341,17 +434,50 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, } break; - case 3: + case STATE_Q_ATTRIBUTE: + switch (res) { + case (T_EOL): + case (T_SEPAR): + /* nothing */ + break; + case (T_TI_STOP): + goto new_leaf; + case (T_IDENT): + goto new_prop; + default: + lprintf("error: unexpected token \"%s\", state %d\n", tok, state); + return -1; + break; + } + break; + + case STATE_NODE_CLOSE: switch (res) { case (T_IDENT): /* must be equal to root_name */ if (xml_parser->mode == XML_PARSER_CASE_INSENSITIVE) { strtoupper(tok); } - if (strcmp(tok, root_name) == 0) { - state = 4; - } else { - lprintf("error: xml struct, tok=%s, waited_tok=%s\n", tok, root_name); + if (strcmp(tok, root_names[rec]) == 0) { + state = STATE_TAG_TERM; + } else if (flags & XML_PARSER_RELAXED) { + int r = rec; + while (--r >= 0) + if (strcmp(tok, root_names[r]) == 0) { + lprintf("warning: wanted %s, got %s - assuming missing close tags\n", root_names[rec], tok); + retval = r - rec - 1; /* -1 - (no. of implied close tags) */ + state = STATE_TAG_TERM; + break; + } + /* relaxed parsing, ignoring extra close tag (but we don't handle out-of-order) */ + if (r < 0) { + lprintf("warning: extra close tag %s - ignoring\n", tok); + state = STATE_TAG_TERM_IGNORE; + } + } + else + { + lprintf("error: xml struct, tok=%s, waited_tok=%s\n", tok, root_names[rec]); return -1; } break; @@ -363,10 +489,10 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, break; /* > expected */ - case 4: + case STATE_TAG_TERM: switch (res) { case (T_M_STOP_1): - return 0; + return retval; break; default: lprintf("error: unexpected token \"%s\", state %d\n", tok, state); @@ -376,18 +502,18 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, break; /* = or > or ident or separator expected */ - case 5: + case STATE_ATTRIBUTE_EQUALS: switch (res) { case (T_EOL): case (T_SEPAR): /* do nothing */ break; case (T_EQUAL): - state = 6; + state = STATE_STRING; break; case (T_IDENT): bypass_get_token = 1; /* jump to state 2 without get a new token */ - state = 2; + state = STATE_ATTRIBUTE; break; case (T_M_STOP_1): /* add a new property without value */ @@ -401,7 +527,42 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, current_property->name = strdup (property_name); lprintf("info: new property %s\n", current_property->name); bypass_get_token = 1; /* jump to state 2 without get a new token */ - state = 2; + state = STATE_ATTRIBUTE; + break; + default: + lprintf("error: unexpected token \"%s\", state %d\n", tok, state); + return -1; + break; + } + break; + + /* = or ?> or ident or separator expected */ + case STATE_Q_ATTRIBUTE_EQUALS: + switch (res) { + case (T_EOL): + case (T_SEPAR): + /* do nothing */ + break; + case (T_EQUAL): + state = STATE_Q_STRING; + break; + case (T_IDENT): + bypass_get_token = 1; /* jump to state 2 without get a new token */ + state = STATE_Q_ATTRIBUTE; + break; + case (T_TI_STOP): + /* add a new property without value */ + if (current_property == NULL) { + properties = new_xml_property(); + current_property = properties; + } else { + current_property->next = new_xml_property(); + current_property = current_property->next; + } + current_property->name = strdup (property_name); + lprintf("info: new property %s\n", current_property->name); + bypass_get_token = 1; /* jump to state 2 without get a new token */ + state = STATE_Q_ATTRIBUTE; break; default: lprintf("error: unexpected token \"%s\", state %d\n", tok, state); @@ -411,7 +572,8 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, break; /* string or ident or separator expected */ - case 6: + case STATE_STRING: + case STATE_Q_STRING: switch (res) { case (T_EOL): case (T_SEPAR): @@ -430,7 +592,7 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, current_property->name = strdup(property_name); current_property->value = lexer_decode_entities(tok); lprintf("info: new property %s=%s\n", current_property->name, current_property->value); - state = 2; + state = Q_STATE(STRING, ATTRIBUTE); break; default: lprintf("error: unexpected token \"%s\", state %d\n", tok, state); @@ -446,31 +608,45 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, state = STATE_IDLE; break; default: - state = STATE_COMMENT; break; } break; - /* ?> expected */ - case 8: + /* > expected */ + case STATE_DOCTYPE: switch (res) { - case (T_TI_STOP): + case (T_M_STOP_1): state = 0; break; default: - state = 8; break; } break; - /* > expected */ - case 9: + /* ]]> expected */ + case STATE_CDATA: + switch (res) { + case (T_CDATA_STOP): + current_subtree = xml_parser_append_text (current_node, current_subtree, tok, flags); + lprintf("info: node cdata : %s\n", tok); + state = STATE_IDLE; + break; + default: + lprintf("error: unexpected token \"%s\", state %d\n", tok, state); + return -1; + break; + } + break; + + /* > expected (following unmatched "</...") */ + case STATE_TAG_TERM_IGNORE: switch (res) { case (T_M_STOP_1): - state = 0; + state = STATE_IDLE; break; default: - state = 9; + lprintf("error: unexpected token \"%s\", state %d\n", tok, state); + return -1; break; } break; @@ -491,7 +667,7 @@ static int _xml_parser_get_node (xml_parser_t *xml_parser, char ** token_buffer, } } -static int xml_parser_get_node (xml_parser_t *xml_parser, xml_node_t *current_node, char *root_name, int rec) +static int xml_parser_get_node (xml_parser_t *xml_parser, xml_node_t *current_node, int flags) { int res = 0; int token_buffer_size = TOKEN_SIZE; @@ -500,11 +676,14 @@ static int xml_parser_get_node (xml_parser_t *xml_parser, xml_node_t *current_no char *token_buffer = calloc(1, token_buffer_size); char *pname_buffer = calloc(1, pname_buffer_size); char *nname_buffer = calloc(1, nname_buffer_size); + char *root_names[MAX_RECURSION + 1]; + root_names[0] = ""; - res = _xml_parser_get_node(xml_parser, &token_buffer, &token_buffer_size, + res = xml_parser_get_node_internal (xml_parser, + &token_buffer, &token_buffer_size, &pname_buffer, &pname_buffer_size, &nname_buffer, &nname_buffer_size, - current_node, root_name, rec); + current_node, root_names, 0, flags); free (token_buffer); free (pname_buffer); @@ -514,18 +693,48 @@ static int xml_parser_get_node (xml_parser_t *xml_parser, xml_node_t *current_no } /* for ABI compatibility */ -int xml_parser_build_tree(xml_node_t **root_node) { - return xml_parser_build_tree_r(static_xml_parser, root_node); +int xml_parser_build_tree_with_options(xml_node_t **root_node, int flags) { + return xml_parser_build_tree_with_options_r(static_xml_parser, root_node, flags); } -int xml_parser_build_tree_r(xml_parser_t *xml_parser, xml_node_t **root_node) { - xml_node_t *tmp_node; +int xml_parser_build_tree_with_options_r(xml_parser_t *xml_parser, xml_node_t **root_node, int flags) { + xml_node_t *tmp_node, *pri_node, *q_node; int res; tmp_node = new_xml_node(); - res = xml_parser_get_node(xml_parser, tmp_node, "", 0); - if ((tmp_node->child) && (!tmp_node->child->next)) { - *root_node = tmp_node->child; + res = xml_parser_get_node(xml_parser, tmp_node, flags); + + /* delete any top-level [CDATA] nodes */; + pri_node = tmp_node->child; + q_node = NULL; + while (pri_node) { + if (pri_node->name == cdata) { + xml_node_t *old = pri_node; + if (q_node) + q_node->next = pri_node->next; + else + q_node = pri_node; + pri_node = pri_node->next; + free_xml_node (old); + } else { + q_node = pri_node; + pri_node = pri_node->next; + } + } + + /* find first non-<?...?> node */; + for (pri_node = tmp_node->child, q_node = NULL; + pri_node && pri_node->name[0] == '?'; + pri_node = pri_node->next) + q_node = pri_node; /* last <?...?> node (eventually), or NULL */ + + if (pri_node && !pri_node->next) { + /* move the tail to the head (for compatibility reasons) */ + if (q_node) { + pri_node->next = tmp_node->child; + q_node->next = NULL; + } + *root_node = pri_node; free_xml_node(tmp_node); res = 0; } else { @@ -536,6 +745,15 @@ int xml_parser_build_tree_r(xml_parser_t *xml_parser, xml_node_t **root_node) { return res; } +/* for ABI compatibility */ +int xml_parser_build_tree(xml_node_t **root_node) { + return xml_parser_build_tree_with_options_r (static_xml_parser, root_node, 0); +} + +int xml_parser_build_tree_r(xml_parser_t *xml_parser, xml_node_t **root_node) { + return xml_parser_build_tree_with_options_r(xml_parser, root_node, 0); +} + const char *xml_parser_get_property (const xml_node_t *node, const char *name) { xml_property_t *prop; @@ -637,7 +855,7 @@ static void xml_parser_dump_node (const xml_node_t *node, int indent) { free (value); p = p->next; if (p) { - printf ("\n%*s", (int)(indent+2+l), ""); + printf ("\n%*s", indent+2+l, ""); } } printf (">\n"); @@ -654,5 +872,78 @@ static void xml_parser_dump_node (const xml_node_t *node, int indent) { } void xml_parser_dump_tree (const xml_node_t *node) { - xml_parser_dump_node (node, 0); + do { + xml_parser_dump_node (node, 0); + node = node->next; + } while (node); } + +#ifdef XINE_XML_PARSER_TEST +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +void *xine_xmalloc (size_t size) +{ + return malloc (size); +} + +int main (int argc, char **argv) +{ + int i, ret = 0; + for (i = 1; argv[i]; ++i) + { + xml_node_t *tree; + int fd; + void *buf; + struct stat st; + + if (stat (argv[i], &st)) + { + perror (argv[i]); + ret = 1; + continue; + } + if (!S_ISREG (st.st_mode)) + { + printf ("%s: not a file\n", argv[i]); + ret = 1; + continue; + } + fd = open (argv[i], O_RDONLY); + if (!fd) + { + perror (argv[i]); + ret = 1; + continue; + } + buf = mmap (NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); + if (!buf) + { + perror (argv[i]); + if (close (fd)) + perror (argv[i]); + ret = 1; + continue; + } + + xml_parser_init (buf, st.st_size, 0); + if (!xml_parser_build_tree (&tree)) + { + puts (argv[i]); + xml_parser_dump_tree (tree); + xml_parser_free_tree (tree); + } + else + printf ("%s: parser failure\n", argv[i]); + + if (close (fd)) + { + perror (argv[i]); + ret = 1; + } + } + return ret; +} +#endif |