From 9c57902b966e6d8b5f5dec9635341b48d2ea006f Mon Sep 17 00:00:00 2001 From: Olof hagsand Date: Sun, 18 Nov 2018 13:22:08 +0100 Subject: [PATCH] * XML parser conformance to W3 spec * Names lexically correct (NCName) * Syntactically Correct handling of 'a * b * + * From https://www.w3.org/TR/2009/REC-xml-names-20091208 + * Definitions: + * - XML namespace: is identified by a URI reference [RFC3986]; element and + * attribute names may be placed in an XML namespace using the mechanisms + * described in this specification. + * - Expanded name: is a pair consisting of a namespace name and a local name. + * - Namespace name: For a name N in a namespace identified by a URI I, the + * "namespace name" is I. + * For a name N that is not in a namespace, the "namespace name" has no value. + * - Local name: In either case the "local name" is N. + * It is this combination of the universally managed URI namespace with the + * vocabulary's local names that is effective in avoiding name clashes. */ struct xml{ char *x_name; /* name of node */ char *x_namespace; /* namespace, if any */ +#ifdef notyet + char *x_namespacename; /* namespace name (or NULL) */ + char *x_localname; /* Local name N as defined above */ +#endif struct xml *x_up; /* parent node in hierarchy if any */ struct xml **x_childvec; /* vector of children nodes */ int x_childvec_len;/* length of vector */ @@ -224,7 +240,7 @@ xmlns_check(cxobj *xn, return NULL; } -/*! Check namespace of xml node by searhing recursively among ancestors +/*! Check namespace of xml node by searching recursively among ancestors * @param[in] xn xml node * @param[in] namespace check validity of namespace * @retval 0 Found / validated or no yang spec @@ -1258,15 +1274,18 @@ xmltree2cbuf(cbuf *cb, * @see xml_parse_file * @see xml_parse_string * @see xml_parse_va + * @note special case is empty XML where the parser is not invoked. */ static int -_xml_parse(const char *str, +_xml_parse(const char *str, yang_spec *yspec, cxobj *xt) { int retval = -1; struct xml_parse_yacc_arg ya = {0,}; + if (strlen(str) == 0) + return 0; /* OK */ if (xt == NULL){ clicon_err(OE_XML, errno, "Unexpected NULL XML"); return -1; diff --git a/lib/src/clixon_xml_parse.l b/lib/src/clixon_xml_parse.l index 4e9b297a..39173634 100644 --- a/lib/src/clixon_xml_parse.l +++ b/lib/src/clixon_xml_parse.l @@ -34,6 +34,8 @@ * XML parser * @see https://www.w3.org/TR/2008/REC-xml-20081126 * https://www.w3.org/TR/2009/REC-xml-names-20091208 + * + */ %{ @@ -72,8 +74,22 @@ int clixon_xml_parsewrap(void) return 1; } +/* + * From https://www.w3.org/TR/2008/REC-xml-20081126: + * [4]* NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] ... + * [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7| + * [5] Name ::= NameStartChar (NameChar)* + * NOTE: From https://www.w3.org/TR/2009/REC-xml-names-20091208: + * [4] NCName ::= Name - (Char* ':' Char*) An XML Name, minus the ":" + * --> namestart and name below is NCNAME + */ + %} +namestart [A-Z_a-z] +namechar [A-Z_a-z\-\.0-9] +ncname {namestart}{namechar}* + %x START %s STATEA %s AMPERSAND @@ -81,36 +97,42 @@ int clixon_xml_parsewrap(void) %s CMNT %s STR %s TEXTDECL +%s PIDECL +%s PIDECL2 %s STRDQ %s STRSQ %% -[0-9A-Za-z_\-]+ { clixon_xml_parselval.string = strdup(yytext); + +[ \t] ; +\n { _YA->ya_linenum++; } + +{ncname} { clixon_xml_parselval.string = strdup(yytext); return NAME; /* rather be catch-all */ } -[ \t]+ ; \: return *clixon_xml_parsetext; \n { _YA->ya_linenum++;} -"<> { return MY_EOF; } +"""/>" { BEGIN(STATEA); return ESLASH; } "" { BEGIN(START); return ECOMMENT; } -\n _YA->ya_linenum++; . encoding return ENC; version return VER; -"=" return *clixon_xml_parsetext; -"?>" { BEGIN(START);return ETEXT;} +standalone return SD; +"=" { return *clixon_xml_parsetext; } +"?>" { BEGIN(START);return EQMARK;} \" { _YA->ya_lex_state =TEXTDECL;BEGIN(STRDQ); return *clixon_xml_parsetext; } \' { _YA->ya_lex_state =TEXTDECL;BEGIN(STRSQ); return *clixon_xml_parsetext; } +. { clixon_xml_parselval.string = yytext; return CHARDATA; /* optimize? */} -1\.[0-9]+ { clixon_xml_parselval.string = strdup(yytext); return CHARDATA; } -[^\"]+ { clixon_xml_parselval.string = strdup(yytext); return CHARDATA; } +{ncname} { clixon_xml_parselval.string = strdup(yytext); + return NAME; /* rather be catch-all */ + } +[ \t] { BEGIN(PIDECL2);} +. { clixon_xml_parselval.string = yytext; return CHARDATA; /* optimize? */} +"?>" { BEGIN(START);return EQMARK;} +[^{?>}]+ { clixon_xml_parselval.string = strdup(yytext); return STRING; } + +1\.[0-9]+ { clixon_xml_parselval.string = strdup(yytext); return STRING; } +[^\"]+ { clixon_xml_parselval.string = strdup(yytext); return STRING; } \" { BEGIN(_YA->ya_lex_state); return *clixon_xml_parsetext; } -1\.[0-9]+ { clixon_xml_parselval.string = strdup(yytext); return CHARDATA; } -[^\']+ { clixon_xml_parselval.string = strdup(yytext); return CHARDATA; } +1\.[0-9]+ { clixon_xml_parselval.string = strdup(yytext); return STRING; } +[^\']+ { clixon_xml_parselval.string = strdup(yytext); return STRING; } \' { BEGIN(_YA->ya_lex_state); return *clixon_xml_parsetext; } %% diff --git a/lib/src/clixon_xml_parse.y b/lib/src/clixon_xml_parse.y index ef9f78a9..a86c7276 100644 --- a/lib/src/clixon_xml_parse.y +++ b/lib/src/clixon_xml_parse.y @@ -39,12 +39,13 @@ char *string; } -%start topxml +%start document -%token NAME CHARDATA -%token VER ENC +%token NAME CHARDATA STRING +%token MY_EOF +%token VER ENC SD %token BSLASH ESLASH -%token BTEXT ETEXT +%token BXMLDCL BQMARK EQMARK %token BCOMMENT ECOMMENT %type attvalue @@ -120,7 +121,8 @@ xml_parse_version(struct xml_parse_yacc_arg *ya, free(ver); return -1; } - free(ver); + if (ver) + free(ver); return 0; } @@ -299,6 +301,11 @@ xml_parse_bslash2(struct xml_parse_yacc_arg *ya, return retval; } +/*! Parse XML attribute + * Special cases: + * - DefaultAttName: xmlns + * - PrefixedAttName: xmlns:NAME + */ static int xml_parse_attr(struct xml_parse_yacc_arg *ya, char *prefix, @@ -308,6 +315,12 @@ xml_parse_attr(struct xml_parse_yacc_arg *ya, int retval = -1; cxobj *xa; +#ifdef notyet + if (prefix && strcmp(prefix,"xmlns")==0) + fprintf(stderr, "PrefixedAttName NCNAME:%s = %s\n", name, attval); + if (prefix==NULL && strcmp(name,"xmlns")==0) + fprintf(stderr, "DefaultAttName = %s\n", attval); +#endif /* notyet */ if ((xa = xml_new(name, ya->ya_xelement, NULL)) == NULL) goto done; xml_type_set(xa, CX_ATTR); @@ -327,69 +340,100 @@ xml_parse_attr(struct xml_parse_yacc_arg *ya, %} %% - -topxml : list - { clicon_debug(3, "topxml->list ACCEPT"); - YYACCEPT; } - | dcl list - { clicon_debug(3, "topxml->dcl list ACCEPT"); - YYACCEPT; } + /* [1] document ::= prolog element Misc* */ +document : prolog element misclist MY_EOF + { clicon_debug(2, "document->prolog element misc* ACCEPT"); + YYACCEPT; } + | elist MY_EOF + { clicon_debug(2, "document->elist ACCEPT"); /* internal exception*/ + YYACCEPT; } + ; +/* [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? */ +prolog : xmldcl misclist + { clicon_debug(2, "prolog->xmldcl misc*"); } + | misclist + { clicon_debug(2, "prolog->misc*"); } ; -dcl : BTEXT info encode ETEXT { clicon_debug(3, "dcl->info encode"); } +misclist : misclist misc { clicon_debug(2, "misclist->misclist misc"); } + | { clicon_debug(2, "misclist->"); } ; -info : VER '=' '\"' CHARDATA '\"' - { if (xml_parse_version(_YA, $4) <0) YYABORT; } - | VER '=' '\'' CHARDATA '\'' - { if (xml_parse_version(_YA, $4) <0) YYABORT; } +/* [27] Misc ::= Comment | PI | S */ +misc : comment { clicon_debug(2, "misc->comment"); } + | pi { clicon_debug(2, "misc->pi"); } + ; + +xmldcl : BXMLDCL verinfo encodingdecl sddecl EQMARK + { clicon_debug(2, "xmldcl->verinfo encodingdecl? sddecl?"); } + ; + +verinfo : VER '=' '\"' STRING '\"' + { if (xml_parse_version(_YA, $4) <0) YYABORT; + clicon_debug(2, "verinfo->version=\"STRING\"");} + | VER '=' '\'' STRING '\'' + { if (xml_parse_version(_YA, $4) <0) YYABORT; + clicon_debug(2, "verinfo->version='STRING'");} + ; + +encodingdecl : ENC '=' '\"' STRING '\"' {if ($4)free($4);} + | ENC '=' '\'' STRING '\'' {if ($4)free($4);} | ; -encode : ENC '=' '\"' CHARDATA '\"' {free($4);} - | ENC '=' '\'' CHARDATA '\'' {free($4);} +sddecl : SD '=' '\"' STRING '\"' {if ($4)free($4);} + | SD '=' '\'' STRING '\'' {if ($4)free($4);} + | ; - +/* [39] element ::= EmptyElemTag | STag content ETag */ element : '<' qname attrs element1 - { clicon_debug(3, "element -> < qname attrs element1"); } - ; + { clicon_debug(2, "element -> < qname attrs element1"); } + ; qname : NAME { if (xml_parse_unprefixed_name(_YA, $1) < 0) YYABORT; - clicon_debug(3, "qname -> NAME %s", $1);} + clicon_debug(2, "qname -> NAME %s", $1);} | NAME ':' NAME { if (xml_parse_prefixed_name(_YA, $1, $3) < 0) YYABORT; - clicon_debug(3, "qname -> NAME : NAME");} + clicon_debug(2, "qname -> NAME : NAME");} ; element1 : ESLASH {_YA->ya_xelement = NULL; - clicon_debug(3, "element1 -> />");} + clicon_debug(2, "element1 -> />");} | '>' { xml_parse_endslash_pre(_YA); } - list { xml_parse_endslash_mid(_YA); } - etg { xml_parse_endslash_post(_YA); - clicon_debug(3, "element1 -> > list etg");} + elist { xml_parse_endslash_mid(_YA); } + endtag { xml_parse_endslash_post(_YA); + clicon_debug(2, "element1 -> > elist endtag");} ; -etg : BSLASH NAME '>' -{ clicon_debug(3, "etg -> < ", $2); if (xml_parse_bslash1(_YA, $2) < 0) YYABORT; } +endtag : BSLASH NAME '>' + { clicon_debug(2, "endtag -> < "); + if (xml_parse_bslash1(_YA, $2) < 0) YYABORT; } | BSLASH NAME ':' NAME '>' { if (xml_parse_bslash2(_YA, $2, $4) < 0) YYABORT; - clicon_debug(3, "etg -> < "); } + clicon_debug(2, "endtag -> < "); } ; -list : list content { clicon_debug(3, "list -> list content"); } - | content { clicon_debug(3, "list -> content"); } +elist : elist content { clicon_debug(2, "elist -> elist content"); } + | content { clicon_debug(2, "elist -> content"); } ; -content : element { clicon_debug(3, "content -> element"); } - | comment { clicon_debug(3, "content -> comment"); } - | CHARDATA { if (xml_parse_content(_YA, $1) < 0) YYABORT; - clicon_debug(3, "content -> CHARDATA %s", $1); } - | { clicon_debug(3, "content -> "); } +/* Rule 43 */ +content : element { clicon_debug(2, "content -> element"); } + | comment { clicon_debug(2, "content -> comment"); } + | pi { clicon_debug(2, "content -> pi"); } + | CHARDATA { if (xml_parse_content(_YA, $1) < 0) YYABORT; + clicon_debug(2, "content -> CHARDATA %s", $1); } + | { clicon_debug(2, "content -> "); } ; comment : BCOMMENT ECOMMENT ; +pi : BQMARK NAME EQMARK {clicon_debug(2, "pi -> "); free($2); } + | BQMARK NAME STRING EQMARK + {clicon_debug(2, "pi -> "); free($2); free($3);} + ; + attrs : attrs attr | @@ -399,9 +443,9 @@ attr : NAME '=' attvalue { if (xml_parse_attr(_YA, NULL, $1, $3) | NAME ':' NAME '=' attvalue { if (xml_parse_attr(_YA, $1, $3, $5) < 0) YYABORT; } ; -attvalue : '\"' CHARDATA '\"' { $$=$2; /* $2 must be consumed */} +attvalue : '\"' STRING '\"' { $$=$2; /* $2 must be consumed */} | '\"' '\"' { $$=strdup(""); /* $2 must be consumed */} - | '\'' CHARDATA '\'' { $$=$2; /* $2 must be consumed */} + | '\'' STRING '\'' { $$=$2; /* $2 must be consumed */} | '\'' '\'' { $$=strdup(""); /* $2 must be consumed */} ; diff --git a/test/lib.sh b/test/lib.sh index 6a5138ac..52e5b119 100755 --- a/test/lib.sh +++ b/test/lib.sh @@ -78,6 +78,9 @@ expectfn(){ expect2= fi ret=$($cmd) +# echo "cmd:\"$cmd\"" +# echo "retval:\"$retval\"" +# echo "ret:\"$ret\"" if [ $? -ne $retval ]; then echo -e "\e[31m\nError in Test$testnr [$testname]:" echo -e "\e[0m:" @@ -134,11 +137,15 @@ $input EOF ) r=$? - if [ $r -ne $retval ]; then + if [ $r != $retval ]; then echo -e "\e[31m\nError ($r != $retval) in Test$testnr [$testname]:" echo -e "\e[0m:" exit -1 fi + # If error dont match output strings + if [ $r != 0 ]; then + return + fi # Match if both are empty string if [ -z "$ret" -a -z "$expect" ]; then diff --git a/test/test_xml.sh b/test/test_xml.sh index b2937201..2aa931e3 100755 --- a/test/test_xml.sh +++ b/test/test_xml.sh @@ -1,5 +1,7 @@ #!/bin/bash # Test: XML parser tests +# @see https://www.w3.org/TR/2008/REC-xml-20081126 +# https://www.w3.org/TR/2009/REC-xml-names-20091208 #PROG="valgrind --leak-check=full --show-leak-kinds=all ../util/clixon_util_xml" PROG=../util/clixon_util_xml @@ -9,6 +11,18 @@ PROG=../util/clixon_util_xml new "xml parse" expecteof "$PROG" 0 "" "^$" +new "xml parse strange names" +expecteof "$PROG" 0 "<_->" "^<_->$" + +new "xml parse name errors" +expecteof "$PROG" 255 "<-a/>" "" + +new "xml parse name errors" +expecteof "$PROG" 255 "<9/>" "" + +new "xml parse name errors" +expecteof "$PROG" 255 "" "" + XML=$(cat <An example of escaped CENDs @@ -52,5 +66,75 @@ expecteof "$PROG" 0 "" '^$' new "Mixed quotes" expecteof "$PROG" 0 "" '^$' +new "XMLdecl version" +expecteof "$PROG" 0 '' '' + +new "XMLdecl version, single quotes" +expecteof "$PROG" 0 "" '' + +new "XMLdecl version no element" +expecteof "$PROG" 255 '' '' + +new "XMLdecl no version" +expecteof "$PROG" 255 '' '' + +new "XMLdecl misspelled version" +expecteof "$PROG" 255 '' '' + +new "XMLdecl version + encoding" +expecteof "$PROG" 0 '' '' + +new "XMLdecl version + misspelled encoding" +expecteof "$PROG" 255 '' '' + +new "XMLdecl version + standalone" +expecteof "$PROG" 0 '' '' + +new "PI - Processing instruction empty" +expecteof "$PROG" 0 '' '' + +new "PI some content" +expecteof "$PROG" 0 '' '' + +new "prolog element misc*" +expecteof "$PROG" 0 '' '' + +# We allow it as an internal necessity for parsing of xml fragments +#new "double element error" +#expecteof "$PROG" 255 '' '' + +new "namespace: DefaultAttName" +expecteof "$PROG" 0 'hello' '^hello$' + +new "namespace: PrefixedAttName" +expecteof "$PROG" 0 'hello' '^hello$' + +new "First example 6.1 from https://www.w3.org/TR/2009/REC-xml-names-20091208" +XML=$(cat < + + + + Frobnostication + Moved to + here. + +EOF +) +expecteof "$PROG" 0 "$XML" "$XML" + +new "Second example 6.1 from https://www.w3.org/TR/2009/REC-xml-names-20091208" +XML=$(cat < + + + Cheaper by the Dozen + 1568491379 + +EOF +) +expecteof "$PROG" 0 "$XML" "$XML" + rm -rf $dir diff --git a/util/Makefile.in b/util/Makefile.in index 4814e91a..5095b612 100644 --- a/util/Makefile.in +++ b/util/Makefile.in @@ -78,7 +78,7 @@ all: $(APPS) @echo "You may want to make clixon_util_stream separately (curl dependency)" clean: - rm -f $(APPS) *.core + rm -f $(APPS) clixon_util_stream *.core # APPS clixon_util_xml: clixon_util_xml.c $(MYLIB) diff --git a/util/clixon_util_xml.c b/util/clixon_util_xml.c index 72730000..29734f40 100644 --- a/util/clixon_util_xml.c +++ b/util/clixon_util_xml.c @@ -48,6 +48,7 @@ #include #include #include +#include #include /* cligen */ @@ -68,21 +69,40 @@ static int usage(char *argv0) { - fprintf(stderr, "usage:%s.\n\tInput on stdin\n", argv0); + fprintf(stderr, "usage:%s [options]\n" + "where options are\n" + "\t-h \t\tHelp\n" + "\t-D \tDebug\n", + argv0); exit(0); } int -main(int argc, char **argv) +main(int argc, + char **argv) { cxobj *xt = NULL; cxobj *xc; cbuf *cb = cbuf_new(); + int retval = -1; + char c; - if (argc != 1){ - usage(argv[0]); - return 0; - } + clicon_log_init("xpath", LOG_DEBUG, CLICON_LOG_STDERR); + optind = 1; + opterr = 0; + while ((c = getopt(argc, argv, "hD:")) != -1) + switch (c) { + case 'h': + usage(argv[0]); + break; + case 'D': + if (sscanf(optarg, "%d", &debug) != 1) + usage(argv[0]); + break; + default: + usage(argv[0]); + break; + } if (xml_parse_file(0, "", NULL, &xt) < 0){ fprintf(stderr, "xml parse error %s\n", clicon_err_reason); goto done; @@ -90,18 +110,20 @@ main(int argc, char **argv) xc = NULL; while ((xc = xml_child_each(xt, xc, -1)) != NULL) clicon_xml2cbuf(cb, xc, 0, 0); /* print xml */ - fprintf(stdout, "%s\n", cbuf_get(cb)); + fprintf(stdout, "%s", cbuf_get(cb)); + fflush(stdout); #if 0 cbuf_reset(cb); xmltree2cbuf(cb, xt, 0); /* dump data structures */ fprintf(stderr, "%s\n", cbuf_get(cb)); #endif + retval = 0; done: if (xt) xml_free(xt); if (cb) cbuf_free(cb); - return 0; + return retval; } diff --git a/util/clixon_util_xpath.c b/util/clixon_util_xpath.c index 2ee8d2e8..dcbee236 100644 --- a/util/clixon_util_xpath.c +++ b/util/clixon_util_xpath.c @@ -133,13 +133,14 @@ main(int argc, char **argv) clicon_log_init("xpath", LOG_DEBUG, CLICON_LOG_STDERR); optind = 1; opterr = 0; - while ((c = getopt(argc, argv, "hDf:p:i:")) != -1) + while ((c = getopt(argc, argv, "hD:f:p:i:")) != -1) switch (c) { case 'h': usage(argv0); break; case 'D': - debug++; + if (sscanf(optarg, "%d", &debug) != 1) + usage(argv0); break; case 'f': /* XML file */ filename = optarg;