* XML parser conformance to W3 spec

* Names lexically correct (NCName)
  * Syntactically Correct handling of '<?' (processing instructions) and '<?xml' (XML declaration)
  * XML prolog syntax for 'well-formed' XML
  * <!DOCTYPE (ie DTD) is not supported.
This commit is contained in:
Olof hagsand 2018-11-18 13:22:08 +01:00
parent 9bd0dc42c6
commit 9c57902b96
9 changed files with 280 additions and 69 deletions

View file

@ -99,10 +99,26 @@
* <x>a</<x>
* <x>b</<x>
* </c>
* From https://www.w3.org/TR/2009/REC-xml-names-20091208
* Definitions:
* - XML namespace: is identified by a URI reference [RFC3986]; element and
* attribute names may be placed in an XML namespace using the mechanisms
* described in this specification.
* - Expanded name: is a pair consisting of a namespace name and a local name.
* - Namespace name: For a name N in a namespace identified by a URI I, the
* "namespace name" is I.
* For a name N that is not in a namespace, the "namespace name" has no value.
* - Local name: In either case the "local name" is N.
* It is this combination of the universally managed URI namespace with the
* vocabulary's local names that is effective in avoiding name clashes.
*/
struct xml{
char *x_name; /* name of node */
char *x_namespace; /* namespace, if any */
#ifdef notyet
char *x_namespacename; /* namespace name (or NULL) */
char *x_localname; /* Local name N as defined above */
#endif
struct xml *x_up; /* parent node in hierarchy if any */
struct xml **x_childvec; /* vector of children nodes */
int x_childvec_len;/* length of vector */
@ -224,7 +240,7 @@ xmlns_check(cxobj *xn,
return NULL;
}
/*! Check namespace of xml node by searhing recursively among ancestors
/*! Check namespace of xml node by searching recursively among ancestors
* @param[in] xn xml node
* @param[in] namespace check validity of namespace
* @retval 0 Found / validated or no yang spec
@ -1258,15 +1274,18 @@ xmltree2cbuf(cbuf *cb,
* @see xml_parse_file
* @see xml_parse_string
* @see xml_parse_va
* @note special case is empty XML where the parser is not invoked.
*/
static int
_xml_parse(const char *str,
_xml_parse(const char *str,
yang_spec *yspec,
cxobj *xt)
{
int retval = -1;
struct xml_parse_yacc_arg ya = {0,};
if (strlen(str) == 0)
return 0; /* OK */
if (xt == NULL){
clicon_err(OE_XML, errno, "Unexpected NULL XML");
return -1;

View file

@ -34,6 +34,8 @@
* XML parser
* @see https://www.w3.org/TR/2008/REC-xml-20081126
* https://www.w3.org/TR/2009/REC-xml-names-20091208
*
*/
%{
@ -72,8 +74,22 @@ int clixon_xml_parsewrap(void)
return 1;
}
/*
* From https://www.w3.org/TR/2008/REC-xml-20081126:
* [4]* NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] ...
* [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7|
* [5] Name ::= NameStartChar (NameChar)*
* NOTE: From https://www.w3.org/TR/2009/REC-xml-names-20091208:
* [4] NCName ::= Name - (Char* ':' Char*) An XML Name, minus the ":"
* --> namestart and name below is NCNAME
*/
%}
namestart [A-Z_a-z]
namechar [A-Z_a-z\-\.0-9]
ncname {namestart}{namechar}*
%x START
%s STATEA
%s AMPERSAND
@ -81,36 +97,42 @@ int clixon_xml_parsewrap(void)
%s CMNT
%s STR
%s TEXTDECL
%s PIDECL
%s PIDECL2
%s STRDQ
%s STRSQ
%%
<START>[0-9A-Za-z_\-]+ { clixon_xml_parselval.string = strdup(yytext);
<START,TEXTDECL>[ \t] ;
<START,STATEA,CMNT,TEXTDECL>\n { _YA->ya_linenum++; }
<START>{ncname} { clixon_xml_parselval.string = strdup(yytext);
return NAME; /* rather be catch-all */
}
<START>[ \t]+ ;
<START>\: return *clixon_xml_parsetext;
<START>\n { _YA->ya_linenum++;}
<START>"<?xml" { BEGIN(TEXTDECL); return BTEXT;}
<START><<EOF>> { return MY_EOF; }
<START>"<?xml" { BEGIN(TEXTDECL); return BXMLDCL;}
<START>"<?" { BEGIN(PIDECL); return BQMARK;}
<START>"/>" { BEGIN(STATEA); return ESLASH; }
<START>"<!--" { BEGIN(CMNT); return BCOMMENT; }
<START>"</" return BSLASH;
<START>[/=] return *clixon_xml_parsetext;
<START>\< return *clixon_xml_parsetext;
<START>\> { BEGIN(STATEA); return *clixon_xml_parsetext; }
<START>\" { _YA->ya_lex_state=START;BEGIN(STRDQ); return *clixon_xml_parsetext; }
<START>\' { _YA->ya_lex_state=START;BEGIN(STRSQ); return *clixon_xml_parsetext; }
<START>. { clixon_xml_parselval.string = yytext; return CHARDATA; /*XXX:optimize*/ }
<START>. { clixon_xml_parselval.string = yytext; return CHARDATA; /* optimize? */}
<STATEA>"</" { BEGIN(START); return BSLASH; }
<STATEA><<EOF>> { return MY_EOF; }
<STATEA>"<!--" { BEGIN(CMNT); return BCOMMENT; }
<STATEA>"<![CDATA[" { BEGIN(CDATA); _YA->ya_lex_state = STATEA; clixon_xml_parselval.string = yytext; return CHARDATA;}
<STATEA>"<?" { BEGIN(PIDECL); return BQMARK; }
<STATEA>\< { BEGIN(START); return *clixon_xml_parsetext; }
<STATEA>& { _YA->ya_lex_state =STATEA;BEGIN(AMPERSAND);}
<STATEA>\n { clixon_xml_parselval.string = yytext;_YA->ya_linenum++; return (CHARDATA);}
<STATEA>. { clixon_xml_parselval.string = yytext; return CHARDATA; /*XXX:optimize*/}
<STATEA>. { clixon_xml_parselval.string = yytext; return CHARDATA; }
/* @see xml_chardata_encode */
<AMPERSAND>"amp;" { BEGIN(_YA->ya_lex_state); clixon_xml_parselval.string = "&"; return CHARDATA;}
@ -124,21 +146,30 @@ int clixon_xml_parsewrap(void)
<CDATA>"]]>" { BEGIN(_YA->ya_lex_state); clixon_xml_parselval.string = yytext; return CHARDATA;}
<CMNT>"-->" { BEGIN(START); return ECOMMENT; }
<CMNT>\n _YA->ya_linenum++;
<CMNT>.
<TEXTDECL>encoding return ENC;
<TEXTDECL>version return VER;
<TEXTDECL>"=" return *clixon_xml_parsetext;
<TEXTDECL>"?>" { BEGIN(START);return ETEXT;}
<TEXTDECL>standalone return SD;
<TEXTDECL>"=" { return *clixon_xml_parsetext; }
<TEXTDECL>"?>" { BEGIN(START);return EQMARK;}
<TEXTDECL>\" { _YA->ya_lex_state =TEXTDECL;BEGIN(STRDQ); return *clixon_xml_parsetext; }
<TEXTDECL>\' { _YA->ya_lex_state =TEXTDECL;BEGIN(STRSQ); return *clixon_xml_parsetext; }
<TEXTDECL>. { clixon_xml_parselval.string = yytext; return CHARDATA; /* optimize? */}
<STRDQ>1\.[0-9]+ { clixon_xml_parselval.string = strdup(yytext); return CHARDATA; }
<STRDQ>[^\"]+ { clixon_xml_parselval.string = strdup(yytext); return CHARDATA; }
<PIDECL>{ncname} { clixon_xml_parselval.string = strdup(yytext);
return NAME; /* rather be catch-all */
}
<PIDECL>[ \t] { BEGIN(PIDECL2);}
<PIDECL>. { clixon_xml_parselval.string = yytext; return CHARDATA; /* optimize? */}
<PIDECL2>"?>" { BEGIN(START);return EQMARK;}
<PIDECL2>[^{?>}]+ { clixon_xml_parselval.string = strdup(yytext); return STRING; }
<STRDQ>1\.[0-9]+ { clixon_xml_parselval.string = strdup(yytext); return STRING; }
<STRDQ>[^\"]+ { clixon_xml_parselval.string = strdup(yytext); return STRING; }
<STRDQ>\" { BEGIN(_YA->ya_lex_state); return *clixon_xml_parsetext; }
<STRSQ>1\.[0-9]+ { clixon_xml_parselval.string = strdup(yytext); return CHARDATA; }
<STRSQ>[^\']+ { clixon_xml_parselval.string = strdup(yytext); return CHARDATA; }
<STRSQ>1\.[0-9]+ { clixon_xml_parselval.string = strdup(yytext); return STRING; }
<STRSQ>[^\']+ { clixon_xml_parselval.string = strdup(yytext); return STRING; }
<STRSQ>\' { BEGIN(_YA->ya_lex_state); return *clixon_xml_parsetext; }
%%

View file

@ -39,12 +39,13 @@
char *string;
}
%start topxml
%start document
%token <string> NAME CHARDATA
%token VER ENC
%token <string> NAME CHARDATA STRING
%token MY_EOF
%token VER ENC SD
%token BSLASH ESLASH
%token BTEXT ETEXT
%token BXMLDCL BQMARK EQMARK
%token BCOMMENT ECOMMENT
%type <string> attvalue
@ -120,7 +121,8 @@ xml_parse_version(struct xml_parse_yacc_arg *ya,
free(ver);
return -1;
}
free(ver);
if (ver)
free(ver);
return 0;
}
@ -299,6 +301,11 @@ xml_parse_bslash2(struct xml_parse_yacc_arg *ya,
return retval;
}
/*! Parse XML attribute
* Special cases:
* - DefaultAttName: xmlns
* - PrefixedAttName: xmlns:NAME
*/
static int
xml_parse_attr(struct xml_parse_yacc_arg *ya,
char *prefix,
@ -308,6 +315,12 @@ xml_parse_attr(struct xml_parse_yacc_arg *ya,
int retval = -1;
cxobj *xa;
#ifdef notyet
if (prefix && strcmp(prefix,"xmlns")==0)
fprintf(stderr, "PrefixedAttName NCNAME:%s = %s\n", name, attval);
if (prefix==NULL && strcmp(name,"xmlns")==0)
fprintf(stderr, "DefaultAttName = %s\n", attval);
#endif /* notyet */
if ((xa = xml_new(name, ya->ya_xelement, NULL)) == NULL)
goto done;
xml_type_set(xa, CX_ATTR);
@ -327,69 +340,100 @@ xml_parse_attr(struct xml_parse_yacc_arg *ya,
%}
%%
topxml : list
{ clicon_debug(3, "topxml->list ACCEPT");
YYACCEPT; }
| dcl list
{ clicon_debug(3, "topxml->dcl list ACCEPT");
YYACCEPT; }
/* [1] document ::= prolog element Misc* */
document : prolog element misclist MY_EOF
{ clicon_debug(2, "document->prolog element misc* ACCEPT");
YYACCEPT; }
| elist MY_EOF
{ clicon_debug(2, "document->elist ACCEPT"); /* internal exception*/
YYACCEPT; }
;
/* [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? */
prolog : xmldcl misclist
{ clicon_debug(2, "prolog->xmldcl misc*"); }
| misclist
{ clicon_debug(2, "prolog->misc*"); }
;
dcl : BTEXT info encode ETEXT { clicon_debug(3, "dcl->info encode"); }
misclist : misclist misc { clicon_debug(2, "misclist->misclist misc"); }
| { clicon_debug(2, "misclist->"); }
;
info : VER '=' '\"' CHARDATA '\"'
{ if (xml_parse_version(_YA, $4) <0) YYABORT; }
| VER '=' '\'' CHARDATA '\''
{ if (xml_parse_version(_YA, $4) <0) YYABORT; }
/* [27] Misc ::= Comment | PI | S */
misc : comment { clicon_debug(2, "misc->comment"); }
| pi { clicon_debug(2, "misc->pi"); }
;
xmldcl : BXMLDCL verinfo encodingdecl sddecl EQMARK
{ clicon_debug(2, "xmldcl->verinfo encodingdecl? sddecl?"); }
;
verinfo : VER '=' '\"' STRING '\"'
{ if (xml_parse_version(_YA, $4) <0) YYABORT;
clicon_debug(2, "verinfo->version=\"STRING\"");}
| VER '=' '\'' STRING '\''
{ if (xml_parse_version(_YA, $4) <0) YYABORT;
clicon_debug(2, "verinfo->version='STRING'");}
;
encodingdecl : ENC '=' '\"' STRING '\"' {if ($4)free($4);}
| ENC '=' '\'' STRING '\'' {if ($4)free($4);}
|
;
encode : ENC '=' '\"' CHARDATA '\"' {free($4);}
| ENC '=' '\'' CHARDATA '\'' {free($4);}
sddecl : SD '=' '\"' STRING '\"' {if ($4)free($4);}
| SD '=' '\'' STRING '\'' {if ($4)free($4);}
|
;
/* [39] element ::= EmptyElemTag | STag content ETag */
element : '<' qname attrs element1
{ clicon_debug(3, "element -> < qname attrs element1"); }
;
{ clicon_debug(2, "element -> < qname attrs element1"); }
;
qname : NAME { if (xml_parse_unprefixed_name(_YA, $1) < 0) YYABORT;
clicon_debug(3, "qname -> NAME %s", $1);}
clicon_debug(2, "qname -> NAME %s", $1);}
| NAME ':' NAME { if (xml_parse_prefixed_name(_YA, $1, $3) < 0) YYABORT;
clicon_debug(3, "qname -> NAME : NAME");}
clicon_debug(2, "qname -> NAME : NAME");}
;
element1 : ESLASH {_YA->ya_xelement = NULL;
clicon_debug(3, "element1 -> />");}
clicon_debug(2, "element1 -> />");}
| '>' { xml_parse_endslash_pre(_YA); }
list { xml_parse_endslash_mid(_YA); }
etg { xml_parse_endslash_post(_YA);
clicon_debug(3, "element1 -> > list etg");}
elist { xml_parse_endslash_mid(_YA); }
endtag { xml_parse_endslash_post(_YA);
clicon_debug(2, "element1 -> > elist endtag");}
;
etg : BSLASH NAME '>'
{ clicon_debug(3, "etg -> < </ NAME %s>", $2); if (xml_parse_bslash1(_YA, $2) < 0) YYABORT; }
endtag : BSLASH NAME '>'
{ clicon_debug(2, "endtag -> < </ NAME>");
if (xml_parse_bslash1(_YA, $2) < 0) YYABORT; }
| BSLASH NAME ':' NAME '>'
{ if (xml_parse_bslash2(_YA, $2, $4) < 0) YYABORT;
clicon_debug(3, "etg -> < </ NAME:NAME >"); }
clicon_debug(2, "endtag -> < </ NAME:NAME >"); }
;
list : list content { clicon_debug(3, "list -> list content"); }
| content { clicon_debug(3, "list -> content"); }
elist : elist content { clicon_debug(2, "elist -> elist content"); }
| content { clicon_debug(2, "elist -> content"); }
;
content : element { clicon_debug(3, "content -> element"); }
| comment { clicon_debug(3, "content -> comment"); }
| CHARDATA { if (xml_parse_content(_YA, $1) < 0) YYABORT;
clicon_debug(3, "content -> CHARDATA %s", $1); }
| { clicon_debug(3, "content -> "); }
/* Rule 43 */
content : element { clicon_debug(2, "content -> element"); }
| comment { clicon_debug(2, "content -> comment"); }
| pi { clicon_debug(2, "content -> pi"); }
| CHARDATA { if (xml_parse_content(_YA, $1) < 0) YYABORT;
clicon_debug(2, "content -> CHARDATA %s", $1); }
| { clicon_debug(2, "content -> "); }
;
comment : BCOMMENT ECOMMENT
;
pi : BQMARK NAME EQMARK {clicon_debug(2, "pi -> <? NAME ?>"); free($2); }
| BQMARK NAME STRING EQMARK
{clicon_debug(2, "pi -> <? NAME STRING ?>"); free($2); free($3);}
;
attrs : attrs attr
|
@ -399,9 +443,9 @@ attr : NAME '=' attvalue { if (xml_parse_attr(_YA, NULL, $1, $3)
| NAME ':' NAME '=' attvalue { if (xml_parse_attr(_YA, $1, $3, $5) < 0) YYABORT; }
;
attvalue : '\"' CHARDATA '\"' { $$=$2; /* $2 must be consumed */}
attvalue : '\"' STRING '\"' { $$=$2; /* $2 must be consumed */}
| '\"' '\"' { $$=strdup(""); /* $2 must be consumed */}
| '\'' CHARDATA '\'' { $$=$2; /* $2 must be consumed */}
| '\'' STRING '\'' { $$=$2; /* $2 must be consumed */}
| '\'' '\'' { $$=strdup(""); /* $2 must be consumed */}
;