diff --git a/lib/src/clixon_regex.c b/lib/src/clixon_regex.c index a5a57ffa..20dd86f9 100644 --- a/lib/src/clixon_regex.c +++ b/lib/src/clixon_regex.c @@ -65,6 +65,143 @@ /*-------------------------- POSIX translation -------------------------*/ +/* parse 4 digit hexadecimal number */ +static unsigned +parse_hex4(const unsigned char *const input, unsigned int *h) +{ + size_t i = 0; + + for (i = 0; i < 4; i++) { + /* parse digit */ + if ((input[i] >= '0') && (input[i] <= '9')) { + *h += (unsigned int) input[i] - '0'; + } else if ((input[i] >= 'A') && (input[i] <= 'F')) { + *h += (unsigned int) 10 + input[i] - 'A'; + } else if ((input[i] >= 'a') && (input[i] <= 'f')) { + *h += (unsigned int) 10 + input[i] - 'a'; + } else { /* invalid */ + return -1; + } + + if (i < 3) { + /* shift left to make place for the next nibble */ + *h = *h << 4; + } + } + + return 0; +} + +/* converts a UTF-16 literal to UTF-8 + * A literal can be one or two sequences of the form \uXXXX */ +static unsigned char +utf16_literal_to_utf8(const unsigned char *const input, int len, + unsigned char **output) +{ + long unsigned int codepoint = 0; + unsigned int first_code = 0; + const unsigned char *first_sequence = input; + unsigned char utf8_length = 0; + unsigned char utf8_position = 0; + unsigned char sequence_length = 0; + unsigned char first_byte_mark = 0; + int retval = -1; + + if (len < 6) { + /* input ends unexpectedly */ + goto fail; + } + + /* get the first utf16 sequence */ + retval = parse_hex4(first_sequence + 2, &first_code); + if (retval != 0) { + goto fail; + } + + /* check that the code is valid */ + if (((first_code >= 0xDC00) && (first_code <= 0xDFFF))) { + goto fail; + } + + /* UTF16 surrogate pair */ + if ((first_code >= 0xD800) && (first_code <= 0xDBFF)) { + const unsigned char *second_sequence = first_sequence + 6; + unsigned int second_code = 0; + sequence_length = 12; /* \uXXXX\uXXXX */ + + if (len < 12) { + /* input ends unexpectedly */ + goto fail; + } + + if ((second_sequence[0] != '\\') || (second_sequence[1] != 'u')) { + /* missing second half of the surrogate pair */ + goto fail; + } + + /* get the second utf16 sequence */ + retval = parse_hex4(second_sequence + 2, &second_code); + if (retval != 0) { + goto fail; + } + /* check that the code is valid */ + if ((second_code < 0xDC00) || (second_code > 0xDFFF)) { + /* invalid second half of the surrogate pair */ + goto fail; + } + + + /* calculate the unicode codepoint from the surrogate pair */ + codepoint = 0x10000 + (((first_code & 0x3FF) << 10) | (second_code & 0x3FF)); + } else { + sequence_length = 6; /* \uXXXX */ + codepoint = first_code; + } + + /* encode as UTF-8 + * takes at maximum 4 bytes to encode: + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if (codepoint < 0x80) { + /* normal ascii, encoding 0xxxxxxx */ + utf8_length = 1; + } else if (codepoint < 0x800) { + /* two bytes, encoding 110xxxxx 10xxxxxx */ + utf8_length = 2; + first_byte_mark = 0xC0; /* 11000000 */ + } else if (codepoint < 0x10000) { + /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */ + utf8_length = 3; + first_byte_mark = 0xE0; /* 11100000 */ + } else if (codepoint <= 0x10FFFF) { + /* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */ + utf8_length = 4; + first_byte_mark = 0xF0; /* 11110000 */ + } else { + /* invalid unicode codepoint */ + goto fail; + } + + /* encode as utf8 */ + for (utf8_position = (unsigned char)(utf8_length - 1); utf8_position > 0; utf8_position--) { + /* 10xxxxxx */ + (*output)[utf8_position] = (unsigned char)((codepoint | 0x80) & 0xBF); + codepoint >>= 6; + } + /* encode first byte */ + if (utf8_length > 1) { + (*output)[0] = (unsigned char)((codepoint | first_byte_mark) & 0xFF); + } else { + (*output)[0] = (unsigned char)(codepoint & 0x7F); + } + + *output += utf8_length; + + return sequence_length; + +fail: + return 0; +} + /*! Transform from XSD regex to posix ERE * The usecase is that Yang (RFC7950) supports XSD regular expressions but * CLIgen supports POSIX ERE @@ -186,6 +323,20 @@ regexp_xsd2posix(char *xsd, case 'W': /* inverse of \w */ cprintf(cb, "[^[[:alnum:]|_]]"); break; + case 'u': { + int n; + char utf8[4]; + char *ptr = utf8; + + n = utf16_literal_to_utf8((void*)(xsd + i - 1), + strlen(xsd) - i + 1, (void*)&ptr); + if (n == 0) { + goto done; + } + cbuf_append_buf(cb, utf8, ptr - utf8); + i += n - 2; + } + break; default: cprintf(cb, "\\%c", x); break; diff --git a/test/lib.sh b/test/lib.sh index 983fff55..bc4b03ed 100755 --- a/test/lib.sh +++ b/test/lib.sh @@ -492,8 +492,8 @@ function chunked_equal() function chunked_framing() { str=$1 - length=${#str} - + length=$(echo -n "$str"|wc -c) + printf "\n#%s\n%s\n##\n" ${length} "${str}" } diff --git a/test/test_pattern.sh b/test/test_pattern.sh index 084eeb87..b2c522f4 100755 --- a/test/test_pattern.sh +++ b/test/test_pattern.sh @@ -376,6 +376,18 @@ module pattern{ } } } + leaf p48 { + description "Chinese characters in unicode format"; + type string { + pattern '[\u4E00-\u9FA5]+'; + } + } + leaf p49 { + description "Arabic characters in unicode format"; + type string { + pattern '[\u0600-\u06FF]+'; + } + } } } EOF @@ -751,9 +763,18 @@ testrun "p$pnr" false '248:197.7.89/8' let pnr=47 # '.*[\n].* testrun "p$pnr" true 'Ensure all nights are cold' -testrun "p$pnr" false 'kalle foo' +testrun "p$pnr" false 'kalle +foo' testrun "p$pnr" false '01234567890123456789012345678901234567890123456789012345678901234567890123456789zzz' +let pnr=48 +testrun "p$pnr" true '你好' +testrun "p$pnr" false 'hello' + +let pnr=49 +testrun "p$pnr" true 'مرحبا' +testrun "p$pnr" false 'hello' + # CLI tests new "CLI tests for RFC7950 Sec 9.4.7 ex 2 AB" expectpart "$($clixon_cli -1f $cfg -l o set c rfc2 AB)" 0 '^$'