add regex unicode match support

2022-11-22 12:15:44 +08:00 · 2022-11-22 12:15:44 +08:00 · 4839a633e3
commit 4839a633e3
parent a3bb271432
3 changed files with 175 additions and 3 deletions
--- a/lib/src/clixon_regex.c
+++ b/lib/src/clixon_regex.c
@ -65,6 +65,143 @@
 /*-------------------------- POSIX translation -------------------------*/
 /* parse 4 digit hexadecimal number */
 static unsigned
 parse_hex4(const unsigned char *const input, unsigned int *h)
 {
    size_t i = 0;
    for (i = 0; i < 4; i++) {
        /* parse digit */
        if ((input[i] >= '0') && (input[i] <= '9')) {
            *h += (unsigned int) input[i] - '0';
        } else if ((input[i] >= 'A') && (input[i] <= 'F')) {
            *h += (unsigned int) 10 + input[i] - 'A';
        } else if ((input[i] >= 'a') && (input[i] <= 'f')) {
            *h += (unsigned int) 10 + input[i] - 'a';
        } else { /* invalid */
            return -1;
        }
        if (i < 3) {
            /* shift left to make place for the next nibble */
            *h = *h << 4;
        }
    }
    return 0;
 }
 /* converts a UTF-16 literal to UTF-8
 * A literal can be one or two sequences of the form \uXXXX */
 static unsigned char
 utf16_literal_to_utf8(const unsigned char *const input, int len,
    unsigned char **output)
 {
    long unsigned int codepoint = 0;
    unsigned int first_code = 0;
    const unsigned char *first_sequence = input;
    unsigned char utf8_length = 0;
    unsigned char utf8_position = 0;
    unsigned char sequence_length = 0;
    unsigned char first_byte_mark = 0;
    int retval = -1;
    if (len < 6) {
        /* input ends unexpectedly */
        goto fail;
    }
    /* get the first utf16 sequence */
    retval = parse_hex4(first_sequence + 2, &first_code);
    if (retval != 0) {
        goto fail;
    }
    /* check that the code is valid */
    if (((first_code >= 0xDC00) && (first_code <= 0xDFFF))) {
        goto fail;
    }
    /* UTF16 surrogate pair */
    if ((first_code >= 0xD800) && (first_code <= 0xDBFF)) {
        const unsigned char *second_sequence = first_sequence + 6;
        unsigned int second_code = 0;
        sequence_length = 12; /* \uXXXX\uXXXX */
        if (len < 12) {
            /* input ends unexpectedly */
            goto fail;
        }
        if ((second_sequence[0] != '\\') || (second_sequence[1] != 'u')) {
            /* missing second half of the surrogate pair */
            goto fail;
        }
        /* get the second utf16 sequence */
        retval = parse_hex4(second_sequence + 2, &second_code);
        if (retval != 0) {
            goto fail;
        }
        /* check that the code is valid */
        if ((second_code < 0xDC00) || (second_code > 0xDFFF)) {
            /* invalid second half of the surrogate pair */
            goto fail;
        }
        /* calculate the unicode codepoint from the surrogate pair */
        codepoint = 0x10000 + (((first_code & 0x3FF) << 10) | (second_code & 0x3FF));
    } else {
        sequence_length = 6; /* \uXXXX */
        codepoint = first_code;
    }
    /* encode as UTF-8
     * takes at maximum 4 bytes to encode:
     * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    if (codepoint < 0x80) {
        /* normal ascii, encoding 0xxxxxxx */
        utf8_length = 1;
    } else if (codepoint < 0x800) {
        /* two bytes, encoding 110xxxxx 10xxxxxx */
        utf8_length = 2;
        first_byte_mark = 0xC0; /* 11000000 */
    } else if (codepoint < 0x10000) {
        /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */
        utf8_length = 3;
        first_byte_mark = 0xE0; /* 11100000 */
    } else if (codepoint <= 0x10FFFF) {
        /* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */
        utf8_length = 4;
        first_byte_mark = 0xF0; /* 11110000 */
    } else {
        /* invalid unicode codepoint */
        goto fail;
    }
    /* encode as utf8 */
    for (utf8_position = (unsigned char)(utf8_length - 1); utf8_position > 0; utf8_position--) {
        /* 10xxxxxx */
        (*output)[utf8_position] = (unsigned char)((codepoint | 0x80) & 0xBF);
        codepoint >>= 6;
    }
    /* encode first byte */
    if (utf8_length > 1) {
        (*output)[0] = (unsigned char)((codepoint | first_byte_mark) & 0xFF);
    } else {
        (*output)[0] = (unsigned char)(codepoint & 0x7F);
    }
    *output += utf8_length;
    return sequence_length;
 fail:
    return 0;
 }
 /*! Transform from XSD regex to posix ERE
 * The usecase is that Yang (RFC7950) supports XSD regular expressions but
 * CLIgen supports POSIX ERE
@ -186,6 +323,20 @@ regexp_xsd2posix(char  *xsd,
            case 'W': /* inverse of \w */
                cprintf(cb, "[^[[:alnum:]|_]]"); 
                break;
            case 'u': {
                int   n;
                char utf8[4];
                char *ptr = utf8;
                n = utf16_literal_to_utf8((void*)(xsd + i - 1),
                    strlen(xsd) - i + 1, (void*)&ptr);
                if (n == 0) {
                    goto done;
                }
                cbuf_append_buf(cb, utf8, ptr - utf8);
                i += n - 2;
            }
            break;
            default:
                cprintf(cb, "\\%c", x);
                break;
--- a/test/lib.sh
+++ b/test/lib.sh
@ -492,7 +492,7 @@ function chunked_equal()
 function chunked_framing()
 {
    str=$1
-    length=${#str}
+    length=$(echo -n "$str"|wc -c)
    printf "\n#%s\n%s\n##\n" ${length} "${str}"
 }
--- a/test/test_pattern.sh
+++ b/test/test_pattern.sh
@ -376,6 +376,18 @@ module pattern{
             }
         }
      }
      leaf p48 {
         description "Chinese characters in unicode format";
         type string {
             pattern '[\u4E00-\u9FA5]+';
         }
      }
      leaf p49 {
         description "Arabic characters in unicode format";
         type string {
             pattern '[\u0600-\u06FF]+';
         }
      }
   }
 }
 EOF
@ -751,9 +763,18 @@ testrun "p$pnr" false '248:197.7.89/8'
 let pnr=47 # '.*[\n].*
 testrun "p$pnr" true 'Ensure all nights are cold'
-testrun "p$pnr" false 'kalle
foo'
+testrun "p$pnr" false 'kalle
 foo'
 testrun "p$pnr" false '01234567890123456789012345678901234567890123456789012345678901234567890123456789zzz'
 let pnr=48
 testrun "p$pnr" true '你好'
 testrun "p$pnr" false 'hello'
 let pnr=49
 testrun "p$pnr" true 'مرحبا'
 testrun "p$pnr" false 'hello'
 # CLI tests
 new "CLI tests for RFC7950 Sec 9.4.7 ex 2 AB"
 expectpart "$($clixon_cli -1f $cfg -l o set c rfc2 AB)" 0 '^$'