add regex unicode match support

2022-11-22 12:15:44 +08:00 · 2022-11-22 12:15:44 +08:00 · 4839a633e3
commit 4839a633e3
parent a3bb271432
3 changed files with 175 additions and 3 deletions
--- a/lib/src/clixon_regex.c
+++ b/lib/src/clixon_regex.c
@ -65,6 +65,143 @@

 /*-------------------------- POSIX translation -------------------------*/

+/* parse 4 digit hexadecimal number */
+static unsigned
+parse_hex4(const unsigned char *const input, unsigned int *h)
+{
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        /* parse digit */
+        if ((input[i] >= '0') && (input[i] <= '9')) {
+            *h += (unsigned int) input[i] - '0';
+        } else if ((input[i] >= 'A') && (input[i] <= 'F')) {
+            *h += (unsigned int) 10 + input[i] - 'A';
+        } else if ((input[i] >= 'a') && (input[i] <= 'f')) {
+            *h += (unsigned int) 10 + input[i] - 'a';
+        } else { /* invalid */
+            return -1;
+        }
+
+        if (i < 3) {
+            /* shift left to make place for the next nibble */
+            *h = *h << 4;
+        }
+    }
+
+    return 0;
+}
+
+/* converts a UTF-16 literal to UTF-8
+ * A literal can be one or two sequences of the form \uXXXX */
+static unsigned char
+utf16_literal_to_utf8(const unsigned char *const input, int len,
+    unsigned char **output)
+{
+    long unsigned int codepoint = 0;
+    unsigned int first_code = 0;
+    const unsigned char *first_sequence = input;
+    unsigned char utf8_length = 0;
+    unsigned char utf8_position = 0;
+    unsigned char sequence_length = 0;
+    unsigned char first_byte_mark = 0;
+    int retval = -1;
+
+    if (len < 6) {
+        /* input ends unexpectedly */
+        goto fail;
+    }
+
+    /* get the first utf16 sequence */
+    retval = parse_hex4(first_sequence + 2, &first_code);
+    if (retval != 0) {
+        goto fail;
+    }
+
+    /* check that the code is valid */
+    if (((first_code >= 0xDC00) && (first_code <= 0xDFFF))) {
+        goto fail;
+    }
+
+    /* UTF16 surrogate pair */
+    if ((first_code >= 0xD800) && (first_code <= 0xDBFF)) {
+        const unsigned char *second_sequence = first_sequence + 6;
+        unsigned int second_code = 0;
+        sequence_length = 12; /* \uXXXX\uXXXX */
+
+        if (len < 12) {
+            /* input ends unexpectedly */
+            goto fail;
+        }
+
+        if ((second_sequence[0] != '\\') || (second_sequence[1] != 'u')) {
+            /* missing second half of the surrogate pair */
+            goto fail;
+        }
+
+        /* get the second utf16 sequence */
+        retval = parse_hex4(second_sequence + 2, &second_code);
+        if (retval != 0) {
+            goto fail;
+        }
+        /* check that the code is valid */
+        if ((second_code < 0xDC00) || (second_code > 0xDFFF)) {
+            /* invalid second half of the surrogate pair */
+            goto fail;
+        }
+
+
+        /* calculate the unicode codepoint from the surrogate pair */
+        codepoint = 0x10000 + (((first_code & 0x3FF) << 10) | (second_code & 0x3FF));
+    } else {
+        sequence_length = 6; /* \uXXXX */
+        codepoint = first_code;
+    }
+
+    /* encode as UTF-8
+     * takes at maximum 4 bytes to encode:
+     * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+    if (codepoint < 0x80) {
+        /* normal ascii, encoding 0xxxxxxx */
+        utf8_length = 1;
+    } else if (codepoint < 0x800) {
+        /* two bytes, encoding 110xxxxx 10xxxxxx */
+        utf8_length = 2;
+        first_byte_mark = 0xC0; /* 11000000 */
+    } else if (codepoint < 0x10000) {
+        /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */
+        utf8_length = 3;
+        first_byte_mark = 0xE0; /* 11100000 */
+    } else if (codepoint <= 0x10FFFF) {
+        /* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */
+        utf8_length = 4;
+        first_byte_mark = 0xF0; /* 11110000 */
+    } else {
+        /* invalid unicode codepoint */
+        goto fail;
+    }
+
+    /* encode as utf8 */
+    for (utf8_position = (unsigned char)(utf8_length - 1); utf8_position > 0; utf8_position--) {
+        /* 10xxxxxx */
+        (*output)[utf8_position] = (unsigned char)((codepoint | 0x80) & 0xBF);
+        codepoint >>= 6;
+    }
+    /* encode first byte */
+    if (utf8_length > 1) {
+        (*output)[0] = (unsigned char)((codepoint | first_byte_mark) & 0xFF);
+    } else {
+        (*output)[0] = (unsigned char)(codepoint & 0x7F);
+    }
+
+    *output += utf8_length;
+
+    return sequence_length;
+
+fail:
+    return 0;
+}
+
 /*! Transform from XSD regex to posix ERE
 * The usecase is that Yang (RFC7950) supports XSD regular expressions but
 * CLIgen supports POSIX ERE
@ -186,6 +323,20 @@ regexp_xsd2posix(char  *xsd,
            case 'W': /* inverse of \w */
                cprintf(cb, "[^[[:alnum:]|_]]"); 
                break;
+            case 'u': {
+                int   n;
+                char utf8[4];
+                char *ptr = utf8;
+
+                n = utf16_literal_to_utf8((void*)(xsd + i - 1),
+                    strlen(xsd) - i + 1, (void*)&ptr);
+                if (n == 0) {
+                    goto done;
+                }
+                cbuf_append_buf(cb, utf8, ptr - utf8);
+                i += n - 2;
+            }
+            break;
            default:
                cprintf(cb, "\\%c", x);
                break;
--- a/test/lib.sh
+++ b/test/lib.sh
@ -492,7 +492,7 @@ function chunked_equal()
 function chunked_framing()
 {
    str=$1
-    length=${#str}
+    length=$(echo -n "$str"|wc -c)

    printf "\n#%s\n%s\n##\n" ${length} "${str}"
 }
--- a/test/test_pattern.sh
+++ b/test/test_pattern.sh
@ -376,6 +376,18 @@ module pattern{
             }
         }
      }
+      leaf p48 {
+         description "Chinese characters in unicode format";
+         type string {
+             pattern '[\u4E00-\u9FA5]+';
+         }
+      }
+      leaf p49 {
+         description "Arabic characters in unicode format";
+         type string {
+             pattern '[\u0600-\u06FF]+';
+         }
+      }
   }
 }
 EOF
@ -751,9 +763,18 @@ testrun "p$pnr" false '248:197.7.89/8'

 let pnr=47 # '.*[\n].*
 testrun "p$pnr" true 'Ensure all nights are cold'
-testrun "p$pnr" false 'kalle
foo'
+testrun "p$pnr" false 'kalle
+foo'
 testrun "p$pnr" false '01234567890123456789012345678901234567890123456789012345678901234567890123456789zzz'

+let pnr=48
+testrun "p$pnr" true '你好'
+testrun "p$pnr" false 'hello'
+
+let pnr=49
+testrun "p$pnr" true 'مرحبا'
+testrun "p$pnr" false 'hello'
+
 # CLI tests
 new "CLI tests for RFC7950 Sec 9.4.7 ex 2 AB"
 expectpart "$($clixon_cli -1f $cfg -l o set c rfc2 AB)" 0 '^$'