added: `\uXXXX` unicode code points in strings
authorBrendan Hansen <brendan.f.hansen@gmail.com>
Tue, 16 May 2023 21:29:39 +0000 (16:29 -0500)
committerBrendan Hansen <brendan.f.hansen@gmail.com>
Tue, 16 May 2023 21:29:39 +0000 (16:29 -0500)
CHANGELOG
compiler/src/utils.c

index 56b6cbd7095135889d1e479d1a0826e1363fbbd4..065676e41ff1a19b6efe16a1b3f228147e50b9a6 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,22 @@
+Release v0.1.2
+-----------
+Unreleased
+
+Additions:
+* String literals can have unicode code points.
+    - '\uXXXX' for small code points (less than U+FFFF)
+    - '\UXXXXXX' for large code points
+    - Does not support UTF-16 surrogate pairs
+
+Removals:
+
+Changes:
+
+Bugfixes:
+
+
+
+
 Release v0.1.1a
 -----------
 15th May 2023
index cf6d35fe7923655ae325a5a41867256bee1e7b9d..3bee7b734d62bc4d19b7795e1ca6ec169c2fd82d 100644 (file)
@@ -1196,6 +1196,43 @@ u32 char_to_base16_value(char x) {
     return 0xffffffff;
 }
 
+static i32 encode_utf8_char(char ** d, u32 r) {
+    char *dest = *d;
+    int len = 0;
+
+    if (r <= 0x7F) {
+        *dest++ = r;
+        len = 1;
+    }
+
+    else if (r <= 0x7FF) {
+        *dest++ = (0xC0 | ((r >> 6) & 0x1F));
+        *dest++ = (0x80 | (r & 0x3F));
+        len = 2;
+    }
+
+    else if (r >= 0xD800 && r <= 0xDFFF) {
+    }
+
+    else if (r <= 0xFFFF) {
+        *dest++ = (0xE0 | ((r >> 12) & 0x0F));
+        *dest++ = (0x80 | ((r >> 6) & 0x3F));
+        *dest++ = (0x80 | (r & 0x3F));
+        len = 3;
+    }
+
+    else if (r <= 0x10FFFF) {
+        *dest++ = (0xF0 | ((r >> 18) & 0x07));
+        *dest++ = (0x80 | ((r >> 12) & 0x3F));
+        *dest++ = (0x80 | ((r >> 6) & 0x3F));
+        *dest++ = (0x80 | (r & 0x3F));
+        len = 4;
+    }
+
+    *d = dest;
+    return len;
+}
+
 i32 string_process_escape_seqs(char* dest, char* src, i32 len) {
     i32 total_len = 0;
     for (i32 i = 0; i < len; i++) {
@@ -1221,6 +1258,30 @@ i32 string_process_escape_seqs(char* dest, char* src, i32 len) {
                 i += 2;
                 break;
             }
+            case 'u': {
+                if (len - i < 5) break;
+                u32 c =
+                      (char_to_base16_value(src[i + 1]) << 12)
+                    | (char_to_base16_value(src[i + 2]) << 8)
+                    | (char_to_base16_value(src[i + 3]) << 4)
+                    | (char_to_base16_value(src[i + 4]));
+                total_len += encode_utf8_char(&dest, c);
+                i += 5;
+                break;
+            }
+            case 'U': {
+                if (len - i < 7) break;
+                u32 c =
+                      (char_to_base16_value(src[i + 1]) << 20)
+                    | (char_to_base16_value(src[i + 2]) << 16)
+                    | (char_to_base16_value(src[i + 3]) << 12)
+                    | (char_to_base16_value(src[i + 4]) << 8)
+                    | (char_to_base16_value(src[i + 5]) << 4)
+                    | (char_to_base16_value(src[i + 6]));
+                total_len += encode_utf8_char(&dest, c);
+                i += 7;
+                break;
+            }
             default:  *dest++ = '\\';
                       *dest++ = src[i];
                       total_len += 2;
@@ -1237,6 +1298,7 @@ i32 string_process_escape_seqs(char* dest, char* src, i32 len) {
     return total_len;
 }
 
+
 static Scope **get_scope_from_node_helper(AstNode *node) {
     b32 used_pointer = 0;