From 02b9a51b66443a59fdec19cc8c1942add31d9bf6 Mon Sep 17 00:00:00 2001 From: Brendan Hansen Date: Tue, 16 May 2023 16:29:39 -0500 Subject: [PATCH] added: `\uXXXX` unicode code points in strings --- CHANGELOG | 19 ++++++++++++++ compiler/src/utils.c | 62 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 56b6cbd7..065676e4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,22 @@ +Release v0.1.2 +----------- +Unreleased + +Additions: +* String literals can have unicode code points. + - '\uXXXX' for small code points (less than U+FFFF) + - '\UXXXXXX' for large code points + - Does not support UTF-16 surrogate pairs + +Removals: + +Changes: + +Bugfixes: + + + + Release v0.1.1a ----------- 15th May 2023 diff --git a/compiler/src/utils.c b/compiler/src/utils.c index cf6d35fe..3bee7b73 100644 --- a/compiler/src/utils.c +++ b/compiler/src/utils.c @@ -1196,6 +1196,43 @@ u32 char_to_base16_value(char x) { return 0xffffffff; } +static i32 encode_utf8_char(char ** d, u32 r) { + char *dest = *d; + int len = 0; + + if (r <= 0x7F) { + *dest++ = r; + len = 1; + } + + else if (r <= 0x7FF) { + *dest++ = (0xC0 | ((r >> 6) & 0x1F)); + *dest++ = (0x80 | (r & 0x3F)); + len = 2; + } + + else if (r >= 0xD800 && r <= 0xDFFF) { + } + + else if (r <= 0xFFFF) { + *dest++ = (0xE0 | ((r >> 12) & 0x0F)); + *dest++ = (0x80 | ((r >> 6) & 0x3F)); + *dest++ = (0x80 | (r & 0x3F)); + len = 3; + } + + else if (r <= 0x10FFFF) { + *dest++ = (0xF0 | ((r >> 18) & 0x07)); + *dest++ = (0x80 | ((r >> 12) & 0x3F)); + *dest++ = (0x80 | ((r >> 6) & 0x3F)); + *dest++ = (0x80 | (r & 0x3F)); + len = 4; + } + + *d = dest; + return len; +} + i32 string_process_escape_seqs(char* dest, char* src, i32 len) { i32 total_len = 0; for (i32 i = 0; i < len; i++) { @@ -1221,6 +1258,30 @@ i32 string_process_escape_seqs(char* dest, char* src, i32 len) { i += 2; break; } + case 'u': { + if (len - i < 5) break; + u32 c = + (char_to_base16_value(src[i + 1]) << 12) + | (char_to_base16_value(src[i + 2]) << 8) + | (char_to_base16_value(src[i + 3]) << 4) + | (char_to_base16_value(src[i + 4])); + total_len += encode_utf8_char(&dest, c); + i += 5; + break; + } + case 'U': { + if (len - i < 7) break; + u32 c = + (char_to_base16_value(src[i + 1]) << 20) + | (char_to_base16_value(src[i + 2]) << 16) + | (char_to_base16_value(src[i + 3]) << 12) + | (char_to_base16_value(src[i + 4]) << 8) + | (char_to_base16_value(src[i + 5]) << 4) + | (char_to_base16_value(src[i + 6])); + total_len += encode_utf8_char(&dest, c); + i += 7; + break; + } default: *dest++ = '\\'; *dest++ = src[i]; total_len += 2; @@ -1237,6 +1298,7 @@ i32 string_process_escape_seqs(char* dest, char* src, i32 len) { return total_len; } + static Scope **get_scope_from_node_helper(AstNode *node) { b32 used_pointer = 0; -- 2.25.1