From: Brendan Hansen Date: Wed, 8 Mar 2023 20:38:31 +0000 (-0600) Subject: added: `core.encoding.utf8` library X-Git-Url: https://git.brendanfh.com/?a=commitdiff_plain;h=830f0c06513589bfd5cad4df6e267d94904f12fe;p=onyx.git added: `core.encoding.utf8` library --- diff --git a/core/encoding/utf8.onyx b/core/encoding/utf8.onyx new file mode 100644 index 00000000..2bf037ff --- /dev/null +++ b/core/encoding/utf8.onyx @@ -0,0 +1,163 @@ +package core.encoding.utf8 + +use core {string, array, iter} + +rune :: i32 + +Max_Chars :: 4 + +append_rune :: (s: &dyn_str, r: rune) -> str { + buffer: [Max_Chars] u8; + string.append(s, encode_rune(buffer, r)); + return *s; +} + +advance_rune :: #match #local {} + +#overload +advance_rune :: (s: str, n := 1) -> str { + t := s; + for n { + t = string.advance(t, rune_length_from_first_byte(t[0])); + } + return t; +} + +#overload +advance_rune :: (s: &str, n := 1) { + for n { + string.advance(s, rune_length_from_first_byte(s.data[0])); + } +} + +next_rune :: (s: str) -> str { + return s.data[0 .. rune_length_from_first_byte(s[0])]; +} + +decode_rune :: (s: str) -> (r: rune, size: i32) { + len := rune_length_from_first_byte(s[0]); + if len == 1 do return (cast(rune) s[0]), 1; + + mask := 0xFF >> (1 + len); + r: rune = ((cast(rune) s[0]) & mask); + + for len - 1 { + r = r << 6; + r = r | ~~(s[it + 1] & 0x3F); + } + + return r, len; +} + +decode_last_rune :: (s: str) -> (r: rune, size: i32) { + i := s.length - 1; + while i >= 0 && (s[i] & 0xC0 == 0x80) { + i -= 1; + } + + return decode_rune(s[i..s.length]); +} + +encode_rune :: (buffer: [] u8, r: rune) -> str { + if r <= 0x7F { + buffer[0] = ~~ r; + return buffer[0 .. 1]; + } + + if r <= 0x7FF { + buffer[0] = ~~(0xC0 | ((r >> 6) & 0x1F)); + buffer[1] = ~~(0x80 | (r & 0x3F)); + return buffer[0 .. 2]; + } + + if r >= 0xD800 && r <= 0xDFFF { + return buffer[0 .. 0]; + } + + if r <= 0xFFFF { + buffer[0] = ~~(0xE0 | ((r >> 12) & 0x0F)); + buffer[1] = ~~(0x80 | ((r >> 6) & 0x3F)); + buffer[2] = ~~(0x80 | (r & 0x3F)); + return buffer[0 .. 3]; + } + + if r <= 0x10FFFF { + buffer[0] = ~~(0xF0 | ((r >> 18) & 0x07)); + buffer[1] = ~~(0x80 | ((r >> 12) & 0x3F)); + buffer[2] = ~~(0x80 | ((r >> 6) & 0x3F)); + buffer[3] = ~~(0x80 | (r & 0x3F)); + return buffer[0 .. 4]; + } + + return buffer[0 .. 0]; +} + +full_rune :: (buffer: [] u8) -> bool { + len := rune_length_from_first_byte(buffer[0]); + + // If there are not enough bytes, then this cannot + // be a valid rune. + if buffer.length < len do return false; + + return array.every(buffer[1 .. buffer.length], #(it & 0xC0 == 0x80)); +} + +rune_count :: (s: str) -> i32 { + w := s; + count := 0; + while !string.empty(w) { + advance_rune(&w); + count += 1; + } + + return count; +} + +rune_length :: (r: rune) -> i32 { + if r <= 0x7f do return 1; + if r <= 0x7ff do return 2; + if r <= 0xffff do return 3; + if r <= 0x10ffff do return 4; + + return -1; +} + +rune_length_from_first_byte :: (b: u8) -> i32 { + if b & 0x80 == 0x00 do return 1; + if b & 0xE0 == 0xC0 do return 2; + if b & 0xF0 == 0xE0 do return 3; + if b & 0xF8 == 0xF0 do return 4; + return -1; +} + +rune_is_start :: (b: u8) -> bool { + return rune_length_from_first_byte(b) > 1; +} + +runes :: (s: str) -> Iterator(rune) { + return iter.generator( + &.{ s = s }, + ctx => { + if string.empty(ctx.s) do return 0, false; + + r, len := decode_rune(ctx.s); + string.advance(&ctx.s, len); + + return r, true; + } + ); +} + +slice :: (s: str, low, high: i32) -> str { + advanced := advance_rune(s, low); + tmp := advanced; + + bytes := 0; + for high - low { + _, b := decode_rune(tmp); + bytes += b; + string.advance(&tmp, b); + } + + return advanced.data[0 .. bytes]; +} diff --git a/core/std.onyx b/core/std.onyx index fc938665..24ab4075 100644 --- a/core/std.onyx +++ b/core/std.onyx @@ -51,6 +51,7 @@ package core #load "./time/date" #load "./encoding/base64" +#load "./encoding/utf8" #load "./runtime/common" @@ -119,4 +120,4 @@ package core #if runtime.runtime == .Js { #load "./runtime/platform/js/platform" -} \ No newline at end of file +} diff --git a/tests/utf8_test b/tests/utf8_test new file mode 100644 index 00000000..7e8e565c --- /dev/null +++ b/tests/utf8_test @@ -0,0 +1,3 @@ +🂦 🂧 🂨 🂩 🂪 🂫 🂬 🂭 🂮 🂯 🂰 🂱 🂲 🂳 🂴 🂵 🂶 🂷 🂸 🂹 🂺 🂻 🂼 🂽 🂾 🂿 🃀 🃁 🃂 🃃 🃄 🃅 🃆 🃇 🃈 🃉 🃊 🃋 🃌 🃍 🃎 🃏 🃐 🃑 🃒 🃓 🃔 🃕 🃖 🃗 🃘 🃙 🃚 🃛 🃜 🃝 🃞 🃟 +🂣🂤🂥🂦🂧🂨🂩 +🂦 diff --git a/tests/utf8_test.onyx b/tests/utf8_test.onyx new file mode 100644 index 00000000..1449b686 --- /dev/null +++ b/tests/utf8_test.onyx @@ -0,0 +1,22 @@ +use core +use core.encoding {utf8} + +main :: () { + output: dyn_str; + for i: 0x1F0A0 .. 0x1F0E0 { + utf8.append_rune(&output, i); + } + + tmp: str = output; + utf8.advance_rune(&tmp, 6); + + for utf8.runes(tmp) { + buf: [4] u8; + printf("{} ", utf8.encode_rune(buf, it)); + } + print("\n"); + + println(utf8.slice(output, 3, 10)); + + println(utf8.next_rune(tmp)); +}