From: Brendan Hansen Date: Wed, 6 May 2020 19:58:54 +0000 (-0500) Subject: Working on the tokenizer X-Git-Url: https://git.brendanfh.com/?a=commitdiff_plain;h=5a26feff68cfc3febda691584b951e97e7adf2cf;p=onyx.git Working on the tokenizer --- diff --git a/bh.h b/bh.h index 16221650..7eb4fd09 100644 --- a/bh.h +++ b/bh.h @@ -21,6 +21,17 @@ typedef signed int i32; typedef signed long i64; typedef signed long long i128; typedef unsigned long isize; +typedef i32 b32; + +//------------------------------------------------------------------------------------- +// Better character functions +//------------------------------------------------------------------------------------- +inline b32 char_is_alpha(const char a); +inline b32 char_is_num(const char a); +inline b32 char_is_alphanum(const char a); +inline b32 char_is_whitespace(const char a); +inline b32 char_in_range(const char lo, const char hi, const char a); +char charset_contains(const char* charset, char ch); //------------------------------------------------------------------------------------- // Better strings @@ -56,8 +67,8 @@ typedef struct bh_string { bh_string bh_string_new_cap(unsigned long cap); bh_string bh_string_new_str(const char* cstr); -i32 bh_string_delete(bh_string* str); -i32 bh_string_ensure_capacity(bh_string* str, u64 cap); +b32 bh_string_delete(bh_string* str); +b32 bh_string_ensure_capacity(bh_string* str, u64 cap); void bh_string_append_bh_string(bh_string* str1, bh_string* str2); void bh_string_append_cstr(bh_string* str1, const char* str2); void bh_string_replace_at_bh_string(bh_string* dest, bh_string* src, u64 offset); @@ -74,7 +85,6 @@ void bh_string_print(bh_string* str); //------------------------------------------------------------------------------------- // Better files //------------------------------------------------------------------------------------- - typedef enum bh_file_error { BH_FILE_ERROR_NONE, BH_FILE_ERROR_INVALID @@ -120,10 +130,9 @@ bh_file_error bh_file_create(bh_file* file, char const* filename); bh_file_error bh_file_open(bh_file* file, char const* filename); bh_file_error bh_file_open_mode(bh_file* file, bh_file_mode mode, const char* filename); bh_file_error bh_file_new(bh_file* file, bh_file_descriptor fd, const char* filename); -i32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read); -i32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote); -static i32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset); -i32 bh_file_seek(bh_file* file, i64 offset); +b32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read); +b32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote); +static b32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset); i64 bh_file_seek_to_end(bh_file* file); i64 bh_file_skip(bh_file* file, i64 bytes); i64 bh_file_tell(bh_file* file); @@ -156,6 +165,44 @@ i32 bh_file_contents_delete(bh_file_contents* contents); // IMPLEMENTATIONS //------------------------------------------------------------------------------------- +//------------------------------------------------------------------------------------- +// CHAR FUNCTIONS +//------------------------------------------------------------------------------------- +inline b32 char_is_alpha(const char a) { + return ('a' <= a && a <= 'z') || ('A' <= a && a <= 'Z'); +} + +inline b32 char_is_num(const char a) { + return ('0' <= a && a <= '9'); +} + +inline b32 char_is_alphanum(const char a) { + return char_is_alpha(a) || char_is_num(a); +} + +inline b32 char_is_whitespace(const char a) { + return charset_contains(" \t\r\n", a); +} + +inline b32 char_in_range(const char lo, const char hi, const char a) { + return lo <= a <= hi; +} + +char charset_contains(const char* charset, char ch) { + while (*charset) { + if (*charset == ch) return ch; + charset++; + } + + return 0; +} + +i64 chars_match(char* ptr1, char* ptr2) { + i64 len = 0; + while (*ptr1 == *ptr2) ptr1++, ptr2++, len++; + return *ptr2 == '\0' ? len : 0; +} + //------------------------------------------------------------------------------------- // STRING IMPLEMENTATION //------------------------------------------------------------------------------------- @@ -182,14 +229,14 @@ bh_string bh_string_new_str(const char* cstr) { return str; } -i32 bh_string_delete(bh_string* str) { +b32 bh_string_delete(bh_string* str) { free(str->data); str->length = 0; str->capacity = 0; return 1; } -i32 bh_string_ensure_capacity(bh_string* str, u64 cap) { +b32 bh_string_ensure_capacity(bh_string* str, u64 cap) { if (str->capacity >= cap) return 1; //TODO: This could fail @@ -217,7 +264,7 @@ void bh_string_append_cstr(bh_string* str1, const char* str2) { } void bh_string_replace_at_bh_string(bh_string* dest, bh_string* src, u64 offset) { - if (offset >= dest->length) return; + if (offset > dest->length) return; if (!bh_string_ensure_capacity(dest, offset + src->length)) return; memcpy(dest->data + offset, src->data, src->length); @@ -226,7 +273,7 @@ void bh_string_replace_at_bh_string(bh_string* dest, bh_string* src, u64 offset) } void bh_string_replace_at_cstr(bh_string* dest, const char* src, u64 offset) { - if (offset >= dest->length) return; + if (offset > dest->length) return; const int srclen = strlen(src); if (!bh_string_ensure_capacity(dest, offset + srclen)) return; @@ -253,14 +300,6 @@ void bh_string_insert_at_cstr(bh_string* dest, const char* src, u64 offset) { dest->length += srclen; } -static inline u8 charset_contains(const char* charset, char ch) { - while (*charset) { - if (*charset == ch) return *charset; - charset++; - } - - return 0; -} void bh_string_trim_end(bh_string* str, const char* charset) { while (charset_contains(charset, str->data[str->length - 1])) @@ -362,14 +401,14 @@ bh_file_error bh_file_new(bh_file* file, bh_file_descriptor fd, const char* file return BH_FILE_ERROR_NONE; } -i32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read) { +b32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read) { isize res = pread(file->fd, buffer, buff_size, offset); if (res < 0) return 0; if (bytes_read) *bytes_read = res; return 1; } -i32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote) { +b32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote) { isize res; i64 current_offset = 0; bh__file_seek_wrapper(file->fd, offset, BH_FILE_WHENCE_CURRENT, ¤t_offset); @@ -385,7 +424,7 @@ i32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_s return 1; } -static i32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset) { +static b32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset) { i64 res = lseek(fd, offset, whence); if (res < 0) return 0; if (new_offset) *new_offset = res; @@ -426,11 +465,11 @@ bh_file_error bh_file_close(bh_file* file) { return err; } -i32 bh_file_read(bh_file* file, void* buffer, isize buff_size) { +b32 bh_file_read(bh_file* file, void* buffer, isize buff_size) { return bh_file_read_at(file, bh_file_tell(file), buffer, buff_size, NULL); } -i32 bh_file_write(bh_file* file, void* buffer, isize buff_size) { +b32 bh_file_write(bh_file* file, void* buffer, isize buff_size) { return bh_file_write_at(file, bh_file_tell(file), buffer, buff_size, NULL); } @@ -465,7 +504,7 @@ bh_file_contents bh_file_read_contents_direct(const char* filename) { return fc; } -i32 bh_file_contents_delete(bh_file_contents* contents) { +b32 bh_file_contents_delete(bh_file_contents* contents) { free(contents->data); contents->length = 0; return 1; diff --git a/docs/plan b/docs/plan new file mode 100644 index 00000000..70cd353e --- /dev/null +++ b/docs/plan @@ -0,0 +1,56 @@ +The ONYX Programming Language +----------------------------- + +WHAT: + ONYX is a low-ish level programming language designed for use with + Web-Assembly 32-bit (WASM). It features some advanced features such + as comptime code execution and JS literals for external functions. + +WHY: + ONYX was made to help me learn about compiler design. + +FEATURES: + - Strong type system + - Smart package loading + - Structs and enums + - functions (no anonymous functions) + - Control structures + if, for, switch + - pointers + - inferred typing + - defer + ? polymorphic functions + +EXAMPLE CODE: + +// This is a comment +// This is also the only way to do comments + +use "core"; // Looks for "core.onyx" in the current directory + +Foo :: struct { x: i32, y: i32 }; + +export add :: (a: i32, b: i32) -> i32 { + return a + b; +}; + +foo :: (a: i32) -> Foo { + return Foo { x = a, y = 0 }; +} + +MVP CODE: + +// Comments need to be parsed + +export add :: (a: i32, b: i32) -> i32 { + return a + b; +} + +export max :: (a: i32, b: i32) -> i32 { + // Curly braces are required + if a > b { + return a; + } else { + return b; + } +} \ No newline at end of file diff --git a/onyx b/onyx index 272ceb6a..2d5a2d93 100755 Binary files a/onyx and b/onyx differ diff --git a/onyx.c b/onyx.c index 60d8ea96..970c85c8 100644 --- a/onyx.c +++ b/onyx.c @@ -2,41 +2,162 @@ #include // TODO: Replace with custom lib #include "bh.h" +typedef struct Tokenizer { + char *start, *curr, *end; + u64 line_number; +} Tokenizer; + +typedef enum TokenType { + TOKEN_TYPE_UNKNOWN, + TOKEN_TYPE_END_STREAM, + + TOKEN_TYPE_KEYWORD_STRUCT, + TOKEN_TYPE_KEYWORD_USE, + TOKEN_TYPE_KEYWORD_EXPORT, + TOKEN_TYPE_KEYWORD_IF, + TOKEN_TYPE_KEYWORD_ELSE, + TOKEN_TYPE_KEYWORD_FOR, + TOKEN_TYPE_KEYWORD_RETURN, + + TOKEN_TYPE_RIGHT_ARROW, + TOKEN_TYPE_OPEN_PAREN, + TOKEN_TYPE_CLOSE_PAREN, + TOKEN_TYPE_OPEN_BRACE, + TOKEN_TYPE_CLOSE_BRACE, + TOKEN_TYPE_OPEN_BRACKET, + TOKEN_TYPE_CLOSE_BRACKET, + + TOKEN_TYPE_OP_ADD, + TOKEN_TYPE_OP_SUB, + TOKEN_TYPE_OP_MUL, + TOKEN_TYPE_OP_DIV, + TOKEN_TYPE_OP_MOD, + + TOKEN_TYPE_COUNT +} TokenType; + +static const char* TokenTypeNames[] = { + "TOKEN_TYPE_UNKNOWN", + "TOKEN_TYPE_END_STREAM", + + "TOKEN_TYPE_KEYWORD_STRUCT", + "TOKEN_TYPE_KEYWORD_USE", + "TOKEN_TYPE_KEYWORD_EXPORT", + "TOKEN_TYPE_KEYWORD_IF", + "TOKEN_TYPE_KEYWORD_ELSE", + "TOKEN_TYPE_KEYWORD_FOR", + "TOKEN_TYPE_KEYWORD_RETURN", + + "TOKEN_TYPE_RIGHT_ARROW", + "TOKEN_TYPE_OPEN_PAREN", + "TOKEN_TYPE_CLOSE_PAREN", + "TOKEN_TYPE_OPEN_BRACE", + "TOKEN_TYPE_CLOSE_BRACE", + "TOKEN_TYPE_OPEN_BRACKET", + "TOKEN_TYPE_CLOSE_BRACKET", + + "TOKEN_TYPE_OP_ADD", + "TOKEN_TYPE_OP_SUB", + "TOKEN_TYPE_OP_MUL", + "TOKEN_TYPE_OP_DIV", + "TOKEN_TYPE_OP_MOD", + + "TOKEN_TYPE_COUNT" +}; + +typedef struct Token { + TokenType type; + char* token; + isize length; + u64 line_number, line_column; +} Token; + +b32 token_lit(Tokenizer* tokenizer, Token* tk, char* lit, TokenType type) { + i64 len = chars_match(tokenizer->curr, lit); + if (len > 0) { + tk->type = type; + tk->token = tokenizer->curr; + tk->length = len; + tokenizer->curr += len; + return 1; + } + return 0; +} + +Token get_token(Tokenizer* tokenizer) { + #ifndef LITERAL_TOKEN + #define LITERAL_TOKEN(token, token_type) \ + if (token_lit(tokenizer, &tk, token, token_type)) goto token_parsed; + #endif + + Token tk; + + tk.type = TOKEN_TYPE_UNKNOWN; + tk.token = tokenizer->curr; + tk.length = 1; + tk.line_number = 0; + tk.line_column = 0; + + if (tokenizer->curr == tokenizer->end) { + tk.type = TOKEN_TYPE_END_STREAM; + goto token_parsed; + } + + LITERAL_TOKEN("struct", TOKEN_TYPE_KEYWORD_STRUCT); + LITERAL_TOKEN("export", TOKEN_TYPE_KEYWORD_EXPORT); + LITERAL_TOKEN("use", TOKEN_TYPE_KEYWORD_USE); + LITERAL_TOKEN("if", TOKEN_TYPE_KEYWORD_IF); + LITERAL_TOKEN("else", TOKEN_TYPE_KEYWORD_IF); + LITERAL_TOKEN("for", TOKEN_TYPE_KEYWORD_FOR); + LITERAL_TOKEN("return", TOKEN_TYPE_KEYWORD_RETURN); + LITERAL_TOKEN("->", TOKEN_TYPE_RIGHT_ARROW); + LITERAL_TOKEN("(", TOKEN_TYPE_OPEN_PAREN); + LITERAL_TOKEN(")", TOKEN_TYPE_CLOSE_PAREN); + LITERAL_TOKEN("{", TOKEN_TYPE_OPEN_BRACE); + LITERAL_TOKEN("}", TOKEN_TYPE_CLOSE_BRACE); + LITERAL_TOKEN("[", TOKEN_TYPE_OPEN_BRACKET); + LITERAL_TOKEN("]", TOKEN_TYPE_CLOSE_BRACKET); + LITERAL_TOKEN("+", TOKEN_TYPE_OP_ADD); + LITERAL_TOKEN("-", TOKEN_TYPE_OP_SUB); + LITERAL_TOKEN("*", TOKEN_TYPE_OP_MUL); + LITERAL_TOKEN("/", TOKEN_TYPE_OP_DIV); + LITERAL_TOKEN("%", TOKEN_TYPE_OP_MOD); + + tokenizer->curr++; // Ignore token + +token_parsed: + return tk; +} + int main(int argc, char *argv[]) { - bh_file demofile; - bh_file_error err = bh_file_open(&demofile, argv[1]); + bh_file source_file; + bh_file_error err = bh_file_open(&source_file, argv[1]); if (err != BH_FILE_ERROR_NONE) { fprintf(stderr, "Failed to open file %s\n", argv[1]); return EXIT_FAILURE; } - bh_file_contents fc = bh_file_read_contents(&demofile); - printf("%ld: %s\n", fc.length, fc.data); + bh_file_contents fc = bh_file_read_contents(&source_file); + bh_file_close(&source_file); - bh_file_contents_delete(&fc); - bh_file_close(&demofile); + Tokenizer tknizer = { + .start = fc.data, + .curr = fc.data, + .end = fc.data + fc.length, + .line_number = 1, + }; - // bh_string test_str = bh_string_new(256); - // bh_string world_str = bh_string_new("World FOO Bar test\n"); + Token tk; + do { + tk = get_token(&tknizer); + char c = *(tk.token + tk.length); + *(tk.token + tk.length) = '\0'; + printf("%s: %s\n", TokenTypeNames[tk.type], tk.token); + *(tk.token + tk.length) = c; + } while (tk.type != TOKEN_TYPE_END_STREAM); - // bh_string_append(&test_str, "Hello Frank!\n"); - // bh_string_replace_at(&test_str, &world_str, 6); - // bh_string_replace_at(&test_str, "Hola ", 0); - // bh_string_insert_at(&test_str, "World", 3); - // bh_string_print(&test_str); - // bh_string trim_str = bh_string_new("abcdeTesting words herezzzz\n \t"); - // bh_string_print(&trim_str); - // bh_string_trim_begin(&trim_str, "abcde"); - // bh_string_print(&trim_str); - // bh_string_trim_end_space(&trim_str); - // bh_string_print(&trim_str); - - // bh_string_delete(&test_str); - // bh_string_delete(&world_str); - // bh_string_delete(&trim_str); - - // bh_string file_contents = bh_file_read_contents("path"); + bh_file_contents_delete(&fc); return 0; } diff --git a/progs/demo.onyx b/progs/demo.onyx index ee02d76b..32a37db1 100644 --- a/progs/demo.onyx +++ b/progs/demo.onyx @@ -1,9 +1,11 @@ -use "core"; +/* This is a comment +This is also the only way to do comments +*/ -Foo :: struct { - x, y i32; -} +use "core"; /* Looks for "core.onyx" in the current directory */ -main :: (argc i32, argv []*u8) int { - print("Hello World!"); -} +Foo :: struct { x i32, y i32 }; + +add :: (a i32, b i32) -> i32 { + return a + b; +}; \ No newline at end of file diff --git a/progs/mvp.onyx b/progs/mvp.onyx new file mode 100644 index 00000000..10f24015 --- /dev/null +++ b/progs/mvp.onyx @@ -0,0 +1,14 @@ +/* Comments need to be parsed */ + +export add :: (a: i32, b: i32) -> i32 { + return a + b; +} + +export max :: (a: i32, b: i32) -> i32 { + /* Curly braces are required */ + if a > b { + return a; + } else { + return b; + } +} \ No newline at end of file