Working on the tokenizer

author Brendan Hansen <brendan.f.hansen@gmail.com>

Wed, 6 May 2020 19:58:54 +0000 (14:58 -0500)

committer Brendan Hansen <brendan.f.hansen@gmail.com>

Wed, 6 May 2020 19:58:54 +0000 (14:58 -0500)
author Brendan Hansen <brendan.f.hansen@gmail.com>
Wed, 6 May 2020 19:58:54 +0000 (14:58 -0500)
committer Brendan Hansen <brendan.f.hansen@gmail.com>
Wed, 6 May 2020 19:58:54 +0000 (14:58 -0500)
diff --git a/bh.h b/bh.h

index 16221650efa60ad97d8a7964bfa6f1d13ee225b7..7eb4fd09ea664d26f0ed585d98bf65d37b925b6a 100644 (file)
--- a/bh.h
+++ b/bh.h
@@ -21,6 +21,17 @@ typedef signed int i32;
  typedef signed long i64;
  typedef signed long long i128;
  typedef unsigned long isize;
+typedef i32 b32;
+
+//-------------------------------------------------------------------------------------
+// Better character functions
+//-------------------------------------------------------------------------------------
+inline b32 char_is_alpha(const char a);
+inline b32 char_is_num(const char a);
+inline b32 char_is_alphanum(const char a);
+inline b32 char_is_whitespace(const char a);
+inline b32 char_in_range(const char lo, const char hi, const char a);
+char charset_contains(const char* charset, char ch);
  
  //-------------------------------------------------------------------------------------
  // Better strings
@@ -56,8 +67,8 @@ typedef struct bh_string {
  
  bh_string bh_string_new_cap(unsigned long cap);
  bh_string bh_string_new_str(const char* cstr);
-i32 bh_string_delete(bh_string* str);
-i32 bh_string_ensure_capacity(bh_string* str, u64 cap);
+b32 bh_string_delete(bh_string* str);
+b32 bh_string_ensure_capacity(bh_string* str, u64 cap);
  void bh_string_append_bh_string(bh_string* str1, bh_string* str2);
  void bh_string_append_cstr(bh_string* str1, const char* str2);
  void bh_string_replace_at_bh_string(bh_string* dest, bh_string* src, u64 offset);
@@ -74,7 +85,6 @@ void bh_string_print(bh_string* str);
  //-------------------------------------------------------------------------------------
  // Better files
  //-------------------------------------------------------------------------------------
-
  typedef enum bh_file_error {
         BH_FILE_ERROR_NONE,
         BH_FILE_ERROR_INVALID
@@ -120,10 +130,9 @@ bh_file_error bh_file_create(bh_file* file, char const* filename);
  bh_file_error bh_file_open(bh_file* file, char const* filename);
  bh_file_error bh_file_open_mode(bh_file* file, bh_file_mode mode, const char* filename);
  bh_file_error bh_file_new(bh_file* file, bh_file_descriptor fd, const char* filename);
-i32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read);
-i32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote);
-static i32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset);
-i32 bh_file_seek(bh_file* file, i64 offset);
+b32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read);
+b32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote);
+static b32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset);
  i64 bh_file_seek_to_end(bh_file* file);
  i64 bh_file_skip(bh_file* file, i64 bytes);
  i64 bh_file_tell(bh_file* file);
@@ -156,6 +165,44 @@ i32 bh_file_contents_delete(bh_file_contents* contents);
  // IMPLEMENTATIONS
  //-------------------------------------------------------------------------------------
  
+//-------------------------------------------------------------------------------------
+// CHAR FUNCTIONS
+//-------------------------------------------------------------------------------------
+inline b32 char_is_alpha(const char a) {
+       return ('a' <= a && a <= 'z') || ('A' <= a && a <= 'Z');
+}
+
+inline b32 char_is_num(const char a) {
+       return ('0' <= a && a <= '9');
+}
+
+inline b32 char_is_alphanum(const char a) {
+       return char_is_alpha(a) || char_is_num(a);
+}
+
+inline b32 char_is_whitespace(const char a) {
+       return charset_contains(" \t\r\n", a);
+}
+
+inline b32 char_in_range(const char lo, const char hi, const char a) {
+       return lo <= a <= hi;
+}
+
+char charset_contains(const char* charset, char ch) {
+       while (*charset) {
+               if (*charset == ch) return ch;
+               charset++;
+       }
+
+       return 0;
+}
+
+i64 chars_match(char* ptr1, char* ptr2) {
+       i64 len = 0;
+       while (*ptr1 == *ptr2) ptr1++, ptr2++, len++;
+       return *ptr2 == '\0' ? len : 0;
+}
+
  //-------------------------------------------------------------------------------------
  // STRING IMPLEMENTATION
  //-------------------------------------------------------------------------------------
@@ -182,14 +229,14 @@ bh_string bh_string_new_str(const char* cstr) {
         return str;
  }
  
-i32 bh_string_delete(bh_string* str) {
+b32 bh_string_delete(bh_string* str) {
         free(str->data);
         str->length = 0;
         str->capacity = 0;
         return 1;
  }
  
-i32 bh_string_ensure_capacity(bh_string* str, u64 cap) {
+b32 bh_string_ensure_capacity(bh_string* str, u64 cap) {
         if (str->capacity >= cap) return 1;
  
         //TODO: This could fail
@@ -217,7 +264,7 @@ void bh_string_append_cstr(bh_string* str1, const char* str2) {
  }
  
  void bh_string_replace_at_bh_string(bh_string* dest, bh_string* src, u64 offset) {
-       if (offset >= dest->length) return;
+       if (offset > dest->length) return;
         if (!bh_string_ensure_capacity(dest, offset + src->length)) return;
  
         memcpy(dest->data + offset, src->data, src->length);
@@ -226,7 +273,7 @@ void bh_string_replace_at_bh_string(bh_string* dest, bh_string* src, u64 offset)
  }
  
  void bh_string_replace_at_cstr(bh_string* dest, const char* src, u64 offset) {
-       if (offset >= dest->length) return;
+       if (offset > dest->length) return;
         const int srclen = strlen(src);
         if (!bh_string_ensure_capacity(dest, offset + srclen)) return;
  
@@ -253,14 +300,6 @@ void bh_string_insert_at_cstr(bh_string* dest, const char* src, u64 offset) {
         dest->length += srclen;
  }
  
-static inline u8 charset_contains(const char* charset, char ch) {
-       while (*charset) {
-               if (*charset == ch) return *charset;
-               charset++;
-       }
-
-       return 0;
-}
  
  void bh_string_trim_end(bh_string* str, const char* charset) {
         while (charset_contains(charset, str->data[str->length - 1]))
@@ -362,14 +401,14 @@ bh_file_error bh_file_new(bh_file* file, bh_file_descriptor fd, const char* file
         return BH_FILE_ERROR_NONE;
  }
  
-i32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read) {
+b32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read) {
         isize res = pread(file->fd, buffer, buff_size, offset);
         if (res < 0) return 0;
         if (bytes_read) *bytes_read = res;
         return 1;
  }
  
-i32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote) {
+b32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote) {
         isize res;
         i64 current_offset = 0;
         bh__file_seek_wrapper(file->fd, offset, BH_FILE_WHENCE_CURRENT, &current_offset);
@@ -385,7 +424,7 @@ i32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_s
         return 1;
  }
  
-static i32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset) {
+static b32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset) {
         i64 res = lseek(fd, offset, whence);
         if (res < 0) return 0;
         if (new_offset) *new_offset = res;
@@ -426,11 +465,11 @@ bh_file_error bh_file_close(bh_file* file) {
         return err;
  }
  
-i32 bh_file_read(bh_file* file, void* buffer, isize buff_size) {
+b32 bh_file_read(bh_file* file, void* buffer, isize buff_size) {
         return bh_file_read_at(file, bh_file_tell(file), buffer, buff_size, NULL);
  }
  
-i32 bh_file_write(bh_file* file, void* buffer, isize buff_size) {
+b32 bh_file_write(bh_file* file, void* buffer, isize buff_size) {
         return bh_file_write_at(file, bh_file_tell(file), buffer, buff_size, NULL);
  }
  
@@ -465,7 +504,7 @@ bh_file_contents bh_file_read_contents_direct(const char* filename) {
         return fc;
  }
  
-i32 bh_file_contents_delete(bh_file_contents* contents) {
+b32 bh_file_contents_delete(bh_file_contents* contents) {
         free(contents->data);
         contents->length = 0;
         return 1;
diff --git a/docs/plan b/docs/plan

new file mode 100644 (file)

index 0000000..70cd353
--- /dev/null
+++ b/docs/plan
@@ -0,0 +1,56 @@
+The ONYX Programming Language
+-----------------------------
+
+WHAT:
+       ONYX is a low-ish level programming language designed for use with
+       Web-Assembly 32-bit (WASM). It features some advanced features such
+       as comptime code execution and JS literals for external functions.
+
+WHY:
+       ONYX was made to help me learn about compiler design.
+
+FEATURES:
+       - Strong type system
+       - Smart package loading
+       - Structs and enums
+       - functions (no anonymous functions)
+       - Control structures
+               if, for, switch
+       - pointers
+       - inferred typing
+       - defer
+       ? polymorphic functions
+
+EXAMPLE CODE:
+
+// This is a comment
+// This is also the only way to do comments
+
+use "core"; // Looks for "core.onyx" in the current directory
+
+Foo :: struct { x: i32, y: i32 };
+
+export add :: (a: i32, b: i32) -> i32 {
+       return a + b;   
+};
+
+foo :: (a: i32) -> Foo {
+       return Foo { x = a, y = 0 };
+}
+
+MVP CODE:
+
+// Comments need to be parsed
+
+export add :: (a: i32, b: i32) -> i32 {
+       return a + b;
+}
+
+export max :: (a: i32, b: i32) -> i32 {
+       // Curly braces are required
+       if a > b {
+               return a;
+       } else {
+               return b;
+       }
+}
+\ No newline at end of file
diff --git a/onyx b/onyx

index 272ceb6aab5bb2c5260ac4a5b1ecf709eec5ac1a..2d5a2d93f7c2ed09c2d0dd54a2cc0bc6f90e9a32 100755 (executable)

Binary files a/onyx and b/onyx differ
diff --git a/onyx.c b/onyx.c

index 60d8ea9687545d5d1adae89bc25dde5ded3db660..970c85c843aad13a2711576da41e4702411dc3c3 100644 (file)
--- a/onyx.c
+++ b/onyx.c
@@ -2,41 +2,162 @@
  #include <stdlib.h> // TODO: Replace with custom lib
  #include "bh.h"
  
+typedef struct Tokenizer {
+       char *start, *curr, *end;
+       u64 line_number;
+} Tokenizer;
+
+typedef enum TokenType {
+       TOKEN_TYPE_UNKNOWN,
+       TOKEN_TYPE_END_STREAM,
+
+       TOKEN_TYPE_KEYWORD_STRUCT,
+       TOKEN_TYPE_KEYWORD_USE,
+       TOKEN_TYPE_KEYWORD_EXPORT,
+       TOKEN_TYPE_KEYWORD_IF,
+       TOKEN_TYPE_KEYWORD_ELSE,
+       TOKEN_TYPE_KEYWORD_FOR,
+       TOKEN_TYPE_KEYWORD_RETURN,
+
+       TOKEN_TYPE_RIGHT_ARROW,
+       TOKEN_TYPE_OPEN_PAREN,
+       TOKEN_TYPE_CLOSE_PAREN,
+       TOKEN_TYPE_OPEN_BRACE,
+       TOKEN_TYPE_CLOSE_BRACE,
+       TOKEN_TYPE_OPEN_BRACKET,
+       TOKEN_TYPE_CLOSE_BRACKET,
+
+       TOKEN_TYPE_OP_ADD,
+       TOKEN_TYPE_OP_SUB,
+       TOKEN_TYPE_OP_MUL,
+       TOKEN_TYPE_OP_DIV,
+       TOKEN_TYPE_OP_MOD,
+
+       TOKEN_TYPE_COUNT
+} TokenType;
+
+static const char* TokenTypeNames[] = {
+       "TOKEN_TYPE_UNKNOWN",
+       "TOKEN_TYPE_END_STREAM",
+
+       "TOKEN_TYPE_KEYWORD_STRUCT",
+       "TOKEN_TYPE_KEYWORD_USE",
+       "TOKEN_TYPE_KEYWORD_EXPORT",
+       "TOKEN_TYPE_KEYWORD_IF",
+       "TOKEN_TYPE_KEYWORD_ELSE",
+       "TOKEN_TYPE_KEYWORD_FOR",
+       "TOKEN_TYPE_KEYWORD_RETURN",
+
+       "TOKEN_TYPE_RIGHT_ARROW",
+       "TOKEN_TYPE_OPEN_PAREN",
+       "TOKEN_TYPE_CLOSE_PAREN",
+       "TOKEN_TYPE_OPEN_BRACE",
+       "TOKEN_TYPE_CLOSE_BRACE",
+       "TOKEN_TYPE_OPEN_BRACKET",
+       "TOKEN_TYPE_CLOSE_BRACKET",
+
+       "TOKEN_TYPE_OP_ADD",
+       "TOKEN_TYPE_OP_SUB",
+       "TOKEN_TYPE_OP_MUL",
+       "TOKEN_TYPE_OP_DIV",
+       "TOKEN_TYPE_OP_MOD",
+
+       "TOKEN_TYPE_COUNT"
+};
+
+typedef struct Token {
+       TokenType type;
+       char* token;
+       isize length;
+       u64 line_number, line_column;
+} Token;
+
+b32 token_lit(Tokenizer* tokenizer, Token* tk, char* lit, TokenType type) {
+       i64 len = chars_match(tokenizer->curr, lit);
+       if (len > 0) {
+               tk->type = type;
+               tk->token = tokenizer->curr;
+               tk->length = len;
+               tokenizer->curr += len;
+               return 1;
+       }
+       return 0;
+}
+
+Token get_token(Tokenizer* tokenizer) {
+       #ifndef LITERAL_TOKEN
+       #define LITERAL_TOKEN(token, token_type) \
+               if (token_lit(tokenizer, &tk, token, token_type)) goto token_parsed;
+       #endif
+
+       Token tk;
+
+       tk.type = TOKEN_TYPE_UNKNOWN;
+       tk.token = tokenizer->curr;
+       tk.length = 1;
+       tk.line_number = 0;
+       tk.line_column = 0;
+
+       if (tokenizer->curr == tokenizer->end) {
+               tk.type = TOKEN_TYPE_END_STREAM;
+               goto token_parsed;
+       }
+
+       LITERAL_TOKEN("struct", TOKEN_TYPE_KEYWORD_STRUCT);
+       LITERAL_TOKEN("export", TOKEN_TYPE_KEYWORD_EXPORT);
+       LITERAL_TOKEN("use", TOKEN_TYPE_KEYWORD_USE);
+       LITERAL_TOKEN("if", TOKEN_TYPE_KEYWORD_IF);
+       LITERAL_TOKEN("else", TOKEN_TYPE_KEYWORD_IF);
+       LITERAL_TOKEN("for", TOKEN_TYPE_KEYWORD_FOR);
+       LITERAL_TOKEN("return", TOKEN_TYPE_KEYWORD_RETURN);
+       LITERAL_TOKEN("->", TOKEN_TYPE_RIGHT_ARROW);
+       LITERAL_TOKEN("(", TOKEN_TYPE_OPEN_PAREN);
+       LITERAL_TOKEN(")", TOKEN_TYPE_CLOSE_PAREN);
+       LITERAL_TOKEN("{", TOKEN_TYPE_OPEN_BRACE);
+       LITERAL_TOKEN("}", TOKEN_TYPE_CLOSE_BRACE);
+       LITERAL_TOKEN("[", TOKEN_TYPE_OPEN_BRACKET);
+       LITERAL_TOKEN("]", TOKEN_TYPE_CLOSE_BRACKET);
+       LITERAL_TOKEN("+", TOKEN_TYPE_OP_ADD);
+       LITERAL_TOKEN("-", TOKEN_TYPE_OP_SUB);
+       LITERAL_TOKEN("*", TOKEN_TYPE_OP_MUL);
+       LITERAL_TOKEN("/", TOKEN_TYPE_OP_DIV);
+       LITERAL_TOKEN("%", TOKEN_TYPE_OP_MOD);
+
+       tokenizer->curr++; // Ignore token
+
+token_parsed:
+       return tk;
+}
+
  int main(int argc, char *argv[]) {
-       bh_file demofile;
-       bh_file_error err = bh_file_open(&demofile, argv[1]);
+       bh_file source_file;
+       bh_file_error err = bh_file_open(&source_file, argv[1]);
         if (err != BH_FILE_ERROR_NONE) {
                 fprintf(stderr, "Failed to open file %s\n", argv[1]);
                 return EXIT_FAILURE;
         }
  
-       bh_file_contents fc = bh_file_read_contents(&demofile);
-       printf("%ld: %s\n", fc.length, fc.data);
+       bh_file_contents fc = bh_file_read_contents(&source_file);
+       bh_file_close(&source_file);
  
-       bh_file_contents_delete(&fc);
-       bh_file_close(&demofile);
+       Tokenizer tknizer = {
+               .start = fc.data,
+               .curr = fc.data,
+               .end = fc.data + fc.length,
+               .line_number = 1,
+       };
  
-       // bh_string test_str = bh_string_new(256);
-       // bh_string world_str = bh_string_new("World FOO Bar test\n");
+       Token tk;
+       do {
+               tk = get_token(&tknizer);
+               char c = *(tk.token + tk.length);
+               *(tk.token + tk.length) = '\0';
+               printf("%s: %s\n", TokenTypeNames[tk.type], tk.token);
+               *(tk.token + tk.length) = c;
+       } while (tk.type != TOKEN_TYPE_END_STREAM);
  
-       // bh_string_append(&test_str, "Hello Frank!\n");
-       // bh_string_replace_at(&test_str, &world_str, 6);
-       // bh_string_replace_at(&test_str, "Hola ", 0);
-       // bh_string_insert_at(&test_str, "World", 3);
-       // bh_string_print(&test_str);
  
-       // bh_string trim_str = bh_string_new("abcdeTesting words herezzzz\n   \t");
-       // bh_string_print(&trim_str);
-       // bh_string_trim_begin(&trim_str, "abcde");
-       // bh_string_print(&trim_str);
-       // bh_string_trim_end_space(&trim_str);
-       // bh_string_print(&trim_str);
-
-       // bh_string_delete(&test_str);
-       // bh_string_delete(&world_str);
-       // bh_string_delete(&trim_str);
-
-       // bh_string file_contents = bh_file_read_contents("path");
+       bh_file_contents_delete(&fc);
  
         return 0;
  }
diff --git a/progs/demo.onyx b/progs/demo.onyx

index ee02d76bce162d6cb1874fe136206ce8b14590f9..32a37db1518370113d66ebf747aff353ca6193fa 100644 (file)
--- a/progs/demo.onyx
+++ b/progs/demo.onyx
@@ -1,9 +1,11 @@
-use "core";
+/* This is a comment
+This is also the only way to do comments
+*/
  
-Foo :: struct {
-       x, y i32;
-}
+use "core"; /* Looks for "core.onyx" in the current directory */
  
-main :: (argc i32, argv []*u8) int {
-       print("Hello World!");
-}
+Foo :: struct { x i32, y i32 };
+
+add :: (a i32, b i32) -> i32 {
+       return a + b;   
+};
+\ No newline at end of file
diff --git a/progs/mvp.onyx b/progs/mvp.onyx

new file mode 100644 (file)

index 0000000..10f2401
--- /dev/null
+++ b/progs/mvp.onyx
@@ -0,0 +1,14 @@
+/* Comments need to be parsed */
+
+export add :: (a: i32, b: i32) -> i32 {
+       return a + b;
+}
+
+export max :: (a: i32, b: i32) -> i32 {
+       /* Curly braces are required */
+       if a > b {
+               return a;
+       } else {
+               return b;
+       }
+}
+\ No newline at end of file
author	Brendan Hansen <brendan.f.hansen@gmail.com>
	Wed, 6 May 2020 19:58:54 +0000 (14:58 -0500)
committer	Brendan Hansen <brendan.f.hansen@gmail.com>
	Wed, 6 May 2020 19:58:54 +0000 (14:58 -0500)
bh.h		patch \| blob \| history
docs/plan	[new file with mode: 0644]	patch \| blob
onyx		patch \| blob \| history
onyx.c		patch \| blob \| history
progs/demo.onyx		patch \| blob \| history
progs/mvp.onyx	[new file with mode: 0644]	patch \| blob