From: Brendan Hansen <brendan.f.hansen@gmail.com>
Date: Wed, 6 May 2020 19:58:54 +0000 (-0500)
Subject: Working on the tokenizer
X-Git-Url: https://git.brendanfh.com/?a=commitdiff_plain;h=5a26feff68cfc3febda691584b951e97e7adf2cf;p=onyx.git

Working on the tokenizer
---

diff --git a/bh.h b/bh.h
index 16221650..7eb4fd09 100644
--- a/bh.h
+++ b/bh.h
@@ -21,6 +21,17 @@ typedef signed int i32;
 typedef signed long i64;
 typedef signed long long i128;
 typedef unsigned long isize;
+typedef i32 b32;
+
+//-------------------------------------------------------------------------------------
+// Better character functions
+//-------------------------------------------------------------------------------------
+inline b32 char_is_alpha(const char a);
+inline b32 char_is_num(const char a);
+inline b32 char_is_alphanum(const char a);
+inline b32 char_is_whitespace(const char a);
+inline b32 char_in_range(const char lo, const char hi, const char a);
+char charset_contains(const char* charset, char ch);
 
 //-------------------------------------------------------------------------------------
 // Better strings
@@ -56,8 +67,8 @@ typedef struct bh_string {
 
 bh_string bh_string_new_cap(unsigned long cap);
 bh_string bh_string_new_str(const char* cstr);
-i32 bh_string_delete(bh_string* str);
-i32 bh_string_ensure_capacity(bh_string* str, u64 cap);
+b32 bh_string_delete(bh_string* str);
+b32 bh_string_ensure_capacity(bh_string* str, u64 cap);
 void bh_string_append_bh_string(bh_string* str1, bh_string* str2);
 void bh_string_append_cstr(bh_string* str1, const char* str2);
 void bh_string_replace_at_bh_string(bh_string* dest, bh_string* src, u64 offset);
@@ -74,7 +85,6 @@ void bh_string_print(bh_string* str);
 //-------------------------------------------------------------------------------------
 // Better files
 //-------------------------------------------------------------------------------------
-
 typedef enum bh_file_error {
 	BH_FILE_ERROR_NONE,
 	BH_FILE_ERROR_INVALID
@@ -120,10 +130,9 @@ bh_file_error bh_file_create(bh_file* file, char const* filename);
 bh_file_error bh_file_open(bh_file* file, char const* filename);
 bh_file_error bh_file_open_mode(bh_file* file, bh_file_mode mode, const char* filename);
 bh_file_error bh_file_new(bh_file* file, bh_file_descriptor fd, const char* filename);
-i32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read);
-i32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote);
-static i32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset);
-i32 bh_file_seek(bh_file* file, i64 offset);
+b32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read);
+b32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote);
+static b32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset);
 i64 bh_file_seek_to_end(bh_file* file);
 i64 bh_file_skip(bh_file* file, i64 bytes);
 i64 bh_file_tell(bh_file* file);
@@ -156,6 +165,44 @@ i32 bh_file_contents_delete(bh_file_contents* contents);
 // IMPLEMENTATIONS
 //-------------------------------------------------------------------------------------
 
+//-------------------------------------------------------------------------------------
+// CHAR FUNCTIONS
+//-------------------------------------------------------------------------------------
+inline b32 char_is_alpha(const char a) {
+	return ('a' <= a && a <= 'z') || ('A' <= a && a <= 'Z');
+}
+
+inline b32 char_is_num(const char a) {
+	return ('0' <= a && a <= '9');
+}
+
+inline b32 char_is_alphanum(const char a) {
+	return char_is_alpha(a) || char_is_num(a);
+}
+
+inline b32 char_is_whitespace(const char a) {
+	return charset_contains(" \t\r\n", a);
+}
+
+inline b32 char_in_range(const char lo, const char hi, const char a) {
+	return lo <= a <= hi;
+}
+
+char charset_contains(const char* charset, char ch) {
+	while (*charset) {
+		if (*charset == ch) return ch;
+		charset++;
+	}
+
+	return 0;
+}
+
+i64 chars_match(char* ptr1, char* ptr2) {
+	i64 len = 0;
+	while (*ptr1 == *ptr2) ptr1++, ptr2++, len++;
+	return *ptr2 == '\0' ? len : 0;
+}
+
 //-------------------------------------------------------------------------------------
 // STRING IMPLEMENTATION
 //-------------------------------------------------------------------------------------
@@ -182,14 +229,14 @@ bh_string bh_string_new_str(const char* cstr) {
 	return str;
 }
 
-i32 bh_string_delete(bh_string* str) {
+b32 bh_string_delete(bh_string* str) {
 	free(str->data);
 	str->length = 0;
 	str->capacity = 0;
 	return 1;
 }
 
-i32 bh_string_ensure_capacity(bh_string* str, u64 cap) {
+b32 bh_string_ensure_capacity(bh_string* str, u64 cap) {
 	if (str->capacity >= cap) return 1;
 
 	//TODO: This could fail
@@ -217,7 +264,7 @@ void bh_string_append_cstr(bh_string* str1, const char* str2) {
 }
 
 void bh_string_replace_at_bh_string(bh_string* dest, bh_string* src, u64 offset) {
-	if (offset >= dest->length) return;
+	if (offset > dest->length) return;
 	if (!bh_string_ensure_capacity(dest, offset + src->length)) return;
 
 	memcpy(dest->data + offset, src->data, src->length);
@@ -226,7 +273,7 @@ void bh_string_replace_at_bh_string(bh_string* dest, bh_string* src, u64 offset)
 }
 
 void bh_string_replace_at_cstr(bh_string* dest, const char* src, u64 offset) {
-	if (offset >= dest->length) return;
+	if (offset > dest->length) return;
 	const int srclen = strlen(src);
 	if (!bh_string_ensure_capacity(dest, offset + srclen)) return;
 
@@ -253,14 +300,6 @@ void bh_string_insert_at_cstr(bh_string* dest, const char* src, u64 offset) {
 	dest->length += srclen;
 }
 
-static inline u8 charset_contains(const char* charset, char ch) {
-	while (*charset) {
-		if (*charset == ch) return *charset;
-		charset++;
-	}
-
-	return 0;
-}
 
 void bh_string_trim_end(bh_string* str, const char* charset) {
 	while (charset_contains(charset, str->data[str->length - 1]))
@@ -362,14 +401,14 @@ bh_file_error bh_file_new(bh_file* file, bh_file_descriptor fd, const char* file
 	return BH_FILE_ERROR_NONE;
 }
 
-i32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read) {
+b32 bh_file_read_at(bh_file* file, i64 offset, void* buffer, isize buff_size, isize* bytes_read) {
 	isize res = pread(file->fd, buffer, buff_size, offset);
 	if (res < 0) return 0;
 	if (bytes_read) *bytes_read = res;
 	return 1;
 }
 
-i32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote) {
+b32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_size, isize* bytes_wrote) {
 	isize res;
 	i64 current_offset = 0;
 	bh__file_seek_wrapper(file->fd, offset, BH_FILE_WHENCE_CURRENT, &current_offset);
@@ -385,7 +424,7 @@ i32 bh_file_write_at(bh_file* file, i64 offset, void const* buffer, isize buff_s
 	return 1;
 }
 
-static i32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset) {
+static b32 bh__file_seek_wrapper(i32 fd, i64 offset, bh_file_whence whence, i64* new_offset) {
 	i64 res = lseek(fd, offset, whence);
 	if (res < 0) return 0;
 	if (new_offset) *new_offset = res;
@@ -426,11 +465,11 @@ bh_file_error bh_file_close(bh_file* file) {
 	return err;
 }
 
-i32 bh_file_read(bh_file* file, void* buffer, isize buff_size) {
+b32 bh_file_read(bh_file* file, void* buffer, isize buff_size) {
 	return bh_file_read_at(file, bh_file_tell(file), buffer, buff_size, NULL);
 }
 
-i32 bh_file_write(bh_file* file, void* buffer, isize buff_size) {
+b32 bh_file_write(bh_file* file, void* buffer, isize buff_size) {
 	return bh_file_write_at(file, bh_file_tell(file), buffer, buff_size, NULL);
 }
 
@@ -465,7 +504,7 @@ bh_file_contents bh_file_read_contents_direct(const char* filename) {
 	return fc;
 }
 
-i32 bh_file_contents_delete(bh_file_contents* contents) {
+b32 bh_file_contents_delete(bh_file_contents* contents) {
 	free(contents->data);
 	contents->length = 0;
 	return 1;
diff --git a/docs/plan b/docs/plan
new file mode 100644
index 00000000..70cd353e
--- /dev/null
+++ b/docs/plan
@@ -0,0 +1,56 @@
+The ONYX Programming Language
+-----------------------------
+
+WHAT:
+	ONYX is a low-ish level programming language designed for use with
+	Web-Assembly 32-bit (WASM). It features some advanced features such
+	as comptime code execution and JS literals for external functions.
+
+WHY:
+	ONYX was made to help me learn about compiler design.
+
+FEATURES:
+	- Strong type system
+	- Smart package loading
+	- Structs and enums
+	- functions (no anonymous functions)
+	- Control structures
+		if, for, switch
+	- pointers
+	- inferred typing
+	- defer
+	? polymorphic functions
+
+EXAMPLE CODE:
+
+// This is a comment
+// This is also the only way to do comments
+
+use "core"; // Looks for "core.onyx" in the current directory
+
+Foo :: struct { x: i32, y: i32 };
+
+export add :: (a: i32, b: i32) -> i32 {
+	return a + b;	
+};
+
+foo :: (a: i32) -> Foo {
+	return Foo { x = a, y = 0 };
+}
+
+MVP CODE:
+
+// Comments need to be parsed
+
+export add :: (a: i32, b: i32) -> i32 {
+	return a + b;
+}
+
+export max :: (a: i32, b: i32) -> i32 {
+	// Curly braces are required
+	if a > b {
+		return a;
+	} else {
+		return b;
+	}
+}
\ No newline at end of file
diff --git a/onyx b/onyx
index 272ceb6a..2d5a2d93 100755
Binary files a/onyx and b/onyx differ
diff --git a/onyx.c b/onyx.c
index 60d8ea96..970c85c8 100644
--- a/onyx.c
+++ b/onyx.c
@@ -2,41 +2,162 @@
 #include <stdlib.h> // TODO: Replace with custom lib
 #include "bh.h"
 
+typedef struct Tokenizer {
+	char *start, *curr, *end;
+	u64 line_number;
+} Tokenizer;
+
+typedef enum TokenType {
+	TOKEN_TYPE_UNKNOWN,
+	TOKEN_TYPE_END_STREAM,
+
+	TOKEN_TYPE_KEYWORD_STRUCT,
+	TOKEN_TYPE_KEYWORD_USE,
+	TOKEN_TYPE_KEYWORD_EXPORT,
+	TOKEN_TYPE_KEYWORD_IF,
+	TOKEN_TYPE_KEYWORD_ELSE,
+	TOKEN_TYPE_KEYWORD_FOR,
+	TOKEN_TYPE_KEYWORD_RETURN,
+
+	TOKEN_TYPE_RIGHT_ARROW,
+	TOKEN_TYPE_OPEN_PAREN,
+	TOKEN_TYPE_CLOSE_PAREN,
+	TOKEN_TYPE_OPEN_BRACE,
+	TOKEN_TYPE_CLOSE_BRACE,
+	TOKEN_TYPE_OPEN_BRACKET,
+	TOKEN_TYPE_CLOSE_BRACKET,
+
+	TOKEN_TYPE_OP_ADD,
+	TOKEN_TYPE_OP_SUB,
+	TOKEN_TYPE_OP_MUL,
+	TOKEN_TYPE_OP_DIV,
+	TOKEN_TYPE_OP_MOD,
+
+	TOKEN_TYPE_COUNT
+} TokenType;
+
+static const char* TokenTypeNames[] = {
+	"TOKEN_TYPE_UNKNOWN",
+	"TOKEN_TYPE_END_STREAM",
+
+	"TOKEN_TYPE_KEYWORD_STRUCT",
+	"TOKEN_TYPE_KEYWORD_USE",
+	"TOKEN_TYPE_KEYWORD_EXPORT",
+	"TOKEN_TYPE_KEYWORD_IF",
+	"TOKEN_TYPE_KEYWORD_ELSE",
+	"TOKEN_TYPE_KEYWORD_FOR",
+	"TOKEN_TYPE_KEYWORD_RETURN",
+
+	"TOKEN_TYPE_RIGHT_ARROW",
+	"TOKEN_TYPE_OPEN_PAREN",
+	"TOKEN_TYPE_CLOSE_PAREN",
+	"TOKEN_TYPE_OPEN_BRACE",
+	"TOKEN_TYPE_CLOSE_BRACE",
+	"TOKEN_TYPE_OPEN_BRACKET",
+	"TOKEN_TYPE_CLOSE_BRACKET",
+
+	"TOKEN_TYPE_OP_ADD",
+	"TOKEN_TYPE_OP_SUB",
+	"TOKEN_TYPE_OP_MUL",
+	"TOKEN_TYPE_OP_DIV",
+	"TOKEN_TYPE_OP_MOD",
+
+	"TOKEN_TYPE_COUNT"
+};
+
+typedef struct Token {
+	TokenType type;
+	char* token;
+	isize length;
+	u64 line_number, line_column;
+} Token;
+
+b32 token_lit(Tokenizer* tokenizer, Token* tk, char* lit, TokenType type) {
+	i64 len = chars_match(tokenizer->curr, lit);
+	if (len > 0) {
+		tk->type = type;
+		tk->token = tokenizer->curr;
+		tk->length = len;
+		tokenizer->curr += len;
+		return 1;
+	}
+	return 0;
+}
+
+Token get_token(Tokenizer* tokenizer) {
+	#ifndef LITERAL_TOKEN
+	#define LITERAL_TOKEN(token, token_type) \
+		if (token_lit(tokenizer, &tk, token, token_type)) goto token_parsed;
+	#endif
+
+	Token tk;
+
+	tk.type = TOKEN_TYPE_UNKNOWN;
+	tk.token = tokenizer->curr;
+	tk.length = 1;
+	tk.line_number = 0;
+	tk.line_column = 0;
+
+	if (tokenizer->curr == tokenizer->end) {
+		tk.type = TOKEN_TYPE_END_STREAM;
+		goto token_parsed;
+	}
+
+	LITERAL_TOKEN("struct", TOKEN_TYPE_KEYWORD_STRUCT);
+	LITERAL_TOKEN("export", TOKEN_TYPE_KEYWORD_EXPORT);
+	LITERAL_TOKEN("use", TOKEN_TYPE_KEYWORD_USE);
+	LITERAL_TOKEN("if", TOKEN_TYPE_KEYWORD_IF);
+	LITERAL_TOKEN("else", TOKEN_TYPE_KEYWORD_IF);
+	LITERAL_TOKEN("for", TOKEN_TYPE_KEYWORD_FOR);
+	LITERAL_TOKEN("return", TOKEN_TYPE_KEYWORD_RETURN);
+	LITERAL_TOKEN("->", TOKEN_TYPE_RIGHT_ARROW);
+	LITERAL_TOKEN("(", TOKEN_TYPE_OPEN_PAREN);
+	LITERAL_TOKEN(")", TOKEN_TYPE_CLOSE_PAREN);
+	LITERAL_TOKEN("{", TOKEN_TYPE_OPEN_BRACE);
+	LITERAL_TOKEN("}", TOKEN_TYPE_CLOSE_BRACE);
+	LITERAL_TOKEN("[", TOKEN_TYPE_OPEN_BRACKET);
+	LITERAL_TOKEN("]", TOKEN_TYPE_CLOSE_BRACKET);
+	LITERAL_TOKEN("+", TOKEN_TYPE_OP_ADD);
+	LITERAL_TOKEN("-", TOKEN_TYPE_OP_SUB);
+	LITERAL_TOKEN("*", TOKEN_TYPE_OP_MUL);
+	LITERAL_TOKEN("/", TOKEN_TYPE_OP_DIV);
+	LITERAL_TOKEN("%", TOKEN_TYPE_OP_MOD);
+
+	tokenizer->curr++; // Ignore token
+
+token_parsed:
+	return tk;
+}
+
 int main(int argc, char *argv[]) {
-	bh_file demofile;
-	bh_file_error err = bh_file_open(&demofile, argv[1]);
+	bh_file source_file;
+	bh_file_error err = bh_file_open(&source_file, argv[1]);
 	if (err != BH_FILE_ERROR_NONE) {
 		fprintf(stderr, "Failed to open file %s\n", argv[1]);
 		return EXIT_FAILURE;
 	}
 
-	bh_file_contents fc = bh_file_read_contents(&demofile);
-	printf("%ld: %s\n", fc.length, fc.data);
+	bh_file_contents fc = bh_file_read_contents(&source_file);
+	bh_file_close(&source_file);
 
-	bh_file_contents_delete(&fc);
-	bh_file_close(&demofile);
+	Tokenizer tknizer = {
+		.start = fc.data,
+		.curr = fc.data,
+		.end = fc.data + fc.length,
+		.line_number = 1,
+	};
 
-	// bh_string test_str = bh_string_new(256);
-	// bh_string world_str = bh_string_new("World FOO Bar test\n");
+	Token tk;
+	do {
+		tk = get_token(&tknizer);
+		char c = *(tk.token + tk.length);
+		*(tk.token + tk.length) = '\0';
+		printf("%s: %s\n", TokenTypeNames[tk.type], tk.token);
+		*(tk.token + tk.length) = c;
+	} while (tk.type != TOKEN_TYPE_END_STREAM);
 
-	// bh_string_append(&test_str, "Hello Frank!\n");
-	// bh_string_replace_at(&test_str, &world_str, 6);
-	// bh_string_replace_at(&test_str, "Hola ", 0);
-	// bh_string_insert_at(&test_str, "World", 3);
-	// bh_string_print(&test_str);
 
-	// bh_string trim_str = bh_string_new("abcdeTesting words herezzzz\n   \t");
-	// bh_string_print(&trim_str);
-	// bh_string_trim_begin(&trim_str, "abcde");
-	// bh_string_print(&trim_str);
-	// bh_string_trim_end_space(&trim_str);
-	// bh_string_print(&trim_str);
-
-	// bh_string_delete(&test_str);
-	// bh_string_delete(&world_str);
-	// bh_string_delete(&trim_str);
-
-	// bh_string file_contents = bh_file_read_contents("path");
+	bh_file_contents_delete(&fc);
 
 	return 0;
 }
diff --git a/progs/demo.onyx b/progs/demo.onyx
index ee02d76b..32a37db1 100644
--- a/progs/demo.onyx
+++ b/progs/demo.onyx
@@ -1,9 +1,11 @@
-use "core";
+/* This is a comment
+This is also the only way to do comments
+*/
 
-Foo :: struct {
-	x, y i32;
-}
+use "core"; /* Looks for "core.onyx" in the current directory */
 
-main :: (argc i32, argv []*u8) int {
-	print("Hello World!");
-}
+Foo :: struct { x i32, y i32 };
+
+add :: (a i32, b i32) -> i32 {
+	return a + b;	
+};
\ No newline at end of file
diff --git a/progs/mvp.onyx b/progs/mvp.onyx
new file mode 100644
index 00000000..10f24015
--- /dev/null
+++ b/progs/mvp.onyx
@@ -0,0 +1,14 @@
+/* Comments need to be parsed */
+
+export add :: (a: i32, b: i32) -> i32 {
+	return a + b;
+}
+
+export max :: (a: i32, b: i32) -> i32 {
+	/* Curly braces are required */
+	if a > b {
+		return a;
+	} else {
+		return b;
+	}
+}
\ No newline at end of file