//-------------------------------------------------------------------------------------
// Better character functions
//-------------------------------------------------------------------------------------
-inline b32 char_is_alpha(const char a);
-inline b32 char_is_num(const char a);
-inline b32 char_is_alphanum(const char a);
-inline b32 char_is_whitespace(const char a);
-inline b32 char_in_range(const char lo, const char hi, const char a);
+b32 char_is_alpha(const char a);
+b32 char_is_num(const char a);
+b32 char_is_alphanum(const char a);
+b32 char_is_whitespace(const char a);
+b32 char_in_range(const char lo, const char hi, const char a);
char charset_contains(const char* charset, char ch);
//-------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------
// CHAR FUNCTIONS
//-------------------------------------------------------------------------------------
-inline b32 char_is_alpha(const char a) {
+b32 char_is_alpha(const char a) {
return ('a' <= a && a <= 'z') || ('A' <= a && a <= 'Z');
}
-inline b32 char_is_num(const char a) {
+b32 char_is_num(const char a) {
return ('0' <= a && a <= '9');
}
-inline b32 char_is_alphanum(const char a) {
+b32 char_is_alphanum(const char a) {
return char_is_alpha(a) || char_is_num(a);
}
-inline b32 char_is_whitespace(const char a) {
+b32 char_is_whitespace(const char a) {
return charset_contains(" \t\r\n", a);
}
-inline b32 char_in_range(const char lo, const char hi, const char a) {
+b32 char_in_range(const char lo, const char hi, const char a) {
return lo <= a <= hi;
}
#include <stdio.h> // TODO: Replace with custom lib
-#include <stdlib.h> // TODO: Replace with custom lib
#include "bh.h"
typedef struct Tokenizer {
char *start, *curr, *end;
+
+ // TODO: Fix the line number and column count
u64 line_number;
+ u64 line_column;
} Tokenizer;
typedef enum TokenType {
TOKEN_TYPE_UNKNOWN,
TOKEN_TYPE_END_STREAM,
+ TOKEN_TYPE_COMMENT,
+
TOKEN_TYPE_KEYWORD_STRUCT,
TOKEN_TYPE_KEYWORD_USE,
TOKEN_TYPE_KEYWORD_EXPORT,
TOKEN_TYPE_KEYWORD_RETURN,
TOKEN_TYPE_RIGHT_ARROW,
+ TOKEN_TYPE_LEFT_ARROW,
TOKEN_TYPE_OPEN_PAREN,
TOKEN_TYPE_CLOSE_PAREN,
TOKEN_TYPE_OPEN_BRACE,
TOKEN_TYPE_CLOSE_BRACE,
TOKEN_TYPE_OPEN_BRACKET,
TOKEN_TYPE_CLOSE_BRACKET,
+ TOKEN_TYPE_OPEN_ANGLE,
+ TOKEN_TYPE_CLOSE_ANGLE,
+
+ TOKEN_TYPE_SYM_PLUS,
+ TOKEN_TYPE_SYM_MINUS,
+ TOKEN_TYPE_SYM_STAR,
+ TOKEN_TYPE_SYM_PERCENT,
+ TOKEN_TYPE_SYM_FSLASH,
+ TOKEN_TYPE_SYM_BSLASH,
+ TOKEN_TYPE_SYM_COLON,
+ TOKEN_TYPE_SYM_SEMICOLON,
+ TOKEN_TYPE_SYM_COMMA,
+ TOKEN_TYPE_SYM_EQUALS,
+ TOKEN_TYPE_SYM_GRAVE,
+ TOKEN_TYPE_SYM_TILDE,
+ TOKEN_TYPE_SYM_BANG,
- TOKEN_TYPE_OP_ADD,
- TOKEN_TYPE_OP_SUB,
- TOKEN_TYPE_OP_MUL,
- TOKEN_TYPE_OP_DIV,
- TOKEN_TYPE_OP_MOD,
+ TOKEN_TYPE_SYMBOL,
+ TOKEN_TYPE_LITERAL_STRING,
+ TOKEN_TYPE_LITERAL_NUMERIC,
TOKEN_TYPE_COUNT
} TokenType;
"TOKEN_TYPE_UNKNOWN",
"TOKEN_TYPE_END_STREAM",
+ "TOKEN_TYPE_COMMENT",
+
"TOKEN_TYPE_KEYWORD_STRUCT",
"TOKEN_TYPE_KEYWORD_USE",
"TOKEN_TYPE_KEYWORD_EXPORT",
"TOKEN_TYPE_KEYWORD_RETURN",
"TOKEN_TYPE_RIGHT_ARROW",
+ "TOKEN_TYPE_LEFT_ARROW",
"TOKEN_TYPE_OPEN_PAREN",
"TOKEN_TYPE_CLOSE_PAREN",
"TOKEN_TYPE_OPEN_BRACE",
"TOKEN_TYPE_CLOSE_BRACE",
"TOKEN_TYPE_OPEN_BRACKET",
"TOKEN_TYPE_CLOSE_BRACKET",
+ "TOKEN_TYPE_OPEN_ANGLE",
+ "TOKEN_TYPE_CLOSE_ANGLE",
+
+ "TOKEN_TYPE_SYM_PLUS",
+ "TOKEN_TYPE_SYM_MINUS",
+ "TOKEN_TYPE_SYM_STAR",
+ "TOKEN_TYPE_SYM_PERCENT",
+ "TOKEN_TYPE_SYM_FSLASH",
+ "TOKEN_TYPE_SYM_BSLASH",
+ "TOKEN_TYPE_SYM_COLON",
+ "TOKEN_TYPE_SYM_SEMICOLON",
+ "TOKEN_TYPE_SYM_COMMA",
+ "TOKEN_TYPE_SYM_EQUALS",
+ "TOKEN_TYPE_SYM_GRAVE",
+ "TOKEN_TYPE_SYM_TILDE",
+ "TOKEN_TYPE_SYM_BANG",
- "TOKEN_TYPE_OP_ADD",
- "TOKEN_TYPE_OP_SUB",
- "TOKEN_TYPE_OP_MUL",
- "TOKEN_TYPE_OP_DIV",
- "TOKEN_TYPE_OP_MOD",
+ "TOKEN_TYPE_SYMBOL",
+ "TOKEN_TYPE_LITERAL_STRING",
+ "TOKEN_TYPE_LITERAL_NUMERIC",
"TOKEN_TYPE_COUNT"
};
u64 line_number, line_column;
} Token;
+#ifndef LITERAL_TOKEN
+#define LITERAL_TOKEN(token, token_type) \
+ if (token_lit(tokenizer, &tk, token, token_type)) goto token_parsed;
+#endif
+
+#ifndef INCREMENT_CURR_TOKEN
+#define INCREMENT_CURR_TOKEN(tkn) { \
+ tkn->curr++; \
+ tkn->line_column++; \
+ if (*tkn->curr == '\n') { \
+ tkn->line_number++; \
+ tkn->line_column = 1; \
+ } \
+}
+#endif
+
b32 token_lit(Tokenizer* tokenizer, Token* tk, char* lit, TokenType type) {
i64 len = chars_match(tokenizer->curr, lit);
if (len > 0) {
tk->type = type;
tk->token = tokenizer->curr;
tk->length = len;
+
tokenizer->curr += len;
+ tokenizer->line_column += len;
+
return 1;
}
return 0;
}
Token get_token(Tokenizer* tokenizer) {
- #ifndef LITERAL_TOKEN
- #define LITERAL_TOKEN(token, token_type) \
- if (token_lit(tokenizer, &tk, token, token_type)) goto token_parsed;
- #endif
-
Token tk;
+ // Skip whitespace
+ while (char_is_whitespace(*tokenizer->curr) && tokenizer->curr != tokenizer->end)
+ INCREMENT_CURR_TOKEN(tokenizer)
+
tk.type = TOKEN_TYPE_UNKNOWN;
tk.token = tokenizer->curr;
tk.length = 1;
- tk.line_number = 0;
- tk.line_column = 0;
+ tk.line_number = tokenizer->line_number;
+ tk.line_column = tokenizer->line_column;
if (tokenizer->curr == tokenizer->end) {
tk.type = TOKEN_TYPE_END_STREAM;
goto token_parsed;
}
+ // Comments
+ if (*tokenizer->curr == '/' && *(tokenizer->curr + 1) == '*') {
+ tokenizer->curr += 2;
+ tk.type = TOKEN_TYPE_COMMENT;
+ tk.token = tokenizer->curr;
+ u16 layers = 1;
+
+ while (layers >= 1) {
+ INCREMENT_CURR_TOKEN(tokenizer);
+
+ if (tokenizer->curr == tokenizer->end) {
+ tk.type = TOKEN_TYPE_END_STREAM;
+ break;
+ }
+
+ if (*tokenizer->curr == '/' && *(tokenizer->curr + 1) == '*') {
+ layers++;
+ INCREMENT_CURR_TOKEN(tokenizer);
+ }
+
+ if (*tokenizer->curr == '*' && *(tokenizer->curr + 1) == '/') {
+ layers--;
+ INCREMENT_CURR_TOKEN(tokenizer);
+ }
+ }
+
+ INCREMENT_CURR_TOKEN(tokenizer);
+
+ tk.length = tokenizer->curr - tk.token - 2;
+ goto token_parsed;
+ }
+
LITERAL_TOKEN("struct", TOKEN_TYPE_KEYWORD_STRUCT);
LITERAL_TOKEN("export", TOKEN_TYPE_KEYWORD_EXPORT);
LITERAL_TOKEN("use", TOKEN_TYPE_KEYWORD_USE);
LITERAL_TOKEN("if", TOKEN_TYPE_KEYWORD_IF);
- LITERAL_TOKEN("else", TOKEN_TYPE_KEYWORD_IF);
+ LITERAL_TOKEN("else", TOKEN_TYPE_KEYWORD_ELSE);
LITERAL_TOKEN("for", TOKEN_TYPE_KEYWORD_FOR);
LITERAL_TOKEN("return", TOKEN_TYPE_KEYWORD_RETURN);
LITERAL_TOKEN("->", TOKEN_TYPE_RIGHT_ARROW);
+ LITERAL_TOKEN("<-", TOKEN_TYPE_RIGHT_ARROW);
LITERAL_TOKEN("(", TOKEN_TYPE_OPEN_PAREN);
LITERAL_TOKEN(")", TOKEN_TYPE_CLOSE_PAREN);
LITERAL_TOKEN("{", TOKEN_TYPE_OPEN_BRACE);
LITERAL_TOKEN("}", TOKEN_TYPE_CLOSE_BRACE);
LITERAL_TOKEN("[", TOKEN_TYPE_OPEN_BRACKET);
LITERAL_TOKEN("]", TOKEN_TYPE_CLOSE_BRACKET);
- LITERAL_TOKEN("+", TOKEN_TYPE_OP_ADD);
- LITERAL_TOKEN("-", TOKEN_TYPE_OP_SUB);
- LITERAL_TOKEN("*", TOKEN_TYPE_OP_MUL);
- LITERAL_TOKEN("/", TOKEN_TYPE_OP_DIV);
- LITERAL_TOKEN("%", TOKEN_TYPE_OP_MOD);
+ LITERAL_TOKEN("<", TOKEN_TYPE_OPEN_ANGLE);
+ LITERAL_TOKEN(">", TOKEN_TYPE_CLOSE_ANGLE);
+ LITERAL_TOKEN("+", TOKEN_TYPE_SYM_PLUS);
+ LITERAL_TOKEN("-", TOKEN_TYPE_SYM_MINUS);
+ LITERAL_TOKEN("*", TOKEN_TYPE_SYM_STAR);
+ LITERAL_TOKEN("/", TOKEN_TYPE_SYM_FSLASH);
+ LITERAL_TOKEN("%", TOKEN_TYPE_SYM_PERCENT);
+ LITERAL_TOKEN("\\", TOKEN_TYPE_SYM_BSLASH);
+ LITERAL_TOKEN(":", TOKEN_TYPE_SYM_COLON);
+ LITERAL_TOKEN(";", TOKEN_TYPE_SYM_SEMICOLON);
+ LITERAL_TOKEN(",", TOKEN_TYPE_SYM_COMMA);
+ LITERAL_TOKEN("=", TOKEN_TYPE_SYM_EQUALS);
+ LITERAL_TOKEN("`", TOKEN_TYPE_SYM_GRAVE);
+ LITERAL_TOKEN("~", TOKEN_TYPE_SYM_TILDE);
+ LITERAL_TOKEN("!", TOKEN_TYPE_SYM_BANG);
+
+ // Symbols
+ if (char_is_alpha(*tk.token)) {
+ u64 len = 0;
+ while (char_is_alphanum(*tokenizer->curr) || charset_contains("_$", *tokenizer->curr)) {
+ len++;
+ INCREMENT_CURR_TOKEN(tokenizer);
+ }
+
+ tk.length = len;
+ tk.type = TOKEN_TYPE_SYMBOL;
+ goto token_parsed;
+ }
+
+ // String literal
+ if (*tk.token == '"') {
+ u64 len = 0;
+ u64 slash_count = 0;
+
+ INCREMENT_CURR_TOKEN(tokenizer);
+
+ while (!(*tokenizer->curr == '"' && slash_count == 0)) {
+ len++;
+
+ if (*tokenizer->curr == '\\') {
+ slash_count += 1;
+ slash_count %= 2;
+ } else {
+ slash_count = 0;
+ }
+
+ INCREMENT_CURR_TOKEN(tokenizer);
+ }
+
+ INCREMENT_CURR_TOKEN(tokenizer);
+
+ tk.token++;
+ tk.type = TOKEN_TYPE_LITERAL_STRING;
+ tk.length = len;
+ goto token_parsed;
+ }
+
+ // Number literal
+ if (char_is_num(*tokenizer->curr)) {
+ u64 len = 0;
+ while (char_is_num(*tokenizer->curr) || *tokenizer->curr == '.') {
+ len++;
+ INCREMENT_CURR_TOKEN(tokenizer);
+ }
+
+ tk.type = TOKEN_TYPE_LITERAL_NUMERIC;
+ tk.length = len;
+ }
- tokenizer->curr++; // Ignore token
+ INCREMENT_CURR_TOKEN(tokenizer);
token_parsed:
return tk;
Tokenizer tknizer = {
.start = fc.data,
.curr = fc.data,
- .end = fc.data + fc.length,
+ .end = fc.data + fc.length - 1,
.line_number = 1,
+ .line_column = 1,
};
Token tk;
tk = get_token(&tknizer);
char c = *(tk.token + tk.length);
*(tk.token + tk.length) = '\0';
- printf("%s: %s\n", TokenTypeNames[tk.type], tk.token);
+ printf("Line %ld, Column %ld: \n%s: %s\n", tk.line_number, tk.line_column, TokenTypeNames[tk.type], tk.token);
*(tk.token + tk.length) = c;
} while (tk.type != TOKEN_TYPE_END_STREAM);