From: Brendan Hansen <brendan.f.hansen@gmail.com>
Date: Tue, 9 Jun 2020 15:02:24 +0000 (-0500)
Subject: Changed implementation for bh_hash
X-Git-Url: https://git.brendanfh.com/?a=commitdiff_plain;h=9002dd7becd255a0630935c66457fbe23ea2e45a;p=onyx.git

Changed implementation for bh_hash
---

diff --git a/docs/new_hash_plan b/docs/new_hash_plan
new file mode 100644
index 00000000..d12fc1d1
--- /dev/null
+++ b/docs/new_hash_plan
@@ -0,0 +1,50 @@
+The state of the hash implementation right now:
+
+				(allocator + 1021 ptrs = 8192 bytes (HUGE))
+			+---------------------------------------------------------
+table ---->	| allocator | ptr | ptr | ptr | ptr | ptr | ptr | ptr ...
+			+-------------||------------------------------------------
+					      \/
+	  		+--------------+------------------------------------------------------
+	  		| Array header | key (64-bytes) | value | key (64-bytes) | value | ...
+	  		+--------------+------------------------------------------------------
+
+There are a couple of issues with this implementation:
+  * The table of pointers is absolutely huge.
+	It takes up about 2 pages of memory and we are randomly accessing it
+	so it will not be cache efficient.
+  * The keys are always the same size.
+	They are normally way too large, but also they would cut off if you
+	needed a large key.
+
+
+
+THIS WORKED VERY WELL!
+Attempt 1 to fix these issues:
+
+				(user defined number of ptrs)
+			+-----------------------------------------------------------
+table ---->	| allocator | hash size | ptr | ptr | ptr | ptr | ptr | ...
+			+-------------------------||--------------------------------
+									  \/
+	  					+--------------+----------------------------------------------------------------------------
+	  					| Array header | value | key_length | key (null terminated) | v | kl | k | v | kl | k | ...
+				        +--------------+----------------------------------------------------------------------------
+
+GOOD:
+  * This implementation would allow for any size of key.
+	Initial thoughts:
+	  - Alignment is going to be very important.
+	  - Alignment will need to be by hand.
+	  - Aligning to 8 bytes should be sufficient.
+	  - The array would just be considered as a u8 array, since each element
+		wouldn't be the same size.
+	  - Random access into the array would not be allowed for the same reason.
+	  - Random access will not be needed however.
+  * This implementation still allows for easy iterator traversal, which is
+	important for the immediate use case.
+
+BAD:
+  * The fact that the number of pointers is user defined, the hashing algorithm could
+	be drastically slowed / crippled if they choose a bad number of pointers.
+  * This implementation still takes a very large number of allocations.
diff --git a/include/bh.h b/include/bh.h
index eb459326..a3854e78 100644
--- a/include/bh.h
+++ b/include/bh.h
@@ -508,66 +508,59 @@ void bh__arr_deleten(void **arr, i32 elemsize, i32 index, i32 numelems);
 //-------------------------------------------------------------------------------------
 #ifndef BH_NO_HASHTABLE
 
-#define BH__HASH_STORED_KEY_SIZE 64
-typedef struct bh__hash_entry {
-	char key[BH__HASH_STORED_KEY_SIZE];
-	i32 value; // NOTE: Not actually an i32, just used as a placeholder for offset
-} bh__hash_entry;
-
-#define BH__HASH_MODULUS 1021
-#define BH__HASH_KEYSIZE 64
 #ifdef BH_DEFINE
-u64 bh__hash_function(const char* str, i32 len) {
+u64 bh__hash_function(const char* str, i32 len, i32 mod) {
 	u64 hash = 5381;
 	i32 c, l = 0;
-	if (len == 0) len = BH__HASH_KEYSIZE;
+	if (len == 0) len = ((u32) 1 << 31) - 1; // TODO: Verify this is right
 
 	while ((c = *str++) && l++ < len) {
 		hash = (hash << 5) + hash + c;
 	}
 
-	return hash % BH__HASH_MODULUS;
+	return hash % mod;
 }
 #endif
 
 typedef struct bh_hash_iterator {
 	ptr *tab, *endtab;
 	i32 elemsize, arrlen;
-	bh__hash_entry* entry;
+	ptr entry;
 } bh_hash_iterator;
 
 typedef struct bh__hash {
 	bh_allocator allocator;
-	ptr arrs[BH__HASH_MODULUS];
+	u64 hash_size; // NOTE: u64 since padding will make it 8-bytes no matter what
+	ptr arrs[];
 } bh__hash;
 
 #define bh_hash(T)		T*
 
 #ifdef BH_HASH_SIZE_SAFE
-	#define bh_hash_init(allocator_, tab)	bh__hash_init(allocator_, (bh__hash **)&(tab))
-	#define bh_hash_free(tab)				bh__hash_free((bh__hash **)&(tab))
-	#define bh_hash_put(T, tab, key, value) (assert(sizeof(T) == sizeof(*(tab))), (*((T *) bh__hash_put((bh__hash *) tab, sizeof(T), key)) = (T) value))
-	#define bh_hash_has(T, tab, key)		(assert(sizeof(T) == sizeof(*(tab))), (bh__hash_has((bh__hash *) tab, sizeof(T), key)))
-	#define bh_hash_get(T, tab, key)		(assert(sizeof(T) == sizeof(*(tab))), (*((T *) bh__hash_get((bh__hash *) tab, sizeof(T), key))))
-	#define bh_hash_delete(T, tab, key)		(assert(sizeof(T) == sizeof(*(tab))), bh__hash_delete((bh__hash *) tab, sizeof(T), key))
+	#define bh_hash_init(allocator_, tab, hs)	bh__hash_init(allocator_, (bh__hash **)&(tab), hs)
+	#define bh_hash_free(tab)					bh__hash_free((bh__hash **)&(tab))
+	#define bh_hash_put(T, tab, key, value) 	(assert(sizeof(T) == sizeof(*(tab))), (*((T *) bh__hash_put((bh__hash *) tab, sizeof(T), key)) = (T) value))
+	#define bh_hash_has(T, tab, key)			(assert(sizeof(T) == sizeof(*(tab))), (bh__hash_has((bh__hash *) tab, sizeof(T), key)))
+	#define bh_hash_get(T, tab, key)			(assert(sizeof(T) == sizeof(*(tab))), (*((T *) bh__hash_get((bh__hash *) tab, sizeof(T), key))))
+	#define bh_hash_delete(T, tab, key)			(assert(sizeof(T) == sizeof(*(tab))), bh__hash_delete((bh__hash *) tab, sizeof(T), key))
 
 	#define bh_hash_iter_setup(T, tab)			(assert(sizeof(T) == sizeof(*(tab))), bh__hash_iter_setup((bh__hash *) tab, sizeof(T)))
-	#define bh_hash_iter_key(it)				(it.entry->key)
-	#define bh_hash_iter_value(T, it)			(assert(sizeof(T) == it.elemsize), *(T *)&(it.entry->value))
+	#define bh_hash_iter_key(it)				((char *)(bh_pointer_add(it.entry, it.elemsize + sizeof(u16))))
+	#define bh_hash_iter_value(T, it)			(assert(sizeof(T) == it.elemsize), *(T *)it.entry)
 #else
-	#define bh_hash_init(allocator_, tab)	bh__hash_init(allocator_, (bh__hash **)&(tab))
-	#define bh_hash_free(tab)				bh__hash_free((bh__hash **)&(tab))
-	#define bh_hash_put(T, tab, key, value) (*((T *) bh__hash_put((bh__hash *) tab, sizeof(T), key)) = value)
-	#define bh_hash_has(T, tab, key)		(bh__hash_has((bh__hash *) tab, sizeof(T), key))
-	#define bh_hash_get(T, tab, key)		(*((T *) bh__hash_get((bh__hash *) tab, sizeof(T), key)))
-	#define bh_hash_delete(T, tab, key)		(bh__hash_delete((bh__hash *) tab, sizeof(T), key))
+	#define bh_hash_init(allocator_, tab, hs)	bh__hash_init(allocator_, (bh__hash **)&(tab), hs)
+	#define bh_hash_free(tab)					bh__hash_free((bh__hash **)&(tab))
+	#define bh_hash_put(T, tab, key, value) 	(*((T *) bh__hash_put((bh__hash *) tab, sizeof(T), key)) = value)
+	#define bh_hash_has(T, tab, key)			(bh__hash_has((bh__hash *) tab, sizeof(T), key))
+	#define bh_hash_get(T, tab, key)			(*((T *) bh__hash_get((bh__hash *) tab, sizeof(T), key)))
+	#define bh_hash_delete(T, tab, key)			(bh__hash_delete((bh__hash *) tab, sizeof(T), key))
 
 	#define bh_hash_iter_setup(T, tab)			(bh__hash_iter_setup((bh__hash *) tab, sizeof(T)))
-	#define bh_hash_iter_key(it)				(it.entry->key)
-	#define bh_hash_iter_value(T, it)			(*(T *)&(it.entry->value))
+	#define bh_hash_iter_key(it)				((char *)(bh_pointer_add(it.entry, it.elemsize + sizeof(u16))))
+	#define bh_hash_iter_value(T, it)			(*(T *)it.entry)
 #endif
 
-b32 bh__hash_init(bh_allocator allocator, bh__hash **table);
+b32 bh__hash_init(bh_allocator allocator, bh__hash **table, i32 hash_size);
 b32 bh__hash_free(bh__hash **table);
 ptr bh__hash_put(bh__hash *table, i32 elemsize, char *key);
 b32 bh__hash_has(bh__hash *table, i32 elemsize, char *key);
@@ -1594,13 +1587,14 @@ void bh__arr_insertn(void **arr, i32 elemsize, i32 index, i32 numelems) {
 //-------------------------------------------------------------------------------------
 #ifndef BH_NO_HASHTABLE
 
-b32 bh__hash_init(bh_allocator allocator, bh__hash **table) {
-	*table = bh_alloc(allocator, sizeof(bh__hash));
+b32 bh__hash_init(bh_allocator allocator, bh__hash **table, i32 hash_size) {
+	*table = bh_alloc(allocator, sizeof(bh__hash) + sizeof(ptr) * hash_size);
 	if (*table == NULL) return 0;
 
 	(*table)->allocator = allocator;
+	(*table)->hash_size = hash_size;
 
-	for (i32 i = 0; i < BH__HASH_MODULUS; i++) {
+	for (i32 i = 0; i < hash_size; i++) {
 		(*table)->arrs[i] = NULL;
 	}
 
@@ -1608,7 +1602,7 @@ b32 bh__hash_init(bh_allocator allocator, bh__hash **table) {
 }
 
 b32 bh__hash_free(bh__hash **table) {
-	for (i32 i = 0; i < BH__HASH_MODULUS; i++) {
+	for (i32 i = 0; i < (*table)->hash_size; i++) {
 		if ((*table)->arrs[i] != NULL) {
 			bh_arr_free((*table)->arrs[i]);
 		}
@@ -1620,95 +1614,158 @@ b32 bh__hash_free(bh__hash **table) {
 
 // Assumes NULL terminated string for key
 ptr bh__hash_put(bh__hash *table, i32 elemsize, char *key) {
-	u64 index = bh__hash_function(key, 0);
-
-	elemsize += BH__HASH_STORED_KEY_SIZE;
+	u64 index = bh__hash_function(key, 0, table->hash_size);
 
 	ptr arrptr = table->arrs[index];
-	i32 len = bh_arr_length(arrptr);
+	if (arrptr == NULL) goto add_new_element;
+	u64 len = *(u64 *) arrptr;
+	arrptr = bh_pointer_add(arrptr, sizeof(u64));
 
+	u16 key_length = 0;
 	while (len--) {
-		if (strncmp(key, (char *) arrptr, BH__HASH_STORED_KEY_SIZE) == 0) goto found_matching;
 		arrptr = bh_pointer_add(arrptr, elemsize);
+		key_length = *(u16 *) arrptr;
+		arrptr = bh_pointer_add(arrptr, sizeof(u16));
+		if (strncmp(key, (char *) arrptr, key_length) == 0) goto found_matching;
+		arrptr = bh_pointer_add(arrptr, key_length);
 	}
 
-	// Didn't find it in the array, make a new one
+add_new_element:
 	arrptr = table->arrs[index];
-	len = bh_arr_length(arrptr);
-	bh__arr_grow(table->allocator, &arrptr, elemsize, len + 1);
-	bh__arrhead(arrptr)->length++;
+	i32 byte_len = bh_arr_length(arrptr);
+	if (byte_len == 0) byte_len = sizeof(u64);
+	key_length = strlen(key) + 1;
+	bh__arr_grow(table->allocator, &arrptr, 1, byte_len + elemsize + sizeof(u16) + key_length);
+	bh__arrhead(arrptr)->length = byte_len + elemsize + sizeof(u16) + key_length;
 	table->arrs[index] = arrptr;
 
-	arrptr = bh_pointer_add(arrptr, elemsize * len);
-	strncpy(arrptr, key, BH__HASH_STORED_KEY_SIZE);
+	(*(u64 *) arrptr)++;
+
+	arrptr = bh_pointer_add(arrptr, byte_len + elemsize);
+	*(u16 *) arrptr = key_length;
+	arrptr = bh_pointer_add(arrptr, sizeof(u16));
+	strncpy(arrptr, key, key_length);
 
 found_matching:
-	return bh_pointer_add(arrptr, BH__HASH_STORED_KEY_SIZE);
+	return bh_pointer_add(arrptr, -(sizeof(u16) + elemsize));
+
+// OLD:
+//	elemsize += BH__HASH_STORED_KEY_SIZE;
+//
+//	ptr arrptr = table->arrs[index];
+//	i32 len = bh_arr_length(arrptr);
+//
+//	while (len--) {
+//		if (strncmp(key, (char *) arrptr, BH__HASH_STORED_KEY_SIZE) == 0) goto found_matching;
+//		arrptr = bh_pointer_add(arrptr, elemsize);
+//	}
+//
+//	// Didn't find it in the array, make a new one
+//	arrptr = table->arrs[index];
+//	len = bh_arr_length(arrptr);
+//	bh__arr_grow(table->allocator, &arrptr, elemsize, len + 1);
+//	bh__arrhead(arrptr)->length++;
+//	table->arrs[index] = arrptr;
+//
+//	arrptr = bh_pointer_add(arrptr, elemsize * len);
+//	strncpy(arrptr, key, BH__HASH_STORED_KEY_SIZE);
+//
+//found_matching:
+//	return bh_pointer_add(arrptr, BH__HASH_STORED_KEY_SIZE);
 }
 
 b32 bh__hash_has(bh__hash *table, i32 elemsize, char *key) {
-	u64 index = bh__hash_function(key, 0);
+	u64 index = bh__hash_function(key, 0, table->hash_size);
 
 	ptr arrptr = table->arrs[index];
 	if (arrptr == NULL) return 0;
 
-	i32 len = bh_arr_length(arrptr);
-	i32 stride = elemsize + BH__HASH_STORED_KEY_SIZE;
+	u64 len = *(u64 *) arrptr;
+	arrptr = bh_pointer_add(arrptr, sizeof(u64));
 
+	u16 key_length = 0;
 	while (len--) {
-		if (strncmp(key, (char *) arrptr, BH__HASH_STORED_KEY_SIZE) == 0) return 1;
-		arrptr = bh_pointer_add(arrptr, stride);
+		arrptr = bh_pointer_add(arrptr, elemsize);
+		key_length = *(u16 *) arrptr;
+		arrptr = bh_pointer_add(arrptr, sizeof(u16));
+		if (strncmp(key, (char *) arrptr, key_length) == 0) return 1;
+		arrptr = bh_pointer_add(arrptr, key_length);
 	}
 
 	return 0;
 }
 
 ptr bh__hash_get(bh__hash *table, i32 elemsize, char *key) {
-	u64 index = bh__hash_function(key, 0);
+	u64 index = bh__hash_function(key, 0, table->hash_size);
 
 	ptr arrptr = table->arrs[index];
-	if (arrptr == NULL) return NULL;
+	if (arrptr == NULL) return 0;
 
-	i32 stride = elemsize + BH__HASH_STORED_KEY_SIZE;
+	u64 len = *(u64 *) arrptr;
+	arrptr = bh_pointer_add(arrptr, sizeof(u64));
 
-	i32 len = bh_arr_length(arrptr);
+	u16 key_length = 0;
 	while (len--) {
-		if (strncmp(key, (char *) arrptr, BH__HASH_STORED_KEY_SIZE) == 0) {
-			return bh_pointer_add(arrptr, BH__HASH_STORED_KEY_SIZE);
+		arrptr = bh_pointer_add(arrptr, elemsize);
+		key_length = *(u16 *) arrptr;
+		arrptr = bh_pointer_add(arrptr, sizeof(u16));
+		if (strncmp(key, (char *) arrptr, key_length) == 0) {
+			return bh_pointer_add(arrptr, -(sizeof(u16) + elemsize));
 		}
-
-		arrptr = bh_pointer_add(arrptr, stride);
+		arrptr = bh_pointer_add(arrptr, key_length);
 	}
 
 	return NULL;
 }
 
 void bh__hash_delete(bh__hash *table, i32 elemsize, char *key) {
-	u64 index = bh__hash_function(key, 0);
+	u64 index = bh__hash_function(key, 0, table->hash_size);
 
 	ptr arrptr = table->arrs[index], walker;
 	if (arrptr == NULL) return; // Didn't exist
 	walker = arrptr;
 
-	i32 stride = elemsize + BH__HASH_STORED_KEY_SIZE;
-	i32 i = 0;
+	i32 byte_offset = 8;
+	i32 delete_len = 0;
 
-	i32 len = bh_arr_length(arrptr);
-	while (len && strncmp(key, (char *) walker, BH__HASH_STORED_KEY_SIZE) != 0) {
-		walker = bh_pointer_add(walker, stride);
-		i++, len--;
+	u64 len = *(u64 *) walker;
+	walker = bh_pointer_add(walker, sizeof(u64));
+
+	u16 key_length = 0;
+	while (len--) {
+		walker = bh_pointer_add(walker, elemsize);
+		key_length = *(u16 *) walker;
+		walker = bh_pointer_add(walker, sizeof(u16));
+		if (strncmp(key, (char *) walker, key_length) == 0) {
+			delete_len = elemsize + sizeof(u16) + key_length;
+			goto found_matching;
+		}
+		byte_offset += elemsize + sizeof(u16) + key_length;
 	}
 
-	if (len == 0) return; // Didn't exist
+	// NOTE: Already didn't exist
+	return;
 
-	bh__arr_deleten((void **) &arrptr, stride, i, 1);
+found_matching:
+	bh__arr_deleten((void **) &arrptr, 1, byte_offset, delete_len);
 	table->arrs[index] = arrptr;
+
+// OLD:
+//	while (len && strncmp(key, (char *) walker, BH__HASH_STORED_KEY_SIZE) != 0) {
+//		walker = bh_pointer_add(walker, stride);
+//		i++, len--;
+//	}
+//
+//	if (len == 0) return; // Didn't exist
+//
+//	bh__arr_deleten((void **) &arrptr, stride, i, 1);
+//	table->arrs[index] = arrptr;
 }
 
 bh_hash_iterator bh__hash_iter_setup(bh__hash *table, i32 elemsize) {
 	bh_hash_iterator it = {
 		.tab = table->arrs,
-		.endtab = table->arrs + BH__HASH_MODULUS,
+		.endtab = table->arrs + table->hash_size,
 		.elemsize = elemsize,
 		.entry = NULL
 	};
@@ -1725,7 +1782,8 @@ b32 bh_hash_iter_next(bh_hash_iterator* it) {
 			goto step_to_next;
 		}
 
-		it->entry = (bh__hash_entry *)bh_pointer_add(it->entry, BH__HASH_STORED_KEY_SIZE + it->elemsize);
+		it->entry = bh_pointer_add(it->entry, it->elemsize);
+		it->entry = bh_pointer_add(it->entry, sizeof(u16) + (*(u16 *) it->entry));
 		return 1;
 	}
 
@@ -1738,7 +1796,8 @@ step_to_next:
 	if (it->tab == it->endtab) return 0;
 
 	it->entry = *it->tab;
-	it->arrlen = bh_arr_length(it->entry);
+	it->arrlen = *(u64 *) it->entry;
+	it->entry = bh_pointer_add(it->entry, sizeof(u64));
 	if (it->arrlen <= 0) {
 		it->tab++;
 		goto step_to_next;
diff --git a/onyx b/onyx
index 4aeca702..74885350 100755
Binary files a/onyx and b/onyx differ
diff --git a/src/onyxparser.c b/src/onyxparser.c
index 646e6e06..0fbfc267 100644
--- a/src/onyxparser.c
+++ b/src/onyxparser.c
@@ -689,7 +689,7 @@ OnyxAstNode* onyx_ast_node_new(bh_allocator alloc, OnyxAstNodeKind kind) {\
 OnyxParser onyx_parser_create(bh_allocator alloc, OnyxTokenizer *tokenizer, OnyxMessages* msgs) {
 	OnyxParser parser;
 
-	bh_hash_init(bh_heap_allocator(), parser.identifiers);
+	bh_hash_init(bh_heap_allocator(), parser.identifiers, 61);
 
 	OnyxTypeInfo* it = &builtin_types[0];
 	while (it->kind != 0xffffffff) {
diff --git a/src/onyxwasm.c b/src/onyxwasm.c
index c0b91ad2..11226d43 100644
--- a/src/onyxwasm.c
+++ b/src/onyxwasm.c
@@ -104,8 +104,8 @@ OnyxWasmModule onyx_wasm_generate_module(bh_allocator alloc, OnyxAstNode* progra
 	bh_arr_new(alloc, module.functypes, 4);
 	bh_arr_new(alloc, module.funcs, 4);
 
-	bh_hash_init(bh_heap_allocator(), module.type_map);
-	bh_hash_init(bh_heap_allocator(), module.exports);
+	bh_hash_init(bh_heap_allocator(), module.type_map, 61);
+	bh_hash_init(bh_heap_allocator(), module.exports, 61);
 
 	OnyxAstNode* walker = program;
 	while (walker) {