From: Brendan Hansen Date: Sun, 20 Jun 2021 12:37:43 +0000 (-0500) Subject: optimized the number of instructions it takes to copy an array X-Git-Url: https://git.brendanfh.com/?a=commitdiff_plain;h=467734c1782339e95199f2bee587aff2d40f6247;p=onyx.git optimized the number of instructions it takes to copy an array --- diff --git a/bin/onyx b/bin/onyx index c129f2fa..125c1bc7 100755 Binary files a/bin/onyx and b/bin/onyx differ diff --git a/include/onyxwasm.h b/include/onyxwasm.h index 7598a204..9f0cfffb 100644 --- a/include/onyxwasm.h +++ b/include/onyxwasm.h @@ -233,6 +233,8 @@ typedef enum WasmInstructionType { WI_PTR_SUB = WI_I32_SUB, WI_PTR_MUL = WI_I32_MUL, WI_PTR_GE = WI_I32_GE_U, + WI_PTR_GT = WI_I32_GT_U, + WI_PTR_EQ = WI_I32_EQ, WI_V128_LOAD = SIMD_INSTR_MASK | 0, WI_V128_STORE = SIMD_INSTR_MASK | 11, diff --git a/src/onyxwasm.c b/src/onyxwasm.c index ac9f6ab4..e2cc009b 100644 --- a/src/onyxwasm.c +++ b/src/onyxwasm.c @@ -2086,20 +2086,70 @@ EMIT_FUNC(array_store, Type* type, u32 offset) { WIL(WI_LOCAL_SET, rptr_local); WIL(WI_LOCAL_SET, lptr_local); - // NOTE: Currently, we inline the copying of the array; But if the array has - // many elements, this could result in a LOT of instructions. Maybe for lengths - // greater than like 16 we output a loop that copies them? - // - brendanfh 2020/12/16 - fori (i, 0, elem_count) { - WIL(WI_LOCAL_GET, lptr_local); + if (elem_count <= 2) { + // Inline copying for a small number of elements. It still may be faster to do this in a tight loop. + + fori (i, 0, elem_count) { + if (bh_arr_last(code).type == WI_LOCAL_SET && (u64) bh_arr_last(code).data.l == lptr_local) + bh_arr_last(code).type = WI_LOCAL_TEE; + else + WIL(WI_LOCAL_GET, lptr_local); + + WIL(WI_LOCAL_GET, rptr_local); + emit_load_instruction(mod, &code, elem_type, i * elem_size); + + emit_store_instruction(mod, &code, elem_type, i * elem_size + offset); + } - if (bh_arr_last(code).type == WI_LOCAL_SET && (u64) bh_arr_last(code).data.l == rptr_local) + } else if (context.options->use_post_mvp_features) { + // Use a simple memory copy if it is available. This may be what happens in the case below too at a later time. + + if (bh_arr_last(code).type == WI_LOCAL_SET && (u64) bh_arr_last(code).data.l == lptr_local) bh_arr_last(code).type = WI_LOCAL_TEE; else + WIL(WI_LOCAL_GET, lptr_local); + WIL(WI_PTR_CONST, offset); + WI(WI_PTR_ADD); + + WIL(WI_LOCAL_GET, rptr_local); + WIL(WI_I32_CONST, elem_count * elem_size); + WI(WI_MEMORY_COPY); + + } else { + // Emit a loop that copies the memory. This could be switched to a tight loop that just copies word per word. + + u64 offset_local = local_raw_allocate(mod->local_alloc, WASM_TYPE_PTR); + WIL(WI_PTR_CONST, 0); + WIL(WI_LOCAL_SET, offset_local); + + WID(WI_BLOCK_START, 0x40); + WID(WI_LOOP_START, 0x40); + WIL(WI_LOCAL_GET, offset_local); + WIL(WI_LOCAL_GET, lptr_local); + WI(WI_PTR_ADD); + + WIL(WI_LOCAL_GET, offset_local); WIL(WI_LOCAL_GET, rptr_local); - emit_load_instruction(mod, &code, elem_type, i * elem_size); + WI(WI_PTR_ADD); + + emit_load_instruction(mod, &code, elem_type, 0); + emit_store_instruction(mod, &code, elem_type, offset); - emit_store_instruction(mod, &code, elem_type, i * elem_size + offset); + WIL(WI_LOCAL_GET, offset_local); + WIL(WI_PTR_CONST, elem_size); + WI(WI_PTR_ADD); + WIL(WI_LOCAL_TEE, offset_local); + + WIL(WI_PTR_CONST, elem_count * elem_size); + WI(WI_PTR_GE); + WID(WI_COND_JUMP, 0x01); + + WID(WI_JUMP, 0x00); + + WI(WI_LOOP_END); + WI(WI_BLOCK_END); + + local_raw_free(mod->local_alloc, WASM_TYPE_PTR); } local_raw_free(mod->local_alloc, WASM_TYPE_PTR);