diff --git a/src/builtins_extra.asm b/src/builtins_extra.asm index e178ef1..1fa7948 100644 --- a/src/builtins_extra.asm +++ b/src/builtins_extra.asm @@ -2134,7 +2134,17 @@ DEF_FUNC builtin_sum mov rsi, r14 ; item payload mov edx, [rsp] ; accum tag (left_tag) mov ecx, r15d ; item tag (right_tag) + ; Use float_add if either operand is float, else int_add + cmp edx, TAG_FLOAT + je .sum_float_add + cmp ecx, TAG_FLOAT + je .sum_float_add call int_add + jmp .sum_have_result +.sum_float_add: + extern float_add + call float_add +.sum_have_result: ; rax = new accum payload, edx = new accum tag ; Save new accum before DECREFs @@ -2184,124 +2194,151 @@ DEF_FUNC builtin_sum END_FUNC builtin_sum ; ============================================================================ -; 15. builtin_min(args, nargs) - min(a, b, ...) +; 15-16. builtin_min / builtin_max ; ============================================================================ -MIN_TAG equ 0 ; [rsp + MIN_TAG] = current min tag -MIN_CMP_RES equ 8 ; [rsp + MIN_CMP_RES] = richcompare result ptr -DEF_FUNC builtin_min +; Shared implementation: minmax_impl(args, nargs, cmp_op) +; rdi = args, rsi = nargs, edx = cmp_op (PY_LT=0 for min, PY_GT=4 for max) +; Returns (rax=payload, rdx=tag) +; +; Stack layout: +; [rsp + MM_TAG] = current best tag (64-bit) +; [rsp + MM_CMP_RES] = richcompare result ptr +; [rsp + MM_ITER] = iterator ptr (iter path only) +; [rsp + MM_ITERNX] = tp_iternext fn ptr (iter path only) +; [rsp + MM_CMP_OP] = comparison op (PY_LT or PY_GT) +MM_TAG equ 8 +MM_CMP_RES equ 16 +MM_ITER equ 24 +MM_ITERNX equ 32 +MM_CMP_OP equ 40 +MM_FRAME equ 48 + +DEF_FUNC_BARE builtin_min + xor edx, edx ; PY_LT = 0 + jmp minmax_impl +END_FUNC builtin_min + +DEF_FUNC_BARE builtin_max + mov edx, PY_GT ; PY_GT = 4 + jmp minmax_impl +END_FUNC builtin_max + +DEF_FUNC_LOCAL minmax_impl, MM_FRAME push rbx push r12 push r13 push r14 push r15 - sub rsp, 16 + + mov [rbp - MM_CMP_OP], edx ; save comparison op cmp rsi, 1 - jb .min_error + jb .mm_error - mov rbx, rdi - mov r12, rsi + ; nargs == 1 → iterate the single argument + cmp rsi, 1 + je .mm_iter_path - mov r13, 1 + ; --- Multi-arg path: min/max(a, b, ...) --- + mov rbx, rdi ; args array + mov r12, rsi ; nargs + mov r13, 1 ; index = 1 - mov r14, [rbx] ; args[0] payload = current min + mov r14, [rbx] ; args[0] payload = current best mov rax, [rbx + 8] ; args[0] tag (64-bit) - mov [rsp + MIN_TAG], rax ; min_tag - cmp rax, TAG_PTR - jne .min_loop ; non-refcounted (SmallInt/Float/Bool/None/SmallStr): no INCREF - inc qword [r14 + PyObject.ob_refcnt] + mov [rbp - MM_TAG], rax + INCREF_VAL r14, rax -.min_loop: +.mm_loop: cmp r13, r12 - jge .min_done + jge .mm_done mov rax, r13 - shl rax, 4 ; rax = index * 16 (16-byte stride) + shl rax, 4 mov r15, [rbx + rax] ; candidate payload - mov rcx, [rbx + rax + 8] ; candidate tag (64-bit) + mov rcx, [rbx + rax + 8] ; candidate tag ; SmallInt fast path: both SmallInt? - cmp qword [rsp + MIN_TAG], TAG_SMALLINT - jne .min_slow_compare + cmp qword [rbp - MM_TAG], TAG_SMALLINT + jne .mm_slow cmp rcx, TAG_SMALLINT - jne .min_slow_compare - - ; Both SmallInts: compare directly + jne .mm_slow + ; For min (PY_LT=0): update if candidate < best + ; For max (PY_GT=4): update if candidate > best + cmp dword [rbp - MM_CMP_OP], 0 + jne .mm_si_max cmp r15, r14 - jge .min_no_update - mov r14, r15 ; update min (both SmallInt, no refcount) - jmp .min_no_update + jge .mm_no_update + mov r14, r15 + jmp .mm_no_update +.mm_si_max: + cmp r15, r14 + jle .mm_no_update + mov r14, r15 + jmp .mm_no_update -.min_slow_compare: - ; Use tp_richcompare(args[i], current_min, PY_LT) - ; SmallStr? (bit 63 set) +.mm_slow: + ; Resolve candidate type for richcompare + mov r8, rcx ; save candidate tag test rcx, rcx - js .min_cand_smallstr - + js .mm_cand_ss cmp rcx, TAG_PTR - jne .min_try_float + jne .mm_try_float mov rdi, r15 mov rax, [rdi + PyObject.ob_type] - jmp .min_have_type -.min_cand_smallstr: + jmp .mm_have_type +.mm_cand_ss: lea rax, [rel str_type] - jmp .min_have_type -.min_try_float: + jmp .mm_have_type +.mm_try_float: cmp rcx, TAG_FLOAT - jne .min_no_update ; Bool/None: skip - mov rdi, r15 + jne .mm_no_update lea rax, [rel float_type] -.min_have_type: - mov r8, rcx ; save candidate tag +.mm_have_type: mov rcx, [rax + PyTypeObject.tp_richcompare] test rcx, rcx - jz .min_no_update + jz .mm_no_update - mov rdi, r15 ; left = candidate - mov rsi, r14 ; right = current min - xor edx, edx ; PY_LT = 0 - mov rax, rcx ; fn ptr -> rax + ; tp_richcompare(candidate, best, cmp_op, cand_tag, best_tag) + mov rdi, r15 + mov rsi, r14 + mov edx, [rbp - MM_CMP_OP] + mov rax, rcx ; fn ptr mov rcx, r8 ; left_tag = candidate tag - mov r8, [rsp + MIN_TAG] ; right_tag = min tag + mov r8, [rbp - MM_TAG] ; right_tag = best tag call rax - ; Compare result against bool_true BEFORE DECREF lea rcx, [rel bool_true] cmp rax, rcx - mov [rsp + MIN_CMP_RES], rax - jne .min_slow_no_update + mov [rbp - MM_CMP_RES], rax + jne .mm_slow_no_upd - ; Update min: DECREF old min + ; Update best: DECREF old, set new = candidate mov rdi, r14 - mov rsi, [rsp + MIN_TAG] + mov rsi, [rbp - MM_TAG] DECREF_VAL rdi, rsi - ; Set new min = candidate mov r14, r15 mov rax, r13 shl rax, 4 - mov rax, [rbx + rax + 8] ; reload candidate tag (64-bit) - mov [rsp + MIN_TAG], rax ; update min_tag - cmp rax, TAG_PTR - jne .min_slow_update_done ; non-refcounted (SmallStr/SmallInt/Float): no INCREF - inc qword [r14 + PyObject.ob_refcnt] -.min_slow_update_done: - ; DECREF richcompare result - mov rdi, [rsp + MIN_CMP_RES] - call obj_decref ; richcompare result is always a heap ptr (bool) - jmp .min_no_update - -.min_slow_no_update: - mov rdi, [rsp + MIN_CMP_RES] - call obj_decref ; richcompare result is always a heap ptr (bool) - -.min_no_update: + mov rax, [rbx + rax + 8] + mov [rbp - MM_TAG], rax + INCREF_VAL r14, rax + + mov rdi, [rbp - MM_CMP_RES] + call obj_decref + jmp .mm_no_update + +.mm_slow_no_upd: + mov rdi, [rbp - MM_CMP_RES] + call obj_decref + +.mm_no_update: inc r13 - jmp .min_loop + jmp .mm_loop -.min_done: +.mm_done: mov rax, r14 - mov rdx, [rsp + MIN_TAG] ; min_tag (64-bit) - add rsp, 16 + mov rdx, [rbp - MM_TAG] pop r15 pop r14 pop r13 @@ -2310,178 +2347,122 @@ DEF_FUNC builtin_min leave ret -.min_error: - lea rdi, [rel exc_TypeError_type] - CSTRING rsi, "min expected at least 1 argument" - call raise_exception -END_FUNC builtin_min - -; ============================================================================ -; 16. builtin_max(args, nargs) - max(a, b, ...) -; ============================================================================ -MAX_TAG equ 0 ; [rsp + MAX_TAG] = current max tag -MAX_CMP_RES equ 8 ; [rsp + MAX_CMP_RES] = richcompare result ptr -DEF_FUNC builtin_max - push rbx - push r12 - push r13 - push r14 - push r15 - sub rsp, 16 - - cmp rsi, 1 - jb .max_error - - mov rbx, rdi - mov r12, rsi - - mov r13, 1 - - mov r14, [rbx] ; args[0] payload = current max - mov rax, [rbx + 8] ; args[0] tag (64-bit) - mov [rsp + MAX_TAG], rax ; max_tag - cmp rax, TAG_PTR - jne .max_loop ; non-refcounted (SmallInt/Float/Bool/None/SmallStr): no INCREF - inc qword [r14 + PyObject.ob_refcnt] - -.max_loop: - cmp r13, r12 - jge .max_done - - mov rax, r13 - shl rax, 4 ; rax = index * 16 (16-byte stride) - mov r15, [rbx + rax] ; candidate payload - mov rcx, [rbx + rax + 8] ; candidate tag (64-bit) - - ; SmallInt fast path: both SmallInt? - cmp qword [rsp + MAX_TAG], TAG_SMALLINT - jne .max_slow_compare - cmp rcx, TAG_SMALLINT - jne .max_slow_compare - - ; Both SmallInts: compare directly - cmp r15, r14 - jle .max_no_update - mov r14, r15 ; update max (both SmallInt, no refcount) - jmp .max_no_update - -.max_slow_compare: - ; Use tp_richcompare(current_max, args[i], PY_LT) - cmp qword [rsp + MAX_TAG], TAG_SMALLINT - je .max_try_rhs ; max is SmallInt, try candidate's richcompare - - ; SmallStr? (bit 63 set) - bt qword [rsp + MAX_TAG], 63 - jc .max_lhs_smallstr - - ; Must be TAG_PTR to dereference ob_type - cmp qword [rsp + MAX_TAG], TAG_PTR - jne .max_try_lhs_float - mov rdi, r14 + ; --- Iterator path: min/max(iterable) --- +.mm_iter_path: + ; Get iterator from args[0] + cmp qword [rdi + 8], TAG_PTR + jne .mm_iter_type_error + mov rdi, [rdi] ; iterable mov rax, [rdi + PyObject.ob_type] - jmp .max_have_type -.max_lhs_smallstr: - extern str_type - lea rax, [rel str_type] - jmp .max_have_type -.max_try_lhs_float: - cmp qword [rsp + MAX_TAG], TAG_FLOAT - jne .max_try_rhs ; Bool/None: try RHS - mov rdi, r14 - lea rax, [rel float_type] -.max_have_type: - mov rax, [rax + PyTypeObject.tp_richcompare] + mov rcx, [rax + PyTypeObject.tp_iter] + test rcx, rcx + jz .mm_iter_type_error + call rcx test rax, rax - jz .max_no_update + jz .mm_iter_type_error + mov [rbp - MM_ITER], rax + mov rbx, [rax + PyObject.ob_type] + mov rbx, [rbx + PyTypeObject.tp_iternext] + mov [rbp - MM_ITERNX], rbx + + ; Get first element → initial best + mov rdi, [rbp - MM_ITER] + call rbx + test edx, edx + jz .mm_iter_empty - ; Call tp_richcompare(left, right, PY_LT, left_tag, right_tag) - mov rdi, r14 - mov rsi, r15 - xor edx, edx ; PY_LT = 0 - mov rcx, [rsp + MAX_TAG] ; left_tag - push rax ; save fn ptr - mov rax, r13 - shl rax, 4 - mov r8, [rbx + rax + 8] ; right_tag - pop rax ; restore fn ptr - call rax - jmp .max_check_result + mov r14, rax ; best payload + mov [rbp - MM_TAG], rdx ; best tag + INCREF_VAL r14, rdx + DECREF_VAL rax, rdx ; DECREF iternext result -.max_try_rhs: - mov rax, r13 - shl rax, 4 - mov rcx, [rbx + rax + 8] ; reload candidate tag (64-bit) - cmp rcx, TAG_SMALLINT - je .max_no_update ; both SmallInt would have been caught above +.mm_iter_loop: + mov rdi, [rbp - MM_ITER] + call qword [rbp - MM_ITERNX] + test edx, edx + jz .mm_iter_done + + mov r15, rax ; candidate payload + mov r12, rdx ; candidate tag + + ; SmallInt fast path + cmp qword [rbp - MM_TAG], TAG_SMALLINT + jne .mm_iter_slow + cmp r12, TAG_SMALLINT + jne .mm_iter_slow + cmp dword [rbp - MM_CMP_OP], 0 + jne .mm_iter_si_max + cmp r15, r14 + jge .mm_iter_no_update + mov r14, r15 + jmp .mm_iter_no_update +.mm_iter_si_max: + cmp r15, r14 + jle .mm_iter_no_update + mov r14, r15 + jmp .mm_iter_no_update - ; SmallStr? (bit 63 set) +.mm_iter_slow: + ; Resolve candidate type for richcompare + mov rcx, r12 test rcx, rcx - js .max_rhs_smallstr - + js .mm_iter_cand_ss cmp rcx, TAG_PTR - jne .max_try_rhs_float + jne .mm_iter_try_float mov rdi, r15 mov rax, [rdi + PyObject.ob_type] - jmp .max_rhs_have_type -.max_rhs_smallstr: + jmp .mm_iter_have_type +.mm_iter_cand_ss: lea rax, [rel str_type] - jmp .max_rhs_have_type -.max_try_rhs_float: + jmp .mm_iter_have_type +.mm_iter_try_float: cmp rcx, TAG_FLOAT - jne .max_no_update ; Bool/None: skip - mov rdi, r15 + jne .mm_iter_no_update lea rax, [rel float_type] -.max_rhs_have_type: - mov r8, rcx ; save candidate tag - mov rcx, [rax + PyTypeObject.tp_richcompare] - test rcx, rcx - jz .max_no_update +.mm_iter_have_type: + mov rax, [rax + PyTypeObject.tp_richcompare] + test rax, rax + jz .mm_iter_no_update - mov rdi, r15 ; left = candidate - mov rsi, r14 ; right = current max - mov edx, PY_GT - mov rax, rcx ; fn ptr -> rax - mov rcx, r8 ; left_tag = candidate tag - mov r8, [rsp + MAX_TAG] ; right_tag = max tag + ; tp_richcompare(candidate, best, cmp_op, cand_tag, best_tag) + mov rdi, r15 + mov rsi, r14 + mov edx, [rbp - MM_CMP_OP] + mov rcx, r12 + mov r8, [rbp - MM_TAG] call rax -.max_check_result: lea rcx, [rel bool_true] cmp rax, rcx - mov [rsp + MAX_CMP_RES], rax - jne .max_slow_no_update + mov [rbp - MM_CMP_RES], rax + jne .mm_iter_slow_no_upd - ; Update max: DECREF old max + ; Update best mov rdi, r14 - mov rsi, [rsp + MAX_TAG] + mov rsi, [rbp - MM_TAG] DECREF_VAL rdi, rsi - ; Set new max = candidate mov r14, r15 - mov rax, r13 - shl rax, 4 - mov rax, [rbx + rax + 8] ; reload candidate tag (64-bit) - mov [rsp + MAX_TAG], rax ; update max_tag - cmp rax, TAG_PTR - jne .max_slow_update_done ; non-refcounted (SmallStr/SmallInt/Float): no INCREF - inc qword [r14 + PyObject.ob_refcnt] -.max_slow_update_done: - ; DECREF richcompare result (always a heap ptr) - mov rdi, [rsp + MAX_CMP_RES] + mov [rbp - MM_TAG], r12 + INCREF_VAL r14, r12 + + mov rdi, [rbp - MM_CMP_RES] call obj_decref - jmp .max_no_update + jmp .mm_iter_no_update -.max_slow_no_update: - mov rdi, [rsp + MAX_CMP_RES] +.mm_iter_slow_no_upd: + mov rdi, [rbp - MM_CMP_RES] call obj_decref -.max_no_update: - inc r13 - jmp .max_loop +.mm_iter_no_update: + ; DECREF candidate + DECREF_VAL r15, r12 + jmp .mm_iter_loop -.max_done: +.mm_iter_done: + mov rdi, [rbp - MM_ITER] + call obj_decref mov rax, r14 - mov rdx, [rsp + MAX_TAG] ; max_tag (64-bit) - add rsp, 16 + mov rdx, [rbp - MM_TAG] pop r15 pop r14 pop r13 @@ -2490,11 +2471,23 @@ DEF_FUNC builtin_max leave ret -.max_error: +.mm_iter_empty: + mov rdi, [rbp - MM_ITER] + call obj_decref + lea rdi, [rel exc_ValueError_type] + CSTRING rsi, "min()/max() arg is an empty sequence" + call raise_exception + +.mm_iter_type_error: lea rdi, [rel exc_TypeError_type] - CSTRING rsi, "max expected at least 1 argument" + CSTRING rsi, "argument is not iterable" call raise_exception -END_FUNC builtin_max + +.mm_error: + lea rdi, [rel exc_TypeError_type] + CSTRING rsi, "min()/max() expected at least 1 argument" + call raise_exception +END_FUNC minmax_impl ; ============================================================================ ; 17. builtin_getattr(args, nargs) - getattr(obj, name[, default]) diff --git a/src/eval.asm b/src/eval.asm index 87198de..8addc6d 100644 --- a/src/eval.asm +++ b/src/eval.asm @@ -108,6 +108,14 @@ extern op_import_from extern op_binary_op_add_int extern op_binary_op_sub_int extern op_compare_op_int +extern op_compare_op_int_jump_false +extern op_compare_op_int_jump_true +extern op_binary_op_add_float +extern op_binary_op_sub_float +extern op_binary_op_mul_float +extern op_binary_op_truediv_float +extern op_binary_op_mul_int +extern op_binary_op_floordiv_int extern op_for_iter_list extern op_for_iter_range @@ -1414,14 +1422,14 @@ opcode_table: dq op_binary_op_sub_int ; 212 = BINARY_OP_SUBTRACT_INT (specialized) dq op_for_iter_list ; 213 = FOR_ITER_LIST (specialized) dq op_for_iter_range ; 214 = FOR_ITER_RANGE (specialized) - dq op_unimplemented ; 215 - dq op_unimplemented ; 216 - dq op_unimplemented ; 217 - dq op_unimplemented ; 218 - dq op_unimplemented ; 219 - dq op_unimplemented ; 220 - dq op_unimplemented ; 221 - dq op_unimplemented ; 222 + dq op_compare_op_int_jump_false ; 215 = COMPARE_OP_INT_JUMP_FALSE (superinstruction) + dq op_compare_op_int_jump_true ; 216 = COMPARE_OP_INT_JUMP_TRUE (superinstruction) + dq op_binary_op_add_float ; 217 = BINARY_OP_ADD_FLOAT (specialized) + dq op_binary_op_sub_float ; 218 = BINARY_OP_SUB_FLOAT (specialized) + dq op_binary_op_mul_float ; 219 = BINARY_OP_MUL_FLOAT (specialized) + dq op_binary_op_truediv_float ; 220 = BINARY_OP_TRUEDIV_FLOAT (specialized) + dq op_binary_op_mul_int ; 221 = BINARY_OP_MULTIPLY_INT (specialized) + dq op_binary_op_floordiv_int ; 222 = BINARY_OP_FLOORDIV_INT (specialized) dq op_unimplemented ; 223 dq op_unimplemented ; 224 dq op_unimplemented ; 225 diff --git a/src/frame.asm b/src/frame.asm index fd69df9..9c89aed 100644 --- a/src/frame.asm +++ b/src/frame.asm @@ -178,12 +178,32 @@ DEF_FUNC frame_new test ecx, ecx jz .done push rax ; save frame pointer - mov rdi, rax ; rdi = frame (for localsplus base calc) - lea rdi, [rdi + PyFrame.localsplus] + lea rdi, [rax + PyFrame.localsplus] + cmp ecx, 4 + ja .zero_large + ; Small: unrolled stores for 1-4 slots (16 bytes each) + xor eax, eax + mov [rdi], rax + mov [rdi + 8], rax + cmp ecx, 1 + je .zero_done + mov [rdi + 16], rax + mov [rdi + 24], rax + cmp ecx, 2 + je .zero_done + mov [rdi + 32], rax + mov [rdi + 40], rax + cmp ecx, 3 + je .zero_done + mov [rdi + 48], rax + mov [rdi + 56], rax + jmp .zero_done +.zero_large: xor eax, eax - mov ecx, ecx ; zero-extend ecx (already done but be explicit) + mov ecx, ecx ; zero-extend ecx shl ecx, 1 ; 2 qwords per 16-byte slot rep stosq ; store ecx qwords of 0 at [rdi] +.zero_done: pop rax ; restore frame pointer .done: diff --git a/src/itertools.asm b/src/itertools.asm index 58c5fb8..daea435 100644 --- a/src/itertools.asm +++ b/src/itertools.asm @@ -472,14 +472,10 @@ DEF_FUNC_LOCAL enumerate_iternext mov r12, rax ; r12 = value payload from iternext push rdx ; save value tag from iternext - ; Create SmallInt for current count - mov rdi, [rbx + IT_FIELD2] ; it_count (raw i64) - call int_from_i64 - mov r13, rax ; r13 = count payload - push rdx ; save count tag from int_from_i64 - - ; Increment it_count - inc qword [rbx + IT_FIELD2] + ; Inline SmallInt for current count (int_from_i64 always returns SmallInt) + mov r13, [rbx + IT_FIELD2] ; r13 = count (raw i64 = SmallInt payload) + inc qword [rbx + IT_FIELD2] ; increment for next time + push qword TAG_SMALLINT ; count tag (always SmallInt) ; Create 2-tuple mov rdi, 2 diff --git a/src/lib/memops.asm b/src/lib/memops.asm index 8952d4b..db77f4f 100644 --- a/src/lib/memops.asm +++ b/src/lib/memops.asm @@ -23,6 +23,35 @@ DEF_FUNC_BARE ap_memset ret END_FUNC ap_memset +; ap_memmove(void *dst, const void *src, size_t n) -> void *dst +; Handles overlapping regions. n must be a multiple of 8. +; Forward: rep movsq (fast). Backward: manual qword loop (avoids std penalty). +DEF_FUNC_BARE ap_memmove + mov rax, rdi ; save dst for return + mov rcx, rdx + shr rcx, 3 ; qword count = n / 8 + jz .memmove_done + cmp rdi, rsi + je .memmove_done ; dst == src, nop + jb .memmove_fwd ; dst < src: forward safe +.memmove_bk: + ; dst > src: copy backward to avoid overlap corruption + lea rsi, [rsi + rdx - 8] + lea rdi, [rdi + rdx - 8] +.memmove_bk_loop: + mov r8, [rsi] + mov [rdi], r8 + sub rsi, 8 + sub rdi, 8 + dec rcx + jnz .memmove_bk_loop + ret +.memmove_fwd: + rep movsq +.memmove_done: + ret +END_FUNC ap_memmove + ; ap_memcmp(const void *s1, const void *s2, size_t n) -> int ; Returns 0 if equal, <0 if s10 if s1>s2 DEF_FUNC_BARE ap_memcmp diff --git a/src/methods.asm b/src/methods.asm index 9af8482..6e9dbd3 100644 --- a/src/methods.asm +++ b/src/methods.asm @@ -15,6 +15,7 @@ extern ap_free extern ap_realloc extern ap_memcpy extern ap_memset +extern ap_memmove extern ap_strcmp extern ap_strlen extern ap_strstr @@ -4129,22 +4130,18 @@ DEF_FUNC list_method_pop push qword [rax + rcx + 8] ; save item tag on stack ; Don't DECREF since we're transferring ownership to caller - ; Shift items down: for i = index .. size-2, items[i] = items[i+1] - mov rcx, r13 ; i = index - mov rdx, [rbx + PyListObject.ob_size] - dec rdx ; size - 1 -.pop_shift: - cmp rcx, rdx - jge .pop_shrink + ; Shift items down: memmove(&items[idx], &items[idx+1], (size-1-idx)*16) mov rax, [rbx + PyListObject.ob_item] - mov r8, rcx - shl r8, 4 ; i * 16 - mov r9, [rax + r8 + 16] ; items[i+1] payload - mov r10, [rax + r8 + 24] ; items[i+1] tag - mov [rax + r8], r9 ; items[i] payload - mov [rax + r8 + 8], r10 ; items[i] tag - inc rcx - jmp .pop_shift + mov rcx, r13 + shl rcx, 4 ; idx * 16 + lea rdi, [rax + rcx] ; dst = &items[idx] + lea rsi, [rdi + 16] ; src = &items[idx+1] + mov rdx, [rbx + PyListObject.ob_size] + sub rdx, r13 + dec rdx ; count = size - idx - 1 + shl rdx, 4 ; bytes = count * 16 + jz .pop_shrink ; nothing to shift if popping last + call ap_memmove .pop_shrink: dec qword [rbx + PyListObject.ob_size] @@ -4222,21 +4219,17 @@ DEF_FUNC list_method_insert mov [rbx + PyListObject.ob_item], rax .ins_no_grow: - ; Shift items up: for i = size-1 down to index, items[i+1] = items[i] - mov rcx, [rbx + PyListObject.ob_size] - dec rcx ; i = size - 1 -.ins_shift: - cmp rcx, r12 - jl .ins_place + ; Shift items up: memmove(&items[idx+1], &items[idx], (size-idx)*16) mov rax, [rbx + PyListObject.ob_item] - mov r8, rcx - shl r8, 4 ; i * 16 - mov r9, [rax + r8] ; payload - mov r10, [rax + r8 + 8] ; tag - mov [rax + r8 + 16], r9 ; items[i+1] payload - mov [rax + r8 + 24], r10 ; items[i+1] tag - dec rcx - jmp .ins_shift + mov rcx, r12 + shl rcx, 4 ; idx * 16 + lea rsi, [rax + rcx] ; src = &items[idx] + lea rdi, [rsi + 16] ; dst = &items[idx+1] + mov rdx, [rbx + PyListObject.ob_size] + sub rdx, r12 ; count = size - idx + shl rdx, 4 ; bytes = count * 16 + jz .ins_place ; nothing to shift if inserting at end + call ap_memmove .ins_place: ; Place item at index (16-byte fat slot) @@ -6417,22 +6410,18 @@ DEF_FUNC list_method_remove mov r12, [rax + rcx] ; item payload (save for DECREF) mov r13, [rax + rcx + 8] ; item tag (save for DECREF) - ; Shift remaining items left (16-byte fat elements) + ; Shift remaining items left: memmove(&items[idx], &items[idx+1], (size-1-idx)*16) + mov rax, [rbx + PyListObject.ob_item] mov rcx, r14 + shl rcx, 4 ; idx * 16 + lea rdi, [rax + rcx] ; dst = &items[idx] + lea rsi, [rdi + 16] ; src = &items[idx+1] mov rdx, [rbx + PyListObject.ob_size] - dec rdx ; size - 1 -.lremove_shift: - cmp rcx, rdx - jge .lremove_shrink - mov rax, [rbx + PyListObject.ob_item] - mov r8, rcx - shl r8, 4 ; i * 16 - mov r9, [rax + r8 + 16] ; items[i+1] payload - mov r10, [rax + r8 + 24] ; items[i+1] tag - mov [rax + r8], r9 ; items[i] payload - mov [rax + r8 + 8], r10 ; items[i] tag - inc rcx - jmp .lremove_shift + sub rdx, r14 + dec rdx ; count = size - idx - 1 + shl rdx, 4 ; bytes = count * 16 + jz .lremove_shrink ; nothing to shift if removing last + call ap_memmove .lremove_shrink: dec qword [rbx + PyListObject.ob_size] diff --git a/src/opcodes_misc.asm b/src/opcodes_misc.asm index c6c96b1..6b6f8c7 100644 --- a/src/opcodes_misc.asm +++ b/src/opcodes_misc.asm @@ -42,6 +42,7 @@ extern raise_exception extern exc_RuntimeError_type extern exc_StopIteration_type extern exc_TypeError_type +extern exc_ZeroDivisionError_type extern current_exception extern eval_exception_unwind extern obj_incref @@ -145,6 +146,24 @@ DEF_FUNC_BARE op_binary_op cmp ecx, 23 ; NB_INPLACE_SUBTRACT je .binop_try_smallint_sub + ; Fast path: SmallInt multiply (NB_MULTIPLY=5, NB_INPLACE_MULTIPLY=18) + cmp ecx, 5 ; NB_MULTIPLY + je .binop_try_smallint_mul + cmp ecx, 18 ; NB_INPLACE_MULTIPLY + je .binop_try_smallint_mul + + ; Fast path: float truediv (NB_TRUE_DIVIDE=11, NB_INPLACE_TRUE_DIVIDE=24) + cmp ecx, 11 ; NB_TRUE_DIVIDE + je .binop_try_float_truediv + cmp ecx, 24 ; NB_INPLACE_TRUE_DIVIDE + je .binop_try_float_truediv + + ; Fast path: SmallInt floor divide (NB_FLOOR_DIVIDE=2, NB_INPLACE_FLOOR_DIVIDE=15) + cmp ecx, 2 ; NB_FLOOR_DIVIDE + je .binop_try_smallint_fdiv + cmp ecx, 15 ; NB_INPLACE_FLOOR_DIVIDE + je .binop_try_smallint_fdiv + .binop_generic: ; Save operands + tags for DECREF after call (push on machine stack) ; Stack layout: [rsp+BO_RIGHT], [rsp+BO_RTAG], [rsp+BO_LEFT], [rsp+BO_LTAG] @@ -263,7 +282,18 @@ DEF_FUNC_BARE op_binary_op ; If inplace slot was NULL, fall back to non-inplace slot cmp r9d, 13 jl .binop_try_dunder ; not inplace, no fallback - ; Reload type's tp_as_number (may have been clobbered) + ; Map inplace op to non-inplace offset + mov ecx, r9d + sub ecx, 13 ; inplace → base op + lea rdx, [rel binary_op_offsets] + mov rdx, [rdx + rcx*8] ; non-inplace offset + ; Float coercion: if either operand is float, use float_number_methods + ; (mirrors the initial float coercion at .use_float_methods) + cmp qword [rsp + BO_LTAG], TAG_FLOAT + je .binop_fallback_float + cmp qword [rsp + BO_RTAG], TAG_FLOAT + je .binop_fallback_float + ; Reload type's tp_as_number cmp qword [rsp + BO_LTAG], TAG_SMALLINT je .binop_fallback_int cmp qword [rsp + BO_LTAG], TAG_BOOL @@ -274,6 +304,9 @@ DEF_FUNC_BARE op_binary_op jz .binop_try_dunder mov rax, [rdi + PyObject.ob_type] jmp .binop_fallback_have_type +.binop_fallback_float: + lea rax, [rel float_number_methods] + jmp .binop_fallback_have_methods .binop_fallback_int: lea rax, [rel int_type] jmp .binop_fallback_have_type @@ -281,13 +314,9 @@ DEF_FUNC_BARE op_binary_op lea rax, [rel str_type] .binop_fallback_have_type: mov rax, [rax + PyTypeObject.tp_as_number] +.binop_fallback_have_methods: test rax, rax jz .binop_try_dunder - ; Map inplace op to non-inplace offset - mov ecx, r9d - sub ecx, 13 ; inplace → base op - lea rdx, [rel binary_op_offsets] - mov rdx, [rdx + rcx*8] ; non-inplace offset mov rax, [rax + rdx] test rax, rax jz .binop_try_dunder @@ -506,7 +535,7 @@ DEF_FUNC_BARE op_binary_op .binop_try_smallint_add: ; Check both TAG_SMALLINT cmp r9d, TAG_SMALLINT - jne .binop_generic + jne .binop_try_float_add cmp r8d, TAG_SMALLINT jne .binop_generic @@ -515,18 +544,31 @@ DEF_FUNC_BARE op_binary_op mov rdx, rsi add rax, rdx jo .binop_generic ; overflow → fall back to generic - ; Encode as SmallInt ; Specialize: rewrite opcode to BINARY_OP_ADD_INT (211) mov byte [rbx - 2], 211 - ; No DECREF needed (SmallInt are not refcounted) VPUSH_INT rax add rbx, 2 DISPATCH +.binop_try_float_add: + cmp r9d, TAG_FLOAT + jne .binop_generic + cmp r8d, TAG_FLOAT + jne .binop_generic + ; Both float: inline add + mov byte [rbx - 2], 217 ; BINARY_OP_ADD_FLOAT + movq xmm0, rdi + movq xmm1, rsi + addsd xmm0, xmm1 + movq rax, xmm0 + VPUSH_FLOAT rax + add rbx, 2 + DISPATCH + .binop_try_smallint_sub: ; Check both TAG_SMALLINT cmp r9d, TAG_SMALLINT - jne .binop_generic + jne .binop_try_float_sub cmp r8d, TAG_SMALLINT jne .binop_generic @@ -535,10 +577,98 @@ DEF_FUNC_BARE op_binary_op mov rdx, rsi sub rax, rdx jo .binop_generic ; overflow → fall back to generic - ; Encode as SmallInt ; Specialize: rewrite opcode to BINARY_OP_SUBTRACT_INT (212) mov byte [rbx - 2], 212 - ; No DECREF needed (SmallInt are not refcounted) + VPUSH_INT rax + add rbx, 2 + DISPATCH + +.binop_try_float_sub: + cmp r9d, TAG_FLOAT + jne .binop_generic + cmp r8d, TAG_FLOAT + jne .binop_generic + ; Both float: inline sub + mov byte [rbx - 2], 218 ; BINARY_OP_SUB_FLOAT + movq xmm0, rdi + movq xmm1, rsi + subsd xmm0, xmm1 + movq rax, xmm0 + VPUSH_FLOAT rax + add rbx, 2 + DISPATCH + +.binop_try_smallint_mul: + ; Check both TAG_SMALLINT + cmp r9d, TAG_SMALLINT + jne .binop_try_float_mul + cmp r8d, TAG_SMALLINT + jne .binop_generic + + ; Both SmallInt: multiply, check overflow + mov rax, rdi + imul rsi + jo .binop_generic ; overflow → fall back to generic + ; Specialize: rewrite opcode to BINARY_OP_MULTIPLY_INT (221) + mov byte [rbx - 2], 221 + VPUSH_INT rax + add rbx, 2 + DISPATCH + +.binop_try_float_mul: + cmp r9d, TAG_FLOAT + jne .binop_generic + cmp r8d, TAG_FLOAT + jne .binop_generic + ; Both float: inline mul + mov byte [rbx - 2], 219 ; BINARY_OP_MUL_FLOAT + movq xmm0, rdi + movq xmm1, rsi + mulsd xmm0, xmm1 + movq rax, xmm0 + VPUSH_FLOAT rax + add rbx, 2 + DISPATCH + +.binop_try_float_truediv: + cmp r9d, TAG_FLOAT + jne .binop_generic + cmp r8d, TAG_FLOAT + jne .binop_generic + ; Both float: check for division by zero + movq xmm1, rsi + xorpd xmm2, xmm2 + ucomisd xmm1, xmm2 + je .binop_generic ; zero divisor → generic path raises ZeroDivisionError + ; Inline truediv + mov byte [rbx - 2], 220 ; BINARY_OP_TRUEDIV_FLOAT + movq xmm0, rdi + divsd xmm0, xmm1 + movq rax, xmm0 + VPUSH_FLOAT rax + add rbx, 2 + DISPATCH + +.binop_try_smallint_fdiv: + ; Check both TAG_SMALLINT + cmp r9d, TAG_SMALLINT + jne .binop_generic + cmp r8d, TAG_SMALLINT + jne .binop_generic + test rsi, rsi + jz .binop_generic ; zero divisor → generic raises error + mov rax, rdi + cqo + idiv rsi ; rax=quotient, rdx=remainder + ; Floor: if remainder != 0 and signs differ, subtract 1 + test rdx, rdx + jz .fdiv_exact + mov rcx, rdi + xor rcx, rsi + jns .fdiv_exact ; same sign → truncation == floor + dec rax +.fdiv_exact: + mov byte [rbx - 2], 222 ; specialize to BINARY_OP_FLOORDIV_INT VPUSH_INT rax add rbx, 2 DISPATCH @@ -567,9 +697,22 @@ DEF_FUNC_BARE op_compare_op cmp r8d, TAG_SMALLINT jne .cmp_slow_path - ; Both SmallInt: specialize to COMPARE_OP_INT (209) - mov byte [rbx - 2], 209 - + ; Both SmallInt: specialize — check if next opcode is POP_JUMP_IF_FALSE/TRUE + ; rbx points past 2-byte instruction; CACHE at [rbx], next opcode at [rbx+2] + cmp byte [rbx + 2], 114 ; POP_JUMP_IF_FALSE + je .cmp_specialize_jump_false + cmp byte [rbx + 2], 115 ; POP_JUMP_IF_TRUE + je .cmp_specialize_jump_true + mov byte [rbx - 2], 209 ; plain COMPARE_OP_INT + jmp .cmp_do_compare +.cmp_specialize_jump_false: + mov byte [rbx - 2], 215 ; COMPARE_OP_INT_JUMP_FALSE + jmp .cmp_do_compare +.cmp_specialize_jump_true: + mov byte [rbx - 2], 216 ; COMPARE_OP_INT_JUMP_TRUE + ; fall through + +.cmp_do_compare: ; Both SmallInt: decode and compare mov rax, rdi mov rdx, rsi @@ -598,12 +741,7 @@ DEF_FUNC_BARE op_compare_op .cmp_push_bool: movzx eax, al ; eax = 0 or 1 - lea rdx, [rel bool_false] - lea r8, [rel bool_true] - test eax, eax - cmovnz rdx, r8 ; rdx = bool_true if true, else bool_false - inc qword [rdx + PyObject.ob_refcnt] - VPUSH_PTR rdx + VPUSH_BOOL rax ; (0/1, TAG_BOOL) — no INCREF needed add rbx, 2 DISPATCH @@ -1053,76 +1191,76 @@ END_FUNC op_unary_not ;; (2-byte units from start of co_code). ;; ============================================================================ DEF_FUNC_BARE op_pop_jump_if_false - ; Save arg (target offset) before call - push rcx ; save target offset on machine stack - VPOP rdi ; rdi = value to test mov r8, [r13 + 8] ; r8 = value tag - ; Save value + tag for DECREF - push r8 - push rdi + ; Fast path: TAG_BOOL — payload is 0/1, no DECREF needed + cmp r8d, TAG_BOOL + je .pjif_bool_fast - ; Call obj_is_true(value, tag) -> 0 (false) or 1 (true) + ; Slow path: call obj_is_true + DECREF + push rcx ; save target offset + push r8 ; save tag for DECREF + push rdi ; save value for DECREF mov rsi, r8 ; 64-bit for SmallStr call obj_is_true - push rax ; save truthiness on machine stack - - ; DECREF the popped value (tag-aware) + push rax ; save truthiness mov rdi, [rsp + 8] ; reload value mov rsi, [rsp + 16] ; tag DECREF_VAL rdi, rsi pop rax ; restore truthiness add rsp, 16 ; discard saved value + tag pop rcx ; restore target offset - - ; If false (result == 0), jump to target test eax, eax jnz .no_jump - - ; Jump: relative from current rbx (delta in instruction words) lea rbx, [rbx + rcx*2] - .no_jump: DISPATCH + +.pjif_bool_fast: + test edi, edi + jnz .pjif_no_jump ; truthy → don't jump + lea rbx, [rbx + rcx*2] ; jump +.pjif_no_jump: + DISPATCH END_FUNC op_pop_jump_if_false ;; ============================================================================ ;; op_pop_jump_if_true - Pop TOS, jump if truthy ;; ============================================================================ DEF_FUNC_BARE op_pop_jump_if_true - ; Save arg (delta in instruction words) - push rcx ; save target offset on machine stack - VPOP rdi mov r8, [r13 + 8] ; r8 = value tag - ; Save value + tag for DECREF - push r8 - push rdi + ; Fast path: TAG_BOOL — payload is 0/1, no DECREF needed + cmp r8d, TAG_BOOL + je .pjit_bool_fast - ; Call obj_is_true(value, tag) + ; Slow path: call obj_is_true + DECREF + push rcx ; save target offset + push r8 ; save tag for DECREF + push rdi ; save value for DECREF mov rsi, r8 ; 64-bit for SmallStr call obj_is_true - push rax ; save truthiness on machine stack - - ; DECREF the popped value (tag-aware) + push rax ; save truthiness mov rdi, [rsp + 8] ; reload value mov rsi, [rsp + 16] ; tag DECREF_VAL rdi, rsi pop rax ; restore truthiness add rsp, 16 ; discard saved value + tag pop rcx ; restore target offset - - ; If true (result != 0), jump to target test eax, eax jz .no_jump - - ; Jump: relative from current rbx lea rbx, [rbx + rcx*2] - .no_jump: DISPATCH + +.pjit_bool_fast: + test edi, edi + jz .pjit_no_jump ; falsy → don't jump + lea rbx, [rbx + rcx*2] ; jump +.pjit_no_jump: + DISPATCH END_FUNC op_pop_jump_if_true ;; ============================================================================ @@ -2996,6 +3134,200 @@ DEF_FUNC_BARE op_binary_op_sub_int DISPATCH END_FUNC op_binary_op_sub_int +;; ============================================================================ +;; op_binary_op_add_float - Specialized float add (opcode 217) +;; +;; Guard: both TOS and TOS1 must be TAG_FLOAT. +;; On guard failure: deopt back to BINARY_OP (122). +;; Followed by 1 CACHE entry (2 bytes). +;; ============================================================================ +DEF_FUNC_BARE op_binary_op_add_float + VPOP rsi ; right + mov r8, [r13 + 8] ; right tag + VPOP rdi ; left + mov r9, [r13 + 8] ; left tag + cmp r9d, TAG_FLOAT + jne .add_float_deopt_repush + cmp r8d, TAG_FLOAT + jne .add_float_deopt_repush + movq xmm0, rdi + movq xmm1, rsi + addsd xmm0, xmm1 + movq rax, xmm0 + VPUSH_FLOAT rax + add rbx, 2 ; skip CACHE + DISPATCH +.add_float_deopt_repush: + VUNDROP 2 +.add_float_deopt: + mov byte [rbx - 2], 122 + sub rbx, 2 + DISPATCH +END_FUNC op_binary_op_add_float + +;; ============================================================================ +;; op_binary_op_sub_float - Specialized float subtract (opcode 218) +;; ============================================================================ +DEF_FUNC_BARE op_binary_op_sub_float + VPOP rsi ; right + mov r8, [r13 + 8] ; right tag + VPOP rdi ; left + mov r9, [r13 + 8] ; left tag + cmp r9d, TAG_FLOAT + jne .sub_float_deopt_repush + cmp r8d, TAG_FLOAT + jne .sub_float_deopt_repush + movq xmm0, rdi + movq xmm1, rsi + subsd xmm0, xmm1 + movq rax, xmm0 + VPUSH_FLOAT rax + add rbx, 2 ; skip CACHE + DISPATCH +.sub_float_deopt_repush: + VUNDROP 2 +.sub_float_deopt: + mov byte [rbx - 2], 122 + sub rbx, 2 + DISPATCH +END_FUNC op_binary_op_sub_float + +;; ============================================================================ +;; op_binary_op_mul_float - Specialized float multiply (opcode 219) +;; ============================================================================ +DEF_FUNC_BARE op_binary_op_mul_float + VPOP rsi ; right + mov r8, [r13 + 8] ; right tag + VPOP rdi ; left + mov r9, [r13 + 8] ; left tag + cmp r9d, TAG_FLOAT + jne .mul_float_deopt_repush + cmp r8d, TAG_FLOAT + jne .mul_float_deopt_repush + movq xmm0, rdi + movq xmm1, rsi + mulsd xmm0, xmm1 + movq rax, xmm0 + VPUSH_FLOAT rax + add rbx, 2 ; skip CACHE + DISPATCH +.mul_float_deopt_repush: + VUNDROP 2 +.mul_float_deopt: + mov byte [rbx - 2], 122 + sub rbx, 2 + DISPATCH +END_FUNC op_binary_op_mul_float + +;; ============================================================================ +;; op_binary_op_truediv_float - Specialized float truediv (opcode 220) +;; ============================================================================ +DEF_FUNC_BARE op_binary_op_truediv_float + VPOP rsi ; right + mov r8, [r13 + 8] ; right tag + VPOP rdi ; left + mov r9, [r13 + 8] ; left tag + cmp r9d, TAG_FLOAT + jne .truediv_float_deopt_repush + cmp r8d, TAG_FLOAT + jne .truediv_float_deopt_repush + ; Check for division by zero + movq xmm1, rsi + xorpd xmm2, xmm2 + ucomisd xmm1, xmm2 + je .truediv_float_deopt_repush ; zero divisor → deopt to generic (raises ZeroDivisionError) + movq xmm0, rdi + divsd xmm0, xmm1 + movq rax, xmm0 + VPUSH_FLOAT rax + add rbx, 2 ; skip CACHE + DISPATCH +.truediv_float_deopt_repush: + VUNDROP 2 +.truediv_float_deopt: + mov byte [rbx - 2], 122 + sub rbx, 2 + DISPATCH +END_FUNC op_binary_op_truediv_float + +;; ============================================================================ +;; op_binary_op_mul_int - Specialized SmallInt multiply (opcode 221) +;; +;; Guard: both TOS and TOS1 must be SmallInt. +;; On guard failure: deopt back to BINARY_OP (122). +;; Followed by 1 CACHE entry (2 bytes). +;; ============================================================================ +DEF_FUNC_BARE op_binary_op_mul_int + VPOP rsi ; right + mov r8, [r13 + 8] ; right tag + VPOP rdi ; left + mov r9, [r13 + 8] ; left tag + cmp r9d, TAG_SMALLINT + jne .mul_int_deopt_repush + cmp r8d, TAG_SMALLINT + jne .mul_int_deopt_repush + mov rax, rdi + imul rsi + jo .mul_int_deopt_repush_vals + VPUSH_INT rax + add rbx, 2 ; skip CACHE + DISPATCH +.mul_int_deopt_repush_vals: + ; imul clobbered rax/rdx, use saved values + VPUSH_INT rdi + VPUSH_INT rsi + jmp .mul_int_deopt +.mul_int_deopt_repush: + VUNDROP 2 +.mul_int_deopt: + mov byte [rbx - 2], 122 + sub rbx, 2 + DISPATCH +END_FUNC op_binary_op_mul_int + +;; ============================================================================ +;; op_binary_op_floordiv_int - Specialized SmallInt floor divide (opcode 222) +;; +;; Guard: both TOS and TOS1 must be SmallInt, right != 0. +;; On guard failure: deopt back to BINARY_OP (122). +;; Followed by 1 CACHE entry (2 bytes). +;; ============================================================================ +DEF_FUNC_BARE op_binary_op_floordiv_int + VPOP rsi ; right + mov r8, [r13 + 8] ; right tag + VPOP rdi ; left + mov r9, [r13 + 8] ; left tag + ; Guard: both SmallInt + cmp r9d, TAG_SMALLINT + jne .fdiv_int_deopt_repush + cmp r8d, TAG_SMALLINT + jne .fdiv_int_deopt_repush + ; Guard: right != 0 + test rsi, rsi + jz .fdiv_int_deopt_repush + ; Floor divide + mov rax, rdi + cqo + idiv rsi ; rax=quotient, rdx=remainder + ; Floor: if remainder != 0 and signs differ, subtract 1 + test rdx, rdx + jz .fdiv_int_exact + mov rcx, rdi + xor rcx, rsi + jns .fdiv_int_exact ; same sign → truncation == floor + dec rax +.fdiv_int_exact: + VPUSH_INT rax + add rbx, 2 ; skip CACHE + DISPATCH +.fdiv_int_deopt_repush: + VUNDROP 2 +.fdiv_int_deopt: + mov byte [rbx - 2], 122 + sub rbx, 2 + DISPATCH +END_FUNC op_binary_op_floordiv_int + ;; ============================================================================ ;; op_compare_op_int - Specialized SmallInt comparison (opcode 209) ;; @@ -3041,12 +3373,7 @@ DEF_FUNC_BARE op_compare_op_int .ci_push_bool: movzx eax, al ; eax = 0 or 1 - lea rdx, [rel bool_false] - lea r8, [rel bool_true] - test eax, eax - cmovnz rdx, r8 ; rdx = bool_true if true, else bool_false - inc qword [rdx + PyObject.ob_refcnt] - VPUSH_PTR rdx + VPUSH_BOOL rax ; (0/1, TAG_BOOL) — no INCREF needed add rbx, 2 ; skip CACHE DISPATCH @@ -3069,3 +3396,142 @@ section .text sub rbx, 2 DISPATCH END_FUNC op_compare_op_int + +;; ============================================================================ +;; op_compare_op_int_jump_false - Fused COMPARE_OP_INT + POP_JUMP_IF_FALSE (215) +;; +;; Guard: both TOS and TOS1 must be SmallInt. +;; On guard failure: deopt back to COMPARE_OP (107). +;; ecx = arg (comparison op = arg >> 4). +;; Followed by 1 CACHE entry (2 bytes), then POP_JUMP_IF_FALSE (2 bytes). +;; ============================================================================ +DEF_FUNC_BARE op_compare_op_int_jump_false + shr ecx, 4 ; ecx = comparison op (0-5) + VPOP rsi ; right + mov r8, [r13 + 8] ; right tag + VPOP rdi ; left + mov r9, [r13 + 8] ; left tag + ; Guard: both SmallInt + cmp r9d, TAG_SMALLINT + jne .cijf_deopt_repush + cmp r8d, TAG_SMALLINT + jne .cijf_deopt_repush + ; Read jump target from POP_JUMP_IF_FALSE arg (at rbx+3) + movzx r8d, byte [rbx + 3] + ; Compare + cmp rdi, rsi + lea r9, [rel .cijf_setcc_table] + jmp [r9 + rcx*8] + +.cijf_lt: + setl al + jmp .cijf_branch +.cijf_le: + setle al + jmp .cijf_branch +.cijf_eq: + sete al + jmp .cijf_branch +.cijf_ne: + setne al + jmp .cijf_branch +.cijf_gt: + setg al + jmp .cijf_branch +.cijf_ge: + setge al + ; fall through +.cijf_branch: + ; Skip CACHE (2) + POP_JUMP_IF_FALSE (2) = 4 bytes + add rbx, 4 + test al, al + jnz .cijf_no_jump ; truthy → don't jump (POP_JUMP_IF_FALSE) + lea rbx, [rbx + r8*2] ; jump (r8 = target offset) +.cijf_no_jump: + DISPATCH + +section .data +align 8 +.cijf_setcc_table: + dq .cijf_lt ; PY_LT = 0 + dq .cijf_le ; PY_LE = 1 + dq .cijf_eq ; PY_EQ = 2 + dq .cijf_ne ; PY_NE = 3 + dq .cijf_gt ; PY_GT = 4 + dq .cijf_ge ; PY_GE = 5 +section .text + +.cijf_deopt_repush: + VUNDROP 2 + mov byte [rbx - 2], 107 ; deopt to COMPARE_OP + sub rbx, 2 + DISPATCH +END_FUNC op_compare_op_int_jump_false + +;; ============================================================================ +;; op_compare_op_int_jump_true - Fused COMPARE_OP_INT + POP_JUMP_IF_TRUE (216) +;; +;; Same as above but jumps when comparison is TRUE. +;; ============================================================================ +DEF_FUNC_BARE op_compare_op_int_jump_true + shr ecx, 4 ; ecx = comparison op (0-5) + VPOP rsi ; right + mov r8, [r13 + 8] ; right tag + VPOP rdi ; left + mov r9, [r13 + 8] ; left tag + ; Guard: both SmallInt + cmp r9d, TAG_SMALLINT + jne .cijt_deopt_repush + cmp r8d, TAG_SMALLINT + jne .cijt_deopt_repush + ; Read jump target from POP_JUMP_IF_TRUE arg (at rbx+3) + movzx r8d, byte [rbx + 3] + ; Compare + cmp rdi, rsi + lea r9, [rel .cijt_setcc_table] + jmp [r9 + rcx*8] + +.cijt_lt: + setl al + jmp .cijt_branch +.cijt_le: + setle al + jmp .cijt_branch +.cijt_eq: + sete al + jmp .cijt_branch +.cijt_ne: + setne al + jmp .cijt_branch +.cijt_gt: + setg al + jmp .cijt_branch +.cijt_ge: + setge al + ; fall through +.cijt_branch: + ; Skip CACHE (2) + POP_JUMP_IF_TRUE (2) = 4 bytes + add rbx, 4 + test al, al + jz .cijt_no_jump ; falsy → don't jump (POP_JUMP_IF_TRUE) + lea rbx, [rbx + r8*2] ; jump (r8 = target offset) +.cijt_no_jump: + DISPATCH + +section .data +align 8 +.cijt_setcc_table: + dq .cijt_lt ; PY_LT = 0 + dq .cijt_le ; PY_LE = 1 + dq .cijt_eq ; PY_EQ = 2 + dq .cijt_ne ; PY_NE = 3 + dq .cijt_gt ; PY_GT = 4 + dq .cijt_ge ; PY_GE = 5 +section .text + +.cijt_deopt_repush: + VUNDROP 2 + mov byte [rbx - 2], 107 ; deopt to COMPARE_OP + sub rbx, 2 + DISPATCH +END_FUNC op_compare_op_int_jump_true diff --git a/src/pyo/float.asm b/src/pyo/float.asm index 3a34a57..b6cb69a 100644 --- a/src/pyo/float.asm +++ b/src/pyo/float.asm @@ -528,6 +528,31 @@ DEF_FUNC float_pow, 32 movsd xmm0, [rbp-8] ; base movsd xmm1, [rbp-16] ; exp + ; Fast path: exp == 0.5 → sqrtsd (~12 cycles vs ~100+ for general) + movsd xmm2, [rel const_half_f] + ucomisd xmm1, xmm2 + jne .not_sqrt + jp .not_sqrt + ; base >= 0 check (negative base → general path for complex/error) + xorpd xmm3, xmm3 + ucomisd xmm0, xmm3 + jb .fpow_general + sqrtsd xmm0, xmm0 + call float_from_f64 + leave + ret +.not_sqrt: + ; Fast path: exp == 2.0 → mulsd + movsd xmm2, [rel const_two_f] + ucomisd xmm1, xmm2 + jne .check_int_exp + jp .check_int_exp + mulsd xmm0, xmm0 + call float_from_f64 + leave + ret + +.check_int_exp: ; Check if exponent is an integer cvtsd2si rcx, xmm1 cvtsi2sd xmm2, rcx @@ -791,7 +816,9 @@ align 8 sign_mask: dq 0x8000000000000000 pos_inf: dq 0x7FF0000000000000 neg_inf: dq 0xFFF0000000000000 -const_one_f: dq 0x3FF0000000000000 ; 1.0 in IEEE 754 +const_one_f: dq 0x3FF0000000000000 ; 1.0 in IEEE 754 +const_half_f: dq 0x3FE0000000000000 ; 0.5 +const_two_f: dq 0x4000000000000000 ; 2.0 align 8 global float_number_methods diff --git a/src/pyo/list.asm b/src/pyo/list.asm index 8c36c22..281ba3f 100644 --- a/src/pyo/list.asm +++ b/src/pyo/list.asm @@ -11,6 +11,8 @@ extern gc_track extern gc_dealloc extern ap_free extern ap_realloc +extern ap_memmove +extern ap_memcpy extern obj_decref extern obj_dealloc extern str_from_cstr @@ -41,6 +43,8 @@ extern list_sorting_error ;; list_new(int64_t capacity) -> PyListObject* ;; Allocate a new empty list with given initial capacity ;; ============================================================================ +LIST_POOL_MAX equ 16 + DEF_FUNC list_new push rbx push r12 @@ -51,12 +55,26 @@ DEF_FUNC list_new mov r12, 4 ; minimum capacity .has_cap: + ; Try list header pool first + mov rax, [rel list_pool_head] + test rax, rax + jz .alloc_fresh + ; Pop from pool: reuse ob_refcnt slot as next-link + mov rcx, [rax + PyObject.ob_refcnt] + mov [rel list_pool_head], rcx + dec dword [rel list_pool_count] + mov qword [rax + PyObject.ob_refcnt], 1 ; reinit refcount + mov rbx, rax + jmp .init_fields + +.alloc_fresh: ; Allocate PyListObject header (GC-tracked) mov edi, PyListObject_size lea rsi, [rel list_type] call gc_alloc mov rbx, rax ; rbx = list (ob_refcnt=1, ob_type set) +.init_fields: mov qword [rbx + PyListObject.ob_size], 0 mov [rbx + PyListObject.allocated], r12 @@ -549,38 +567,11 @@ DEF_FUNC list_ass_subscript, LAS_FRAME mov rax, r14 shl rax, 4 add rsi, rax - ; count in 16-byte slots - ; Use forward or backward copy depending on direction + ; rcx = tail_count (16-byte elements), rdi = dst, rsi = src push rcx - cmp rdi, rsi - jb .las_copy_fwd - ; Backward copy (dst > src, overlap) - dec rcx -.las_copy_bwd_loop: - cmp rcx, 0 - jl .las_copy_done - mov rax, rcx - shl rax, 4 ; offset = i * 16 - mov r10, [rsi + rax] ; payload - mov r11, [rsi + rax + 8] ; tag - mov [rdi + rax], r10 - mov [rdi + rax + 8], r11 - dec rcx - jmp .las_copy_bwd_loop -.las_copy_fwd: - xor edx, edx -.las_copy_fwd_loop: - cmp rdx, rcx - jge .las_copy_done - mov rax, rdx - shl rax, 4 ; offset = i * 16 - mov r10, [rsi + rax] - mov r11, [rsi + rax + 8] - mov [rdi + rax], r10 - mov [rdi + rax + 8], r11 - inc rdx - jmp .las_copy_fwd_loop -.las_copy_done: + shl rcx, 4 ; bytes = tail_count * 16 + mov rdx, rcx ; rdx = byte count for ap_memmove + call ap_memmove pop rcx .las_shift_done: @@ -993,7 +984,7 @@ END_FUNC list_contains ;; ============================================================================ ;; list_dealloc(PyObject *self) -;; DECREF all items, free items array, free list +;; DECREF all items, free items array, free or pool list header ;; ============================================================================ DEF_FUNC list_dealloc push rbx @@ -1020,6 +1011,25 @@ DEF_FUNC list_dealloc mov rdi, [rbx + PyListObject.ob_item] call ap_free + ; Try to pool list header + cmp dword [rel list_pool_count], LIST_POOL_MAX + jge .free_header + ; Untrack from GC before pooling + mov rdi, rbx + extern gc_untrack + call gc_untrack + ; Push to pool: reuse ob_refcnt as next-pointer + mov rcx, [rel list_pool_head] + mov [rbx + PyObject.ob_refcnt], rcx + mov [rel list_pool_head], rbx + inc dword [rel list_pool_count] + pop r13 + pop r12 + pop rbx + leave + ret + +.free_header: mov rdi, rbx call gc_dealloc @@ -1111,6 +1121,40 @@ DEF_FUNC list_getslice mov rdi, [rsp] ; new list mov [rdi + PyListObject.ob_size], rcx + ; Fast path: step == 1 → contiguous memcpy + bulk INCREF + cmp r15, 1 + jne .lgs_loop_start + ; src = source->ob_item + start * 16 + mov rsi, [rbx + PyListObject.ob_item] + mov rax, r13 + shl rax, 4 + add rsi, rax + ; dst = new_list->ob_item + mov rdi, [rsp] ; new list + mov rdi, [rdi + PyListObject.ob_item] + ; count = slicelength * 16 + mov rdx, [rsp + 8] ; slicelength + shl rdx, 4 + call ap_memcpy + ; Bulk INCREF all copied elements + mov rcx, [rsp + 8] ; slicelength + test rcx, rcx + jz .lgs_done + mov rdi, [rsp] ; new list + mov rdi, [rdi + PyListObject.ob_item] + xor edx, edx +.lgs_incref_loop: + cmp rdx, rcx + jge .lgs_done + mov rax, rdx + shl rax, 4 + mov r8, [rdi + rax] ; payload + mov r9, [rdi + rax + 8] ; tag + INCREF_VAL r8, r9 + inc rdx + jmp .lgs_incref_loop + +.lgs_loop_start: xor ecx, ecx ; i = 0 .lgs_loop: cmp rcx, [rsp + 8] ; slicelength @@ -1672,6 +1716,12 @@ END_FUNC list_type_call ;; ============================================================================ section .data +; List header pool (freelist, singly-linked via ob_refcnt) +align 8 +list_pool_head: dq 0 ; freelist head +list_pool_count: dd 0 ; current count + dd 0 ; padding + list_name_str: db "list", 0 ; list_repr_str removed - repr now in src/repr.asm diff --git a/src/pyo/set.asm b/src/pyo/set.asm index e9edde5..1754f3a 100644 --- a/src/pyo/set.asm +++ b/src/pyo/set.asm @@ -312,9 +312,10 @@ DEF_FUNC_LOCAL set_find_slot mov rsi, r12 ; b = lookup key mov rcx, [rbp - SFS_KEY_TAG] ; b_tag (lookup key tag, 64-bit) call set_keys_equal + mov edi, eax ; save equality result (survives pops) pop rax ; entry ptr pop rcx ; slot - test eax, eax + test edi, edi jnz .found_existing .find_next: diff --git a/src/pyo/slice.asm b/src/pyo/slice.asm index 09ca15c..5937f23 100644 --- a/src/pyo/slice.asm +++ b/src/pyo/slice.asm @@ -11,10 +11,12 @@ %include "macros.inc" %include "object.inc" %include "types.inc" +%include "gc.inc" extern ap_malloc extern gc_alloc extern gc_track +extern gc_untrack extern gc_dealloc extern ap_free extern obj_incref @@ -50,12 +52,25 @@ DEF_FUNC slice_new mov r15d, r8d ; stop_tag push r9 ; save step_tag across malloc + ; Check slice pool first + mov rax, [rel slice_pool_head] + test rax, rax + jz .alloc_fresh + ; Pop from pool: reuse ob_refcnt slot as next-link + mov rcx, [rax + PyObject.ob_refcnt] + mov [rel slice_pool_head], rcx + dec dword [rel slice_pool_count] + mov qword [rax + PyObject.ob_refcnt], 1 ; reinit refcount + jmp .fill_fields + +.alloc_fresh: mov edi, PySliceObject_size lea rsi, [rel slice_type] call gc_alloc +.fill_fields: pop r9 ; step_tag - ; ob_refcnt=1, ob_type set by gc_alloc + ; ob_refcnt=1, ob_type set by gc_alloc (or still set from pool) mov [rax + PySliceObject.start], rbx mov [rax + PySliceObject.start_tag], r14 mov [rax + PySliceObject.stop], r12 @@ -86,6 +101,8 @@ END_FUNC slice_new ;; ============================================================================ ;; slice_dealloc(PySliceObject *self) ;; ============================================================================ +SLICE_POOL_MAX equ 16 + DEF_FUNC slice_dealloc push rbx mov rbx, rdi @@ -99,9 +116,26 @@ DEF_FUNC slice_dealloc mov rdi, [rbx + PySliceObject.step] mov rsi, [rbx + PySliceObject.step_tag] DECREF_VAL rdi, rsi + + ; Untrack from GC mov rdi, rbx - call gc_dealloc + call gc_untrack + + ; Try to push to pool + cmp dword [rel slice_pool_count], SLICE_POOL_MAX + jge .free_it + ; Push to pool: reuse ob_refcnt as next-pointer + mov rcx, [rel slice_pool_head] + mov [rbx + PyObject.ob_refcnt], rcx + mov [rel slice_pool_head], rbx + inc dword [rel slice_pool_count] + pop rbx + leave + ret +.free_it: + lea rdi, [rbx - GC_HEAD_SIZE] + call ap_free pop rbx leave ret @@ -390,6 +424,12 @@ section .data slice_name_str: db "slice", 0 slice_repr_str: db "slice(...)", 0 +; Slice object pool (freelist) +align 8 +slice_pool_head: dq 0 ; freelist head (singly-linked via ob_refcnt slot) +slice_pool_count: dd 0 ; current count + dd 0 ; padding + align 8 global slice_type slice_type: diff --git a/src/pyo/tuple.asm b/src/pyo/tuple.asm index c8d07f9..1b606d7 100644 --- a/src/pyo/tuple.asm +++ b/src/pyo/tuple.asm @@ -22,6 +22,7 @@ extern obj_incref extern slice_type extern slice_indices extern type_type +extern gc_untrack extern tuple_traverse extern tuple_clear extern obj_is_true @@ -39,6 +40,36 @@ DEF_FUNC tuple_new mov r12, rdi ; r12 = size (item count) + ; Try pool for small tuples (size 1-3) + cmp r12, 1 + je .try_pool_1 + cmp r12, 2 + je .try_pool_2 + cmp r12, 3 + je .try_pool_3 + jmp .alloc_fresh +.try_pool_1: + lea rcx, [rel tuple_pool_1_head] + jmp .try_pool +.try_pool_2: + lea rcx, [rel tuple_pool_2_head] + jmp .try_pool +.try_pool_3: + lea rcx, [rel tuple_pool_3_head] +.try_pool: + mov rax, [rcx] ; head + test rax, rax + jz .alloc_fresh + mov rdx, [rax + PyObject.ob_refcnt] ; next link + mov [rcx], rdx + dec dword [rcx + 8] ; count-- + mov qword [rax + PyObject.ob_refcnt], 1 + mov rbx, rax + mov [rbx + PyTupleObject.ob_size], r12 + mov qword [rbx + PyTupleObject.ob_hash], -1 + jmp .zero_fill ; zero items, skip gc_alloc+gc_track + +.alloc_fresh: ; Allocate: header (32) + size * 16 (GC-tracked) mov rdi, r12 shl rdi, 4 ; size * 16 @@ -49,9 +80,10 @@ DEF_FUNC tuple_new mov [rbx + PyTupleObject.ob_size], r12 mov qword [rbx + PyTupleObject.ob_hash], -1 ; not computed +.zero_fill: ; Zero-fill the ob_item array (16 bytes per slot) test r12, r12 - jz .done + jz .done_pool lea rdi, [rbx + PyTupleObject.ob_item] xor eax, eax mov rcx, r12 @@ -62,7 +94,11 @@ DEF_FUNC tuple_new dec rcx jnz .zero_loop -.done: +.done_pool: + ; Only gc_track if freshly allocated (pooled tuples are already tracked) + ; Check: if tuple came from pool, ob_type is already set from previous use + ; For fresh alloc, gc_alloc sets ob_type. We can skip gc_track for pooled. + ; Pooled tuples were gc_untracked in dealloc, so we must gc_track them again. mov rdi, rbx call gc_track @@ -139,7 +175,9 @@ DEF_FUNC_BARE tuple_len END_FUNC tuple_len ; tuple_dealloc(PyObject *self) -; DECREF_VAL each fat item, then free self +; DECREF_VAL each fat item, then free self or return to pool +TUPLE_POOL_MAX equ 16 + DEF_FUNC tuple_dealloc push rbx push r12 @@ -151,7 +189,7 @@ DEF_FUNC tuple_dealloc .decref_loop: cmp r13, r12 - jge .free_self + jge .try_pool mov rax, r13 shl rax, 4 ; index * 16 mov rdi, [rbx + PyTupleObject.ob_item + rax] @@ -160,6 +198,42 @@ DEF_FUNC tuple_dealloc inc r13 jmp .decref_loop +.try_pool: + ; Try to pool small tuples (size 1-3) + cmp r12, 1 + je .pool_1 + cmp r12, 2 + je .pool_2 + cmp r12, 3 + je .pool_3 + jmp .free_self +.pool_1: + lea rcx, [rel tuple_pool_1_head] + jmp .try_push +.pool_2: + lea rcx, [rel tuple_pool_2_head] + jmp .try_push +.pool_3: + lea rcx, [rel tuple_pool_3_head] +.try_push: + cmp dword [rcx + 8], TUPLE_POOL_MAX + jge .free_self + ; Untrack from GC before pooling + push rcx ; save pool head ptr (caller-saved, clobbered by gc_untrack) + mov rdi, rbx + call gc_untrack + pop rcx ; restore pool head ptr + ; Push to pool: reuse ob_refcnt as next-pointer + mov rdx, [rcx] + mov [rbx + PyObject.ob_refcnt], rdx + mov [rcx], rbx + inc dword [rcx + 8] ; count++ + pop r13 + pop r12 + pop rbx + leave + ret + .free_self: mov rdi, rbx call gc_dealloc @@ -1023,6 +1097,18 @@ END_FUNC tuple_type_call section .data +; Tuple object pools (freelist per size class, singly-linked via ob_refcnt) +align 8 +tuple_pool_1_head: dq 0 ; 1-tuple freelist head +tuple_pool_1_count: dd 0 ; current count + dd 0 ; padding +tuple_pool_2_head: dq 0 ; 2-tuple freelist head +tuple_pool_2_count: dd 0 + dd 0 +tuple_pool_3_head: dq 0 ; 3-tuple freelist head +tuple_pool_3_count: dd 0 + dd 0 + tuple_name_str: db "tuple", 0 ; tuple_repr_str removed - repr now in src/repr.asm diff --git a/tests/test_list_insert_pop.py b/tests/test_list_insert_pop.py new file mode 100644 index 0000000..a8ca199 --- /dev/null +++ b/tests/test_list_insert_pop.py @@ -0,0 +1,43 @@ +# Test list insert, pop, and remove shift operations + +# insert at various positions +lst = [1, 2, 3, 4, 5] +lst.insert(0, 0) # insert at front (backward shift of all elements) +lst.insert(3, 99) # insert in middle +lst.insert(len(lst), 100) # insert at end (no shift) +lst.insert(-1, 88) # negative index +print(lst) + +# pop from various positions +lst = [10, 20, 30, 40, 50] +print(lst.pop(0)) # pop front (forward shift of all elements) +print(lst.pop(1)) # pop middle +print(lst.pop()) # pop last (no shift) +print(lst) + +# combined insert+pop pattern (fannkuch's hot path) +lst = list(range(10)) +for i in range(5): + lst.insert(i, lst.pop(0)) +print(lst) + +# remove (also uses shift) +lst = [1, 2, 3, 4, 5] +lst.remove(1) # remove first (shift all) +lst.remove(5) # remove last (no shift) +lst.remove(3) # remove middle +print(lst) + +# edge cases +lst = [42] +print(lst.pop(0)) # single-element pop +print(lst) +lst.insert(0, 99) # insert into empty list +print(lst) + +# larger list shifts +lst = list(range(20)) +lst.insert(0, lst.pop(0)) # pop front, insert front (identity) +print(lst) +lst.insert(10, lst.pop(0)) # pop front, insert middle +print(lst)