From de70368e4540ae5f8e7cc31c9aa9124c49885cd0 Mon Sep 17 00:00:00 2001 From: andrej Date: Fri, 5 Dec 2025 13:19:39 -0700 Subject: [PATCH 1/7] fix regular MHA/softmax --- .../llama_3.2_1b/configs/llama32_1b.json | 4 ++-- applications/llama_3.2_1b/src/block/gqa.py | 4 ++-- operators/softmax/op.py | 22 +++++++++---------- operators/softmax/test.py | 3 +-- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/applications/llama_3.2_1b/configs/llama32_1b.json b/applications/llama_3.2_1b/configs/llama32_1b.json index 2d66ff56..6321dcab 100644 --- a/applications/llama_3.2_1b/configs/llama32_1b.json +++ b/applications/llama_3.2_1b/configs/llama32_1b.json @@ -21,8 +21,8 @@ "use_aie_norm1": true, "use_aie_norm2": true, "use_aie_residual": true, - "use_aie_regular_mha": false, - "use_aie_fused_mha": true, + "use_aie_regular_mha": true, + "use_aie_fused_mha": false, "use_aie_final_gemm": false, "rope_freq": { "factor": 32.0, diff --git a/applications/llama_3.2_1b/src/block/gqa.py b/applications/llama_3.2_1b/src/block/gqa.py index 9b9e4ebc..aa4c1982 100644 --- a/applications/llama_3.2_1b/src/block/gqa.py +++ b/applications/llama_3.2_1b/src/block/gqa.py @@ -84,8 +84,8 @@ def __init__( self.aie_softmax = AIESoftmax( num_aie_columns=1, num_channels=1, - size=prompt_length * prompt_length, - last_dim=prompt_length, + rows=prompt_length, + cols=prompt_length, ) M_for_gemm = prompt_length + num_tokens self.aie_mha_gemm_qk = AIEGEMM( diff --git a/operators/softmax/op.py b/operators/softmax/op.py index d1764e3f..f031c4ce 100644 --- a/operators/softmax/op.py +++ b/operators/softmax/op.py @@ -21,16 +21,14 @@ class AIESoftmax(AIEOperatorBase): def __init__( - self, - rows: int, - cols: int, - num_aie_columns=1, - num_channels=1, - tile_size=None, - context=None, + self, + rows: int, + cols: int, + num_aie_columns=1, + num_channels=1, + context=None ): self.size = rows * cols - self.tile_size = tile_size if tile_size is not None else cols self.rows = rows self.cols = cols @@ -46,7 +44,7 @@ def __init__( def set_up_artifacts(self): # Compilation artifacts operator_dir = Path(__file__).parent - file_name_base = f"softmax_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.tile_size}t" + file_name_base = f"softmax_{self.num_columns}c_{self.num_channels}ch_{self.size}_{self.cols}t" mlir_artifact = PythonGeneratedMLIRArtifact.new( f"{file_name_base}.mlir", @@ -54,11 +52,11 @@ def set_up_artifacts(self): callback_fn="softmax", callback_args=[ self.context.device_manager.device_type, - self.size, + self.rows * self.cols, self.num_columns, self.num_channels, 0, - self.tile_size, + self.cols, ], ) @@ -105,7 +103,7 @@ def set_up_runtime(self): def forward(self, x): applicable = ( x.shape[-1] * x.shape[-2] == self.size - and x.shape[-1] == self.tile_size + and x.shape[-1] == self.cols and x.shape[-1] % 16 == 0 and x.shape[-2] % 16 == 0 ) diff --git a/operators/softmax/test.py b/operators/softmax/test.py index 515b13c0..1b63c3c5 100755 --- a/operators/softmax/test.py +++ b/operators/softmax/test.py @@ -84,8 +84,7 @@ def test_softmax(input_length, num_aie_columns, num_channels, tile_size, aie_con cols=cols, num_aie_columns=num_aie_columns, num_channels=num_channels, - tile_size=tile_size, - context=aie_context, + context=aie_context ) input_buffers = {"in": golden_ref["input"]} From 11b40aa7699abd0cdcad2136439f415789c9aa77 Mon Sep 17 00:00:00 2001 From: andrej Date: Fri, 5 Dec 2025 15:40:15 -0700 Subject: [PATCH 2/7] fix padding in runlist-based SwiGLU --- .../llama_3.2_1b/configs/llama32_1b.json | 4 +- operators/swiglu_decode/op.py | 26 ++++---- operators/swiglu_prefill/op.py | 62 +++++++++++++++---- 3 files changed, 66 insertions(+), 26 deletions(-) diff --git a/applications/llama_3.2_1b/configs/llama32_1b.json b/applications/llama_3.2_1b/configs/llama32_1b.json index 6321dcab..2d66ff56 100644 --- a/applications/llama_3.2_1b/configs/llama32_1b.json +++ b/applications/llama_3.2_1b/configs/llama32_1b.json @@ -21,8 +21,8 @@ "use_aie_norm1": true, "use_aie_norm2": true, "use_aie_residual": true, - "use_aie_regular_mha": true, - "use_aie_fused_mha": false, + "use_aie_regular_mha": false, + "use_aie_fused_mha": true, "use_aie_final_gemm": false, "rope_freq": { "factor": 32.0, diff --git a/operators/swiglu_decode/op.py b/operators/swiglu_decode/op.py index 38cb33c0..98975840 100644 --- a/operators/swiglu_decode/op.py +++ b/operators/swiglu_decode/op.py @@ -46,8 +46,6 @@ def __init__(self, embedding_dim, hidden_dim, prio_accuracy=False, context=None) super().__init__(context=context) def set_up_artifacts(self): - # Artifact setup - # --- artifacts = [] device_str = self.context.device_manager.device_str() @@ -57,6 +55,7 @@ def set_up_artifacts(self): num_aie_columns=8, tile_size=1, ) + self.gemv_1 = gemv_1 gemv_1_xclbin, gemv_1_insts = gemv_1.get_artifacts( prefix="swiglu_decode_gemv_1_" ) @@ -75,6 +74,8 @@ def set_up_artifacts(self): num_channels=2, tile_size=self.hidden_dim // 16, ) + self.silu = silu + self.hidden_dim_padded = silu.size silu_xclbin, silu_insts = silu.get_artifacts(prefix="swiglu_decode_silu_") silu_xclbin.xclbin_input = gemv_1_xclbin silu_xclbin.extra_flags += [ @@ -91,6 +92,8 @@ def set_up_artifacts(self): num_channels=2, tile_size=self.hidden_dim // 8, ) + self.eltwise_mul = eltwise_mul + assert self.hidden_dim <= eltwise_mul.size <= self.hidden_dim_padded eltwise_mul_xclbin, eltwise_mul_insts = eltwise_mul.get_artifacts( prefix="swiglu_decode_eltwise_mul_" ) @@ -109,6 +112,7 @@ def set_up_artifacts(self): num_aie_columns=8, tile_size=1, ) + self.gemv_2 = gemv_2 gemv_2_xclbin, gemv_2_insts = gemv_2.get_artifacts( prefix="swiglu_decode_gemv_2_" ) @@ -135,28 +139,26 @@ def set_up_artifacts(self): self.add_artifacts(artifacts) def set_up_runtime(self): - # Runtime setup - # --- self.add_buffer("input", self.embedding_dim) self.add_buffer( "weights_1", - self.embedding_dim * self.hidden_dim, + self.embedding_dim * self.hidden_dim_padded, static_data=torch_to_numpy(self.weights_1), ) self.add_buffer( "weights_2", - self.embedding_dim * self.hidden_dim, + self.embedding_dim * self.hidden_dim_padded, static_data=torch_to_numpy(self.weights_2), ) self.add_buffer( "weights_3", - self.hidden_dim * self.embedding_dim, + self.hidden_dim_padded * self.embedding_dim, static_data=torch_to_numpy(self.weights_3), ) - self.add_buffer("left", self.hidden_dim) - self.add_buffer("left_swished", self.hidden_dim) - self.add_buffer("right", self.hidden_dim) - self.add_buffer("intermediate", self.hidden_dim) + self.add_buffer("left", self.hidden_dim_padded) + self.add_buffer("left_swished", self.hidden_dim_padded) + self.add_buffer("right", self.hidden_dim_padded) + self.add_buffer("intermediate", self.hidden_dim_padded) self.add_buffer("output", self.embedding_dim) self.add_kernel( "swiglu_gemv_1", @@ -191,9 +193,7 @@ def set_up_runtime(self): self.add_to_runlist("swiglu_gemv_2", "weights_3", "intermediate", "output") def forward(self, x): - # Turn into a numpy vector and drop the batch and other higher dimensions, if any; will error if batch or other higher dimensions > 1 x_flat = x.reshape(x.shape[-1]) - assert x_flat.shape[0] == self.embedding_dim self.write_buffer("input", x_flat) diff --git a/operators/swiglu_prefill/op.py b/operators/swiglu_prefill/op.py index 32443458..ace713bf 100644 --- a/operators/swiglu_prefill/op.py +++ b/operators/swiglu_prefill/op.py @@ -51,6 +51,9 @@ def __init__( def set_up_artifacts(self): # Artifact setup # --- + # Note: All operators (GEMM, SiLU, ElementwiseMul) apply their own padding + # to meet hardware alignment requirements. We store the padded dimensions + # from GEMM and verify that all operators use consistent padded sizes. artifacts = [] device_str = self.context.device_manager.device_str() @@ -65,6 +68,10 @@ def set_up_artifacts(self): gemm_1 = AIEGEMM( M=self.seq_len, K=self.embedding_dim, N=self.hidden_dim, **accuracy_flags ) + self.gemm_1 = gemm_1 + self.seq_len_padded = gemm_1.M + self.embedding_dim_padded = gemm_1.K + self.hidden_dim_padded = gemm_1.N gemm_1_xclbin, gemm_1_insts = gemm_1.get_artifacts(prefix="swiglu_gemm_1_") gemm_1_xclbin.extra_flags += [ "--xclbin-instance-name=swiglu_gemm_1", @@ -81,6 +88,9 @@ def set_up_artifacts(self): num_channels=2, tile_size=self.hidden_dim // 8, ) + self.silu = silu + assert self.seq_len * self.hidden_dim <= silu.size <= self.seq_len_padded * self.hidden_dim_padded + silu_xclbin, silu_insts = silu.get_artifacts(prefix="swiglu_silu_") silu_xclbin.xclbin_input = gemm_1_xclbin silu_xclbin.extra_flags += [ @@ -97,6 +107,9 @@ def set_up_artifacts(self): num_channels=2, tile_size=self.hidden_dim // 8, ) + self.eltwise_mul = eltwise_mul + assert self.seq_len * self.hidden_dim <= eltwise_mul.size <= self.seq_len_padded * self.hidden_dim_padded + eltwise_mul_xclbin, eltwise_mul_insts = eltwise_mul.get_artifacts( prefix="swiglu_eltwise_mul_" ) @@ -112,6 +125,11 @@ def set_up_artifacts(self): gemm_2 = AIEGEMM( M=self.seq_len, K=self.hidden_dim, N=self.embedding_dim, **accuracy_flags ) + self.gemm_2 = gemm_2 + assert gemm_2.M == self.seq_len_padded + assert gemm_2.K == self.hidden_dim_padded + assert gemm_2.N == self.embedding_dim_padded + gemm_2_xclbin, gemm_2_insts = gemm_2.get_artifacts(prefix="swiglu_gemm_2_") gemm_2_xclbin.xclbin_input = eltwise_mul_xclbin gemm_2_xclbin.extra_flags += [ @@ -138,27 +156,27 @@ def set_up_artifacts(self): def set_up_runtime(self): # Runtime setup # --- - self.add_buffer("input", self.seq_len * self.embedding_dim) + self.add_buffer("input", self.seq_len_padded * self.embedding_dim_padded) self.add_buffer( "weights_1", - self.embedding_dim * self.hidden_dim, + self.embedding_dim_padded * self.hidden_dim_padded, static_data=torch_to_numpy(self.weights_1.T), ) self.add_buffer( "weights_2", - self.embedding_dim * self.hidden_dim, + self.embedding_dim_padded * self.hidden_dim_padded, static_data=torch_to_numpy(self.weights_2.T), ) self.add_buffer( "weights_3", - self.hidden_dim * self.embedding_dim, + self.hidden_dim_padded * self.embedding_dim_padded, static_data=torch_to_numpy(self.weights_3.T), ) - self.add_buffer("left", self.seq_len * self.hidden_dim) - self.add_buffer("left_swished", self.seq_len * self.hidden_dim) - self.add_buffer("right", self.seq_len * self.hidden_dim) - self.add_buffer("intermediate", self.seq_len * self.hidden_dim) - self.add_buffer("output", self.seq_len * self.embedding_dim) + self.add_buffer("left", self.seq_len_padded * self.hidden_dim_padded) + self.add_buffer("left_swished", self.seq_len_padded * self.hidden_dim_padded) + self.add_buffer("right", self.seq_len_padded * self.hidden_dim_padded) + self.add_buffer("intermediate", self.seq_len_padded * self.hidden_dim_padded) + self.add_buffer("output", self.seq_len_padded * self.embedding_dim_padded) self.add_kernel( "swiglu_gemm_1", self.combined_xclbin, @@ -212,10 +230,32 @@ def _execute_aie_operation(self, x): # Flatten inputs for AIE processing x_flat = x.view(-1) + + # Verify input size matches expected dimensions + expected_size = batch * self.seq_len * self.embedding_dim + assert x_flat.shape[0] == expected_size + + # Pad input if necessary to match GEMM requirements + if self.seq_len_padded * self.embedding_dim_padded > x_flat.shape[0]: + x_padded = torch.zeros( + self.seq_len_padded * self.embedding_dim_padded, + dtype=x_flat.dtype, + device=x_flat.device + ) + x_padded[:x_flat.shape[0]] = x_flat + x_flat = x_padded self.write_buffer("input", x_flat) - test_pattern = np.zeros(len(x_flat), dtype=bfloat16) self.run_runlist() - result = self.read_buffer_as_torch("output", shape=x_flat.shape, dtype=bfloat16) + + # Read padded output buffer + result_padded = self.read_buffer_as_torch( + "output", + shape=(self.seq_len_padded * self.embedding_dim_padded,), + dtype=bfloat16 + ) + + # Extract only the unpadded portion + result = result_padded[:expected_size].view(batch, -1) return result From 4c50752d98e3fb47447b90499bf425d7ff6e7eac Mon Sep 17 00:00:00 2001 From: andrej Date: Fri, 5 Dec 2025 15:44:16 -0700 Subject: [PATCH 3/7] format --- operators/swiglu_prefill/op.py | 36 +++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/operators/swiglu_prefill/op.py b/operators/swiglu_prefill/op.py index ace713bf..6afdfe82 100644 --- a/operators/swiglu_prefill/op.py +++ b/operators/swiglu_prefill/op.py @@ -89,8 +89,12 @@ def set_up_artifacts(self): tile_size=self.hidden_dim // 8, ) self.silu = silu - assert self.seq_len * self.hidden_dim <= silu.size <= self.seq_len_padded * self.hidden_dim_padded - + assert ( + self.seq_len * self.hidden_dim + <= silu.size + <= self.seq_len_padded * self.hidden_dim_padded + ) + silu_xclbin, silu_insts = silu.get_artifacts(prefix="swiglu_silu_") silu_xclbin.xclbin_input = gemm_1_xclbin silu_xclbin.extra_flags += [ @@ -108,8 +112,12 @@ def set_up_artifacts(self): tile_size=self.hidden_dim // 8, ) self.eltwise_mul = eltwise_mul - assert self.seq_len * self.hidden_dim <= eltwise_mul.size <= self.seq_len_padded * self.hidden_dim_padded - + assert ( + self.seq_len * self.hidden_dim + <= eltwise_mul.size + <= self.seq_len_padded * self.hidden_dim_padded + ) + eltwise_mul_xclbin, eltwise_mul_insts = eltwise_mul.get_artifacts( prefix="swiglu_eltwise_mul_" ) @@ -129,7 +137,7 @@ def set_up_artifacts(self): assert gemm_2.M == self.seq_len_padded assert gemm_2.K == self.hidden_dim_padded assert gemm_2.N == self.embedding_dim_padded - + gemm_2_xclbin, gemm_2_insts = gemm_2.get_artifacts(prefix="swiglu_gemm_2_") gemm_2_xclbin.xclbin_input = eltwise_mul_xclbin gemm_2_xclbin.extra_flags += [ @@ -230,31 +238,31 @@ def _execute_aie_operation(self, x): # Flatten inputs for AIE processing x_flat = x.view(-1) - + # Verify input size matches expected dimensions expected_size = batch * self.seq_len * self.embedding_dim assert x_flat.shape[0] == expected_size - + # Pad input if necessary to match GEMM requirements if self.seq_len_padded * self.embedding_dim_padded > x_flat.shape[0]: x_padded = torch.zeros( self.seq_len_padded * self.embedding_dim_padded, dtype=x_flat.dtype, - device=x_flat.device + device=x_flat.device, ) - x_padded[:x_flat.shape[0]] = x_flat + x_padded[: x_flat.shape[0]] = x_flat x_flat = x_padded self.write_buffer("input", x_flat) self.run_runlist() - + # Read padded output buffer result_padded = self.read_buffer_as_torch( - "output", - shape=(self.seq_len_padded * self.embedding_dim_padded,), - dtype=bfloat16 + "output", + shape=(self.seq_len_padded * self.embedding_dim_padded,), + dtype=bfloat16, ) - + # Extract only the unpadded portion result = result_padded[:expected_size].view(batch, -1) From 9d5c72a0ec22caff19faed7261cffd204668380d Mon Sep 17 00:00:00 2001 From: andrej Date: Thu, 11 Dec 2025 10:52:06 -0700 Subject: [PATCH 4/7] format --- operators/softmax/op.py | 7 +------ operators/softmax/test.py | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/operators/softmax/op.py b/operators/softmax/op.py index f031c4ce..7c4cef71 100644 --- a/operators/softmax/op.py +++ b/operators/softmax/op.py @@ -21,12 +21,7 @@ class AIESoftmax(AIEOperatorBase): def __init__( - self, - rows: int, - cols: int, - num_aie_columns=1, - num_channels=1, - context=None + self, rows: int, cols: int, num_aie_columns=1, num_channels=1, context=None ): self.size = rows * cols self.rows = rows diff --git a/operators/softmax/test.py b/operators/softmax/test.py index 1b63c3c5..dd2c297e 100755 --- a/operators/softmax/test.py +++ b/operators/softmax/test.py @@ -84,7 +84,7 @@ def test_softmax(input_length, num_aie_columns, num_channels, tile_size, aie_con cols=cols, num_aie_columns=num_aie_columns, num_channels=num_channels, - context=aie_context + context=aie_context, ) input_buffers = {"in": golden_ref["input"]} From 19c0347211ea4c7a8207cf80ff395382ae687c69 Mon Sep 17 00:00:00 2001 From: andrej Date: Thu, 11 Dec 2025 16:58:51 -0700 Subject: [PATCH 5/7] fix swiglu padding sizes + add test cases for llama --- applications/llama_3.2_1b/test.py | 4 ++-- operators/swiglu_prefill/op.py | 20 ++++++-------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/applications/llama_3.2_1b/test.py b/applications/llama_3.2_1b/test.py index 914a49c1..b0760efc 100644 --- a/applications/llama_3.2_1b/test.py +++ b/applications/llama_3.2_1b/test.py @@ -11,8 +11,8 @@ def generate_test_params(): - prompt_lengths = [2048] - num_tokens_list = [40] + prompt_lengths = [2048, 13, 3] + num_tokens_list = [40, 1] params = [] names = [] diff --git a/operators/swiglu_prefill/op.py b/operators/swiglu_prefill/op.py index 6afdfe82..6a0d0c2b 100644 --- a/operators/swiglu_prefill/op.py +++ b/operators/swiglu_prefill/op.py @@ -83,17 +83,13 @@ def set_up_artifacts(self): ) # xclbin artifact will be pulled in as a dependency of last xclbin silu = AIESiLU( - size=self.seq_len * self.hidden_dim, + size=self.seq_len_padded * self.hidden_dim_padded, num_aie_columns=8, num_channels=2, - tile_size=self.hidden_dim // 8, + tile_size=self.hidden_dim_padded // 8, ) self.silu = silu - assert ( - self.seq_len * self.hidden_dim - <= silu.size - <= self.seq_len_padded * self.hidden_dim_padded - ) + assert silu.size == self.seq_len_padded * self.hidden_dim_padded silu_xclbin, silu_insts = silu.get_artifacts(prefix="swiglu_silu_") silu_xclbin.xclbin_input = gemm_1_xclbin @@ -106,17 +102,13 @@ def set_up_artifacts(self): artifacts.append(silu_insts) eltwise_mul = AIEElementwiseMul( - size=self.seq_len * self.hidden_dim, + size=self.seq_len_padded * self.hidden_dim_padded, num_aie_columns=8, num_channels=2, - tile_size=self.hidden_dim // 8, + tile_size=self.hidden_dim_padded // 8, ) self.eltwise_mul = eltwise_mul - assert ( - self.seq_len * self.hidden_dim - <= eltwise_mul.size - <= self.seq_len_padded * self.hidden_dim_padded - ) + assert eltwise_mul.size == self.seq_len_padded * self.hidden_dim_padded eltwise_mul_xclbin, eltwise_mul_insts = eltwise_mul.get_artifacts( prefix="swiglu_eltwise_mul_" From cf7a176651119c3fa89ca97a169cc33eb2ad7ff3 Mon Sep 17 00:00:00 2001 From: andrej Date: Thu, 11 Dec 2025 16:59:16 -0700 Subject: [PATCH 6/7] make non-runlist execution an option for operators for easier debugging --- operators/common/aie_base.py | 57 +++++++++++++++++++++++---------- operators/common/aie_context.py | 35 +++++++++++--------- 2 files changed, 60 insertions(+), 32 deletions(-) diff --git a/operators/common/aie_base.py b/operators/common/aie_base.py index 79902150..9d6739b2 100644 --- a/operators/common/aie_base.py +++ b/operators/common/aie_base.py @@ -167,23 +167,46 @@ def _move_artifact_paths(self): todo.extend(artifact.depends) def run_runlist(self): - bos = set( - self.buffer_bos[buffer_arg] - for _, *buffer_args in self.runlist - for buffer_arg in buffer_args - ) - insts_bos = set( - self.xrt_kernels[kernel_name][2] for (kernel_name, *_) in self.runlist - ) - for bo in bos | insts_bos: - bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - start = time.perf_counter() - self.xrt_runlist.execute() - self.xrt_runlist.wait() - stop = time.perf_counter() - for bo in bos: - bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) - return stop - start + elapsed = 0.0 + if self.xrt_runlist is None: + # Execute as separate xclbin kernel invocations + for i, (kernel_name, *buffer_args) in enumerate(self.runlist): + context, xrt_kernel, insts_bo, insts_len = self.xrt_kernels[kernel_name] + insts_bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + bos = [self.buffer_bos[buffer_arg] for buffer_arg in buffer_args] + for bo in bos: + bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + opcode = 3 + start = time.perf_counter() + run = xrt_kernel(opcode, insts_bo, insts_len, *bos) + result = run.wait() + stop = time.perf_counter() + elapsed += stop - start + if result != pyxrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED: + raise RuntimeError( + f"Kernel {kernel_name} did not complete correctly: {result}" + ) + for bo in bos: + bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) + else: + bos = set( + self.buffer_bos[buffer_arg] + for _, *buffer_args in self.runlist + for buffer_arg in buffer_args + ) + insts_bos = set( + self.xrt_kernels[kernel_name][2] for (kernel_name, *_) in self.runlist + ) + for bo in bos | insts_bos: + bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + start = time.perf_counter() + self.xrt_runlist.execute() + self.xrt_runlist.wait() + stop = time.perf_counter() + for bo in bos: + bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) + elapsed = stop - start + return elapsed class AIEOperatorConstraintError(RuntimeError): diff --git a/operators/common/aie_context.py b/operators/common/aie_context.py index 7184278e..30de8924 100644 --- a/operators/common/aie_context.py +++ b/operators/common/aie_context.py @@ -14,7 +14,7 @@ class AIEContext: """Context for managing AIE operator compilation and runtime state""" - def __init__(self): + def __init__(self, use_runlist=True): self.operators = [] self.static_data_pool = {} self.device_manager = AIEDeviceManager() @@ -22,6 +22,8 @@ def __init__(self): self.build_dir = Path(os.getcwd()) / "build" self.mlir_aie_dir = Path(aie.utils.config.root_path()) self.peano_dir = Path(aie.utils.config.peano_install_dir()) + # Disable the XRT runlist sacrifices performance by executing kernels individually as separate xclbin invocations for easier debugging (can tell which part of runlist execution failed) + self.use_runlist = use_runlist self._runtime_prepared = False def register_operator(self, operator): @@ -146,20 +148,23 @@ def prepare_runtime(self): context, _ = self.device_manager.get_context_and_kernel( str(first_xclbin.path), first_xclbin_kernel_name ) - op.xrt_runlist = pyxrt.runlist(context) - for i, (kernel_name, *buffer_args) in enumerate(op.runlist): - this_context, xrt_kernel, insts_bo, insts_len = op.xrt_kernels[ - kernel_name - ] - assert this_context == context - opcode = 3 - run = pyxrt.run(xrt_kernel) - run.set_arg(0, opcode) - run.set_arg(1, insts_bo) - run.set_arg(2, insts_len) - for j, buffer_arg in enumerate(buffer_args): - run.set_arg(j + 3, op.buffer_bos[buffer_arg]) - op.xrt_runlist.add(run) + if self.use_runlist: + op.xrt_runlist = pyxrt.runlist(context) + for i, (kernel_name, *buffer_args) in enumerate(op.runlist): + this_context, xrt_kernel, insts_bo, insts_len = op.xrt_kernels[ + kernel_name + ] + assert this_context == context + opcode = 3 + run = pyxrt.run(xrt_kernel) + run.set_arg(0, opcode) + run.set_arg(1, insts_bo) + run.set_arg(2, insts_len) + for j, buffer_arg in enumerate(buffer_args): + run.set_arg(j + 3, op.buffer_bos[buffer_arg]) + op.xrt_runlist.add(run) + else: + op.xrt_runlist = None # Log allocation info bo_count = sum(len(pool) for pool in bo_pools.values()) From 3b1ec7d3a4beb74736e8cbab505820164cbab35c Mon Sep 17 00:00:00 2001 From: andrej Date: Thu, 11 Dec 2025 17:06:43 -0700 Subject: [PATCH 7/7] update test cases --- applications/llama_3.2_1b/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/llama_3.2_1b/test.py b/applications/llama_3.2_1b/test.py index b0760efc..933b7d5e 100644 --- a/applications/llama_3.2_1b/test.py +++ b/applications/llama_3.2_1b/test.py @@ -11,7 +11,7 @@ def generate_test_params(): - prompt_lengths = [2048, 13, 3] + prompt_lengths = [2048, 13] num_tokens_list = [40, 1] params = []