ocentra
diff --git a/‎crates/bitnet-core/Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎crates/bitnet-core/Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/bitnet-core/README.md‎
Lines changed: 5 additions & 0 deletions b/‎crates/bitnet-core/README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎crates/bitnet-core/src/bitnet_linear.rs‎
Lines changed: 2 additions & 1 deletion b/‎crates/bitnet-core/src/bitnet_linear.rs‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎crates/bitnet-core/src/kernels.rs‎
Lines changed: 31 additions & 3 deletions b/‎crates/bitnet-core/src/kernels.rs‎
Lines changed: 31 additions & 3 deletions
diff --git a/‎crates/bitnet-core/src/kernels/bitnet_kernel_optimal.wgsl‎
Lines changed: 14 additions & 55 deletions b/‎crates/bitnet-core/src/kernels/bitnet_kernel_optimal.wgsl‎
Lines changed: 14 additions & 55 deletions
diff --git a/‎crates/bitnet-core/src/wgpu_context.rs‎
Lines changed: 8 additions & 6 deletions b/‎crates/bitnet-core/src/wgpu_context.rs‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎crates/bitnet-core/tests/DX12_test.rs‎
Lines changed: 9 additions & 7 deletions b/‎crates/bitnet-core/tests/DX12_test.rs‎
Lines changed: 9 additions & 7 deletions
@@ -37,7 +37,7 @@ bytemuck = { workspace = true, version = "1.12.3", features = ["derive"] }
 pollster = { workspace = true }
 derive-new = "0.7.0"
 bitnet-tools = { path = "../bitnet-tools" }
-wgpu = { version = "0.20.0", features = ["naga"], optional = true }
+wgpu = { version = "25.0.2", optional = true }
 rand = "0.9.1"
 tokio = { version = "1.36.0", features = ["full"] }
 futures-intrusive = "0.5"
 
@@ -82,10 +82,15 @@ use bitnet_core::model::Transformer;
 - Streaming and per-block model loading tests
 - **Optional Stress Test**: A long-running stress test (`stress_test_maximum_dimension_support`) is available but ignored by default. To run it, set the `RUN_STRESS_TESTS` environment variable:
   - **PowerShell**:
+  
     ```powershell
+
     $env:RUN_STRESS_TESTS="1"; cargo test --package bitnet-core --test kernel_tests -- --nocapture
+
     ```
+
   - **Linux/macOS**:
+
     ```bash
     RUN_STRESS_TESTS=1 cargo test --package bitnet-core --test kernel_tests -- --nocapture
     ```
 
@@ -295,8 +295,9 @@ impl BitLinear {
             label: Some("Bitnet Pipeline"),
             layout: Some(&pipeline_layout),
             module: &shader,
-            entry_point: "main",
+            entry_point: Some("main"),
             compilation_options: Default::default(),
+            cache: None,
         });
 
         // Step 4: Create and submit command buffer
 
@@ -96,6 +96,11 @@ pub struct BitnetMetadata {
 ///    - +1 => 10 (binary 2)
 /// 2. Per-output-channel scaling factors
 ///
+/// # Packing Order (LSB-first)
+///
+/// The first weight goes into the lowest bits (bits 0-1), the next into bits 2-3, ..., the last (16th) into bits 30-31.
+/// This matches the GPU shader and converter logic.
+///
 /// # Arguments
 ///
 /// * `weights` - 2D array of ternary weights [out_features][in_features]
@@ -138,7 +143,7 @@ pub fn pack_ternary_weights(weights: &[Vec<i8>]) -> Result<(Vec<u32>, Vec<f32>),
         // Pack weights
         for (in_idx, &w) in row.iter().enumerate() {
             let pack_idx = in_idx / 16;
-            let bit_idx = 30 - ((in_idx % 16) * 2); // Start from MSB and work down
+            let bit_idx = (in_idx % 16) * 2; // LSB-first: first weight in bits 0-1
 
             // Map -1, 0, +1 to 2-bit values
             let bits = match w {
@@ -215,10 +220,11 @@ mod tests {
         assert_eq!(scales.len(), 2);
         assert!(scales.iter().all(|&s| s > 0.0));
 
-        // Verify first row packing
+        // Verify first row packing (LSB-first)
         // Input:  -1,  0,  1,  0, -1,  1,  0,  0,  1, -1,  0,  1,  0, -1,  1,  0
         // Bits:   00, 01, 10, 01, 00, 10, 01, 01, 10, 00, 01, 10, 01, 00, 10, 01
-        let expected = 0b00_01_10_01_00_10_01_01_10_00_01_10_01_00_10_01u32;
+        // LSB-first: first weight in bits 0-1, last in 30-31
+        let expected = 0b01_10_00_01_10_01_01_10_00_10_01_00_01_10_01_00u32;
         assert_eq!(packed[0], expected, "Packed weights don't match expected pattern.\nExpected: {:032b}\nGot:      {:032b}", expected, packed[0]);
 
         // Verify second row packing
@@ -264,4 +270,26 @@ mod tests {
         );
         println!("[TEST] test_invalid_weight_value (took {:.2?})", t0.elapsed());
     }
+
+    #[test]
+    fn test_packing_unpacking_symmetry() {
+        // This test ensures that packing and then unpacking using the shader/scalar logic is symmetric.
+        let original: Vec<i8> = vec![-1, 0, 1, 0, -1, 1, 0, 0, 1, -1, 0, 1, 0, -1, 1, 0];
+        let weights = vec![original.clone()];
+        let (packed, _) = pack_ternary_weights(&weights).unwrap();
+        let packed_val = packed[0];
+        // Unpack using the same logic as the shader/scalar code (LSB-first)
+        let mut unpacked = Vec::with_capacity(16);
+        for i in 0..16 {
+            let bits = (packed_val >> (i * 2)) & 0b11;
+            let w = match bits {
+                0 => -1i8,
+                1 => 0i8,
+                2 => 1i8,
+                _ => 0i8, // Should never happen
+            };
+            unpacked.push(w);
+        }
+        assert_eq!(original, unpacked, "Packing and unpacking are not symmetric!\nOriginal: {:?}\nUnpacked: {:?}", original, unpacked);
+    }
 }
@@ -1,5 +1,5 @@
-// bitnet_kernel_optimal.wgsl
-// Optimized BitNet B1.58 Ternary Kernel for WGPU (Optimal, but fails on DX12)
+// bitnet_kernel_pre_naga.wgsl
+// Optimized BitNet B1.58 Ternary Kernel for WGPU (Original, before any DX12/Naga workaround)
 // Supports {-1, 0, +1} ternary weights with efficient packing and vectorization
 
 struct BitnetMetadata {
@@ -17,11 +17,11 @@ struct BitnetMetadata {
 @group(0) @binding(5) var<storage, read_write> output: array<f32>;
 
 // Optimized tiling parameters for modern GPUs
-const TILE_DIM_M: u32 = 64u;   // Reduced for better occupancy
+const TILE_DIM_M: u32 = 64u;   
 const TILE_DIM_N: u32 = 64u;   
-const TILE_DIM_K: u32 = 32u;   // Increased K tile for better data reuse
+const TILE_DIM_K: u32 = 32u;   
 
-const THREAD_TILE_M: u32 = 4u; // Smaller thread tiles for better vectorization
+const THREAD_TILE_M: u32 = 4u; 
 const THREAD_TILE_N: u32 = 4u;
 
 const WORKGROUP_SIZE_X: u32 = 16u; // TILE_DIM_N / THREAD_TILE_N
@@ -31,12 +31,10 @@ const WORKGROUP_SIZE_Y: u32 = 16u; // TILE_DIM_M / THREAD_TILE_M
 const TILE_A_SIZE: u32 = (TILE_DIM_M * TILE_DIM_K) / 4u; // for vec4<i32>
 const TILE_B_SIZE: u32 = TILE_DIM_K * TILE_DIM_N;         // for i32
 
-// Shared memory with better alignment
 var<workgroup> tile_a: array<vec4<i32>, TILE_A_SIZE>;
 var<workgroup> tile_b: array<i32, TILE_B_SIZE>;
 
-// Use direct decode function for ternary weights, matching the logic
-// from the previously passing tests.
+// Use direct decode function for ternary weights
 fn decode_2bit(val: u32) -> i32 {
     switch(val) {
         case 1u: { return 1; }   // 01
@@ -45,8 +43,6 @@ fn decode_2bit(val: u32) -> i32 {
     }
 }
 
-// Unroll the decoding loop for maximum compatibility, while preserving
-// the LSB-to-MSB decoding order from the previously passing version.
 fn decode_16x2bit_ternary(packed_val: u32) -> array<i32, 16> {
     var decoded: array<i32, 16>;
     decoded[0]  = decode_2bit((packed_val >> 0u) & 0x3u);
@@ -68,7 +64,6 @@ fn decode_16x2bit_ternary(packed_val: u32) -> array<i32, 16> {
     return decoded;
 }
 
-// Vectorized dot product for better throughput
 fn dot_product_4x4(a: vec4<i32>, b: vec4<i32>) -> i32 {
     return dot(a, b);
 }
@@ -84,21 +79,17 @@ fn main(
     let tile_start_m = workgroup_id.y * TILE_DIM_M;
     let tile_start_n = workgroup_id.x * TILE_DIM_N;
 
-    // FIX 1: Use a flattened array of i32 for the accumulator to avoid the
-    // `array<vec4<i32>>` indexing bug on the Dx12 backend.
-    var accumulators: array<i32, 16>;
-    for (var i = 0u; i < 16u; i = i + 1u) {
-        accumulators[i] = 0;
+    // Original accumulator: vectorized style
+    var accumulators: array<vec4<i32>, THREAD_TILE_M>;
+    for (var i = 0u; i < THREAD_TILE_M; i = i + 1u) {
+        accumulators[i] = vec4<i32>(0);
     }
 
-    // Main tiling loop with optimizations
     let num_k_tiles = (metadata.K + TILE_DIM_K - 1u) / TILE_DIM_K;
 
     var k_tile_idx = 0u;
     while (k_tile_idx < num_k_tiles) {
         let k_tile_start = k_tile_idx * TILE_DIM_K;
-        // === Cooperative Loading with Coalescing ===
-        // Load activations with vectorization
         let total_a_elements = TILE_DIM_M * TILE_DIM_K / 4u;
         let loads_per_thread_a = (total_a_elements + 255u) / 256u; // Ceiling division
         for (var i = 0u; i < loads_per_thread_a; i = i + 1u) {
@@ -111,7 +102,6 @@ fn main(
                 let global_m = tile_start_m + m;
                 let global_k = k_tile_start + k;
                 if (global_m < metadata.M && global_k + 3u < metadata.K) {
-                    // Load 4 activations at once
                     let base_addr = global_m * metadata.K + global_k;
                     tile_a[vec_idx] = vec4<i32>(
                         activations[base_addr],
@@ -124,7 +114,6 @@ fn main(
                 }
             }
         }
-        // Load and decode weights
         let total_b_elements = TILE_DIM_N * TILE_DIM_K;
         let loads_per_thread_b = (total_b_elements + 255u) / 256u;
         for (var i = 0u; i < loads_per_thread_b; i = i + 1u) {
@@ -137,26 +126,7 @@ fn main(
                 if (global_n < metadata.N && global_k_packed_idx < metadata.K_packed) {
                     let weight_idx = global_n * metadata.K_packed + global_k_packed_idx;
                     let packed_w = packed_weights[weight_idx];
-                    // FIX 2: Inline the decoding logic. The Dx12 backend fails pipeline
-                    // creation when a function returns an array, as seen in the V4.2.2 test.
-                    var decoded: array<i32, 16>;
-                    decoded[0]  = decode_2bit((packed_w >> 0u) & 0x3u);
-                    decoded[1]  = decode_2bit((packed_w >> 2u) & 0x3u);
-                    decoded[2]  = decode_2bit((packed_w >> 4u) & 0x3u);
-                    decoded[3]  = decode_2bit((packed_w >> 6u) & 0x3u);
-                    decoded[4]  = decode_2bit((packed_w >> 8u) & 0x3u);
-                    decoded[5]  = decode_2bit((packed_w >> 10u) & 0x3u);
-                    decoded[6]  = decode_2bit((packed_w >> 12u) & 0x3u);
-                    decoded[7]  = decode_2bit((packed_w >> 14u) & 0x3u);
-                    decoded[8]  = decode_2bit((packed_w >> 16u) & 0x3u);
-                    decoded[9]  = decode_2bit((packed_w >> 18u) & 0x3u);
-                    decoded[10] = decode_2bit((packed_w >> 20u) & 0x3u);
-                    decoded[11] = decode_2bit((packed_w >> 22u) & 0x3u);
-                    decoded[12] = decode_2bit((packed_w >> 24u) & 0x3u);
-                    decoded[13] = decode_2bit((packed_w >> 26u) & 0x3u);
-                    decoded[14] = decode_2bit((packed_w >> 28u) & 0x3u);
-                    decoded[15] = decode_2bit((packed_w >> 30u) & 0x3u);
-                    // Store decoded weights (unrolled for WGSL compliance)
+                    let decoded = decode_16x2bit_ternary(packed_w);
                     tile_b[n * TILE_DIM_K + k + 0u] = decoded[0u];
                     tile_b[n * TILE_DIM_K + k + 1u] = decoded[1u];
                     tile_b[n * TILE_DIM_K + k + 2u] = decoded[2u];
@@ -174,25 +144,21 @@ fn main(
                     tile_b[n * TILE_DIM_K + k + 14u] = decoded[14u];
                     tile_b[n * TILE_DIM_K + k + 15u] = decoded[15u];
                 } else {
-                    // Pad with zeros
                     for (var j = 0u; j < 16u; j = j + 1u) {
                         tile_b[n * TILE_DIM_K + k + j] = 0;
                     }
                 }
             }
         }
         workgroupBarrier();
-        // === Vectorized Computation ===
         for (var k_inner = 0u; k_inner < TILE_DIM_K; k_inner = k_inner + 4u) {
-            // Load vectorized activations
             var a_vecs: array<vec4<i32>, THREAD_TILE_M>;
             for (var m = 0u; m < THREAD_TILE_M; m = m + 1u) {
                 let base_m = thread_idx_m * THREAD_TILE_M + m;
                 let vec_idx = (base_m * TILE_DIM_K + k_inner) / 4u;
                 let a_i32 = tile_a[vec_idx];
                 a_vecs[m] = a_i32;
             }
-            // Load vectorized weights and compute
             for (var n = 0u; n < THREAD_TILE_N; n = n + 1u) {
                 let base_n = thread_idx_n * THREAD_TILE_N + n;
                 let b_vec = vec4<i32>(
@@ -201,32 +167,25 @@ fn main(
                     tile_b[base_n * TILE_DIM_K + k_inner + 2u],
                     tile_b[base_n * TILE_DIM_K + k_inner + 3u]
                 );
-                // Vectorized multiply-accumulate
                 for (var m = 0u; m < THREAD_TILE_M; m = m + 1u) {
                     let dot_result = dot_product_4x4(a_vecs[m], b_vec);
-                    // Manually calculate the 1D index for the flattened accumulator.
-                    let acc_idx = m * THREAD_TILE_N + n;
-                    accumulators[acc_idx] += dot_result;
+                    accumulators[m][n] += dot_result;
                 }
             }
         }
         workgroupBarrier();
         k_tile_idx = k_tile_idx + 1u;
     }
-    // === Write Results with Proper Scaling ===
     for (var m = 0u; m < THREAD_TILE_M; m = m + 1u) {
         for (var n = 0u; n < THREAD_TILE_N; n = n + 1u) {
             let global_m = tile_start_m + thread_idx_m * THREAD_TILE_M + m;
             let global_n = tile_start_n + thread_idx_n * THREAD_TILE_N + n;
             if (global_m < metadata.M && global_n < metadata.N) {
-                // BitNet B1.58 scaling: result = activation_scale * weight_scale * dot_product
                 let activation_scale = activation_scales[global_m];
                 let weight_scale = weight_scales[global_n];
-                // Use the manually calculated 1D index again.
-                let acc_idx = m * THREAD_TILE_N + n;
-                let final_result = f32(accumulators[acc_idx]) * activation_scale * weight_scale;
+                let final_result = f32(accumulators[m][n]) * activation_scale * weight_scale;
                 output[global_m * metadata.N + global_n] = final_result;
             }
         }
     }
-} 
+}
@@ -61,7 +61,7 @@ impl WgpuContext {
                 compatible_surface: None,
             })
             .await
-            .ok_or(BitNetError::NoSuitableAdapter)?;
+            .map_err(|_| BitNetError::NoSuitableAdapter)?;
 
         let mut required_features = wgpu::Features::empty();
         if adapter.features().contains(wgpu::Features::TIMESTAMP_QUERY) {
@@ -74,8 +74,9 @@ impl WgpuContext {
                     label: Some("Bitnet Device"),
                     required_features,
                     required_limits: Default::default(),
-                },
-                None,
+                    memory_hints: wgpu::MemoryHints::default(),
+                    trace: wgpu::Trace::default(),
+                }
             )
             .await
             .map_err(BitNetError::RequestDeviceError)?;
@@ -103,7 +104,7 @@ impl WgpuContext {
                 compatible_surface: None,
             })
             .await
-            .ok_or(BitNetError::NoSuitableAdapter)?;
+            .map_err(|_| BitNetError::NoSuitableAdapter)?;
 
         let mut required_features = wgpu::Features::empty();
         if adapter.features().contains(wgpu::Features::TIMESTAMP_QUERY) {
@@ -116,8 +117,9 @@ impl WgpuContext {
                     label: Some("Custom Device"),
                     required_features,
                     required_limits: limits,
-                },
-                None,
+                    memory_hints: wgpu::MemoryHints::default(),
+                    trace: wgpu::Trace::default(),
+                }
             )
             .await
             .map_err(BitNetError::RequestDeviceError)?;
 
@@ -1916,18 +1916,18 @@ struct WarmGpuContext {
 
 impl WarmGpuContext {
     async fn new() -> Option<Self> {
-        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor {
+        let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor {
             backends: wgpu::Backends::DX12,
             ..Default::default()
         });
 
         let adapter = instance
             .request_adapter(&wgpu::RequestAdapterOptions::default())
-            .await?;
+            .await.ok()?;
 
         let info = adapter.get_info();
         let (device, queue) = adapter // Capture queue here
-            .request_device(&wgpu::DeviceDescriptor::default(), None)
+            .request_device(&wgpu::DeviceDescriptor::default())
             .await
             .ok()?;
 
@@ -1958,7 +1958,7 @@ where
     }));
 
     let result = f();
-    device.poll(wgpu::Maintain::Wait);
+    device.poll(wgpu::MaintainBase::Wait);
     device.on_uncaptured_error(Box::new(|_| {})); // Reset handler
 
     let e = error.lock().unwrap().take();
@@ -2008,8 +2008,9 @@ async fn test_shader_compilation(name: &str, shader_source: &str, context: &Warm
             label: Some(&format!("{} compute pipeline", name)),
             layout: Some(&pipeline_layout),
             module: &shader_module,
-            entry_point: "main",
+            entry_point: Some("main"),
             compilation_options: Default::default(),
+            cache: None,
         })
     }).await;
     if let Some(e) = pipeline_error {
@@ -2150,8 +2151,9 @@ async fn test_correctness_on_dx12(context: &WarmGpuContext) {
             label: Some("Correctness Test Pipeline"),
             layout: Some(&pipeline_layout),
             module: &shader_module,
-            entry_point: "main",
+            entry_point: Some("main"),
             compilation_options: Default::default(),
+            cache: None,
         })
     }).await;
 
@@ -2220,7 +2222,7 @@ async fn test_correctness_on_dx12(context: &WarmGpuContext) {
     let buffer_slice = staging_buffer.slice(..);
     let (tx, rx) = futures_intrusive::channel::shared::oneshot_channel();
     buffer_slice.map_async(wgpu::MapMode::Read, move |v| tx.send(v).unwrap());
-    context.device.poll(wgpu::Maintain::Wait);
+    context.device.poll(wgpu::MaintainBase::Wait);
     rx.receive().await.unwrap().unwrap();
     let gpu_output: Vec<f32> = bytemuck::cast_slice(&buffer_slice.get_mapped_range()).to_vec();