From a1a2c43a0869c7399a821aaf8625d4bb67c6a102 Mon Sep 17 00:00:00 2001
From: zhangnju <zhangnju@126.com>
Date: Wed, 17 Dec 2025 06:55:41 +0800
Subject: [PATCH 1/3] fix the issue of GEMM validation failure

---
 HIP-Basic/matrix_multiplication/main.hip | 65 ++++++++++++++++++++----
 1 file changed, 54 insertions(+), 11 deletions(-)
diff --git a/HIP-Basic/matrix_multiplication/main.hip b/HIP-Basic/matrix_multiplication/main.hip
index 8c0a36aec..944711b97 100644
--- a/HIP-Basic/matrix_multiplication/main.hip
+++ b/HIP-Basic/matrix_multiplication/main.hip
@@ -28,9 +28,11 @@
 #include <algorithm>
 #include <iostream>
 #include <vector>
-
+#include <vector>
+#include <cstdlib>
 #include <cassert>
 #include <cstddef>
+#include <memory>
 
 /// \brief Multiplies matrices \p A and \p B and stores the result to \p C.
 /// - The number of rows of the result matrix is equal to the number of rows of matrix A
@@ -108,6 +110,53 @@ __global__ void matrix_multiplication_kernel(const float*       A,
     // Every thread stores the final result to global memory.
     C[block_offset + b_cols * ty + tx] = thread_result;
 }
+
+bool checkValidity (const float* A, const float* B, const float* C, size_t a_rows,size_t a_cols, size_t b_cols)
+{
+    const float EPSILON = 0.001;
+    const int BLOCK_SIZE = 64;
+
+    std::vector<float> golden_c(b_cols * a_rows);
+
+    for (size_t i_block = 0; i_block < a_rows; i_block += BLOCK_SIZE) {
+        for (size_t j_block = 0; j_block < b_cols; j_block += BLOCK_SIZE) {
+            for (size_t t_block = 0; t_block < a_cols; t_block += BLOCK_SIZE) {
+                int i_end = std::min(i_block + BLOCK_SIZE, a_rows);
+                int j_end = std::min(j_block + BLOCK_SIZE, b_cols);
+                int t_end = std::min(t_block + BLOCK_SIZE, a_cols);
+
+                
+                for (int i = i_block; i < i_end; ++i) {
+                    for (int t = t_block; t < t_end; ++t) {
+                        float a_val = A[i * a_cols + t];
+                        for (int j = j_block; j < j_end; ++j) {
+                             golden_c[i * b_cols + j] += a_val * B[t * b_cols + j];
+                        }
+                    }
+                }
+            }
+        }
+    }
+   
+    for (size_t i = 0; i < a_rows; ++i) 
+    {        
+        for (size_t j = 0; j < b_cols; ++j) 
+        {
+            float absdiff = abs(C[i*b_cols+j] - golden_c[i*b_cols+j]);
+            if(absdiff > EPSILON)
+            {
+                std::cerr << "\nVALIDATION FAILED!!!\n    reference" << "[" << i << ", " << j << "] = "
+                     << golden_c[i*b_cols+j] << ",\n    calculated" << "[" << i << ", " << j << "] = "
+                     << C[i*b_cols+j]
+                     << ",\n    absolute difference" << "[" << i << ", " << j << "] = " << absdiff << "\n"
+                     << "Further validation was stopped\n\n";
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
 template<unsigned int BlockSize>
 void configure_parser(cli::Parser& parser)
 {
@@ -165,11 +214,9 @@ int main(int argc, const char* argv[])
     std::vector<float> B(b_cols * b_rows);
     std::vector<float> C(c_cols * c_rows);
 
-    // Set matrix elements to a constant on the host.
-    std::fill(A.begin(), A.end(), 1.F);
-
-    constexpr float b_value = 0.02F;
-    std::fill(B.begin(), B.end(), b_value);
+    // Set matrix elements to random value on the host.
+    for (size_t i = 0; i < A.size(); ++i) A[i] = static_cast<float>(rand() / RAND_MAX );
+    for (size_t i = 0; i < B.size(); ++i) B[i] = static_cast<float>(rand() / RAND_MAX );
 
     const size_t a_bytes = sizeof(float) * A.size();
     const size_t b_bytes = sizeof(float) * B.size();
@@ -203,11 +250,7 @@ int main(int argc, const char* argv[])
     HIP_CHECK(hipFree(d_C));
 
     // Check if the resulting elements match the expectation.
-    constexpr float tolerance         = 0.001F;
-    const bool      validation_passed = std::all_of(
-        C.begin(),
-        C.end(),
-        [=](const float value) { return tolerance > std::abs(value - a_cols * b_value); });
+    bool      validation_passed = checkValidity(A.data(),B.data(),C.data(),a_rows,a_cols,b_cols);
     if(validation_passed)
     {
         std::cout << "Validation passed." << std::endl;

From 44dd3ca5ba45dd5434635c7f7781c0863986e474 Mon Sep 17 00:00:00 2001
From: zhangnju <zhangnju@126.com>
Date: Wed, 17 Dec 2025 07:10:51 +0800
Subject: [PATCH 2/3] remove redundent vector head file

---
 HIP-Basic/matrix_multiplication/main.hip | 1 -
 1 file changed, 1 deletion(-)

diff --git a/HIP-Basic/matrix_multiplication/main.hip b/HIP-Basic/matrix_multiplication/main.hip
index 944711b97..699c69031 100644
--- a/HIP-Basic/matrix_multiplication/main.hip
+++ b/HIP-Basic/matrix_multiplication/main.hip
@@ -28,7 +28,6 @@
 #include <algorithm>
 #include <iostream>
 #include <vector>
-#include <vector>
 #include <cstdlib>
 #include <cassert>
 #include <cstddef>

From d77279b205f7da14f3d9a9aae132929baaadc35e Mon Sep 17 00:00:00 2001
From: zhangnju <zhangnju@126.com>
Date: Fri, 19 Dec 2025 21:51:34 +0800
Subject: [PATCH 3/3] update

---
 HIP-Basic/matrix_multiplication/main.hip | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/HIP-Basic/matrix_multiplication/main.hip b/HIP-Basic/matrix_multiplication/main.hip
index 699c69031..97aafa5d0 100644
--- a/HIP-Basic/matrix_multiplication/main.hip
+++ b/HIP-Basic/matrix_multiplication/main.hip
@@ -28,10 +28,9 @@
 #include <algorithm>
 #include <iostream>
 #include <vector>
-#include <cstdlib>
+
 #include <cassert>
 #include <cstddef>
-#include <memory>
 
 /// \brief Multiplies matrices \p A and \p B and stores the result to \p C.
 /// - The number of rows of the result matrix is equal to the number of rows of matrix A
@@ -214,9 +213,9 @@ int main(int argc, const char* argv[])
     std::vector<float> C(c_cols * c_rows);
 
     // Set matrix elements to random value on the host.
-    for (size_t i = 0; i < A.size(); ++i) A[i] = static_cast<float>(rand() / RAND_MAX );
-    for (size_t i = 0; i < B.size(); ++i) B[i] = static_cast<float>(rand() / RAND_MAX );
-
+    for (size_t i = 0; i < A.size(); ++i) A[i] = static_cast<float>(rand() / (RAND_MAX + 1.0f) );
+    for (size_t i = 0; i < B.size(); ++i) B[i] = static_cast<float>(rand() / (RAND_MAX + 1.0f) );
+    
     const size_t a_bytes = sizeof(float) * A.size();
     const size_t b_bytes = sizeof(float) * B.size();
     const size_t c_bytes = sizeof(float) * C.size();