Matmul verify

I'm testing matmul and find a strange result.
The result looks like shifted left side.
I append the verify code and result.


```CUDA
// Standard Library includes
#include "LSTM.h"
#include "misc.h"

#include <iostream>
#include <sstream>
#include <vector>
#include <cmath>
#include <cstdlib>
#include <sys/time.h>
#include <math.h>
#include <cassert>
#include <limits>
#include <cstring>
#include <iostream>
#include <cooperative_groups.h>
#include <math.h>
#define MM_BLOCK_SIZE 16
#define MM_REG_TILE 4
#define MM_TILE_SIZE 64


/// Naive reference GEMM computation.
__global__ void ReferenceGemm_kernel(
  int M,
  int N,
  int K,
  float alpha,
  float const *A,
  int lda,
  float const *B,
  int ldb,
  float beta,
  float *C,
  int ldc) {

  int i = threadIdx.x + blockIdx.x * blockDim.x;
  int j = threadIdx.y + blockIdx.y * blockDim.y;

  if (i < M && j < N) {
    float accumulator = 0;

    for (int k = 0; k < K; ++k) {
      accumulator += A[i + k * lda] * B[k + j * ldb];
    }

    C[i + j * ldc] = accumulator; //alpha * accumulator + beta * C[i + j * ldc];
  }
}

/// Reference GEMM computation.
cudaError_t ReferenceGemm(
  int M,
  int N,
  int K,
  float alpha,
  float const *A,
  int lda,
  float const *B,
  int ldb,
  float beta,
  float *C,
  int ldc) {

  dim3 block(16, 16);
  dim3 grid(
    (M + block.x - 1) / block.x,
    (N + block.y - 1) / block.y
  );

    ReferenceGemm_kernel<<< grid, block >>>(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);

  return cudaGetLastError();
}

__global__ void grnn_matmul(float * A, float * B, float * C,
                       uint32_t M, uint32_t K, uint32_t N) {

  extern __shared__ float base[];
  float* bufferA = base;
  float* bufferB = &bufferA[MM_TILE_SIZE * MM_TILE_SIZE];

  float regA[MM_REG_TILE]; //4
  float regB[MM_REG_TILE]; // 4
  float regC[MM_REG_TILE][MM_REG_TILE]; // 16

  uint32_t tidx = threadIdx.x;
  uint32_t tidy = threadIdx.y;
  uint32_t id = threadIdx.y * blockDim.x + threadIdx.x;
  uint32_t bidx = blockIdx.x;
  uint32_t bidy = blockIdx.y;

  // Number of rows that are traversed in a single fully coalesced load sequence
  constexpr uint32_t LOAD_STEPS = MM_TILE_SIZE * MM_TILE_SIZE / (MM_BLOCK_SIZE * MM_BLOCK_SIZE);
  constexpr uint32_t NUM_THREADS = MM_BLOCK_SIZE * MM_BLOCK_SIZE;

  // Zero the intermediate output
  for (uint32_t y = 0; y < MM_REG_TILE; y++) {
    for (uint32_t x = 0; x < MM_REG_TILE; x++) {
      regC[y][x] = 0.0f;
    }
  }

  for (uint32_t i = 0; i < K; i += MM_TILE_SIZE) {

    // Load lhs tile from global memory to shared memory (fully coalesced)
    #pragma unroll
    for (uint32_t j = 0; j < LOAD_STEPS; j++) {
      uint32_t index = j * NUM_THREADS + id;
      if (((bidy * MM_TILE_SIZE + index / MM_TILE_SIZE) < M) && ((i + index % MM_TILE_SIZE) < K)) {
        bufferA[index] = A[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * K + i + index % MM_TILE_SIZE];
      } else {
        bufferA[index] = 0.0f;
      }
    }

    // Not necessary for correctness, but improves performance by avoiding thrashing shared memory
    __syncthreads();

    // Load rhs tile from global memory to shared memory (fully coalesced)
    #pragma unroll
    for (uint32_t j = 0; j < LOAD_STEPS; j++) {
      uint32_t index = j * NUM_THREADS + id;
      if (((i + index / MM_TILE_SIZE) < K) && ((bidx * MM_TILE_SIZE + index % MM_TILE_SIZE) < N)) {
        bufferB[index] = B[ ((index / MM_TILE_SIZE) + i) * N + bidx * MM_TILE_SIZE + index % MM_TILE_SIZE];
      } else {
        bufferB[index] = 0.0f;
      }
    }

    // Ensures all data is written from global memory to shared memory before it is streamed
    // into register arrays.
    __syncthreads();



    // Loop through full tile
    for (uint32_t j  = 0; j < MM_TILE_SIZE; j++) {

      // Load vector from lhs and rhs
      #pragma unroll
      for (uint32_t l = 0; l < MM_REG_TILE; l++) {
        regA[l] = bufferA[(tidy * MM_REG_TILE + l) * MM_TILE_SIZE + j];
        regB[l] = bufferB[j * MM_TILE_SIZE + tidx * MM_REG_TILE + l];
      }

      #pragma unroll
      // Perform a narrow matmul
      for (uint32_t y = 0; y < MM_REG_TILE; y++) {
        for (uint32_t x = 0; x < MM_REG_TILE; x++) {
          regC[y][x] += regA[y] * regB[x];
        }
      }
    }

    __syncthreads();
  }

  // Write register intermediates to shared memory (possibly unnecessary)
  for (uint32_t y = 0; y < MM_REG_TILE; y++) {
    for (uint32_t x = 0; x < MM_REG_TILE; x++) {
      bufferA[(tidy * MM_REG_TILE + y) * MM_TILE_SIZE + tidx * MM_REG_TILE + x] = regC[y][x];
    }
  }

  __syncthreads();


  for (uint32_t j = 0; j < LOAD_STEPS; j++) {
    uint32_t index = j * NUM_THREADS + id;
    if (((bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) < M) && ((bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)) < N)) {
      C[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * N + bidx * MM_TILE_SIZE +  (index % MM_TILE_SIZE)] = bufferA[index];
    }
  }
}
/// Kernel to initialize a matrix with small integers.
__global__ void InitializeMatrix_kernel(
  float *matrix,
  int ldm,
  int rows,
  int columns,
  int seed = 0) {

  int i = threadIdx.x + blockIdx.x * blockDim.x;
  int j = threadIdx.y + blockIdx.y * blockDim.y;

  if (i < rows && j < columns) {
    int offset = i + j * ldm;

    // Generate arbitrary elements.
    int const k = 16807;
    int const m = 16;
    float value = float(((offset + seed) * k % m) - m / 2);

    matrix[offset] = value;
  }
}

/// Simple function to initialize a matrix to arbitrary small integers.
cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) {

  dim3 block(16, 16);
  dim3 grid(
    (rows + block.x - 1) / block.x,
    (columns + block.y - 1) / block.y
  );

  InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed);

  return cudaGetLastError();
}

/// Allocates device memory for a matrix then fills with arbitrary small integers.
cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) {
  cudaError_t result;

  size_t sizeof_matrix = sizeof(float) * ldm * columns;

  // Allocate device memory.
  result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);

  if (result != cudaSuccess) {
    std::cerr << "Failed to allocate matrix: "
      << cudaGetErrorString(result) << std::endl;
    return result;
  }

  // Clear the allocation.
  result = cudaMemset(*matrix, 0, sizeof_matrix);

  if (result != cudaSuccess) {
    std::cerr << "Failed to clear matrix device memory: "
      << cudaGetErrorString(result) << std::endl;
    return result;
  }

  // Initialize matrix elements to arbitrary small integers.
  result = InitializeMatrix(*matrix, ldm, rows, columns, seed);

  if (result != cudaSuccess) {
    std::cerr << "Failed to initialize matrix: "
      << cudaGetErrorString(result) << std::endl;
    return result;
  }

  return result;
}


int main(int argc, const char *arg[]) {
	float alpha = 1;
	float beta = 0;
	int M = 32;
	int K = 32;
	int N = 32;

	float *A;
	float *B;
	float *C_grnn;
	float *C_reference;

	AllocateMatrix(&A, M, M, K, 1);
	AllocateMatrix(&B, K, K, N, 2);
	AllocateMatrix(&C_grnn, M, M, N, 0);
	AllocateMatrix(&C_reference, M, M, N, 0);

	// Reference GEMM
	ReferenceGemm(M, N, K, alpha, A, M, B, K, beta, C_reference, M);

	// grnn
	dim3 mm_grid = dim3((N + MM_TILE_SIZE - 1) / MM_TILE_SIZE, (M + MM_TILE_SIZE - 1) / MM_TILE_SIZE);
	dim3 mm_block = dim3(MM_BLOCK_SIZE, MM_BLOCK_SIZE);

	void* paramsMM[6];
	paramsMM[0] = (void*) &A;
	paramsMM[1] = (void*) &B;
	paramsMM[2] = (void*) &C_grnn;
	paramsMM[3] = (void*) &M;
	paramsMM[4] = (void*) &K;
	paramsMM[5] = (void*) &N;

	size_t mm_sm_requirement = MM_TILE_SIZE * MM_TILE_SIZE * 2 * sizeof(float);

	cudaLaunchKernel((void *)grnn_matmul, mm_grid, mm_block, paramsMM, mm_sm_requirement);

	// verification of both results
	std::vector<float> reference_result(M * N, 0);
	std::vector<float> grnn_result(N * N, 0);
	size_t sizeof_C = sizeof(float) * M * N;
	cudaMemcpy(reference_result.data(), C_reference, sizeof_C, cudaMemcpyDeviceToHost);
	cudaMemcpy(grnn_result.data(), C_grnn, sizeof_C, cudaMemcpyDeviceToHost);

	std::cout << "Reference" << std::endl;
	for(int i=0; i<M*N; i++){
		std::cout << reference_result[i] << ' ';
		if ((i+1)%N == 0){
			std::cout << std::endl;
		}
	}

	std::cout << "GRNN" << std::endl;
	for(int i=0; i<M*N; i++){
		std::cout << grnn_result[i] << ' ';
		if ((i+1)%N == 0){
			std::cout << std::endl;
		}
	}
}

```


```
Reference
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 
GRNN
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Matmul verify #1

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Matmul verify #1

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions