-
Notifications
You must be signed in to change notification settings - Fork 8
Open
Description
I'm testing matmul and find a strange result.
The result looks like shifted left side.
I append the verify code and result.
// Standard Library includes
#include "LSTM.h"
#include "misc.h"
#include <iostream>
#include <sstream>
#include <vector>
#include <cmath>
#include <cstdlib>
#include <sys/time.h>
#include <math.h>
#include <cassert>
#include <limits>
#include <cstring>
#include <iostream>
#include <cooperative_groups.h>
#include <math.h>
#define MM_BLOCK_SIZE 16
#define MM_REG_TILE 4
#define MM_TILE_SIZE 64
/// Naive reference GEMM computation.
__global__ void ReferenceGemm_kernel(
int M,
int N,
int K,
float alpha,
float const *A,
int lda,
float const *B,
int ldb,
float beta,
float *C,
int ldc) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < M && j < N) {
float accumulator = 0;
for (int k = 0; k < K; ++k) {
accumulator += A[i + k * lda] * B[k + j * ldb];
}
C[i + j * ldc] = accumulator; //alpha * accumulator + beta * C[i + j * ldc];
}
}
/// Reference GEMM computation.
cudaError_t ReferenceGemm(
int M,
int N,
int K,
float alpha,
float const *A,
int lda,
float const *B,
int ldb,
float beta,
float *C,
int ldc) {
dim3 block(16, 16);
dim3 grid(
(M + block.x - 1) / block.x,
(N + block.y - 1) / block.y
);
ReferenceGemm_kernel<<< grid, block >>>(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
return cudaGetLastError();
}
__global__ void grnn_matmul(float * A, float * B, float * C,
uint32_t M, uint32_t K, uint32_t N) {
extern __shared__ float base[];
float* bufferA = base;
float* bufferB = &bufferA[MM_TILE_SIZE * MM_TILE_SIZE];
float regA[MM_REG_TILE]; //4
float regB[MM_REG_TILE]; // 4
float regC[MM_REG_TILE][MM_REG_TILE]; // 16
uint32_t tidx = threadIdx.x;
uint32_t tidy = threadIdx.y;
uint32_t id = threadIdx.y * blockDim.x + threadIdx.x;
uint32_t bidx = blockIdx.x;
uint32_t bidy = blockIdx.y;
// Number of rows that are traversed in a single fully coalesced load sequence
constexpr uint32_t LOAD_STEPS = MM_TILE_SIZE * MM_TILE_SIZE / (MM_BLOCK_SIZE * MM_BLOCK_SIZE);
constexpr uint32_t NUM_THREADS = MM_BLOCK_SIZE * MM_BLOCK_SIZE;
// Zero the intermediate output
for (uint32_t y = 0; y < MM_REG_TILE; y++) {
for (uint32_t x = 0; x < MM_REG_TILE; x++) {
regC[y][x] = 0.0f;
}
}
for (uint32_t i = 0; i < K; i += MM_TILE_SIZE) {
// Load lhs tile from global memory to shared memory (fully coalesced)
#pragma unroll
for (uint32_t j = 0; j < LOAD_STEPS; j++) {
uint32_t index = j * NUM_THREADS + id;
if (((bidy * MM_TILE_SIZE + index / MM_TILE_SIZE) < M) && ((i + index % MM_TILE_SIZE) < K)) {
bufferA[index] = A[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * K + i + index % MM_TILE_SIZE];
} else {
bufferA[index] = 0.0f;
}
}
// Not necessary for correctness, but improves performance by avoiding thrashing shared memory
__syncthreads();
// Load rhs tile from global memory to shared memory (fully coalesced)
#pragma unroll
for (uint32_t j = 0; j < LOAD_STEPS; j++) {
uint32_t index = j * NUM_THREADS + id;
if (((i + index / MM_TILE_SIZE) < K) && ((bidx * MM_TILE_SIZE + index % MM_TILE_SIZE) < N)) {
bufferB[index] = B[ ((index / MM_TILE_SIZE) + i) * N + bidx * MM_TILE_SIZE + index % MM_TILE_SIZE];
} else {
bufferB[index] = 0.0f;
}
}
// Ensures all data is written from global memory to shared memory before it is streamed
// into register arrays.
__syncthreads();
// Loop through full tile
for (uint32_t j = 0; j < MM_TILE_SIZE; j++) {
// Load vector from lhs and rhs
#pragma unroll
for (uint32_t l = 0; l < MM_REG_TILE; l++) {
regA[l] = bufferA[(tidy * MM_REG_TILE + l) * MM_TILE_SIZE + j];
regB[l] = bufferB[j * MM_TILE_SIZE + tidx * MM_REG_TILE + l];
}
#pragma unroll
// Perform a narrow matmul
for (uint32_t y = 0; y < MM_REG_TILE; y++) {
for (uint32_t x = 0; x < MM_REG_TILE; x++) {
regC[y][x] += regA[y] * regB[x];
}
}
}
__syncthreads();
}
// Write register intermediates to shared memory (possibly unnecessary)
for (uint32_t y = 0; y < MM_REG_TILE; y++) {
for (uint32_t x = 0; x < MM_REG_TILE; x++) {
bufferA[(tidy * MM_REG_TILE + y) * MM_TILE_SIZE + tidx * MM_REG_TILE + x] = regC[y][x];
}
}
__syncthreads();
for (uint32_t j = 0; j < LOAD_STEPS; j++) {
uint32_t index = j * NUM_THREADS + id;
if (((bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) < M) && ((bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)) < N)) {
C[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * N + bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)] = bufferA[index];
}
}
}
/// Kernel to initialize a matrix with small integers.
__global__ void InitializeMatrix_kernel(
float *matrix,
int ldm,
int rows,
int columns,
int seed = 0) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < rows && j < columns) {
int offset = i + j * ldm;
// Generate arbitrary elements.
int const k = 16807;
int const m = 16;
float value = float(((offset + seed) * k % m) - m / 2);
matrix[offset] = value;
}
}
/// Simple function to initialize a matrix to arbitrary small integers.
cudaError_t InitializeMatrix(float *matrix, int ldm, int rows, int columns, int seed = 0) {
dim3 block(16, 16);
dim3 grid(
(rows + block.x - 1) / block.x,
(columns + block.y - 1) / block.y
);
InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed);
return cudaGetLastError();
}
/// Allocates device memory for a matrix then fills with arbitrary small integers.
cudaError_t AllocateMatrix(float **matrix, int ldm, int rows, int columns, int seed = 0) {
cudaError_t result;
size_t sizeof_matrix = sizeof(float) * ldm * columns;
// Allocate device memory.
result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
if (result != cudaSuccess) {
std::cerr << "Failed to allocate matrix: "
<< cudaGetErrorString(result) << std::endl;
return result;
}
// Clear the allocation.
result = cudaMemset(*matrix, 0, sizeof_matrix);
if (result != cudaSuccess) {
std::cerr << "Failed to clear matrix device memory: "
<< cudaGetErrorString(result) << std::endl;
return result;
}
// Initialize matrix elements to arbitrary small integers.
result = InitializeMatrix(*matrix, ldm, rows, columns, seed);
if (result != cudaSuccess) {
std::cerr << "Failed to initialize matrix: "
<< cudaGetErrorString(result) << std::endl;
return result;
}
return result;
}
int main(int argc, const char *arg[]) {
float alpha = 1;
float beta = 0;
int M = 32;
int K = 32;
int N = 32;
float *A;
float *B;
float *C_grnn;
float *C_reference;
AllocateMatrix(&A, M, M, K, 1);
AllocateMatrix(&B, K, K, N, 2);
AllocateMatrix(&C_grnn, M, M, N, 0);
AllocateMatrix(&C_reference, M, M, N, 0);
// Reference GEMM
ReferenceGemm(M, N, K, alpha, A, M, B, K, beta, C_reference, M);
// grnn
dim3 mm_grid = dim3((N + MM_TILE_SIZE - 1) / MM_TILE_SIZE, (M + MM_TILE_SIZE - 1) / MM_TILE_SIZE);
dim3 mm_block = dim3(MM_BLOCK_SIZE, MM_BLOCK_SIZE);
void* paramsMM[6];
paramsMM[0] = (void*) &A;
paramsMM[1] = (void*) &B;
paramsMM[2] = (void*) &C_grnn;
paramsMM[3] = (void*) &M;
paramsMM[4] = (void*) &K;
paramsMM[5] = (void*) &N;
size_t mm_sm_requirement = MM_TILE_SIZE * MM_TILE_SIZE * 2 * sizeof(float);
cudaLaunchKernel((void *)grnn_matmul, mm_grid, mm_block, paramsMM, mm_sm_requirement);
// verification of both results
std::vector<float> reference_result(M * N, 0);
std::vector<float> grnn_result(N * N, 0);
size_t sizeof_C = sizeof(float) * M * N;
cudaMemcpy(reference_result.data(), C_reference, sizeof_C, cudaMemcpyDeviceToHost);
cudaMemcpy(grnn_result.data(), C_grnn, sizeof_C, cudaMemcpyDeviceToHost);
std::cout << "Reference" << std::endl;
for(int i=0; i<M*N; i++){
std::cout << reference_result[i] << ' ';
if ((i+1)%N == 0){
std::cout << std::endl;
}
}
std::cout << "GRNN" << std::endl;
for(int i=0; i<M*N; i++){
std::cout << grnn_result[i] << ' ';
if ((i+1)%N == 0){
std::cout << std::endl;
}
}
}
Reference
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128
GRNN
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
-96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16 -96 48 -64 80 -32 112 0 -112 32 -80 64 -48 96 -16 128 16
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels