Skip to content
This repository was archived by the owner on Dec 16, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 59 additions & 32 deletions gpu-burn/BurnKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
#include "common.h"
#include "BurnKernel.h"

#include "rocblas.h"

#define EPSILOND 0.0000001f
// ---------------------------------------------------------------------------
namespace gpuburn {

Expand All @@ -26,48 +29,55 @@ BurnKernel::BurnKernel(int hipDevice)
: mHipDevice(hipDevice), mRunKernel(false),
mDeviceAdata(NULL), mDeviceBdata(NULL), mDeviceCdata(NULL)
{

err_num = 0;

}

BurnKernel::~BurnKernel()
{
if (mBurnThread)
if (mBurnThread){
mBurnThread->join();
}

if (mDeviceAdata)
if (mDeviceAdata){
hipFree(mDeviceAdata);
}

if (mDeviceBdata)
if (mDeviceBdata){
hipFree(mDeviceBdata);
}

if (mDeviceCdata)
if (mDeviceCdata){
hipFree(mDeviceCdata);
}
}

// ---------------------------------------------------------------------------

extern "C" __global__ void hip_sgemm_kernel(hipLaunchParm lp, const int M,
const int N, const int K,
const float alpha,
float *A, const int lda, float *B,
const int ldb, const float beta,
float *C, const int ldc)


extern "C" __global__ void hip_compare_kernel(double *C, int *faultyElems, size_t iters)
{
//column major NN
size_t idx_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
size_t idx_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
size_t dim_x = hipGridDim_x * hipBlockDim_x;
size_t idx_x = blockIdx.x * blockDim.x + threadIdx.x;
size_t idx_y = blockIdx.y * blockDim.y + threadIdx.y;
size_t dim_x = gridDim.x * blockDim.x;

size_t myIdx = idx_y * dim_x + idx_x;

float local_c = beta * C[myIdx];

for(int k = 0; k < K; k++) {
local_c += alpha * A[ idx_y + k * K] * B[ idx_x * K + k];
}
size_t iterStep = hipBlockDim_x*hipBlockDim_y*hipGridDim_x*hipGridDim_y;

C[myIdx] = local_c;
int myFaulty = 0;
for (size_t i = 1; i < iters; ++i){
if(fabs(C[myIdx] - C[myIdx + iterStep]) > EPSILOND){
myFaulty++;
}
}
atomicAdd(faultyElems, myFaulty);
}

// ---------------------------------------------------------------------------

int BurnKernel::Init()
{
Expand All @@ -82,14 +92,19 @@ int BurnKernel::Init()
mHostBdata[i] = (rand() % 1000000)/100000.0;
}


size_t freeMem = getAvailableMemory() * cUseMem;
size_t matrixSizeBytes = sizeof(float)*cMatrixSize;
//size_t matrixSizeBytes = sizeof(float)*cMatrixSize;
size_t matrixSizeBytes = sizeof(double)*cMatrixSize;
mNumIterations = (freeMem - (matrixSizeBytes*2))/matrixSizeBytes;

checkError(hipMalloc((void**)&mDeviceAdata, matrixSizeBytes), "Alloc A");
checkError(hipMalloc((void**)&mDeviceBdata, matrixSizeBytes), "Alloc B");
checkError(hipMalloc((void**)&mDeviceCdata, matrixSizeBytes*mNumIterations), "Alloc C");

//rocky added for acc check:
checkError(hipMalloc(&d_faultyElemData, sizeof(int)), "faulty data");

checkError(hipMemcpy(mDeviceAdata, mHostAdata, matrixSizeBytes, hipMemcpyHostToDevice), "A -> device");
checkError(hipMemcpy(mDeviceBdata, mHostBdata, matrixSizeBytes, hipMemcpyHostToDevice), "B -> device");
checkError(hipMemset(mDeviceCdata, 0, matrixSizeBytes*mNumIterations), "C memset");
Expand Down Expand Up @@ -142,25 +157,37 @@ int BurnKernel::runComputeKernel()
{
int err = 0;


for (int i = 0; mRunKernel && i < mNumIterations; ++i) {
hipLaunchKernel(
/* Launch params */
HIP_KERNEL_NAME(hip_sgemm_kernel),
dim3(cRowSize/cBlockSize, cRowSize/cBlockSize, 1),
dim3(cBlockSize,cBlockSize,1), 0, 0,
/* Kernel params */
cRowSize, cRowSize, cRowSize, cAlpha,
mDeviceAdata, cRowSize,
mDeviceBdata, cRowSize,
cBeta,
mDeviceCdata + i*cMatrixSize,
cRowSize);

double alpha = 1.1;
double beta = 0.0;

rocblas_handle handle;
rocblas_create_handle(&handle);
rocblas_dgemm(handle, rocblas_operation_none, rocblas_operation_transpose, cRowSize, cRowSize, cRowSize, &alpha, mDeviceAdata, cRowSize, mDeviceBdata,cRowSize , &beta, mDeviceCdata + i*cMatrixSize, cRowSize);
}

checkError(hipDeviceSynchronize(), "Sync"); // rocky added to fix seg fault

hipLaunchKernelGGL(HIP_KERNEL_NAME(hip_compare_kernel),dim3(cRowSize/cBlockSize, cRowSize/cBlockSize, 1),dim3(cBlockSize,cBlockSize,1), 0, 0, mDeviceCdata, d_faultyElemData, mNumIterations);


int *d_faultyElemsHost;
checkError(hipMemcpy(d_faultyElemsHost, d_faultyElemData, sizeof(int), hipMemcpyDeviceToHost), "Read faultyelemdata");

err_num += *d_faultyElemsHost;

checkError(hipDeviceSynchronize(), "Sync");

return err;
}

int BurnKernel::get_err_num(){
return err_num;
}


size_t BurnKernel::getAvailableMemory()
{
size_t freeMem, totalMem;
Expand Down
27 changes: 21 additions & 6 deletions gpu-burn/BurnKernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#define GPUBURN_BURNKERNEL_H_

#include <thread>
#include <map>

// ---------------------------------------------------------------------------
namespace gpuburn {
Expand All @@ -34,21 +35,35 @@ class BurnKernel {
*/
int stopBurn();

// rocky:
int get_err_num();

private:
static constexpr int cRandSeed = 10;
static constexpr float cUseMem = 0.80;
static constexpr uint32_t cRowSize = 512;
//static constexpr uint32_t cRowSize = 512;
static constexpr uint32_t cRowSize = 8640; // rocky, 20190809
static constexpr uint32_t cMatrixSize = cRowSize * cRowSize;
static constexpr uint32_t cBlockSize = 16;
static constexpr float cAlpha = 1.0f;
static constexpr float cBeta = 0.0f;

float mHostAdata[cMatrixSize];
float mHostBdata[cMatrixSize];
//float mHostAdata[cMatrixSize];
//float mHostBdata[cMatrixSize];

double mHostAdata[cMatrixSize];
double mHostBdata[cMatrixSize];
//float* mDeviceAdata;
//float* mDeviceBdata;
//float* mDeviceCdata;

// rocky:
double* mDeviceAdata;
double* mDeviceBdata;
double* mDeviceCdata;

float* mDeviceAdata;
float* mDeviceBdata;
float* mDeviceCdata;
int* d_faultyElemData;
int err_num;

bool mRunKernel;
int mNumIterations;
Expand Down
6 changes: 3 additions & 3 deletions gpu-burn/Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
HIP_PATH ?= /opt/rocm/hip
HCC_PATH ?= /opt/rocm/hcc
HIP_PLATFORM = $(shell $(HIP_PATH)/bin/hipconfig --platform)
HIP_INCLUDE = -I${HIP_PATH}/include -I${HCC_PATH}/include
HIP_INCLUDE = -I${HIP_PATH}/include -I${HCC_PATH}/include -I/opt/rocm/include
BUILD_DIR ?= build

HIPCC = ${HIP_PATH}/bin/hipcc
CPPFLAGS = -O3
LDFLAGS = -lm -lpthread
CPPFLAGS = -lrocblas -L/opt/rocm/lib -I/opt/rocm/include
LDFLAGS = -lm -lrocblas

ifeq (${HIP_PLATFORM}, nvcc)
CPPFLAGS += -arch=compute_20
Expand Down
42 changes: 39 additions & 3 deletions gpu-burn/gpuburn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ std::vector<std::unique_ptr<BurnKernel>> genBurnKernels()

try {
checkError(hipGetDeviceCount(&deviceCount));
std::cout<<"Total no. of GPUs found: "<<deviceCount<<std::endl;
} catch (std::string e) {
std::cerr << "Error: couldn't find any HIP devices\n";
}


for (int i =0; i < deviceCount; ++i) {
try {
std::unique_ptr<BurnKernel> kernel(new BurnKernel(i));
Expand Down Expand Up @@ -89,17 +89,54 @@ int doBurn(int burnSec) {
std::ostringstream msg;
msg << "Temps: ";
for (auto& monitor : gpuMonitors) {
msg << "[GPU" << monitor->getId() << ": " << monitor->getTemperature() << " C] ";
msg << "[GPU" << monitor->getId() << ":" << monitor->getTemperature() << "C] ";
}

int cnt = 0;
msg << " Accuracy: ";
int current_err = 0;
for(auto& kernel : burnKernels){
current_err += kernel->get_err_num();
msg << "[GPU " << kernel->mHipDevice << " err: " << kernel->get_err_num() << "] " ;
cnt += 1;
}


msg << burnSec << "s\n";
std::cout << msg.str();

sleep(1);
}

for (auto& kernel : burnKernels) {
kernel->stopBurn();
}


// final report, rockyli:
// Tested 2 GPUs:
// GPU 0: FAULTY
// GPU 1: OK

std::ostringstream rpt;
int gpu_count = 0;
for (auto& kernel : burnKernels) {
gpu_count +=1;
}
rpt << "Tested " << gpu_count << " GPUs:\n";
for (auto& kernel : burnKernels) {
std::string r;
if(kernel->get_err_num() > 0){
r = "FAULTY";
}
else{
r = "OK";
}
rpt << "\tGPU " << kernel->mHipDevice << ": " << r << "\n";
}
std::cout << rpt.str();


return 0;
}

Expand All @@ -118,7 +155,6 @@ int main(int argc, char **argv) {
std::cerr << "Usage: " << argv[0] << " [-t sec]\n";
return -EINVAL;
}

return doBurn(burnSec);
}

Expand Down