From 9ebb62365dd8a94e4043bf004e78955c4e8e7499 Mon Sep 17 00:00:00 2001 From: Abhijeet Date: Wed, 7 Feb 2024 19:47:46 +0530 Subject: [PATCH 1/3] Implement Dual-Key GPU Hash Table for Enhanced Key-Value Storage --- src/DoubleKeyValHashTable.cu | 241 +++++++++++++++++++++++++++++++++++ src/cuda_hashing.h | 33 +++++ 2 files changed, 274 insertions(+) create mode 100644 src/DoubleKeyValHashTable.cu create mode 100644 src/cuda_hashing.h diff --git a/src/DoubleKeyValHashTable.cu b/src/DoubleKeyValHashTable.cu new file mode 100644 index 0000000..7827e65 --- /dev/null +++ b/src/DoubleKeyValHashTable.cu @@ -0,0 +1,241 @@ +// Added support for two keys and two values, + +//Create a hash table. For linear probing, this is just an array of keyValues +keyValues* create_hashtable() { + keyValues* hashtable; + CUDA_CHECK(cudaMalloc(&hashtable, sizeof(keyValues) * kHashTableCapacity), + "Unable to allocate hashtable"); + + //Initialize hash table to empty + static_assert(kEmpty == 0xffffffffffffffff, "memset expected kEmpty=0xffffffffffffffff"); + CUDA_CHECK(cudaMemset(hashtable, 0xff, sizeof(keyValues) * kHashTableCapacity), + "Unable memset the hashtable"); + + return hashtable; +} + +void destroy_hashtable(keyValues* pHashTable) { + CUDA_CHECK(cudaFree(pHashTable),"Unable to destroy the hashtable"); +} + +//64 bit Murmur2 hash +__device__ __forceinline__ +uint64_t hash(const uint64_t key) { + const uint32_t seed = 0x9747b28c; + const uint64_t m = 0xc6a4a7935bd1e995LLU; // A large prime number + const int r = 47; + + uint64_t h = seed ^ (8 * m); + + uint64_t k = key; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + + // Finalization + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h & (kHashTableCapacity - 1); //mask to ensure it falls within table +} + +//Combining two keys +__device__ __forceinline__ +uint64_t combine_keys(uint32_t key1, uint32_t key2) { + uint64_t combined_key = key1; + combined_key = (combined_key << 32) | key2; + return combined_key; +} + +//Lookup keys in the hashtable, and return the values +__global__ +void gpu_hashtable_lookup( + keyValues* hashtable, + uint32_t key_1, // key_1 to lookup + uint32_t key_2, // key_2 to lookup + uint32_t value_1, // value_1 (result) + uint32_t value_2, // value_2 (result) + int size) { // num_items + + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if(tid < size) { + + unsigned long long new_key = combine_keys(key_1, key_2); + uint64_t slot = hash(new_key); + + while(1) { + if(hashtable[slot].key == new_key) { + value_1[tid] = hashtable[slot].val1; + value_2[tid] = hashtable[slot].val2; + return; + } + if(hashtable[slot].key == kEmpty) { + d_val1[tid] = kEmpty; + d_val2[tid] = kEmpty; + return; + } + + slot = (slot + 1) & (kHashTableCapacity - 1); + } + } +} + +// insert into hashTable +__global__ +void gpu_hashtable_insert(keyValues* hashtable, uint32_t *key1, uint32_t *key2, uint32_t *value1, uint32_t *value2, int size) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if(tid < size) { + //insert into hashtable here only + unsigned long long key = combine_keys(key1[tid], key2[tid]); + + uint64_t slot = hash(key); + + while(1) { + unsigned long long prev = atomicCAS(&hashtable[slot].key, kEmpty, key); + if(prev == kEmpty || prev == key) { + hashtable[slot].val1 = value1[tid]; + hashtable[slot].val2 = value2[tid]; + return; + } + + slot = (slot + 1) & (kHashTableCapacity - 1); + } + + } +} + +void insert_hashtable( + KeyValue* pHashTable, + const uint32_t* key_1, const uint32_t* key_2, + uint32_t* value_1, const uint32_t* value_2, + uint32_t num_items) { + + uint32_t *d_key_1; + uint32_t *d_key_2; + uint32_t *d_value_1; + uint32_t *d_value_2; + + // Allocate memory on gpu + CUDA_CHECK(cudaMalloc(&d_key_1, sizeof(uint32_t) * num_items), "Failed to allocate d_key_1"); + CUDA_CHECK(cudaMalloc(&d_key_2, sizeof(uint32_t) * num_items), "Failed to allocate d_key_2"); + CUDA_CHECK(cudaMalloc(&d_value_1, sizeof(uint32_t) * num_items), "Failed to allocate d_value_1"); + CUDA_CHECK(cudaMalloc(&d_value_2, sizeof(uint32_t) * num_items), "Failed to allocate d_value_2"); + + // Copy the keyvalues to the GPU + CUDA_CHECK(cudaMemcpy(d_key_1, key_1, sizeof(uint32_t) * num_items, cudaMemcpyHostToDevice), + "Failed to copy key_1 to gpu"); + CUDA_CHECK(cudaMemcpy(d_key_2, key_2, sizeof(uint32_t) * num_items, cudaMemcpyHostToDevice), + "Failed to copy key_2 to gpu"); + CUDA_CHECK(cudaMemcpy(d_value_1, value_1, sizeof(uint32_t) * num_items, cudaMemcpyHostToDevice), + "Failed to copy d_value_1 to gpu"); + CUDA_CHECK(cudaMemcpy(d_value_2, value_2, sizeof(uint32_t) * num_items, cudaMemcpyHostToDevice), + "Failed to copy d_value_2 to gpu"); + + // Have CUDA calculate the thread block size + int mingridsize; + int threadblocksize; + cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0); + + // Create events for GPU timing + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + // Insert all the keys into the hash table + int gridsize = (num_items + threadblocksize - 1) / threadblocksize; + gpu_hashtable_insert<<>>( + pHashTable, + d_key_1, + d_key_2, + d_value_1, + d_value_2, + num_items); + + CUDA_CHECK(cudaDeviceSynchronize(), "Failed to synchronize after gpu_hashtable_insert"); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + float seconds = milliseconds / 1000.0f; + printf(" GPU inserted %d items in %f ms (%f million keys/second)\n", + num_items, milliseconds, num_items / (double)seconds / 1000000.0f); + + CUDA_CHECK(cudaFree(d_key_1), "Failed to free d_key_1"); + CUDA_CHECK(cudaFree(d_key_2), "Failed to free d_key_2"); + CUDA_CHECK(cudaFree(d_value_1), "Failed to free d_value_1"); + CUDA_CHECK(cudaFree(d_value_2), "Failed to free d_value_2"); +} + +void lookup_hashtable(KeyValue* pHashTable, + const uint32_t* key_1, // cpu key_1 + const uint32_t* key_2, // cpu key_2 + const uint32_t* value_1, // cpu value_1 (result) + const uint32_t* value_2, // cpu value_2 (result) + uint32_t num_items) { + + uint32_t *d_key_1; + uint32_t *d_key_2; + uint32_t *d_value_1; + uint32_t *d_value_2; + + // Allocate memory on gpu + CUDA_CHECK(cudaMalloc(&d_key_1, sizeof(uint32_t) * num_items), "Failed to allocate d_key_1"); + CUDA_CHECK(cudaMalloc(&d_key_2, sizeof(uint32_t) * num_items), "Failed to allocate d_key_2"); + CUDA_CHECK(cudaMalloc(&d_value_1, sizeof(uint32_t) * num_items), "Failed to allocate d_value_1"); + CUDA_CHECK(cudaMalloc(&d_value_2, sizeof(uint32_t) * num_items), "Failed to allocate d_value_2"); + + // Copy the keyvalues to the GPU + CUDA_CHECK(cudaMemcpy(d_key_1, key_1, sizeof(uint32_t) * num_items, cudaMemcpyHostToDevice), + "Failed to copy key_1 to gpu"); + CUDA_CHECK(cudaMemcpy(d_key_2, key_2, sizeof(uint32_t) * num_items, cudaMemcpyHostToDevice), + "Failed to copy key_2 to gpu"); + + // Have CUDA calculate the thread block size + int mingridsize; + int threadblocksize; + cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0); + + // Create events for GPU timing + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + gpu_hashtable_lookup<<>>( + pHashTable, + d_key_1, + d_key_2, + d_value_1, + d_value_2, + h_size); + + CUDA_CHECK(cudaDeviceSynchronize(), "Failed to synchronize after gpu_hashtable_insert"); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + float seconds = milliseconds / 1000.0f; + printf(" GPU looked_up %d items in %f ms (%f million keys/second)\n", + num_items, milliseconds, num_items / (double)seconds / 1000000.0f); + + // copy back results to cpu + CUDA_CHECK(cudaMemcpy(value_1, d_value_1, sizeof(uint32_t) * num_items, cudaMemcpyHostToDevice), + "Failed to copy back d_value_1 back to CPU"); + CUDA_CHECK(cudaMemcpy(value_2, d_value_2, sizeof(uint32_t) * num_items, cudaMemcpyHostToDevice), + "Failed to copy back d_value_2 back to CPU"); + + CUDA_CHECK(cudaFree(d_key_1), "Failed to free d_key_1"); + CUDA_CHECK(cudaFree(d_key_2), "Failed to free d_key_2"); + CUDA_CHECK(cudaFree(d_value_1), "Failed to free d_value_1"); + CUDA_CHECK(cudaFree(d_value_2), "Failed to free d_value_2"); +} \ No newline at end of file diff --git a/src/cuda_hashing.h b/src/cuda_hashing.h new file mode 100644 index 0000000..1f402b8 --- /dev/null +++ b/src/cuda_hashing.h @@ -0,0 +1,33 @@ +#ifndef CUDA_HASHING_H_ +#define CUDA_HASHING_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct keyValues +{ + unsigned long long key; + uint32_t val1; + uint32_t val2; +}; + +const uint64_t kHashTableCapacity = 128 * 1024 * 1024; + +const uint64_t kEmpty = 0xffffffffffffffff; + +keyValues* create_hashtable(); +void insert_hashtable(keyValues* pHashTable, const std::vector& v1, const std::vector& v2, const std::vector& v3, const std::vector& v4); + +void lookup_hashtable(keyValues* hashtable, const std::vector& key1, const std::vector& key2, std::vector& value1, std::vector& value2); + +void destroy_hashtable(keyValues* pHashTable); + +#endif // CUDA_HASHING_H_ + + From 1544b1ec2f284ccf27c7a034dc1484370772bdc7 Mon Sep 17 00:00:00 2001 From: Abhijeet Date: Wed, 7 Feb 2024 20:09:09 +0530 Subject: [PATCH 2/3] README update --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index ce2a8c1..bc5c12f 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,12 @@ ![](screenshot.png) +# Enhancements: + +- Dual-Key Support: Expanded the hash table's ability to store pairs with two keys, facilitating complex & sophisticated data-structures like graph edges. + +- 64-bit MurmurHash2 Algorithm: Adapted the MurmurHash2 hashing algorithm to efficiently manage and resolve dual-key entries. + +- CUDA_CHECKER Utility: Implements checks for CUDA operations like memory allocation and data transfer, enhancing error handling and debugging. # About This project shows how to implement a simple GPU hash table. Thanks to the high bandwidth and massive parallelism of From 4bd80202b3ff959d1f91d98d937058d29149a40c Mon Sep 17 00:00:00 2001 From: Abhijeet Date: Wed, 7 Feb 2024 20:10:08 +0530 Subject: [PATCH 3/3] Update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 567609b..7a79cfd 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ build/ +.DS_Store +src/.DS_Store