diff --git a/gimmik/cuda.py b/gimmik/cuda.py index b18c509..293d7c3 100644 --- a/gimmik/cuda.py +++ b/gimmik/cuda.py @@ -10,20 +10,20 @@ class CUDAMatMul(MatMul): def _kernel_generators(self, dtype, dsize, *, compute_capability=None): # B loading, C streaming kernel - yield ('cstream', {}, {}) + yield ('cstream', {'blocksz': self.basemeta['block'][0]}, {}) # B streaming, C accumulation kernel - yield ('bstream', {}, {}) + yield ('bstream', {'blocksz': self.basemeta['block'][0]}, {}) # Four-way m-split B streaming, C accumulation kernel ms, bsz, blkx = 4, 24, 32 - args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} + args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx, 'blocksz': blkx*ms} meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize} yield ('bstream-msplit', args, meta) # Two-way k-split B loading, C streaming kernel ks, csz, blkx = 2, 24, 32 - args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} + args = {'ksplit': ks, 'csz': csz, 'blockx': blkx, 'blocksz': blkx*ks} meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize} yield ('cstream-ksplit', args, meta) @@ -31,14 +31,15 @@ def _kernel_generators(self, dtype, dsize, *, compute_capability=None): if (dtype == 'float' and self.aligne is not None and self.aligne % 2 == 0): # Vector B loading, C streaming kernel - args = {'dtype': 'float2', 'width': 2} + args = {'dtype': 'float2', 'width': 2, + 'blocksz': self.basemeta['block'][0]} meta = {'width': 2} yield ('cstream', args, meta) # Vector four-way m-split B streaming, C accumulation kernel ms, bsz, blkx = 4, 16, 32 - args = {'dtype': 'float2', 'width': 2, 'msplit': ms, - 'bsz': bsz, 'blockx': blkx} + args = {'dtype': 'float2', 'width': 2, 'msplit': ms, 'bsz': bsz, + 'blockx': blkx, 'blocksz': blkx*ms} meta = {'block': (blkx, ms, 1), 'width': 2, 'shared': 2*blkx*bsz*2*dsize} yield ('bstream-msplit', args, meta) @@ -46,7 +47,7 @@ def _kernel_generators(self, dtype, dsize, *, compute_capability=None): # Vector two-way k-split B loading, C streaming kernel ks, csz, blkx = 2, 24, 32 args = {'dtype': 'float2', 'width': 2, 'ksplit': ks, - 'csz': csz, 'blockx': blkx} + 'csz': csz, 'blockx': blkx, 'blocksz': blkx*ks} meta = {'block': (blkx, ks, 1), 'width': 2, 'shared': 2*(ks - 1)*csz*blkx*dsize} yield ('cstream-ksplit', args, meta) diff --git a/gimmik/kernels/cuda/bstream-msplit.mako b/gimmik/kernels/cuda/bstream-msplit.mako index 5d173d1..458491b 100644 --- a/gimmik/kernels/cuda/bstream-msplit.mako +++ b/gimmik/kernels/cuda/bstream-msplit.mako @@ -5,7 +5,7 @@ mx = partition(A, into=msplit, by='rows') bchunks = chunk(bix, bsz) %> -__global__ void +__global__ __launch_bounds__(${blocksz}) void % if n is None: ${kname}(int n, const ${dtype}* __restrict__ b, int ldb, diff --git a/gimmik/kernels/cuda/bstream.mako b/gimmik/kernels/cuda/bstream.mako index 6b6faec..500c2eb 100644 --- a/gimmik/kernels/cuda/bstream.mako +++ b/gimmik/kernels/cuda/bstream.mako @@ -1,11 +1,15 @@ <%inherit file='base'/> -__global__ void +__global__ __launch_bounds__(${blocksz}) void % if n is None: ${kname}(int n, const ${dtype}* __restrict__ b, int ldb, ${dtype}* __restrict__ c, int ldc) { +#if ( __CUDACC_VER_MAJOR__ >= 13 ) + asm volatile (".pragma \"enable_smem_spilling\";"); +#endif + % if width > 1: n = ((n + ${width} - 1) / ${width}) * ${width}; ldb /= ${width}; @@ -14,6 +18,9 @@ ${kname}(int n, % else: ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) { +#if ( __CUDACC_VER_MAJOR__ >= 13 ) + asm volatile (".pragma \"enable_smem_spilling\";"); +#endif const int n = ${-(-n // width)}; const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; diff --git a/gimmik/kernels/cuda/cstream-ksplit.mako b/gimmik/kernels/cuda/cstream-ksplit.mako index 2826b06..4ab35b1 100644 --- a/gimmik/kernels/cuda/cstream-ksplit.mako +++ b/gimmik/kernels/cuda/cstream-ksplit.mako @@ -6,7 +6,7 @@ cchunks = chunk(range(m), csz) loaded = set() %> -__global__ void +__global__ __launch_bounds__(${blocksz}) void % if n is None: ${kname}(int n, const ${dtype}* __restrict__ b, int ldb, diff --git a/gimmik/kernels/cuda/cstream.mako b/gimmik/kernels/cuda/cstream.mako index f8c7f03..203e429 100644 --- a/gimmik/kernels/cuda/cstream.mako +++ b/gimmik/kernels/cuda/cstream.mako @@ -2,12 +2,15 @@ <% ksplit = 2 if m < 36 else 1 %> -__global__ void +__global__ __launch_bounds__(${blocksz}) void % if n is None: ${kname}(int n, const ${dtype}* __restrict__ b, int ldb, ${dtype}* __restrict__ c, int ldc) { +#if ( __CUDACC_VER_MAJOR__ >= 13 ) + asm volatile (".pragma \"enable_smem_spilling\";"); +#endif % if width > 1: n = ((n + ${width} - 1) / ${width}) * ${width}; ldb /= ${width}; @@ -16,6 +19,9 @@ ${kname}(int n, % else: ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) { +#if ( __CUDACC_VER_MAJOR__ >= 13 ) + asm volatile (".pragma \"enable_smem_spilling\";"); +#endif const int n = ${-(-n // width)}; const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};