From 03c9a54ba7addd7f2a885a3332a0bede5333adf6 Mon Sep 17 00:00:00 2001 From: Fan Jiang Date: Wed, 17 Oct 2018 14:30:08 +0800 Subject: [PATCH 1/2] Fix CUDA deprecation warning --- examples/image_warping/src/WarpingSolverUtil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/image_warping/src/WarpingSolverUtil.h b/examples/image_warping/src/WarpingSolverUtil.h index 31eab308b..0cc9bf68f 100644 --- a/examples/image_warping/src/WarpingSolverUtil.h +++ b/examples/image_warping/src/WarpingSolverUtil.h @@ -12,7 +12,7 @@ __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val; From 0bd857a13f0159c54ad4048f1c21951d99fa982f Mon Sep 17 00:00:00 2001 From: Fan Jiang Date: Wed, 17 Oct 2018 15:28:27 +0800 Subject: [PATCH 2/2] CUDA 9.0 deprecated non-sync shuffle --- API/src/util.t | 20 +++++++++++++++++++ .../src/WarpingSolverUtil.h | 2 +- .../src/PatchSolverWarpingUtil.h | 2 +- .../src/WarpingSolverUtil.h | 2 +- .../shape_from_shading/src/SFSSolverUtil.h | 2 +- .../src/WarpingSolverUtil.h | 2 +- 6 files changed, 25 insertions(+), 5 deletions(-) diff --git a/API/src/util.t b/API/src/util.t index 0085b51b7..8efab26dd 100644 --- a/API/src/util.t +++ b/API/src/util.t @@ -522,6 +522,7 @@ __syncthreads = cudalib.nvvm_barrier0 local __shfl_down +local __shfl_down_sync if opt_float == float then @@ -537,6 +538,14 @@ if opt_float == float then ret = terralib.asm(float,"shfl.down.b32 $0, $1, $2, $3;","=f,f,r,r", true, v, delta, c) return ret; end + + terra __shfl_down_sync(mask : uint, v : float, delta : uint, width : int) + var ret : float; + var c : int; + c = ((warpSize-width) << 8) or 0x1F; + ret = terralib.asm(float,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, v, delta, c, mask) + return ret; + end else struct ULLDouble { union { @@ -606,6 +615,17 @@ else ret.u2.y = terralib.asm(uint32,"shfl.down.b32 $0, $1, $2, $3;","=f,f,r,r", true, init.u2.y, delta, c) return ret.d; end + + terra __shfl_down_sync(mask: uint, v : double, delta : uint, width : int) + var ret : uint2Double; + var init : uint2Double; + init.d = v + var c : int; + c = ((warpSize-width) << 8) or 0x1F; + ret.u2.x = terralib.asm(uint32,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, init.u2.x, delta, c, mask) + ret.u2.y = terralib.asm(uint32,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, init.u2.y, delta, c, mask) + return ret.d; + end end -- Using the "Kepler Shuffle", see http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ diff --git a/examples/arap_mesh_deformation/src/WarpingSolverUtil.h b/examples/arap_mesh_deformation/src/WarpingSolverUtil.h index 57276b663..9ed0abb8d 100644 --- a/examples/arap_mesh_deformation/src/WarpingSolverUtil.h +++ b/examples/arap_mesh_deformation/src/WarpingSolverUtil.h @@ -12,7 +12,7 @@ __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val; diff --git a/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h b/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h index 088070160..4f66d2818 100644 --- a/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h +++ b/examples/poisson_image_editing/src/PatchSolverWarpingUtil.h @@ -105,7 +105,7 @@ __inline__ __device__ void loadPatchToCache(volatile float* cache, float* data, __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val; diff --git a/examples/poisson_image_editing/src/WarpingSolverUtil.h b/examples/poisson_image_editing/src/WarpingSolverUtil.h index 31eab308b..0cc9bf68f 100644 --- a/examples/poisson_image_editing/src/WarpingSolverUtil.h +++ b/examples/poisson_image_editing/src/WarpingSolverUtil.h @@ -12,7 +12,7 @@ __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val; diff --git a/examples/shape_from_shading/src/SFSSolverUtil.h b/examples/shape_from_shading/src/SFSSolverUtil.h index 75cfc7be2..63bf62a59 100644 --- a/examples/shape_from_shading/src/SFSSolverUtil.h +++ b/examples/shape_from_shading/src/SFSSolverUtil.h @@ -18,7 +18,7 @@ __inline__ __device__ bool IsValidPoint(float d) __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val; diff --git a/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h b/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h index 18bc580db..13c4c7da1 100644 --- a/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h +++ b/examples/volumetric_mesh_deformation/src/WarpingSolverUtil.h @@ -12,7 +12,7 @@ __inline__ __device__ float warpReduce(float val) { int offset = 32 >> 1; while (offset > 0) { - val = val + __shfl_down(val, offset, 32); + val = val + __shfl_down_sync(0xffffffff, val, offset, 32); offset = offset >> 1; } return val;