Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions API/src/util.t
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,7 @@ __syncthreads = cudalib.nvvm_barrier0


local __shfl_down
local __shfl_down_sync

if opt_float == float then

Expand All @@ -537,6 +538,14 @@ if opt_float == float then
ret = terralib.asm(float,"shfl.down.b32 $0, $1, $2, $3;","=f,f,r,r", true, v, delta, c)
return ret;
end

terra __shfl_down_sync(mask : uint, v : float, delta : uint, width : int)
var ret : float;
var c : int;
c = ((warpSize-width) << 8) or 0x1F;
ret = terralib.asm(float,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, v, delta, c, mask)
return ret;
end
else
struct ULLDouble {
union {
Expand Down Expand Up @@ -606,6 +615,17 @@ else
ret.u2.y = terralib.asm(uint32,"shfl.down.b32 $0, $1, $2, $3;","=f,f,r,r", true, init.u2.y, delta, c)
return ret.d;
end

terra __shfl_down_sync(mask: uint, v : double, delta : uint, width : int)
var ret : uint2Double;
var init : uint2Double;
init.d = v
var c : int;
c = ((warpSize-width) << 8) or 0x1F;
ret.u2.x = terralib.asm(uint32,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, init.u2.x, delta, c, mask)
ret.u2.y = terralib.asm(uint32,"shfl.sync.down.b32 $0, $1, $2, $3, $4;","=f,f,r,r,r", true, init.u2.y, delta, c, mask)
return ret.d;
end
end

-- Using the "Kepler Shuffle", see http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
Expand Down
2 changes: 1 addition & 1 deletion examples/arap_mesh_deformation/src/WarpingSolverUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
__inline__ __device__ float warpReduce(float val) {
int offset = 32 >> 1;
while (offset > 0) {
val = val + __shfl_down(val, offset, 32);
val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
offset = offset >> 1;
}
return val;
Expand Down
2 changes: 1 addition & 1 deletion examples/image_warping/src/WarpingSolverUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
__inline__ __device__ float warpReduce(float val) {
int offset = 32 >> 1;
while (offset > 0) {
val = val + __shfl_down(val, offset, 32);
val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
offset = offset >> 1;
}
return val;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ __inline__ __device__ void loadPatchToCache(volatile float* cache, float* data,
__inline__ __device__ float warpReduce(float val) {
int offset = 32 >> 1;
while (offset > 0) {
val = val + __shfl_down(val, offset, 32);
val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
offset = offset >> 1;
}
return val;
Expand Down
2 changes: 1 addition & 1 deletion examples/poisson_image_editing/src/WarpingSolverUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
__inline__ __device__ float warpReduce(float val) {
int offset = 32 >> 1;
while (offset > 0) {
val = val + __shfl_down(val, offset, 32);
val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
offset = offset >> 1;
}
return val;
Expand Down
2 changes: 1 addition & 1 deletion examples/shape_from_shading/src/SFSSolverUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ __inline__ __device__ bool IsValidPoint(float d)
__inline__ __device__ float warpReduce(float val) {
int offset = 32 >> 1;
while (offset > 0) {
val = val + __shfl_down(val, offset, 32);
val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
offset = offset >> 1;
}
return val;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
__inline__ __device__ float warpReduce(float val) {
int offset = 32 >> 1;
while (offset > 0) {
val = val + __shfl_down(val, offset, 32);
val = val + __shfl_down_sync(0xffffffff, val, offset, 32);
offset = offset >> 1;
}
return val;
Expand Down