diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh index d5ec2648a8..68ca1cc9b2 100755 --- a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh @@ -36,7 +36,7 @@ inline int get_device_sm_cnt_() { namespace fbgemm_gpu { -#if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 9000 +#if !defined(USE_ROCM) && defined(CUDA_VERSION) #define FBGEMM_USE_SUBWARP_SHUFFLE #endif @@ -88,7 +88,7 @@ DEVICE_INLINE T shfl_xor( int laneMask, int width = kWarpSize, unsigned shfl_sync_mask = static_cast(kFullWarpMask)) { -#if defined(USE_ROCM) || CUDA_VERSION < 9000 +#if defined(USE_ROCM) return __shfl_xor(val, laneMask, width); #else return __shfl_xor_sync(shfl_sync_mask, val, laneMask, width); @@ -101,7 +101,7 @@ DEVICE_INLINE T shfl_sync( int srcLane = 0, int width = kWarpSize, unsigned shfl_sync_mask = static_cast(kFullWarpMask)) { -#if defined(USE_ROCM) || CUDA_VERSION < 9000 +#if defined(USE_ROCM) return __shfl(val, srcLane, width); #else return __shfl_sync(shfl_sync_mask, val, srcLane, width); @@ -114,21 +114,21 @@ DEVICE_INLINE T shfl_down_sync( unsigned delta, int width = kWarpSize, unsigned shfl_sync_mask = static_cast(kFullWarpMask)) { -#if defined(USE_ROCM) || CUDA_VERSION < 9000 +#if defined(USE_ROCM) return __shfl_down(val, delta, width); #else return __shfl_down_sync(shfl_sync_mask, val, delta, width); #endif } -#if defined(USE_ROCM) || CUDA_VERSION < 9000 +#if defined(USE_ROCM) DEVICE_INLINE uint64_t ballot_sync( #else DEVICE_INLINE uint32_t ballot_sync( #endif int predicate, unsigned shfl_sync_mask = static_cast(kFullWarpMask)) { -#if defined(USE_ROCM) || CUDA_VERSION < 9000 +#if defined(USE_ROCM) return __ballot(predicate); #else return __ballot_sync(shfl_sync_mask, predicate); diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/float.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/float.cuh index dd3fec0280..8c8a0a5117 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/utils/float.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/utils/float.cuh @@ -54,7 +54,7 @@ struct Half4 { *reinterpret_cast(p) = *reinterpret_cast(&a); *reinterpret_cast(p + 2) = *reinterpret_cast(&b); -#elif CUDA_VERSION >= 9000 +#else #ifndef __HALF2_TO_UI // cuda_fp16.hpp doesn't export this @@ -64,8 +64,6 @@ struct Half4 { asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(__HALF2_TO_UI(a)), "r"(__HALF2_TO_UI(b))); -#else - asm("st.v2.u32 [%0], {%1, %2};" : : "l"(p), "r"(a.x), "r"(b.x)); #endif } }; diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/vec4.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/vec4.cuh index a05f91c572..e53a33c3d4 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/utils/vec4.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/utils/vec4.cuh @@ -97,15 +97,9 @@ struct Vec4T : public Vec4BaseT { acc.w = b.y; #else Half4 out; -#if CUDA_VERSION >= 9000 asm("ld.global.v2.u32 {%0, %1}, [%2];" : "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) : "l"(p)); -#else - asm("ld.global.v2.u32 {%0, %1}, [%2];" - : "=r"(out.a.x), "=r"(out.b.x) - : "l"(p)); -#endif float2 a = __half22float2(out.a); float2 b = __half22float2(out.b); @@ -287,15 +281,9 @@ struct Vec4T : public Vec4BaseT { acc.w = b.y; #else Half4 out; -#if CUDA_VERSION >= 9000 asm("ld.global.v2.u32 {%0, %1}, [%2];" : "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) : "l"(p)); -#else - asm("ld.global.v2.u32 {%0, %1}, [%2];" - : "=r"(out.a.x), "=r"(out.b.x) - : "l"(p)); -#endif float2 a = __half22float2(out.a); float2 b = __half22float2(out.b); @@ -360,22 +348,12 @@ struct Vec4T : public Vec4BaseT { dst[3] = src[3]; #else Half4 out; -#if CUDA_VERSION >= 9000 asm("ld.global.v2.u32 {%0, %1}, [%2];" : "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) : "l"(src)); -#else - asm("ld.global.v2.u32 {%0, %1}, [%2];" - : "=r"(out.a.x), "=r"(out.b.x) - : "l"(src)); -#endif -#if CUDA_VERSION >= 9000 asm("st.v2.u32 [%0], {%1, %2};" : : "l"(dst), "r"(__HALF2_TO_UI(out.a)), "r"(__HALF2_TO_UI(out.b))); -#else - asm("st.v2.u32 [%0], {%1, %2};" : : "l"(dst), "r"(out.a.x), "r"(out.b.x)); -#endif #endif } @@ -488,15 +466,9 @@ struct Vec4T : public Vec4BaseT { acc.w = b.y; #else Half4 out; -#if CUDA_VERSION >= 9000 asm("ld.global.v2.u32 {%0, %1}, [%2];" : "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) : "l"(p)); -#else - asm("ld.global.v2.u32 {%0, %1}, [%2];" - : "=r"(out.a.x), "=r"(out.b.x) - : "l"(p)); -#endif float2 a = __half22float2(out.a); float2 b = __half22float2(out.b); diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/vec_quant.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/vec_quant.cuh index fc5347e459..7dee2a71d3 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/utils/vec_quant.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/utils/vec_quant.cuh @@ -235,7 +235,7 @@ DEVICE_INLINE T shfl_xor( const T val, int laneMask, int width = kThreadsPerWarp) { -#if defined(__HIP_PLATFORM_AMD__) || CUDA_VERSION < 9000 +#if defined(__HIP_PLATFORM_AMD__) return __shfl_xor(val, laneMask, width); #else return __shfl_xor_sync(shfl_sync_mask, val, laneMask, width);