diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h index 5915c55025d12c..085845dfb30399 100644 --- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h +++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h @@ -135,7 +135,6 @@ struct SearchAlgorithmBase { size_t workspace_size_limit = CalcWorkspaceLimitInBytes(UseFixedWorkspace()); -#if CUDNN_VERSION >= 7001 int actual_perf_count; int best_algo_idx = 0; std::vector perf_results(kNUM_CUDNN_FWD_ALGS); @@ -153,7 +152,6 @@ struct SearchAlgorithmBase { result.workspace_size = perf_results[best_algo_idx].memory; if (result.workspace_size > workspace_size_limit) { -#if CUDNN_VERSION >= 8000 VLOG(4) << GetPerfResultString("[Heuristic] FwdAlgo Perf result", perf_results, actual_perf_count, @@ -161,35 +159,8 @@ struct SearchAlgorithmBase { // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 ChooseAlgoByWorkspace( perf_results, workspace_size_limit, &result); -#else - VLOG(3) << "Fallback to non-v7 method to find conv algorithm " - "because the workspace size request(" - << result.workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cudnnGetConvolutionForwardAlgorithm( - args.handle, - args.idesc.desc(), - args.wdesc.desc(), - args.cdesc.desc(), - args.odesc.desc(), - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, - &(result.algo))); -#endif } -#else - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cudnnGetConvolutionForwardAlgorithm( - args.handle, - args.idesc.desc(), - args.wdesc.desc(), - args.cdesc.desc(), - args.odesc.desc(), - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, - &(result.algo))); -#endif + result.workspace_size = GetWorkspaceSize(args, result.algo); return result; } @@ -311,7 +282,6 @@ struct SearchAlgorithmBase { size_t workspace_size_limit = CalcWorkspaceLimitInBytes(UseFixedWorkspace()); -#if CUDNN_VERSION >= 7001 int actual_perf_count; int best_algo_idx = 0; std::vector perf_results(kNUM_CUDNN_BWD_DATA_ALGS); @@ -327,53 +297,12 @@ struct SearchAlgorithmBase { perf_results.data())); result.algo = perf_results[best_algo_idx].algo; -#if CUDNN_VERSION < 7500 - int stride_dim = args.x->dims().size() - 2; - bool blacklist = std::any_of(args.s.begin(), - args.s.begin() + stride_dim, - [=](int n) { return n != 1; }); - if (blacklist && (perf_results[best_algo_idx].algo == - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || - perf_results[best_algo_idx].algo == - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { - result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; - } -#endif result.workspace_size = GetWorkspaceSize(args, result.algo); if (result.workspace_size > workspace_size_limit) { -#if CUDNN_VERSION >= 8000 // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8 ChooseAlgoByWorkspace( perf_results, workspace_size_limit, &result); -#else - VLOG(1) << "Fallback to non-v7 method to find conv algorithm because " - "the workspace size request(" - << result.workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - args.handle, - args.wdesc.desc(), - args.odesc.desc(), - args.cdesc.desc(), - args.idesc.desc(), - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, - &(result.algo))); -#endif } -#else - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm( - args.handle, - args.wdesc.desc(), - args.odesc.desc(), - args.cdesc.desc(), - args.idesc.desc(), - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, - &(result.algo))); -#endif result.workspace_size = GetWorkspaceSize(args, result.algo); return result; } @@ -495,7 +424,6 @@ struct SearchAlgorithmBase { size_t workspace_size_limit = CalcWorkspaceLimitInBytes(UseFixedWorkspace()); -#if CUDNN_VERSION >= 7001 int actual_perf_count; int best_algo_idx = 0; std::vector perf_results(kNUM_CUDNN_BWD_FILTER_ALGS); @@ -513,39 +441,10 @@ struct SearchAlgorithmBase { result.workspace_size = perf_results[best_algo_idx].memory; if (result.workspace_size > workspace_size_limit) { -#if CUDNN_VERSION >= 8000 // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8 ChooseAlgoByWorkspace( perf_results, workspace_size_limit, &result); -#else - VLOG(1) << "Fallback to non-v7 method to find conv algorithm because " - "the workspace size request(" - << result.workspace_size << ") exceeds the limit(" - << workspace_size_limit << ")"; - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - args.handle, - args.idesc.desc(), - args.odesc.desc(), - args.cdesc.desc(), - args.wdesc.desc(), - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, - &(result.algo))); -#endif } -#else - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - args.handle, - args.idesc.desc(), - args.odesc.desc(), - args.cdesc.desc(), - args.wdesc.desc(), - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_size_limit, - &(result.algo))); -#endif result.workspace_size = GetWorkspaceSize(args, result.algo); return result; @@ -622,7 +521,6 @@ struct SearchAlgorithmBase { } static int GetAlgorithmMaxCount(cudnnHandle_t handle) { -#if CUDNN_VERSION_MIN(7, 0, 1) int max_algos = 0; auto status = phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( @@ -632,7 +530,6 @@ struct SearchAlgorithmBase { << kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos; return max_algos; } -#endif return kNUM_CUDNN_BWD_FILTER_ALGS; } @@ -736,12 +633,10 @@ struct SearchAlgorithm : public SearchAlgorithmBase { const phi::GPUContext& dev_ctx, cudnnDataType_t dtype, const phi::backends::gpu::ConvolutionDescriptor& cdesc) { -#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); VLOG(5) << "Enable Tensor Core for FLOAT16"; -#if CUDA_VERSION >= 11000 #if CUDNN_VERSION_MIN(8, 1, 0) } else if (dev_ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) { @@ -753,12 +648,10 @@ struct SearchAlgorithm : public SearchAlgorithmBase { VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT"; PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_FMA_MATH)); -#endif // CUDA_VERSION >= 11000 } else { PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_DEFAULT_MATH)); } -#endif } };