Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 1 addition & 108 deletions paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ struct SearchAlgorithmBase<ConvKind::kForward> {
size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
int actual_perf_count;
int best_algo_idx = 0;
std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
Expand All @@ -153,43 +152,15 @@ struct SearchAlgorithmBase<ConvKind::kForward> {
result.workspace_size = perf_results[best_algo_idx].memory;

if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
VLOG(4) << GetPerfResultString<PerfT>("[Heuristic] FwdAlgo Perf result",
perf_results,
actual_perf_count,
workspace_size_limit);
// cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace<PerfT, AlgoT>(
perf_results, workspace_size_limit, &result);
#else
VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
"because the workspace size request("
<< result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle,
args.idesc.desc(),
args.wdesc.desc(),
args.cdesc.desc(),
args.odesc.desc(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit,
&(result.algo)));
#endif
}
#else
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle,
args.idesc.desc(),
args.wdesc.desc(),
args.cdesc.desc(),
args.odesc.desc(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit,
&(result.algo)));
#endif

result.workspace_size = GetWorkspaceSize(args, result.algo);
return result;
}
Expand Down Expand Up @@ -311,7 +282,6 @@ struct SearchAlgorithmBase<ConvKind::kBackwardData> {
size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
int actual_perf_count;
int best_algo_idx = 0;
std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
Expand All @@ -327,53 +297,12 @@ struct SearchAlgorithmBase<ConvKind::kBackwardData> {
perf_results.data()));
result.algo = perf_results[best_algo_idx].algo;

#if CUDNN_VERSION < 7500
int stride_dim = args.x->dims().size() - 2;
bool blacklist = std::any_of(args.s.begin(),
args.s.begin() + stride_dim,
[=](int n) { return n != 1; });
if (blacklist && (perf_results[best_algo_idx].algo ==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
perf_results[best_algo_idx].algo ==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
#endif
result.workspace_size = GetWorkspaceSize(args, result.algo);
if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
// cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace<PerfT, AlgoT>(
perf_results, workspace_size_limit, &result);
#else
VLOG(1) << "Fallback to non-v7 method to find conv algorithm because "
"the workspace size request("
<< result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle,
args.wdesc.desc(),
args.odesc.desc(),
args.cdesc.desc(),
args.idesc.desc(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit,
&(result.algo)));
#endif
}
#else
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle,
args.wdesc.desc(),
args.odesc.desc(),
args.cdesc.desc(),
args.idesc.desc(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit,
&(result.algo)));
#endif
result.workspace_size = GetWorkspaceSize(args, result.algo);
return result;
}
Expand Down Expand Up @@ -495,7 +424,6 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
int actual_perf_count;
int best_algo_idx = 0;
std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
Expand All @@ -513,39 +441,10 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
result.workspace_size = perf_results[best_algo_idx].memory;

if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
// cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace<PerfT, AlgoT>(
perf_results, workspace_size_limit, &result);
#else
VLOG(1) << "Fallback to non-v7 method to find conv algorithm because "
"the workspace size request("
<< result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle,
args.idesc.desc(),
args.odesc.desc(),
args.cdesc.desc(),
args.wdesc.desc(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit,
&(result.algo)));
#endif
}
#else
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle,
args.idesc.desc(),
args.odesc.desc(),
args.cdesc.desc(),
args.wdesc.desc(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit,
&(result.algo)));
#endif

result.workspace_size = GetWorkspaceSize(args, result.algo);
return result;
Expand Down Expand Up @@ -622,7 +521,6 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
}

static int GetAlgorithmMaxCount(cudnnHandle_t handle) {
#if CUDNN_VERSION_MIN(7, 0, 1)
int max_algos = 0;
auto status =
phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
Expand All @@ -632,7 +530,6 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
<< kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos;
return max_algos;
}
#endif
return kNUM_CUDNN_BWD_FILTER_ALGS;
}

Expand Down Expand Up @@ -736,12 +633,10 @@ struct SearchAlgorithm : public SearchAlgorithmBase<CK> {
const phi::GPUContext& dev_ctx,
cudnnDataType_t dtype,
const phi::backends::gpu::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
VLOG(5) << "Enable Tensor Core for FLOAT16";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
} else if (dev_ctx.GetComputeCapability() >= 80 &&
dtype == CUDNN_DATA_BFLOAT16) {
Expand All @@ -753,12 +648,10 @@ struct SearchAlgorithm : public SearchAlgorithmBase<CK> {
VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_FMA_MATH));
#endif // CUDA_VERSION >= 11000
} else {
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_DEFAULT_MATH));
}
#endif
}
};

Expand Down