From 44b1c98e129bf3062b8f4331772a7b6959e50f25 Mon Sep 17 00:00:00 2001 From: Sergey Nikiforov Date: Wed, 24 Feb 2021 00:06:27 +0300 Subject: [PATCH] "Large" and "sleep" versions of "CL N-pipe" It is inefficient to poll GPU for results wasting CPU time and (in case of dGPUs) PCIe bandwidth, especially if CPU is powerful while (i)GPU is not. Original "CL N-pipe" cores are not touched, OpenCL kernels are not touched, but scheduling code is modified to permit 100 times larger work units ("CL 1-pipe large" etc) and also to flush assignment to GPU and put CPU to sleep ("CL 1-pipe sleep" etc). "Large" cores are marginally faster than original ones. "Sleep" cores are slightly slower than "large" ones because GPU may sometimes finish processing work unit while CPU still sleeps. These cores, however, consume zero CPU (all other cores consume 1 logical CPU unless sleep is transparently performed by GPU driver - Intel does this for gen8 but not for newer GPUs, this helps but only if work unit is large enough for CPU to sleep for several milliseconds). This results in higher power efficiency and, if we are not limited by TDP, significant performance improvement. Effect is more pronounced when CPU does not support MT. Note that with "sleep" cores there is no need to manually limit number of threads for CPU cruncher. Performance/efficiency can be further improved by growing work unit size faster. Wider testing and benchmarking (especially on high-end GPUs) are welcome. Benchmarks below are performed with CPU being loaded with 2.9116.525-amd64 core #4 (YK AVX2). CUDA client is 2.9110.519b, core #10 (CUDA 1-pipe 64-thd sleep 100us). "521" refers to 2.9112.521 dnetc-win32-x86-opencl.zip/ dnetc-linux-amd64-opencl.tar.gz Power consumption is "measured" with "Core Temp" / "s-tui". Core i5-8265U (15W, 4C8T, 14 nm, 1.6-3.9 GHz, Intel UHD Graphics 620 [gen9] 1100 MHz), Ubuntu 20.04 CL 2-pipe/large/sleep Mode CPU iGPU Summary Power Efficiency 521, 7 threads 124 150 274 15 18.27 521, 8 threads 127 150 277 15 18.47 521, iGPU only 0 184 184 15 12.27 CPU only 181 0 181 15 12.07 Sleep, 8 threads 135 148 283 15 18.87 iGPU only, sleep 0 186 186 15 12.40 iGPU only, large 0 186 187 15 12.47 [1.022 efficiency improvement, "sleep" is optimal] Core i7-9700K (95W, 8C8T, 14 nm, 3.6-4.9 GHz, Intel UHD Graphics 630 [gen9] 1200 MHz), Windows 10 20H2 CL 2-pipe/large/sleep Mode CPU iGPU Summary Power Efficiency 521, 8 threads 480 92 572 95 6.02 521, 7 threads 406 187 593 95 6.24 Sleep, 8 threads 457 178 635 95 6.68 Sleep, 7 threads 403 188 591 95 6.22 CPU only 473 0 473 95 4.98 iGPU only, sleep 0 188 188 22 8.55 iGPU only, large 0 190 190 44 4.32 [1.071 efficiency improvement, "sleep" is optimal] Core i5-5200U (15W, 2C4T, 14 nm, 2.2-2.7 GHz, Intel HD Graphics 5500 [gen8] 900 MHz) NVidia GeForce 820M 2048 MB, ForceWare 382.05 Windows 10 20H2 CL 4-pipe/large/sleep Mode CPU iGPU dGPU Summary Power* Efficiency* 521, 4 threads 66 59 0 125 15 8.33 521, 3 threads 63 167 0 230 21.4 10.75 521, 3 threads, CUDA 29 161 89 279 15* * CPU only 67 0 0 67 10.2 6.57 Sleep, 4 threads 71 168 0 239 21.4 11.17 Large, 4 threads 68 172 0 240 21.4 11.21 iGPU only, sleep 0 173 0 173 13.5 12.81 iGPU only, large 0 175 0 175 13.5 12.96 dGPU only, sleep 0 0 123 123 1.3* * dGPU only, large 0 0 134 134 8.3* * Sleep, 4 threads, dG 42 153 119 314 15* * Custom**, 4 threads 41 155 120 316 15* * *dGPU is not included in power measurements **Custom - "large" for iGPU (gen8 driver idles CPU himself), "sleep" for dGPU [1.043 efficiency improvement, "large" is optimal for iGPU] [CPU+iGPU+dGPU: 1.133 performance improvement, "sleep" is optimal for dGPU] "-bench" Intel UHD Graphics 620 [gen9] 1100 MHz (Core i5-8265U) RC5-72: using core #0 (CL ANSI 1-pipe). RC5-72: Benchmark for core #0 (CL ANSI 1-pipe) 0.00:00:16.14 [113,990,283 keys/sec] RC5-72: using core #1 (CL 1-pipe). RC5-72: Benchmark for core #1 (CL 1-pipe) 0.00:00:16.32 [187,106,455 keys/sec] RC5-72: using core #2 (CL 2-pipe). RC5-72: Benchmark for core #2 (CL 2-pipe) 0.00:00:16.92 [184,015,486 keys/sec] RC5-72: using core #3 (CL 4-pipe). RC5-72: Benchmark for core #3 (CL 4-pipe) 0.00:00:16.80 [166,416,580 keys/sec] RC5-72: using core #4 (CL 1-pipe large). RC5-72: Benchmark for core #4 (CL 1-pipe large) 0.00:00:16.80 [184,818,394 keys/sec] RC5-72: using core #5 (CL 2-pipe large). RC5-72: Benchmark for core #5 (CL 2-pipe large) 0.00:00:16.81 [188,636,921 keys/sec] RC5-72: using core #6 (CL 4-pipe large). RC5-72: Benchmark for core #6 (CL 4-pipe large) 0.00:00:16.61 [170,029,327 keys/sec] RC5-72: using core #7 (CL 1-pipe sleep). RC5-72: Benchmark for core #7 (CL 1-pipe sleep) 0.00:00:16.05 [189,540,521 keys/sec] RC5-72: using core #8 (CL 2-pipe sleep). RC5-72: Benchmark for core #8 (CL 2-pipe sleep) 0.00:00:17.02 [192,711,899 keys/sec] RC5-72: using core #9 (CL 4-pipe sleep). RC5-72: Benchmark for core #9 (CL 4-pipe sleep) 0.00:00:16.93 [174,570,008 keys/sec] RC5-72 benchmark summary : Default core : #-1 (undefined) 0 keys/sec Fastest core : #8 (CL 2-pipe sleep) 192,711,899 keys/sec "-bench" Intel UHD Graphics 630 [gen9] 1200 MHz (Core i7-9700K) RC5-72: using core #0 (CL ANSI 1-pipe). RC5-72: Benchmark for core #0 (CL ANSI 1-pipe) 0.00:00:16.96 [124,370,534 keys/sec] RC5-72: using core #1 (CL 1-pipe). RC5-72: Benchmark for core #1 (CL 1-pipe) 0.00:00:16.84 [186,580,220 keys/sec] RC5-72: using core #2 (CL 2-pipe). RC5-72: Benchmark for core #2 (CL 2-pipe) 0.00:00:16.76 [189,445,953 keys/sec] RC5-72: using core #3 (CL 4-pipe). RC5-72: Benchmark for core #3 (CL 4-pipe) 0.00:00:16.53 [172,042,275 keys/sec] RC5-72: using core #4 (CL 1-pipe large). RC5-72: Benchmark for core #4 (CL 1-pipe large) 0.00:00:16.10 [191,761,686 keys/sec] RC5-72: using core #5 (CL 2-pipe large). RC5-72: Benchmark for core #5 (CL 2-pipe large) 0.00:00:16.84 [192,842,719 keys/sec] RC5-72: using core #6 (CL 4-pipe large). RC5-72: Benchmark for core #6 (CL 4-pipe large) 0.00:00:16.59 [176,169,744 keys/sec] RC5-72: using core #7 (CL 1-pipe sleep). RC5-72: Benchmark for core #7 (CL 1-pipe sleep) 0.00:00:16.59 [183,669,420 keys/sec] RC5-72: using core #8 (CL 2-pipe sleep). RC5-72: Benchmark for core #8 (CL 2-pipe sleep) 0.00:00:16.57 [186,548,997 keys/sec] RC5-72: using core #9 (CL 4-pipe sleep). RC5-72: Benchmark for core #9 (CL 4-pipe sleep) 0.00:00:16.35 [169,087,725 keys/sec] RC5-72 benchmark summary : Default core : #-1 (undefined) 0 keys/sec Fastest core : #5 (CL 2-pipe large) 192,842,719 keys/sec "-bench" Intel HD Graphics 5500 [gen8] 900 MHz (Core i5-5200U) RC5-72: using core #0 (CL ANSI 1-pipe). RC5-72: Benchmark for core #0 (CL ANSI 1-pipe) 0.00:00:16.15 [9,209,485 keys/sec] RC5-72: using core #1 (CL 1-pipe). RC5-72: Benchmark for core #1 (CL 1-pipe) 0.00:00:16.06 [168,667,029 keys/sec] RC5-72: using core #2 (CL 2-pipe). RC5-72: Benchmark for core #2 (CL 2-pipe) 0.00:00:16.81 [168,043,318 keys/sec] RC5-72: using core #3 (CL 4-pipe). RC5-72: Benchmark for core #3 (CL 4-pipe) 0.00:00:17.03 [171,313,110 keys/sec] RC5-72: using core #4 (CL 1-pipe large). RC5-72: Benchmark for core #4 (CL 1-pipe large) 0.00:00:16.86 [173,663,198 keys/sec] RC5-72: using core #5 (CL 2-pipe large). RC5-72: Benchmark for core #5 (CL 2-pipe large) 0.00:00:17.06 [177,573,667 keys/sec] RC5-72: using core #6 (CL 4-pipe large). RC5-72: Benchmark for core #6 (CL 4-pipe large) 0.00:00:16.70 [176,852,285 keys/sec] RC5-72: using core #7 (CL 1-pipe sleep). RC5-72: Benchmark for core #7 (CL 1-pipe sleep) 0.00:00:16.51 [166,997,768 keys/sec] RC5-72: using core #8 (CL 2-pipe sleep). RC5-72: Benchmark for core #8 (CL 2-pipe sleep) 0.00:00:16.59 [168,755,292 keys/sec] RC5-72: using core #9 (CL 4-pipe sleep). RC5-72: Benchmark for core #9 (CL 4-pipe sleep) 0.00:00:16.64 [170,413,224 keys/sec] RC5-72 benchmark summary : Default core : #-1 (undefined) 0 keys/sec Fastest core : #5 (CL 2-pipe large) 177,573,667 keys/sec "-bench" NVidia GeForce 820M 2048 MB, ForceWare 382.05 RC5-72: using core #0 (CL ANSI 1-pipe). RC5-72: Benchmark for core #0 (CL ANSI 1-pipe) 0.00:00:16.20 [102,620,050 keys/sec] RC5-72: using core #1 (CL 1-pipe). RC5-72: Benchmark for core #1 (CL 1-pipe) 0.00:00:16.98 [129,678,653 keys/sec] RC5-72: using core #2 (CL 2-pipe). RC5-72: Benchmark for core #2 (CL 2-pipe) 0.00:00:16.95 [123,092,851 keys/sec] RC5-72: using core #3 (CL 4-pipe). RC5-72: Benchmark for core #3 (CL 4-pipe) 0.00:00:16.98 [78,567,847 keys/sec] RC5-72: using core #4 (CL 1-pipe large). RC5-72: Benchmark for core #4 (CL 1-pipe large) 0.00:00:17.03 [135,449,921 keys/sec] RC5-72: using core #5 (CL 2-pipe large). RC5-72: Benchmark for core #5 (CL 2-pipe large) 0.00:00:16.89 [128,422,603 keys/sec] RC5-72: using core #6 (CL 4-pipe large). RC5-72: Benchmark for core #6 (CL 4-pipe large) 0.00:00:16.43 [78,558,193 keys/sec] RC5-72: using core #7 (CL 1-pipe sleep). RC5-72: Benchmark for core #7 (CL 1-pipe sleep) 0.00:00:16.65 [127,347,752 keys/sec] RC5-72: using core #8 (CL 2-pipe sleep). RC5-72: Benchmark for core #8 (CL 2-pipe sleep) 0.00:00:16.10 [117,091,782 keys/sec] RC5-72: using core #9 (CL 4-pipe sleep). RC5-72: Benchmark for core #9 (CL 4-pipe sleep) 0.00:00:16.14 [71,550,849 keys/sec] RC5-72 benchmark summary : Default core : #-1 (undefined) 0 keys/sec Fastest core : #4 (CL 1-pipe large) 135,449,921 keys/sec --- common/core_r72.cpp | 36 +++++++++++ plat/opencl/ocl_context.h | 1 + rc5-72/opencl/ocl_1pipe.cpp | 119 +++++++++++++++++++++++++++++++++++- 3 files changed, 153 insertions(+), 3 deletions(-) diff --git a/common/core_r72.cpp b/common/core_r72.cpp index b85b89edc..c3ad17ac2 100644 --- a/common/core_r72.cpp +++ b/common/core_r72.cpp @@ -114,6 +114,12 @@ extern "C" s32 rc5_72_unit_func_ocl_ref (RC5_72UnitWork *rc5_72unitwork, u32 *it extern "C" s32 rc5_72_unit_func_ocl_1pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); extern "C" s32 rc5_72_unit_func_ocl_2pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); extern "C" s32 rc5_72_unit_func_ocl_4pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_1pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_2pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_4pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_1pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_2pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_4pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); #elif (CLIENT_CPU == CPU_ARM64) extern "C" s32 rc5_72_unit_func_scalarfusion(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); #endif @@ -240,6 +246,12 @@ const char **corenames_for_contest_rc572() "CL 1-pipe", "CL 2-pipe", "CL 4-pipe", + "CL 1-pipe large", + "CL 2-pipe large", + "CL 4-pipe large", + "CL 1-pipe sleep", + "CL 2-pipe sleep", + "CL 4-pipe sleep", #else "ANSI 4-pipe", "ANSI 2-pipe", @@ -1026,6 +1038,30 @@ int selcoreSelectCore_rc572(Client *client, unsigned int threadindex, unit_func.gen_72 = rc5_72_unit_func_ocl_4pipe; pipeline_count = 4; break; + case 4: + unit_func.gen_72 = rc5_72_unit_func_ocl_1pipe_large; + pipeline_count = 1; + break; + case 5: + unit_func.gen_72 = rc5_72_unit_func_ocl_2pipe_large; + pipeline_count = 2; + break; + case 6: + unit_func.gen_72 = rc5_72_unit_func_ocl_4pipe_large; + pipeline_count = 4; + break; + case 7: + unit_func.gen_72 = rc5_72_unit_func_ocl_1pipe_sleep; + pipeline_count = 1; + break; + case 8: + unit_func.gen_72 = rc5_72_unit_func_ocl_2pipe_sleep; + pipeline_count = 2; + break; + case 9: + unit_func.gen_72 = rc5_72_unit_func_ocl_4pipe_sleep; + pipeline_count = 4; + break; // ----------- #else /* the ansi cores */ diff --git a/plat/opencl/ocl_context.h b/plat/opencl/ocl_context.h index c5f7236e7..b6d7b9cd4 100644 --- a/plat/opencl/ocl_context.h +++ b/plat/opencl/ocl_context.h @@ -34,6 +34,7 @@ typedef struct { u32 runSize; u32 runSizeMultiplier; u32 maxWorkSize; + unsigned long long estimatedPerf; //keys per second per pipe } ocl_context_t; ocl_context_t *ocl_get_context(int device); diff --git a/rc5-72/opencl/ocl_1pipe.cpp b/rc5-72/opencl/ocl_1pipe.cpp index 377e13ac9..ac7f17fc3 100644 --- a/rc5-72/opencl/ocl_1pipe.cpp +++ b/rc5-72/opencl/ocl_1pipe.cpp @@ -6,11 +6,16 @@ * $Id: */ +#ifndef ANOTHER_PASS + #include "ocl_common.h" #include #include #include +#define __SLEEP_FOR_POLLING__ +#include "sleepdef.h" + #include "rc5-1pipe.cpp" #include "rc5-2pipe.cpp" #include "rc5-4pipe.cpp" @@ -273,6 +278,12 @@ static bool selftest(ocl_context_t *cont) extern "C" s32 rc5_72_unit_func_ocl_1pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); extern "C" s32 rc5_72_unit_func_ocl_2pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); extern "C" s32 rc5_72_unit_func_ocl_4pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_1pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_2pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_4pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_1pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_2pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); +extern "C" s32 rc5_72_unit_func_ocl_4pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *); #endif /* some static flags which are set on per-core basis */ @@ -282,7 +293,29 @@ struct core_static_flags bool profilingErr; }; -static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, +#define FLOAT_INTERNAL_HELPER(x) x##. +#define FLOAT_HELPER(x) FLOAT_INTERNAL_HELPER(x) +#define MIN_DESIRED_UNIT_MS 8. +#define MAX_DESIRED_UNIT_MS_INT 12 +#define MAX_DESIRED_UNIT_MS FLOAT_HELPER(MAX_DESIRED_UNIT_MS_INT) +#define MAX_SLEEP_MKS (2 * (1000 * MAX_DESIRED_UNIT_MS_INT * SCALING_RATIO)) +#define MIN_SLEEP_MKS 1000 + +#endif /* ANOTHER_PASS */ + +#ifndef SCALING_RATIO +#define SCALING_RATIO 1 +#endif /* SCALING_RATIO */ + +static s32 +#if defined(USE_SLEEP) +rc5_72_unit_func_ocl_npipe_sleep +#elif SCALING_RATIO > 1 +rc5_72_unit_func_ocl_npipe_large +#else +rc5_72_unit_func_ocl_npipe +#endif +(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, unsigned core_ID, unsigned pipes_count, const char *core_program, const char *program_entry, struct core_static_flags *static_flags) @@ -368,6 +401,28 @@ static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *itera return -1; //err } +#ifdef USE_SLEEP + /* Sleep is useless w/o flush. */ + status = clFlush(cont->cmdQueue); + if (ocl_diagnose(status, "flushing work", cont) != CL_SUCCESS) + { + RaiseExitRequestTrigger(); + return -1; //err + } + + unsigned sleepTimeMks; + if (cont->estimatedPerf) { + sleepTimeMks = (unsigned)(1000000ULL * rest0 / cont->estimatedPerf); + if (sleepTimeMks > MAX_SLEEP_MKS) + sleepTimeMks = MAX_SLEEP_MKS; + else if (sleepTimeMks < MIN_SLEEP_MKS) + sleepTimeMks = MIN_SLEEP_MKS; + } else + sleepTimeMks = MIN_SLEEP_MKS; + + usleep(sleepTimeMks); +#endif /* USE_SLEEP */ + // wait for the kernel call to finish execution status = clWaitForEvents(1, &ndrEvt); if (ocl_diagnose(status, "waiting for event", cont) != CL_SUCCESS) @@ -404,7 +459,7 @@ static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *itera d = 10; } - if (d > 12.) + if (d > MAX_DESIRED_UNIT_MS * SCALING_RATIO) { //Decrease worksize by 5% u32 diffm = cont->runSize / 20 / cont->runSizeMultiplier; @@ -414,7 +469,7 @@ static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *itera cont->runSize -= diffm*cont->runSizeMultiplier; //Log("Down:Time: %f, runsize=%u\n", float(d), cont->runSize); } else - if ((d < 8.) && (rest0 == cont->runSize)) + if ((d < MIN_DESIRED_UNIT_MS * SCALING_RATIO) && (rest0 == cont->runSize)) { u32 diffm = cont->runSize / 20 / cont->runSizeMultiplier; if (diffm == 0) @@ -424,6 +479,8 @@ static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *itera //Log("Up:Time: %f, runsize=%u, diff=%u\n", float(d), cont->runSize, diffm*cont->runSizeMultiplier); } + cont->estimatedPerf = (unsigned long long)(rest0 / (d / 1000)); + key_incr(&tmp_unit.L0.hi, &tmp_unit.L0.mid, &tmp_unit.L0.lo, rest0 * pipes_count); iter_offset += rest0 * pipes_count; } @@ -488,6 +545,19 @@ static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *itera return RESULT_NOTHING; } +#ifndef ANOTHER_PASS + +#define ANOTHER_PASS + +#undef SCALING_RATIO +#define SCALING_RATIO 100 /* Higher performance, especially with USE_SLEEP. */ + +#include "ocl_1pipe.cpp" + +#define USE_SLEEP /* Required for low CPU usage on some hardware/drivers. */ + +#include "ocl_1pipe.cpp" + s32 rc5_72_unit_func_ocl_1pipe(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *) { static struct core_static_flags flags; @@ -508,3 +578,46 @@ s32 rc5_72_unit_func_ocl_4pipe(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, return rc5_72_unit_func_ocl_npipe(rc5_72unitwork, iterations, CORE_4PIPE, 4, ocl_rc572_4pipe_src, "ocl_rc572_4pipe", &flags); } + +s32 rc5_72_unit_func_ocl_1pipe_large(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *) +{ + static struct core_static_flags flags; + + return rc5_72_unit_func_ocl_npipe_large(rc5_72unitwork, iterations, CORE_1PIPE, 1, ocl_rc572_1pipe_src, "ocl_rc572_1pipe", &flags); +} + +s32 rc5_72_unit_func_ocl_2pipe_large(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *) +{ + static struct core_static_flags flags; + + return rc5_72_unit_func_ocl_npipe_large(rc5_72unitwork, iterations, CORE_2PIPE, 2, ocl_rc572_2pipe_src, "ocl_rc572_2pipe", &flags); +} + +s32 rc5_72_unit_func_ocl_4pipe_large(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *) +{ + static struct core_static_flags flags; + + return rc5_72_unit_func_ocl_npipe_large(rc5_72unitwork, iterations, CORE_4PIPE, 4, ocl_rc572_4pipe_src, "ocl_rc572_4pipe", &flags); +} + +s32 rc5_72_unit_func_ocl_1pipe_sleep(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *) +{ + static struct core_static_flags flags; + + return rc5_72_unit_func_ocl_npipe_sleep(rc5_72unitwork, iterations, CORE_1PIPE, 1, ocl_rc572_1pipe_src, "ocl_rc572_1pipe", &flags); +} + +s32 rc5_72_unit_func_ocl_2pipe_sleep(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *) +{ + static struct core_static_flags flags; + + return rc5_72_unit_func_ocl_npipe_sleep(rc5_72unitwork, iterations, CORE_2PIPE, 2, ocl_rc572_2pipe_src, "ocl_rc572_2pipe", &flags); +} + +s32 rc5_72_unit_func_ocl_4pipe_sleep(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *) +{ + static struct core_static_flags flags; + + return rc5_72_unit_func_ocl_npipe_sleep(rc5_72unitwork, iterations, CORE_4PIPE, 4, ocl_rc572_4pipe_src, "ocl_rc572_4pipe", &flags); +} +#endif /* ANOTHER_PASS */