From 44b1c98e129bf3062b8f4331772a7b6959e50f25 Mon Sep 17 00:00:00 2001
From: Sergey Nikiforov <void234@gmail.com>
Date: Wed, 24 Feb 2021 00:06:27 +0300
Subject: [PATCH] "Large" and "sleep" versions of "CL N-pipe"

It is inefficient to poll GPU for results wasting CPU time and
(in case of dGPUs) PCIe bandwidth, especially if CPU is powerful
while (i)GPU is not. Original "CL N-pipe" cores are not touched,
OpenCL kernels are not touched, but scheduling code is modified
to permit 100 times larger work units ("CL 1-pipe large" etc) and
also to flush assignment to GPU and put CPU to sleep
("CL 1-pipe sleep" etc).

"Large" cores are marginally faster than original ones.
"Sleep" cores are slightly slower than "large" ones because GPU may
sometimes finish processing work unit while CPU still sleeps.
These cores, however, consume zero CPU (all other cores consume 1
logical CPU unless sleep is transparently performed by GPU driver -
Intel does this for gen8 but not for newer GPUs, this helps but only
if work unit is large enough for CPU to sleep for several milliseconds).
This results in higher power efficiency and, if we are not limited
by TDP, significant performance improvement. Effect is more pronounced
when CPU does not support MT.

Note that with "sleep" cores there is no need to manually limit
number of threads for CPU cruncher.

Performance/efficiency can be further improved by growing work unit
size faster. Wider testing and benchmarking (especially on high-end
GPUs) are welcome.

Benchmarks below are performed with CPU being loaded with
2.9116.525-amd64 core #4 (YK AVX2).
CUDA client is 2.9110.519b, core #10 (CUDA 1-pipe 64-thd sleep 100us).
"521" refers to 2.9112.521 dnetc-win32-x86-opencl.zip/
  dnetc-linux-amd64-opencl.tar.gz
Power consumption is "measured" with "Core Temp" / "s-tui".

Core i5-8265U (15W, 4C8T, 14 nm, 1.6-3.9 GHz,
  Intel UHD Graphics 620 [gen9] 1100 MHz), Ubuntu 20.04
CL 2-pipe/large/sleep

Mode               CPU     iGPU   Summary Power Efficiency
521, 7 threads     124      150       274    15   18.27
521, 8 threads     127      150       277    15   18.47
521, iGPU only       0      184       184    15   12.27
CPU only           181        0       181    15   12.07
Sleep, 8 threads   135      148       283    15   18.87
iGPU only, sleep     0      186       186    15   12.40
iGPU only, large     0      186       187    15   12.47
[1.022 efficiency improvement, "sleep" is optimal]

Core i7-9700K (95W, 8C8T, 14 nm, 3.6-4.9 GHz,
  Intel UHD Graphics 630 [gen9] 1200 MHz), Windows 10 20H2
CL 2-pipe/large/sleep

Mode               CPU     iGPU   Summary Power Efficiency
521, 8 threads     480       92       572    95    6.02
521, 7 threads     406      187       593    95    6.24
Sleep, 8 threads   457      178       635    95    6.68
Sleep, 7 threads   403      188       591    95    6.22
CPU only           473        0       473    95    4.98
iGPU only, sleep     0      188       188    22    8.55
iGPU only, large     0      190       190    44    4.32
<Note terrible power efficiency of polling - "large" vs "sleep">
[1.071 efficiency improvement, "sleep" is optimal]

Core i5-5200U (15W, 2C4T, 14 nm, 2.2-2.7 GHz,
  Intel HD Graphics 5500 [gen8] 900 MHz)
NVidia GeForce 820M 2048 MB, ForceWare 382.05
Windows 10 20H2
CL 4-pipe/large/sleep

Mode                CPU     iGPU   dGPU Summary Power* Efficiency*
521, 4 threads       66       59      0     125    15    8.33
521, 3 threads       63      167      0     230  21.4   10.75
521, 3 threads, CUDA 29      161     89     279   15*       *
CPU only             67        0      0      67  10.2    6.57
Sleep, 4 threads     71      168      0     239  21.4   11.17
Large, 4 threads     68      172      0     240  21.4   11.21
iGPU only, sleep      0      173      0     173  13.5   12.81
iGPU only, large      0      175      0     175  13.5   12.96
dGPU only, sleep      0        0    123     123  1.3*       *
dGPU only, large      0        0    134     134  8.3*       *
Sleep, 4 threads, dG 42      153    119     314   15*       *
Custom**, 4 threads  41      155    120     316   15*       *

*dGPU is not included in power measurements
**Custom - "large" for iGPU (gen8 driver idles CPU himself),
  "sleep" for dGPU
[1.043 efficiency improvement, "large" is optimal for iGPU]
[CPU+iGPU+dGPU: 1.133 performance improvement, "sleep" is optimal for dGPU]

"-bench" Intel UHD Graphics 620 [gen9] 1100 MHz (Core i5-8265U)

RC5-72: using core #0 (CL ANSI 1-pipe).
RC5-72: Benchmark for core #0 (CL ANSI 1-pipe)
0.00:00:16.14 [113,990,283 keys/sec]
RC5-72: using core #1 (CL 1-pipe).
RC5-72: Benchmark for core #1 (CL 1-pipe)
0.00:00:16.32 [187,106,455 keys/sec]
RC5-72: using core #2 (CL 2-pipe).
RC5-72: Benchmark for core #2 (CL 2-pipe)
0.00:00:16.92 [184,015,486 keys/sec]
RC5-72: using core #3 (CL 4-pipe).
RC5-72: Benchmark for core #3 (CL 4-pipe)
0.00:00:16.80 [166,416,580 keys/sec]
RC5-72: using core #4 (CL 1-pipe large).
RC5-72: Benchmark for core #4 (CL 1-pipe large)
0.00:00:16.80 [184,818,394 keys/sec]
RC5-72: using core #5 (CL 2-pipe large).
RC5-72: Benchmark for core #5 (CL 2-pipe large)
0.00:00:16.81 [188,636,921 keys/sec]
RC5-72: using core #6 (CL 4-pipe large).
RC5-72: Benchmark for core #6 (CL 4-pipe large)
0.00:00:16.61 [170,029,327 keys/sec]
RC5-72: using core #7 (CL 1-pipe sleep).
RC5-72: Benchmark for core #7 (CL 1-pipe sleep)
0.00:00:16.05 [189,540,521 keys/sec]
RC5-72: using core #8 (CL 2-pipe sleep).
RC5-72: Benchmark for core #8 (CL 2-pipe sleep)
0.00:00:17.02 [192,711,899 keys/sec]
RC5-72: using core #9 (CL 4-pipe sleep).
RC5-72: Benchmark for core #9 (CL 4-pipe sleep)
0.00:00:16.93 [174,570,008 keys/sec]
RC5-72 benchmark summary :
Default core : #-1 (undefined) 0 keys/sec
Fastest core : #8 (CL 2-pipe sleep) 192,711,899 keys/sec

"-bench" Intel UHD Graphics 630 [gen9] 1200 MHz (Core i7-9700K)

RC5-72: using core #0 (CL ANSI 1-pipe).
RC5-72: Benchmark for core #0 (CL ANSI 1-pipe)
0.00:00:16.96 [124,370,534 keys/sec]
RC5-72: using core #1 (CL 1-pipe).
RC5-72: Benchmark for core #1 (CL 1-pipe)
0.00:00:16.84 [186,580,220 keys/sec]
RC5-72: using core #2 (CL 2-pipe).
RC5-72: Benchmark for core #2 (CL 2-pipe)
0.00:00:16.76 [189,445,953 keys/sec]
RC5-72: using core #3 (CL 4-pipe).
RC5-72: Benchmark for core #3 (CL 4-pipe)
0.00:00:16.53 [172,042,275 keys/sec]
RC5-72: using core #4 (CL 1-pipe large).
RC5-72: Benchmark for core #4 (CL 1-pipe large)
0.00:00:16.10 [191,761,686 keys/sec]
RC5-72: using core #5 (CL 2-pipe large).
RC5-72: Benchmark for core #5 (CL 2-pipe large)
0.00:00:16.84 [192,842,719 keys/sec]
RC5-72: using core #6 (CL 4-pipe large).
RC5-72: Benchmark for core #6 (CL 4-pipe large)
0.00:00:16.59 [176,169,744 keys/sec]
RC5-72: using core #7 (CL 1-pipe sleep).
RC5-72: Benchmark for core #7 (CL 1-pipe sleep)
0.00:00:16.59 [183,669,420 keys/sec]
RC5-72: using core #8 (CL 2-pipe sleep).
RC5-72: Benchmark for core #8 (CL 2-pipe sleep)
0.00:00:16.57 [186,548,997 keys/sec]
RC5-72: using core #9 (CL 4-pipe sleep).
RC5-72: Benchmark for core #9 (CL 4-pipe sleep)
0.00:00:16.35 [169,087,725 keys/sec]
RC5-72 benchmark summary :
Default core : #-1 (undefined) 0 keys/sec
Fastest core : #5 (CL 2-pipe large) 192,842,719 keys/sec

"-bench" Intel HD Graphics 5500 [gen8] 900 MHz (Core i5-5200U)

RC5-72: using core #0 (CL ANSI 1-pipe).
RC5-72: Benchmark for core #0 (CL ANSI 1-pipe)
0.00:00:16.15 [9,209,485 keys/sec]
RC5-72: using core #1 (CL 1-pipe).
RC5-72: Benchmark for core #1 (CL 1-pipe)
0.00:00:16.06 [168,667,029 keys/sec]
RC5-72: using core #2 (CL 2-pipe).
RC5-72: Benchmark for core #2 (CL 2-pipe)
0.00:00:16.81 [168,043,318 keys/sec]
RC5-72: using core #3 (CL 4-pipe).
RC5-72: Benchmark for core #3 (CL 4-pipe)
0.00:00:17.03 [171,313,110 keys/sec]
RC5-72: using core #4 (CL 1-pipe large).
RC5-72: Benchmark for core #4 (CL 1-pipe large)
0.00:00:16.86 [173,663,198 keys/sec]
RC5-72: using core #5 (CL 2-pipe large).
RC5-72: Benchmark for core #5 (CL 2-pipe large)
0.00:00:17.06 [177,573,667 keys/sec]
RC5-72: using core #6 (CL 4-pipe large).
RC5-72: Benchmark for core #6 (CL 4-pipe large)
0.00:00:16.70 [176,852,285 keys/sec]
RC5-72: using core #7 (CL 1-pipe sleep).
RC5-72: Benchmark for core #7 (CL 1-pipe sleep)
0.00:00:16.51 [166,997,768 keys/sec]
RC5-72: using core #8 (CL 2-pipe sleep).
RC5-72: Benchmark for core #8 (CL 2-pipe sleep)
0.00:00:16.59 [168,755,292 keys/sec]
RC5-72: using core #9 (CL 4-pipe sleep).
RC5-72: Benchmark for core #9 (CL 4-pipe sleep)
0.00:00:16.64 [170,413,224 keys/sec]
RC5-72 benchmark summary :
Default core : #-1 (undefined) 0 keys/sec
Fastest core : #5 (CL 2-pipe large) 177,573,667 keys/sec

"-bench" NVidia GeForce 820M 2048 MB, ForceWare 382.05

RC5-72: using core #0 (CL ANSI 1-pipe).
RC5-72: Benchmark for core #0 (CL ANSI 1-pipe)
0.00:00:16.20 [102,620,050 keys/sec]
RC5-72: using core #1 (CL 1-pipe).
RC5-72: Benchmark for core #1 (CL 1-pipe)
0.00:00:16.98 [129,678,653 keys/sec]
RC5-72: using core #2 (CL 2-pipe).
RC5-72: Benchmark for core #2 (CL 2-pipe)
0.00:00:16.95 [123,092,851 keys/sec]
RC5-72: using core #3 (CL 4-pipe).
RC5-72: Benchmark for core #3 (CL 4-pipe)
0.00:00:16.98 [78,567,847 keys/sec]
RC5-72: using core #4 (CL 1-pipe large).
RC5-72: Benchmark for core #4 (CL 1-pipe large)
0.00:00:17.03 [135,449,921 keys/sec]
RC5-72: using core #5 (CL 2-pipe large).
RC5-72: Benchmark for core #5 (CL 2-pipe large)
0.00:00:16.89 [128,422,603 keys/sec]
RC5-72: using core #6 (CL 4-pipe large).
RC5-72: Benchmark for core #6 (CL 4-pipe large)
0.00:00:16.43 [78,558,193 keys/sec]
RC5-72: using core #7 (CL 1-pipe sleep).
RC5-72: Benchmark for core #7 (CL 1-pipe sleep)
0.00:00:16.65 [127,347,752 keys/sec]
RC5-72: using core #8 (CL 2-pipe sleep).
RC5-72: Benchmark for core #8 (CL 2-pipe sleep)
0.00:00:16.10 [117,091,782 keys/sec]
RC5-72: using core #9 (CL 4-pipe sleep).
RC5-72: Benchmark for core #9 (CL 4-pipe sleep)
0.00:00:16.14 [71,550,849 keys/sec]
RC5-72 benchmark summary :
Default core : #-1 (undefined) 0 keys/sec
Fastest core : #4 (CL 1-pipe large) 135,449,921 keys/sec
---
 common/core_r72.cpp         |  36 +++++++++++
 plat/opencl/ocl_context.h   |   1 +
 rc5-72/opencl/ocl_1pipe.cpp | 119 +++++++++++++++++++++++++++++++++++-
 3 files changed, 153 insertions(+), 3 deletions(-)
diff --git a/common/core_r72.cpp b/common/core_r72.cpp
index b85b89edc..c3ad17ac2 100644
--- a/common/core_r72.cpp
+++ b/common/core_r72.cpp
@@ -114,6 +114,12 @@ extern "C" s32 rc5_72_unit_func_ocl_ref (RC5_72UnitWork *rc5_72unitwork, u32 *it
 extern "C" s32 rc5_72_unit_func_ocl_1pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
 extern "C" s32 rc5_72_unit_func_ocl_2pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
 extern "C" s32 rc5_72_unit_func_ocl_4pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_1pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_2pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_4pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_1pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_2pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_4pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
 #elif (CLIENT_CPU == CPU_ARM64)
 extern "C" s32 rc5_72_unit_func_scalarfusion(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
 #endif
@@ -240,6 +246,12 @@ const char **corenames_for_contest_rc572()
       "CL 1-pipe",
       "CL 2-pipe",
       "CL 4-pipe",
+      "CL 1-pipe large",
+      "CL 2-pipe large",
+      "CL 4-pipe large",
+      "CL 1-pipe sleep",
+      "CL 2-pipe sleep",
+      "CL 4-pipe sleep",
   #else
       "ANSI 4-pipe",
       "ANSI 2-pipe",
@@ -1026,6 +1038,30 @@ int selcoreSelectCore_rc572(Client *client, unsigned int threadindex,
         unit_func.gen_72 = rc5_72_unit_func_ocl_4pipe;
         pipeline_count = 4;
         break;
+      case 4:
+        unit_func.gen_72 = rc5_72_unit_func_ocl_1pipe_large;
+        pipeline_count = 1;
+        break;
+      case 5:
+        unit_func.gen_72 = rc5_72_unit_func_ocl_2pipe_large;
+        pipeline_count = 2;
+        break;
+      case 6:
+        unit_func.gen_72 = rc5_72_unit_func_ocl_4pipe_large;
+        pipeline_count = 4;
+        break;
+      case 7:
+        unit_func.gen_72 = rc5_72_unit_func_ocl_1pipe_sleep;
+        pipeline_count = 1;
+        break;
+      case 8:
+        unit_func.gen_72 = rc5_72_unit_func_ocl_2pipe_sleep;
+        pipeline_count = 2;
+        break;
+      case 9:
+        unit_func.gen_72 = rc5_72_unit_func_ocl_4pipe_sleep;
+        pipeline_count = 4;
+        break;
 
     // -----------
      #else /* the ansi cores */
diff --git a/plat/opencl/ocl_context.h b/plat/opencl/ocl_context.h
index c5f7236e7..b6d7b9cd4 100644
--- a/plat/opencl/ocl_context.h
+++ b/plat/opencl/ocl_context.h
@@ -34,6 +34,7 @@ typedef struct {
   u32               runSize;
   u32               runSizeMultiplier;
   u32               maxWorkSize;
+  unsigned long long estimatedPerf; //keys per second per pipe
 } ocl_context_t;
 
 ocl_context_t *ocl_get_context(int device);
diff --git a/rc5-72/opencl/ocl_1pipe.cpp b/rc5-72/opencl/ocl_1pipe.cpp
index 377e13ac9..ac7f17fc3 100644
--- a/rc5-72/opencl/ocl_1pipe.cpp
+++ b/rc5-72/opencl/ocl_1pipe.cpp
@@ -6,11 +6,16 @@
 * $Id:
 */
 
+#ifndef ANOTHER_PASS
+
 #include "ocl_common.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+#define __SLEEP_FOR_POLLING__
+#include "sleepdef.h"
+
 #include "rc5-1pipe.cpp"
 #include "rc5-2pipe.cpp"
 #include "rc5-4pipe.cpp"
@@ -273,6 +278,12 @@ static bool selftest(ocl_context_t *cont)
 extern "C" s32 rc5_72_unit_func_ocl_1pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
 extern "C" s32 rc5_72_unit_func_ocl_2pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
 extern "C" s32 rc5_72_unit_func_ocl_4pipe (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_1pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_2pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_4pipe_large (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_1pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_2pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
+extern "C" s32 rc5_72_unit_func_ocl_4pipe_sleep (RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *);
 #endif
 
 /* some static flags which are set on per-core basis */
@@ -282,7 +293,29 @@ struct core_static_flags
   bool profilingErr;
 };
 
-static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *iterations,
+#define FLOAT_INTERNAL_HELPER(x) x##.
+#define FLOAT_HELPER(x) FLOAT_INTERNAL_HELPER(x)
+#define MIN_DESIRED_UNIT_MS 8.
+#define MAX_DESIRED_UNIT_MS_INT 12
+#define MAX_DESIRED_UNIT_MS FLOAT_HELPER(MAX_DESIRED_UNIT_MS_INT)
+#define MAX_SLEEP_MKS (2 * (1000 * MAX_DESIRED_UNIT_MS_INT * SCALING_RATIO))
+#define MIN_SLEEP_MKS 1000
+
+#endif /* ANOTHER_PASS */
+
+#ifndef SCALING_RATIO
+#define SCALING_RATIO 1
+#endif /* SCALING_RATIO */
+
+static s32
+#if defined(USE_SLEEP)
+rc5_72_unit_func_ocl_npipe_sleep
+#elif SCALING_RATIO > 1
+rc5_72_unit_func_ocl_npipe_large
+#else
+rc5_72_unit_func_ocl_npipe
+#endif
+(RC5_72UnitWork *rc5_72unitwork, u32 *iterations,
                                       unsigned core_ID, unsigned pipes_count,
                                       const char *core_program, const char *program_entry,
                                       struct core_static_flags *static_flags)
@@ -368,6 +401,28 @@ static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *itera
       return -1;          //err
     }
 
+#ifdef USE_SLEEP
+    /* Sleep is useless w/o flush. */
+    status = clFlush(cont->cmdQueue);
+    if (ocl_diagnose(status, "flushing work", cont) != CL_SUCCESS)
+    {
+      RaiseExitRequestTrigger();
+      return -1;          //err
+    }
+
+    unsigned sleepTimeMks;
+    if (cont->estimatedPerf) {
+      sleepTimeMks = (unsigned)(1000000ULL * rest0 / cont->estimatedPerf);
+      if (sleepTimeMks > MAX_SLEEP_MKS)
+        sleepTimeMks = MAX_SLEEP_MKS;
+      else if (sleepTimeMks < MIN_SLEEP_MKS)
+        sleepTimeMks = MIN_SLEEP_MKS;
+    } else
+      sleepTimeMks = MIN_SLEEP_MKS;
+
+    usleep(sleepTimeMks);
+#endif /* USE_SLEEP */
+
     // wait for the kernel call to finish execution
     status = clWaitForEvents(1, &ndrEvt);
     if (ocl_diagnose(status, "waiting for event", cont) != CL_SUCCESS)
@@ -404,7 +459,7 @@ static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *itera
       d = 10;
     }
 
-    if (d > 12.)
+    if (d > MAX_DESIRED_UNIT_MS * SCALING_RATIO)
     {
       //Decrease worksize by 5%
       u32 diffm = cont->runSize / 20 / cont->runSizeMultiplier;
@@ -414,7 +469,7 @@ static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *itera
         cont->runSize -= diffm*cont->runSizeMultiplier;
       //Log("Down:Time: %f, runsize=%u\n", float(d), cont->runSize);
     } else
-    if ((d < 8.) && (rest0 == cont->runSize))
+    if ((d < MIN_DESIRED_UNIT_MS * SCALING_RATIO) && (rest0 == cont->runSize))
     {
       u32 diffm = cont->runSize / 20 / cont->runSizeMultiplier;
       if (diffm == 0)
@@ -424,6 +479,8 @@ static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *itera
       //Log("Up:Time: %f, runsize=%u, diff=%u\n", float(d), cont->runSize, diffm*cont->runSizeMultiplier);
     }
 
+    cont->estimatedPerf = (unsigned long long)(rest0 / (d / 1000));
+
     key_incr(&tmp_unit.L0.hi, &tmp_unit.L0.mid, &tmp_unit.L0.lo, rest0 * pipes_count);
     iter_offset += rest0 * pipes_count;
   }
@@ -488,6 +545,19 @@ static s32 rc5_72_unit_func_ocl_npipe(RC5_72UnitWork *rc5_72unitwork, u32 *itera
   return RESULT_NOTHING;
 }
 
+#ifndef ANOTHER_PASS
+
+#define ANOTHER_PASS
+
+#undef SCALING_RATIO
+#define SCALING_RATIO 100 /* Higher performance, especially with USE_SLEEP. */
+
+#include "ocl_1pipe.cpp"
+
+#define USE_SLEEP /* Required for low CPU usage on some hardware/drivers. */
+
+#include "ocl_1pipe.cpp"
+
 s32 rc5_72_unit_func_ocl_1pipe(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *)
 {
   static struct core_static_flags flags;
@@ -508,3 +578,46 @@ s32 rc5_72_unit_func_ocl_4pipe(RC5_72UnitWork *rc5_72unitwork, u32 *iterations,
 
   return rc5_72_unit_func_ocl_npipe(rc5_72unitwork, iterations, CORE_4PIPE, 4, ocl_rc572_4pipe_src, "ocl_rc572_4pipe", &flags);
 }
+
+s32 rc5_72_unit_func_ocl_1pipe_large(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *)
+{
+  static struct core_static_flags flags;
+
+  return rc5_72_unit_func_ocl_npipe_large(rc5_72unitwork, iterations, CORE_1PIPE, 1, ocl_rc572_1pipe_src, "ocl_rc572_1pipe", &flags);
+}
+
+s32 rc5_72_unit_func_ocl_2pipe_large(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *)
+{
+  static struct core_static_flags flags;
+
+  return rc5_72_unit_func_ocl_npipe_large(rc5_72unitwork, iterations, CORE_2PIPE, 2, ocl_rc572_2pipe_src, "ocl_rc572_2pipe", &flags);
+}
+
+s32 rc5_72_unit_func_ocl_4pipe_large(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *)
+{
+  static struct core_static_flags flags;
+
+  return rc5_72_unit_func_ocl_npipe_large(rc5_72unitwork, iterations, CORE_4PIPE, 4, ocl_rc572_4pipe_src, "ocl_rc572_4pipe", &flags);
+}
+
+s32 rc5_72_unit_func_ocl_1pipe_sleep(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *)
+{
+  static struct core_static_flags flags;
+
+  return rc5_72_unit_func_ocl_npipe_sleep(rc5_72unitwork, iterations, CORE_1PIPE, 1, ocl_rc572_1pipe_src, "ocl_rc572_1pipe", &flags);
+}
+
+s32 rc5_72_unit_func_ocl_2pipe_sleep(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *)
+{
+  static struct core_static_flags flags;
+
+  return rc5_72_unit_func_ocl_npipe_sleep(rc5_72unitwork, iterations, CORE_2PIPE, 2, ocl_rc572_2pipe_src, "ocl_rc572_2pipe", &flags);
+}
+
+s32 rc5_72_unit_func_ocl_4pipe_sleep(RC5_72UnitWork *rc5_72unitwork, u32 *iterations, void *)
+{
+  static struct core_static_flags flags;
+
+  return rc5_72_unit_func_ocl_npipe_sleep(rc5_72unitwork, iterations, CORE_4PIPE, 4, ocl_rc572_4pipe_src, "ocl_rc572_4pipe", &flags);
+}
+#endif /* ANOTHER_PASS */