Skip to content
This repository was archived by the owner on Dec 25, 2023. It is now read-only.

Commit 4db9260

Browse files
authored
Feature/optimization for agilex (#22)
* add optimization for agilex Signed-off-by: wangyon1 <yong4.wang@intel.com> * pre-commits code format update Signed-off-by: wangyon1 <yong4.wang@intel.com> * change keyswitch input/output to use mem_channel_k1 Signed-off-by: wangyon1 <yong4.wang@intel.com> * add copy fromm buffer to device elper function Signed-off-by: wangyon1 <yong4.wang@intel.com> * rewrite copy helper function Signed-off-by: wangyon1 <yong4.wang@intel.com> * change KeySwitch maximum batch size to 1024 Signed-off-by: wangyon1 <yong4.wang@intel.com> * move keyswitch store kernel queue wait to right place Signed-off-by: wangyon1 <yong4.wang@intel.com> * pre-commit fix Signed-off-by: wangyon1 <yong4.wang@intel.com> * comment out keyswitch performance measurement utility * pre-commit fix Signed-off-by: wangyon1 <yong4.wang@intel.com>
1 parent 27a4695 commit 4db9260

File tree

7 files changed

+41
-15
lines changed

7 files changed

+41
-15
lines changed

device/keyswitch/dyadmult.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ void broadcast_keys(sycl::queue& q,
2626
const unsigned int KEYS_LEN = tp_MAX_RNS_MODULUS_SIZE * 2;
2727
auto kernelLambda = [=]()
2828
[[intel::kernel_args_restrict]] [[intel::max_global_work_dim(0)]] {
29-
for (size_t i = 0; i < batch_size; i++) {
29+
for (int i = 0; i < batch_size; i++) {
3030
unsigned params_size = tt_ch_keyswitch_params::read();
3131
for (int i = 0; i < params_size; i++) {
3232
uint256_t keys1 = k_switch_keys1[i];

device/keyswitch/load.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,8 @@ sycl::event load(sycl::queue& q, sycl::event* inDepsEv,
110110
temp_pipe::write(cur_moduli);
111111
});
112112
STEP(decomp_index, decomp_modulus_size);
113-
114-
for (int n = 0; n < coeff_count; n++) {
113+
uint coeff_count_tmp = coeff_count;
114+
for (uint n = 0; n < coeff_count_tmp; n++) {
115115
Unroller<0, NUM_CORES>::Step([&](auto COREID) {
116116
using temp_pipe =
117117
typename ch_intt_elements_in::template PipeAt<

device/keyswitch/params.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,4 @@
4242

4343
#define STEP(n, max) n = n == (max - 1) ? 0 : n + 1
4444
#define STEP2(n, max) n = n == ((max)-1) ? -1 : n + 1
45+
#define STEP3(n, max) n = n == (max) ? -1 : n + 1

device/keyswitch/twiddle_generator.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,8 @@ void dispatch_twiddle_factors(sycl::queue& q,
240240
typename tt_ch_twiddle_factor_rep::template PipeAt<
241241
NTT_ENGINES - 2>;
242242
twPipe8::write(tf, success);
243-
if (success) STEP2(ntt2_index, ntt2_decomp_size / VEC);
243+
short max_tmp = ntt2_decomp_size / VEC - 1;
244+
if (success) STEP3(ntt2_index, max_tmp);
244245
}
245246
// write intt1
246247
TwiddleFactor_t intt1_tf;
@@ -253,7 +254,8 @@ void dispatch_twiddle_factors(sycl::queue& q,
253254
}
254255
if (intt1_index >= 0) {
255256
tt_ch_intt1_twiddle_factor_rep::write(intt1_tf, success);
256-
if (success) STEP2(intt1_index, intt1_decomp_size / VEC);
257+
short max_tmp = intt1_decomp_size / VEC - 1;
258+
if (success) STEP3(intt1_index, max_tmp);
257259
}
258260

259261
// write intt2

host/inc/fpga.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ class Device {
685685

686686
// KeySwitch section
687687
sycl::queue keyswitch_queues_[KEYSWITCH_NUM_KERNELS];
688-
sycl::event KeySwitch_events_write_[2][128];
688+
sycl::event KeySwitch_events_write_[2][1024];
689689
sycl::event KeySwitch_events_enqueue_[2][2];
690690
std::unordered_map<uint64_t**, KeySwitchMemKeys<uint256_t>*> keys_map_;
691691
static int device_id_;

host/src/fpga.cpp

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,19 @@
1414
namespace intel {
1515
namespace hexl {
1616
namespace fpga {
17+
18+
// helper function to explicitly copy host data to device.
19+
static sycl::event copy_buffer_to_device(sycl::queue& q,
20+
sycl::buffer<uint64_t>& buf) {
21+
sycl::host_accessor host_acc(buf);
22+
uint64_t* host_ptr = host_acc.get_pointer();
23+
sycl::event e = q.submit([&](sycl::handler& h) {
24+
auto acc_dev = buf.get_access<sycl::access::mode::discard_write>(h);
25+
h.copy(host_ptr, acc_dev);
26+
});
27+
return e;
28+
}
29+
1730
// utility function for copying input data batch for KeySwitch
1831

1932
const char* keyswitch_kernel_name[] = {"load", "store"};
@@ -257,10 +270,10 @@ FPGAObject_KeySwitch::FPGAObject_KeySwitch(sycl::queue& p_q,
257270
aligned_alloc(HOST_MEM_ALIGNMENT, size_out * sizeof(uint64_t)));
258271
mem_t_target_iter_ptr_ = new sycl::buffer<uint64_t>(
259272
sycl::range(size_in),
260-
{sycl::property::buffer::mem_channel{MEM_CHANNEL_K2}});
273+
{sycl::property::buffer::mem_channel{MEM_CHANNEL_K1}});
261274
mem_KeySwitch_results_ = new sycl::buffer<sycl::ulong2>(
262275
sycl::range(size_out / 2),
263-
{sycl::property::buffer::mem_channel{MEM_CHANNEL_K2}});
276+
{sycl::property::buffer::mem_channel{MEM_CHANNEL_K1}});
264277
mem_t_target_iter_ptr_->set_write_back(false);
265278
mem_KeySwitch_results_->set_write_back(false);
266279
}
@@ -1261,18 +1274,28 @@ void Device::enqueue_input_data_KeySwitch(FPGAObject_KeySwitch* fpga_obj) {
12611274
keyswitch_queues_[KEYSWITCH_LOAD], *(keys->k_switch_keys_1_),
12621275
*(keys->k_switch_keys_2_), *(keys->k_switch_keys_3_),
12631276
fpga_obj->in_objs_.size());
1264-
const auto& start_ocl = std::chrono::high_resolution_clock::now();
1277+
12651278
int obj_id = KeySwitch_id_ % 2;
12661279
copyKeySwitchBatch(fpga_obj, obj_id);
1280+
1281+
// copy_buffer_to_device() and wait() is a utility to force blocked write,
1282+
// and to facilitate performance measure on FPGA.
1283+
// The release is to support streaming, and blocking write will slow things
1284+
// down.
1285+
// KeySwitch_events_write_[obj_id][0] = copy_buffer_to_device(
1286+
// keyswitch_queues_[KEYSWITCH_LOAD],
1287+
// *(fpga_obj->mem_t_target_iter_ptr_));
1288+
// KeySwitch_events_write_[obj_id][0].wait();
1289+
12671290
// =============== Launch keyswitch kernel ==============================
12681291
unsigned rmem = 0;
12691292
if (RWMEM_FLAG) {
12701293
rmem = 1;
12711294
}
1295+
const auto& start_ocl = std::chrono::high_resolution_clock::now();
12721296
KeySwitch_events_enqueue_[obj_id][0] =
12731297
(*(KeySwitch_kernel_container_->load))(
1274-
keyswitch_queues_[KEYSWITCH_LOAD],
1275-
nullptr /* KeySwitch_events_write_[obj_id] */,
1298+
keyswitch_queues_[KEYSWITCH_LOAD], nullptr,
12761299
*(fpga_obj->mem_t_target_iter_ptr_), modulus_meta_, fpga_obj->n_,
12771300
fpga_obj->decomp_modulus_size_, fpga_obj->n_batch_,
12781301
(*(invn_t*)(void*)&invn_), rmem);
@@ -1540,9 +1563,9 @@ bool Device::process_output_KeySwitch() {
15401563
*(fpga_obj->mem_KeySwitch_results_), fpga_obj->n_batch_,
15411564
fpga_obj->n_, fpga_obj->decomp_modulus_size_, modulus_meta_, rmem,
15421565
wmem);
1543-
1544-
const auto& end_ocl = std::chrono::high_resolution_clock::now();
15451566
keyswitch_queues_[KEYSWITCH_STORE].wait();
1567+
const auto& end_ocl = std::chrono::high_resolution_clock::now();
1568+
15461569
const auto& start_io = std::chrono::high_resolution_clock::now();
15471570
if (KeySwitch_id_ > 0) {
15481571
KeySwitch_read_output();

host/src/fpga_int.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,9 @@ static uint64_t g_batch_size_intt = get_batch_size_intt();
109109
static uint64_t get_batch_size_KeySwitch() {
110110
char* env = getenv("BATCH_SIZE_KEYSWITCH");
111111
uint64_t size = env ? strtoul(env, NULL, 10) : 1;
112-
if (size > 128) {
112+
if (size > 1024) {
113113
std::cerr << "Error: BATCH_SIZE_KEYSWITCH is " << size << std::endl;
114-
std::cerr << " Maxiaml supported BATCH_SIZE_KEYSWITCH is 128."
114+
std::cerr << " Maxiaml supported BATCH_SIZE_KEYSWITCH is 1024."
115115
<< std::endl;
116116
exit(1);
117117
}

0 commit comments

Comments
 (0)