From b5c3c4eabf6caac35f5971ad323ba8b5cc0e45a2 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 19 Jul 2016 17:57:04 +0200 Subject: [PATCH] fix pointer-chasing to touch full working set xmem::build_random_pointer_permutation weaves a random path through a pointer array, thereby generating a linked list feasible for latency-sensitive pointer-chasing in memory micro-benchmarks. The resulting permutation should touch the whole pointer array to maximize the working set of the benchmark thread. In CS terms, the pointers should form a hamiltonian cycle in the graph formed by the pointer array. However, the build_random_pointer_permutation often fails to construct a hamiltonian cycle. As a result, a benchmark thread's working set might become much smaller than desired, thereby distorting cache miss rates and observed average latencies. Change xmem::build_random_pointer_permutation to always produce hamiltonian cycles in a cheap way: * Represent the traversal order of pointers in an int array of indices * Shuffle the indices in a way that maintains a hamiltonian cycle starting and ending at 0. * Impose the traversal order on the pointer array (specific per chunk size). The changes slow build_random_pointer_permutation down by only ~4% (measured with 64-bit chunks in 32 MiB array on an i7-3530M @ 4 MiB LLC) but ensure the full array as working set all the time. --- src/benchmark_kernels.cpp | 66 ++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/src/benchmark_kernels.cpp b/src/benchmark_kernels.cpp index 9b94166..466dd58 100644 --- a/src/benchmark_kernels.cpp +++ b/src/benchmark_kernels.cpp @@ -866,9 +866,28 @@ bool xmem::build_random_pointer_permutation(void* start_address, void* end_addre std::mt19937_64 gen(time(NULL)); //Mersenne Twister random number generator, seeded at current time - //Do a random shuffle of memory pointers. - //I had originally used a random Hamiltonian Cycle generator, but this was much slower and aside from - //rare instances, did not make any difference in random-access performance measurement. + //Do a random shuffle of memory pointers. Make sure that the result + //is a hamiltonian cycle so that pointer chasing will touch the full + //working set. + + //Represent traversal order of pointers in an array external to the + //pointers + int * traversal_order = new int[num_pointers + 1]; + + //start with sequential order 0, 1, 2, ...., num_pointers-1 + for(size_t i = 0; i < num_pointers; i++) + traversal_order[i] = i; + //...and back to 0 + traversal_order[num_pointers] = 0; + + //shuffle the elements 1,2,...,num-pointers-1, + //thereby preserving the property of a hamiltonian cycle starting + //and ending at 0 + std::shuffle(traversal_order + 1, + traversal_order + (num_pointers - 1), gen); + + //finally, impose the traversal order within the pointer array + #ifdef HAS_WORD_64 Word64_t* mem_region_base = reinterpret_cast(start_address); #else //special case for 32-bit architectures @@ -877,45 +896,48 @@ bool xmem::build_random_pointer_permutation(void* start_address, void* end_addre switch (chunk_size) { #ifdef HAS_WORD_64 case CHUNK_64b: - for (size_t i = 0; i < num_pointers; i++) { //Initialize pointers to point at themselves (identity mapping) - mem_region_base[i] = reinterpret_cast(mem_region_base+i); + for (size_t i = 0; i < num_pointers; i++) { + mem_region_base[traversal_order[i]] = reinterpret_cast( + mem_region_base+traversal_order[i+1]); } - std::shuffle(mem_region_base, mem_region_base + num_pointers, gen); break; #else //special case for 32-bit architectures case CHUNK_32b: - for (size_t i = 0; i < num_pointers; i++) { //Initialize pointers to point at themselves (identity mapping) - mem_region_base[i] = reinterpret_cast(mem_region_base+i); + for (size_t i = 0; i < num_pointers; i++) { + mem_region_base[traversal_order[i]] = reinterpret_cast( + mem_region_base+traversal_order[i+1]); } - std::shuffle(mem_region_base, mem_region_base + num_pointers, gen); break; #endif #ifdef HAS_WORD_128 case CHUNK_128b: - for (size_t i = 0; i < num_pointers; i++) { //Initialize pointers to point at themselves (identity mapping) + for (size_t i = 0; i < num_pointers; i++) { #ifdef HAS_WORD_64 - mem_region_base[i*2] = reinterpret_cast(mem_region_base+(i*2)); + mem_region_base[traversal_order[i]*2] = reinterpret_cast( + mem_region_base+(traversal_order[i+1]*2)); mem_region_base[(i*2)+1] = 0xFFFFFFFFFFFFFFFF; //1-fill upper 64 bits #else //special case for 32-bit architectures - mem_region_base[i*4] = reinterpret_cast(mem_region_base+(i*4)); + mem_region_base[traversal_order[i]*4] = reinterpret_cast( + mem_region_base+(traversal_order[i+1]*4)); mem_region_base[(i*4)+1] = 0xFFFFFFFF; //1-fill upper 96 bits mem_region_base[(i*4)+2] = 0xFFFFFFFF; mem_region_base[(i*4)+3] = 0xFFFFFFFF; #endif } - std::shuffle(reinterpret_cast(mem_region_base), reinterpret_cast(mem_region_base) + num_pointers, gen); break; #endif #ifdef HAS_WORD_256 case CHUNK_256b: - for (size_t i = 0; i < num_pointers; i++) { //Initialize pointers to point at themselves (identity mapping) + for (size_t i = 0; i < num_pointers; i++) { #ifdef HAS_WORD_64 - mem_region_base[i*4] = reinterpret_cast(mem_region_base+(i*4)); + mem_region_base[traversal_order[i]*4] = reinterpret_cast( + mem_region_base+(traversal_order[i+1]*4)); mem_region_base[(i*4)+1] = 0xFFFFFFFFFFFFFFFF; //1-fill upper 192 bits mem_region_base[(i*4)+2] = 0xFFFFFFFFFFFFFFFF; mem_region_base[(i*4)+3] = 0xFFFFFFFFFFFFFFFF; #else //special case for 32-bit architectures - mem_region_base[i*8] = reinterpret_cast(mem_region_base+(i*8)); + mem_region_base[traversal_order[i]*8] = reinterpret_cast( + mem_region_base+(traversal_order[i+1]*8)); mem_region_base[(i*8)+1] = 0xFFFFFFFF; //1-fill upper 224 bits mem_region_base[(i*8)+2] = 0xFFFFFFFF; mem_region_base[(i*8)+3] = 0xFFFFFFFF; @@ -925,14 +947,14 @@ bool xmem::build_random_pointer_permutation(void* start_address, void* end_addre mem_region_base[(i*8)+7] = 0xFFFFFFFF; #endif } - std::shuffle(reinterpret_cast(mem_region_base), reinterpret_cast(mem_region_base) + num_pointers, gen); break; #endif #ifdef HAS_WORD_512 case CHUNK_512b: - for (size_t i = 0; i < num_pointers; i++) { //Initialize pointers to point at themselves (identity mapping) + for (size_t i = 0; i < num_pointers; i++) { #ifdef HAS_WORD_64 - mem_region_base[i*4] = reinterpret_cast(mem_region_base+(i*4)); + mem_region_base[traversal_order[i]*4] = reinterpret_cast( + mem_region_base+(traversal_order[i+1]*4)); mem_region_base[(i*4)+1] = 0xFFFFFFFFFFFFFFFF; //1-fill upper 448 bits mem_region_base[(i*4)+2] = 0xFFFFFFFFFFFFFFFF; mem_region_base[(i*4)+3] = 0xFFFFFFFFFFFFFFFF; @@ -941,7 +963,8 @@ bool xmem::build_random_pointer_permutation(void* start_address, void* end_addre mem_region_base[(i*4)+6] = 0xFFFFFFFFFFFFFFFF; mem_region_base[(i*4)+7] = 0xFFFFFFFFFFFFFFFF; #else //special case for 32-bit architectures - mem_region_base[i*8] = reinterpret_cast(mem_region_base+(i*8)); + mem_region_base[traversal_order[i]*8] = reinterpret_cast( + mem_region_base+(traversal_order[i+1]*8)); mem_region_base[(i*8)+1] = 0xFFFFFFFF; //1-fill upper 480 bits mem_region_base[(i*8)+2] = 0xFFFFFFFF; mem_region_base[(i*8)+3] = 0xFFFFFFFF; @@ -959,7 +982,6 @@ bool xmem::build_random_pointer_permutation(void* start_address, void* end_addre mem_region_base[(i*8)+15] = 0xFFFFFFFF; #endif } - std::shuffle(reinterpret_cast(mem_region_base), reinterpret_cast(mem_region_base) + num_pointers, gen); break; #endif default: @@ -967,6 +989,8 @@ bool xmem::build_random_pointer_permutation(void* start_address, void* end_addre return false; } + delete traversal_order; + if (g_verbose) { std::cout << "done" << std::endl; std::cout << std::endl;