Hi,
I am playing with some programs to compute "bandwidth" on my system which is a Dual-Xeon Skylake Gold 6140 (2 sockets of 18 cores) with 12 DIMMS (6 per socket) of RAM at 2666 MHz for a total of 96 GB. I wrote my own "stream" benchmark, and I am surprised by some results. On this platform, Intel Advisor (the roofline) claims 207 GB/s of memory bandwidth. The Intel Memory Latency Checker gives exactly the same result for the bandwidth. Here are the results given by my program.
Bandwidth, sum += a[i] * b[i] : 182.698 Gb/s Bandwidth, a[i] = 0.0 : 103.311 Gb/s Bandwidth, a[i] = 2 * a[i] : 128.075 Gb/s Bandwidth, a[i] = b[i] : 136.004 Gb/s Bandwidth, a[i] = 2 * b[i] : 102.294 Gb/s Bandwidth, a[i] += 2 * b[i] : 101.337 Gb/s Bandwidth, a[i] = 2 * b[i] + 3 * c[i]: 114.601 Gb/s Bandwidth, a[i] = b[i] + 3 * c[i] : 114.525 Gb/s
I have a few questions:
1/ Is there a way to reach the peak performance of 207 GB/s with the reduction (sum += a[i] * b[i]) ? Can we tune prefetching to do so?
2/ Why is the bandwidth for setting a to 0.0 so low? Can we make it faster?
Best regards
PS: The following code has been compiled with
icpc -g -std=c++11 -O3 -xCORE-AVX512 -qopenmp -DNDEBUG main.cpp -o main
and launched with thread pinning with 1 thread per core.
export OMP_PLACES=cores export OMP_PROC_BIND=spread export OMP_NUM_THREADS=36 ./main
Here is the full listing
#include <chrono> #include <iostream> int main() { const std::ptrdiff_t n = 1024 * 1024 * 1024; double *a = new double[n]; double *b = new double[n]; double *c = new double[n]; #pragma omp parallel for for (std::ptrdiff_t i = 0; i < n; ++i) { a[i] = 0.0; b[i] = 0.0; c[i] = 0.0; } const std::ptrdiff_t nb_times = 20; double sum = 0.0; auto point_begin = std::chrono::high_resolution_clock::now(); for (std::ptrdiff_t k = 0; k < nb_times; ++k) { #pragma omp parallel for reduction(+ : sum) for (std::ptrdiff_t i = 0; i < n; ++i) { sum += a[i] * b[i]; } asm volatile("" : : "g"(a) : "memory"); asm volatile("" : : "g"(b) : "memory"); } auto point_end = std::chrono::high_resolution_clock::now(); double time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>( point_end - point_begin) .count(); std::cout << "Bandwidth, sum += a[i] * b[i] : "<< (2 * n * sizeof(double) * nb_times) / (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl; point_begin = std::chrono::high_resolution_clock::now(); for (std::ptrdiff_t k = 0; k < nb_times; ++k) { #pragma omp parallel for for (std::ptrdiff_t i = 0; i < n; ++i) { a[i] = 0.0; } asm volatile("" : : "g"(a) : "memory"); } point_end = std::chrono::high_resolution_clock::now(); time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>( point_end - point_begin) .count(); std::cout << "Bandwidth, a[i] = 0.0 : "<< (1 * n * sizeof(double) * nb_times) / (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl; point_begin = std::chrono::high_resolution_clock::now(); for (std::ptrdiff_t k = 0; k < nb_times; ++k) { #pragma omp parallel for for (std::ptrdiff_t i = 0; i < n; ++i) { a[i] = 2 * a[i]; } asm volatile("" : : "g"(a) : "memory"); } point_end = std::chrono::high_resolution_clock::now(); time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>( point_end - point_begin) .count(); std::cout << "Bandwidth, a[i] = 2 * a[i] : "<< (2 * n * sizeof(double) * nb_times) / (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl; point_begin = std::chrono::high_resolution_clock::now(); for (std::ptrdiff_t k = 0; k < nb_times; ++k) { #pragma omp parallel for for (std::ptrdiff_t i = 0; i < n; ++i) { a[i] = b[i]; } asm volatile("" : : "g"(a) : "memory"); asm volatile("" : : "g"(b) : "memory"); } point_end = std::chrono::high_resolution_clock::now(); time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>( point_end - point_begin) .count(); std::cout << "Bandwidth, a[i] = b[i] : "<< (2 * n * sizeof(double) * nb_times) / (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl; point_begin = std::chrono::high_resolution_clock::now(); for (std::ptrdiff_t k = 0; k < nb_times; ++k) { #pragma omp parallel for for (std::ptrdiff_t i = 0; i < n; ++i) { a[i] = 2 * b[i]; } asm volatile("" : : "g"(a) : "memory"); asm volatile("" : : "g"(b) : "memory"); } point_end = std::chrono::high_resolution_clock::now(); time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>( point_end - point_begin) .count(); std::cout << "Bandwidth, a[i] = 2 * b[i] : "<< (2 * n * sizeof(double) * nb_times) / (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl; point_begin = std::chrono::high_resolution_clock::now(); for (std::ptrdiff_t k = 0; k < nb_times; ++k) { #pragma omp parallel for for (std::ptrdiff_t i = 0; i < n; ++i) { a[i] += 2 * b[i]; } asm volatile("" : : "g"(a) : "memory"); asm volatile("" : : "g"(b) : "memory"); } point_end = std::chrono::high_resolution_clock::now(); time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>( point_end - point_begin) .count(); std::cout << "Bandwidth, a[i] += 2 * b[i] : "<< (2 * n * sizeof(double) * nb_times) / (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl; point_begin = std::chrono::high_resolution_clock::now(); for (std::ptrdiff_t k = 0; k < nb_times; ++k) { #pragma omp parallel for for (std::ptrdiff_t i = 0; i < n; ++i) { a[i] = 2 * b[i] + 3 * c[i]; } asm volatile("" : : "g"(a) : "memory"); asm volatile("" : : "g"(b) : "memory"); asm volatile("" : : "g"(c) : "memory"); } point_end = std::chrono::high_resolution_clock::now(); time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>( point_end - point_begin) .count(); std::cout << "Bandwidth, a[i] = 2 * b[i] + 3 * c[i]: "<< (3 * n * sizeof(double) * nb_times) / (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl; point_begin = std::chrono::high_resolution_clock::now(); for (std::ptrdiff_t k = 0; k < nb_times; ++k) { #pragma omp parallel for for (std::ptrdiff_t i = 0; i < n; ++i) { a[i] = b[i] + 3 * c[i]; } asm volatile("" : : "g"(a) : "memory"); asm volatile("" : : "g"(b) : "memory"); asm volatile("" : : "g"(c) : "memory"); } point_end = std::chrono::high_resolution_clock::now(); time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>( point_end - point_begin) .count(); std::cout << "Bandwidth, a[i] = b[i] + 3 * c[i] : "<< (3 * n * sizeof(double) * nb_times) / (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl; std::cout << "Check: "<< sum << std::endl; delete[] c; delete[] b; delete[] a; return 0; }