I’ve implemented a quaternion-vector multiplication function using SIMD instructions, with conditional compilation for AVX512, AVX2, and SSE. While I expected to see significant performance improvements with newer instruction sets (AVX512 > AVX2 > SSE), the actual performance difference is surprisingly small.
The result of
for f in "-mno-avx -mfma sse" "-mavx -mavx2 -mfma avx" "-mavx512f -mfma mavx512"; do
g++ -O3 -w quat_vec_mult.cpp -I/usr/include/eigen3 -std=c++20 ${f% *}
echo "${f##* }"
./a.out
done
is
sse
Time: 104786 us
Sum: -3309.12
avx
Time: 100155 us
Sum: -3309.12
mavx512
Time: 98113 us
Sum: -3309.12
My questions are two-holds
- Why might the performance difference between AVX512, AVX2, and SSE be so small in this case?
- Could there be any bottlenecks in my implementation that are limiting the potential speedup from more advanced SIMD instructions?
Note that my cpu spec is https://gist.github.com/HiroIshida/7230aab2d40a6cae7b18c552a363f31f
The below is the code
#include <iostream>
#include <chrono>
#include <vector>
#include <Eigen/Dense>
#include <Eigen/Geometry>
#include <random>
#include <immintrin.h>
#include <fstream>
#ifdef __AVX512F__
const size_t bytes_alligned = 64;
const size_t n_batch = 8;
#define SIMD_LOAD _mm512_load_pd
#define SIMD_STORE _mm512_store_pd
#define SIMD_ADD _mm512_add_pd
#define SIMD_SUB _mm512_sub_pd
#define SIMD_MUL _mm512_mul_pd
#define SIMD_SET1 _mm512_set1_pd
#elif __AVX2__
const size_t bytes_alligned = 32;
const size_t n_batch = 4;
#define SIMD_LOAD _mm256_load_pd
#define SIMD_STORE _mm256_store_pd
#define SIMD_ADD _mm256_add_pd
#define SIMD_SUB _mm256_sub_pd
#define SIMD_MUL _mm256_mul_pd
#define SIMD_SET1 _mm256_set1_pd
#else
const size_t bytes_alligned = 16;
const size_t n_batch = 2;
#define SIMD_LOAD _mm_load_pd
#define SIMD_STORE _mm_store_pd
#define SIMD_ADD _mm_add_pd
#define SIMD_SUB _mm_sub_pd
#define SIMD_MUL _mm_mul_pd
#define SIMD_SET1 _mm_set1_pd
#endif
void quat_mult_vec(
double* qx, double* qy, double* qz, double* qw,
double* x, double* y, double* z,
double* output_x, double* output_y, double* output_z){
auto qx_vec = SIMD_LOAD(qx);
auto qy_vec = SIMD_LOAD(qy);
auto qz_vec = SIMD_LOAD(qz);
auto qw_vec = SIMD_LOAD(qw);
auto x_vec = SIMD_LOAD(x);
auto y_vec = SIMD_LOAD(y);
auto z_vec = SIMD_LOAD(z);
auto ones = SIMD_SET1(1);
auto twos = SIMD_SET1(2);
auto qxqx_2 = SIMD_MUL(SIMD_MUL(qx_vec, qx_vec), twos);
auto qyqy_2 = SIMD_MUL(SIMD_MUL(qy_vec, qy_vec), twos);
auto qzqz_2 = SIMD_MUL(SIMD_MUL(qz_vec, qz_vec), twos);
auto qwqw_2 = SIMD_MUL(SIMD_MUL(qw_vec, qw_vec), twos);
auto qxqy_2 = SIMD_MUL(SIMD_MUL(qx_vec, qy_vec), twos);
auto qxqz_2 = SIMD_MUL(SIMD_MUL(qx_vec, qz_vec), twos);
auto qxqw_2 = SIMD_MUL(SIMD_MUL(qx_vec, qw_vec), twos);
auto qyqz_2 = SIMD_MUL(SIMD_MUL(qy_vec, qz_vec), twos);
auto qyqw_2 = SIMD_MUL(SIMD_MUL(qy_vec, qw_vec), twos);
auto qzqw_2 = SIMD_MUL(SIMD_MUL(qz_vec, qw_vec), twos);
auto M00 = SIMD_SUB(SIMD_ADD(qwqw_2, qxqx_2), ones);
auto M01 = SIMD_SUB(qxqy_2, qzqw_2);
auto M02 = SIMD_ADD(qxqz_2, qyqw_2);
auto output_x_vec = SIMD_ADD(SIMD_ADD(SIMD_MUL(M00, x_vec), SIMD_MUL(M01, y_vec)), SIMD_MUL(M02, z_vec));
SIMD_STORE(output_x, output_x_vec);
auto M10 = SIMD_ADD(qxqy_2, qzqw_2);
auto M11 = SIMD_SUB(SIMD_ADD(qwqw_2, qyqy_2), ones);
auto M12 = SIMD_SUB(qyqz_2, qxqw_2);
auto output_y_vec = SIMD_ADD(SIMD_ADD(SIMD_MUL(M10, x_vec), SIMD_MUL(M11, y_vec)), SIMD_MUL(M12, z_vec));
SIMD_STORE(output_y, output_y_vec);
auto M20 = SIMD_SUB(qxqz_2, qyqw_2);
auto M21 = SIMD_ADD(qyqz_2, qxqw_2);
auto M22 = SIMD_SUB(SIMD_ADD(qwqw_2, qzqz_2), ones);
auto output_z_vec = SIMD_ADD(SIMD_ADD(SIMD_MUL(M20, x_vec), SIMD_MUL(M21, y_vec)), SIMD_MUL(M22, z_vec));
SIMD_STORE(output_z, output_z_vec);
}
int main(){
size_t N = 10000000;
size_t aligned_N = N + (n_batch - N % n_batch);
double* qx = static_cast<double*>(aligned_alloc(bytes_alligned, aligned_N * sizeof(double)));
double* qy = static_cast<double*>(aligned_alloc(bytes_alligned, aligned_N * sizeof(double)));
double* qz = static_cast<double*>(aligned_alloc(bytes_alligned, aligned_N * sizeof(double)));
double* qw = static_cast<double*>(aligned_alloc(bytes_alligned, aligned_N * sizeof(double)));
double* x = static_cast<double*>(aligned_alloc(bytes_alligned, aligned_N * sizeof(double)));
double* y = static_cast<double*>(aligned_alloc(bytes_alligned, aligned_N * sizeof(double)));
double* z = static_cast<double*>(aligned_alloc(bytes_alligned, aligned_N * sizeof(double)));
double* output_x = static_cast<double*>(aligned_alloc(bytes_alligned, aligned_N * sizeof(double)));
double* output_y = static_cast<double*>(aligned_alloc(bytes_alligned, aligned_N * sizeof(double)));
double* output_z = static_cast<double*>(aligned_alloc(bytes_alligned, aligned_N * sizeof(double)));
for(size_t i = 0; i < N; ++i) {
Eigen::Vector4d q = Eigen::Vector4d::Random();
Eigen::Vector3d v = Eigen::Vector3d::Random();
qx[i] = q[0]; qy[i] = q[1]; qz[i] = q[2]; qw[i] = q[3];
x[i] = v[0]; y[i] = v[1]; z[i] = v[2];
}
auto start = std::chrono::high_resolution_clock::now();
for(size_t head = 0; head < N; head += n_batch){
quat_mult_vec(qx + head, qy + head, qz + head, qw + head, x + head, y + head, z + head, output_x + head, output_y + head, output_z + head);
}
auto end = std::chrono::high_resolution_clock::now();
std::cout << "Time: " << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" << std::endl;
double sum = 0;
for(size_t i = 0; i < N; ++i) {
sum += output_x[i] + output_y[i] + output_z[i];
}
std::cout << "Sum: " << sum << std::endl;
}
Also note that the implementation basically compute the matrix element of the below in batch, then multiply it the vector also in batch
https://i.sstatic.net/0500x.jpg
5
I think your long test with 10M inputs bottlenecks on memory bandwidth. 10M quaternions + 10M 3D vectors take 560 MB RAM in FP64 precision, while your CPU only has 16MB L3 cache.
The reason why for short test AVX512 is close to AVX2 is probably CPU microarchitecture. According to uops.info, on Zen 4 CPUs the throughput of vector addition and multiplication is twice as fast with 32 bytes AVX vectors compared to 64 bytes AVX512 vectors.
Couple possible micro-optimizations in your code.
You can compute v * 2.0
as v + v
, this gets rid of the 2.0 constant vector.
For AVX and AVX512 targets, you can compute ( a * b ) + ( c * d ) + ( e * f )
expression as fma( e, f, fma( c, d, a * b ) )
this reduces count of instructions for that expression from 5 to 3, by 40%.
1