#include <Eigen/Dense>
#include <iostream>
#include <random>
#include <chrono>
int main()
{
const int num_points = 300000000;
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> init_points(num_points, 3);
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> transformed_points(num_points, 3);
std::mt19937 rng(42);
std::uniform_real_distribution<float> dist(-100.0f, 100.0f);
for (int i = 0; i < num_points; ++i)
{
init_points(i, 0) = dist(rng); // x
init_points(i, 1) = dist(rng); // y
init_points(i, 2) = dist(rng); // z
}
float theta = 3.14159265358 / 4; // pi/4
Eigen::Matrix3f rotation;
rotation = Eigen::AngleAxisf(theta, Eigen::Vector3f::UnitZ());
Eigen::Vector3f translation(10.0f, 20.0f, 30.0f);
auto start_time = std::chrono::high_resolution_clock::now();
//transformed_points = init_points * rotation; //uncomment this line to use the Matrix multiply version
//transformed_points += translation. Transpose();//uncomment this line to use the Matrix multiply version
for (int i = 0; i < num_points; ++i) //comment this for loop to use the Matrix multiply version
{
Eigen::Vector3f v = init_points.row(i).transpose();
v = rotation * v;
v += translation;
transformed_points.row(i) = v.transpose();
}
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration_ms = end_time - start_time;
std::cout << "total consume: " << duration_ms.count() << "ms" << std::endl;
std::cout << "first 5 points:(x,y,z)" << std::endl;
for (int i = 0; i < 5; ++i)
{
std::cout << "("<< transformed_points(i, 0) << ","<< transformed_points(i, 1) << ", " << transformed_points(i, 2) << ")" << std::endl;
}
return 0;
}
I am currently performing a coordinate transformation on an object obtained through point cloud sampling. I need to rotate a set of points first, and then translate them to obtain a transformed point cloud.
This point cloud data contains a total of 300,000,000 points, and I am using Eigen’s dynamic arrays for storage.
The code above is a test example where I initialize 300,000,000 points with random numbers and then perform a rotation and translation operation. This code includes both a for-loop traversal version and a matrix multiplication version. To use matrix multiplication, you just need to uncomment the two lines I’ve commented out and comment out the for-loop part.
Here is a comparison of the time consumed by the for-loop traversal and matrix multiplication versions:
1. For the rotation and translation operations, I tried multiplying all 300,000,000 points by the rotation matrix directly and then adding the translation vector. This operation takes about 2044ms on my i7-13700k CPU.
2. I wrote a for-loop to traverse each point and apply rotation and translation one by one. The total time is only about 600ms.
I know Eigen utilizes many CPU instruction sets to optimize matrix multiplication, and I am using the Intel ICC compiler with various SIMD and AVX optimizations enabled. Why is the for-loop traversal 3 times faster than matrix multiplication here?
Could someone help analyze this? I would greatly appreciate it.
1