I am working on an MPI implementation of the Cooley-Tukey FFT algorithm in C++. The implementation is supposed to distribute the computation of the FFT across multiple processes. It works correctly with a single rank but fails to produce the correct results when executed with two or more ranks. I believe the issue might be related to how data dependencies are handled between the FFT stages when data is split among different processes.
void cooley_tukey_fft(vector<complex<double>>& x, bool inverse) {
int N = x.size();
for (int i = 1, j = N / 2; i < N - 1; i++) {
if (i < j) {
swap(x[i], x[j]);
}
int k = N / 2;
while (k <= j) {
j -= k;
k /= 2;
}
j += k;
}
double sign = (inverse) ? 1.0 : -1.0;
for (int s = 1; s <= log2(N); s++) {
int m = 1 << s;
complex<double> omega_m = exp(complex<double>(0, sign * 2.0 * PI / m));
for (int k = 0; k < N; k += m) {
complex<double> omega = 1.0;
for (int j = 0; j < m / 2; j++) {
complex<double> t = omega * x[k + j + m / 2];
complex<double> u = x[k + j];
x[k + j] = u + t;
x[k + j + m / 2] = u - t;
omega *= omega_m;
}
}
}
}
int main(int argc, char* argv[]) {
MPI_Init(&argc, &argv);
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
// Hardcoded data for all processes (replicated)
vector<complex<double>> data = {
{88.1033, 45.955},
{12.194, 72.0208},
{97.1567, 18.006},
{51.3203, 99.5343},
{98.0407, 57.5992},
{70.6577, 20.4711},
{44.7407, 84.487},
{20.2791, 39.3583}
};
int count = data.size();
// Calculate the local size for each process
int local_n = count / size;
vector<complex<double>> local_data(local_n);
// Scatter the data to all processes
MPI_Scatter(data.data(), local_n * sizeof(complex<double>), MPI_BYTE,
local_data.data(), local_n * sizeof(complex<double>), MPI_BYTE, 0, MPI_COMM_WORLD);
// Local FFT computation
cooley_tukey_fft(local_data, false);
// Gather the results back to the root process
vector<complex<double>> result;
if (rank == 0) {
result.resize(count);
}
MPI_Gather(local_data.data(), local_n * sizeof(complex<double>), MPI_BYTE,
result.data(), local_n * sizeof(complex<double>), MPI_BYTE, 0, MPI_COMM_WORLD);
// Output the results from the root process
if (rank == 0) {
cout << "FFT Result:" << endl;
for (const auto& c : result) {
cout << c << endl;
}
}
MPI_Finalize();
return 0;
}