I’m trying to use Global Arrays library with MPI in C++ as it allows sizeable variables being defined only once in public and meanwhile available to access by all mpi processes. So I created a little program which does the following math
The following function produces the correct result using 1 process:
Tensor<double, 3> tensorxmatrix(const Tensor<double, 3> &tensor, const MatrixXd &matrix)
{
int pid;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
Tensor<double, 3> result(tensor.dimension(0), tensor.dimension(1), tensor.dimension(2));
result.setZero();
for (int i = 0; i < tensor.dimension(0); i++)
{
for (int j = 0; j < tensor.dimension(1); j++)
{
for (int k = 0; k < tensor.dimension(2); k++)
{
for (int l = 0; l < matrix.rows(); l++)
{
result(i, j, k) += tensor(i, j, l) * matrix(l, k);
}
}
}
}
return result;
}
Here’s the GA-MPI code I wrote which suppose to produce the same result as above:
Tensor<double, 3> vectorisedtensorxmatrix_gampi(const Tensor<double, 3> &tensor, const MatrixXd &matrix)
{
const auto &dim0 = tensor.dimension(0);
const auto &dim1 = tensor.dimension(1);
const auto &dim2 = tensor.dimension(2);
int me = GA_Nodeid();
int nproc = GA_Nnodes();
vector<double> vec_tensor = ConvertTensorToVector(tensor);
const int num_total_tasks = dim0 * dim1;
const int num_tasks_per_proc = num_total_tasks / nproc;
const int remainder = num_total_tasks % nproc;
int tensor_size = dim0 * dim1 * dim2;
int g_vector = NGA_Create(C_DBL, 1, &tensor_size, "g_vector", NULL);
int g_vec_result_tensor = NGA_Create(C_DBL, 1, &tensor_size, "g_vec_result_tensor", NULL);
if (me == 0)
{
int lo = 0;
int hi = dim0 * dim1 * dim2 - 1;
NGA_Put(g_vector, &lo, &hi, vec_tensor.data(), NULL);
}
GA_Zero(g_vec_result_tensor);
GA_Sync();
const int start_task = me * num_tasks_per_proc;
const int end_task = (me == nproc - 1) ? (start_task + num_tasks_per_proc + remainder - 1) : (start_task + num_tasks_per_proc - 1);
for (int taskID = start_task; taskID <= end_task; ++taskID)
{
// int i = taskID / dim1;
// int j = taskID % dim1;
const double *vec;
int lo = taskID * dim2;
int hi = (taskID + 1) * dim2 - 1;
if (me != 0)
{
cout << "Process " << me << " access " << lo << ":" << hi << endl;
}
NGA_Access(g_vector, &lo, &hi, &vec, NULL);
if (me != 0)
{
cout << " Process " << me << " GNA_Access check" << endl;
}
Map<const VectorXd> vec_map(vec, dim2);
VectorXd vec_result = vec_map.transpose() * matrix;
NGA_Put(g_vec_result_tensor, &lo, &hi, vec_result.data(), NULL);
// cout << "Process " << me << " GNA_Put check" << endl;
NGA_Release(g_vector, &lo, &hi);
}
GA_Sync();
MPI_Barrier(MPI_COMM_WORLD);
vector<double> vec_result_tensor(tensor_size);
int lo = 0;
int hi = tensor_size - 1;
NGA_Get(g_vec_result_tensor, &lo, &hi, vec_result_tensor.data(), NULL);
Tensor<double, 3> global_results = ConvertVectorToTensor(vec_result_tensor, dim0, dim1, dim2);
return global_results;
}
The code generally works, however it fails when executing the program using 3 mpi processes with defining Tensor<double,3>(10,10,10) somehow. Below is the printout of the program:
Global Arrays initialized with 3 processes.
Minimum memory required: 4.91738e-05 GB
Time taken by tensorxmatrix: 0.00303842 seconds
Process 1 access 330:339
Process 2 access 660:669
1:check subscript failed:331 not in (335:667) dim=0:Received an Error in Communication
2:check subscript failed:661 not in (668:1000) dim=0:Received an Error in Communication
--------------------------------------------------------------------------
MPI_ABORT was invoked on rank 1 in communicator MPI COMMUNICATOR 3 DUP FROM 0
with errorcode -1.
NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes.
You may or may not see output from other processes, depending on
exactly when Open MPI kills them.
--------------------------------------------------------------------------
[Dell-Precision-7920-Tower:3425473] 1 more process has sent help message help-mpi-api.txt / mpi-abort
[Dell-Precision-7920-Tower:3425473] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
note that lo and hi for process 1 and 2 are [330:339] and [660:669] when taskID = 0
as shown in the printout, however line NGA_Access(g_vector, &lo, &hi, &vec, NULL);
still appears to be accessing the wrong part of the g_vector
somehow?
Appreciate for any insight, also any suggestions on how to improve the code efficiency are welcomed!
Also if it is any useful to know:
Tensor<double, 3> ConvertVectorToTensor(const vector<double> &vector, const int &dim0, const int &dim1, const int &dim2)
{
Tensor<double, 3> result_tensor(dim0, dim1, dim2);
for (int i = 0; i < dim0; ++i)
{
for (int j = 0; j < dim1; ++j)
{
for (int k = 0; k < dim2; ++k)
{
result_tensor(i, j, k) = vector[i * dim1 * dim2 + j * dim2 + k];
}
}
}
return result_tensor;
}
vector<double> ConvertTensorToVector(const Tensor<double, 3> &tensor)
{
const auto &dim0 = tensor.dimension(0);
const auto &dim1 = tensor.dimension(1);
const auto &dim2 = tensor.dimension(2);
vector<double> vec(tensor.size());
for (int i = 0; i < dim0; ++i)
{
for (int j = 0; j < dim1; ++j)
{
for (int k = 0; k < dim2; ++k)
{
vec[i * dim1 * dim2 + j * dim2 + k] = tensor(i, j, k);
}
}
}
return vec;
}