Thiết kế website giá rẻ

Question

I’m trying to use Global Arrays library with MPI in C++ as it allows sizeable variables being defined only once in public and meanwhile available to access by all mpi processes. So I created a little program which does the following math

The following function produces the correct result using 1 process:

<code>Tensor<double, 3> tensorxmatrix(const Tensor<double, 3> &tensor, const MatrixXd &matrix)

{

int pid;

MPI_Comm_rank(MPI_COMM_WORLD, &pid);

Tensor<double, 3> result(tensor.dimension(0), tensor.dimension(1), tensor.dimension(2));

result.setZero();

for (int i = 0; i < tensor.dimension(0); i++)

{

for (int j = 0; j < tensor.dimension(1); j++)

{

for (int k = 0; k < tensor.dimension(2); k++)

{

for (int l = 0; l < matrix.rows(); l++)

{

result(i, j, k) += tensor(i, j, l) * matrix(l, k);

}

return result;

}

</code>

<code>Tensor<double, 3> tensorxmatrix(const Tensor<double, 3> &tensor, const MatrixXd &matrix) { int pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); Tensor<double, 3> result(tensor.dimension(0), tensor.dimension(1), tensor.dimension(2)); result.setZero(); for (int i = 0; i < tensor.dimension(0); i++) { for (int j = 0; j < tensor.dimension(1); j++) { for (int k = 0; k < tensor.dimension(2); k++) { for (int l = 0; l < matrix.rows(); l++) { result(i, j, k) += tensor(i, j, l) * matrix(l, k); } } } } return result; } </code>

Tensor<double, 3> tensorxmatrix(const Tensor<double, 3> &tensor, const MatrixXd &matrix)
{
    int pid;
    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
    Tensor<double, 3> result(tensor.dimension(0), tensor.dimension(1), tensor.dimension(2));
    result.setZero();
    for (int i = 0; i < tensor.dimension(0); i++)
    {
        for (int j = 0; j < tensor.dimension(1); j++)
        {
            for (int k = 0; k < tensor.dimension(2); k++)
            {
                for (int l = 0; l < matrix.rows(); l++)
                {
                    result(i, j, k) += tensor(i, j, l) * matrix(l, k);
                }
            }
        }
    }
    return result;
}

Here’s the GA-MPI code I wrote which suppose to produce the same result as above:

<code>Tensor<double, 3> vectorisedtensorxmatrix_gampi(const Tensor<double, 3> &tensor, const MatrixXd &matrix)

{

const auto &dim0 = tensor.dimension(0);

const auto &dim1 = tensor.dimension(1);

const auto &dim2 = tensor.dimension(2);

int me = GA_Nodeid();

int nproc = GA_Nnodes();

vector<double> vec_tensor = ConvertTensorToVector(tensor);

const int num_total_tasks = dim0 * dim1;

const int num_tasks_per_proc = num_total_tasks / nproc;

const int remainder = num_total_tasks % nproc;

int tensor_size = dim0 * dim1 * dim2;

int g_vector = NGA_Create(C_DBL, 1, &tensor_size, "g_vector", NULL);

int g_vec_result_tensor = NGA_Create(C_DBL, 1, &tensor_size, "g_vec_result_tensor", NULL);

if (me == 0)

{

int lo = 0;

int hi = dim0 * dim1 * dim2 - 1;

NGA_Put(g_vector, &lo, &hi, vec_tensor.data(), NULL);

}

GA_Zero(g_vec_result_tensor);

GA_Sync();

const int start_task = me * num_tasks_per_proc;

const int end_task = (me == nproc - 1) ? (start_task + num_tasks_per_proc + remainder - 1) : (start_task + num_tasks_per_proc - 1);

for (int taskID = start_task; taskID <= end_task; ++taskID)

{

// int i = taskID / dim1;

// int j = taskID % dim1;

const double *vec;

int lo = taskID * dim2;

int hi = (taskID + 1) * dim2 - 1;

if (me != 0)

{

cout << "Process " << me << " access " << lo << ":" << hi << endl;

}

NGA_Access(g_vector, &lo, &hi, &vec, NULL);

if (me != 0)

{

cout << " Process " << me << " GNA_Access check" << endl;

}

Map<const VectorXd> vec_map(vec, dim2);

VectorXd vec_result = vec_map.transpose() * matrix;

NGA_Put(g_vec_result_tensor, &lo, &hi, vec_result.data(), NULL);

// cout << "Process " << me << " GNA_Put check" << endl;

NGA_Release(g_vector, &lo, &hi);

}

GA_Sync();

MPI_Barrier(MPI_COMM_WORLD);

vector<double> vec_result_tensor(tensor_size);

int lo = 0;

int hi = tensor_size - 1;

NGA_Get(g_vec_result_tensor, &lo, &hi, vec_result_tensor.data(), NULL);

Tensor<double, 3> global_results = ConvertVectorToTensor(vec_result_tensor, dim0, dim1, dim2);

return global_results;

}

</code>

<code>Tensor<double, 3> vectorisedtensorxmatrix_gampi(const Tensor<double, 3> &tensor, const MatrixXd &matrix) { const auto &dim0 = tensor.dimension(0); const auto &dim1 = tensor.dimension(1); const auto &dim2 = tensor.dimension(2); int me = GA_Nodeid(); int nproc = GA_Nnodes(); vector<double> vec_tensor = ConvertTensorToVector(tensor); const int num_total_tasks = dim0 * dim1; const int num_tasks_per_proc = num_total_tasks / nproc; const int remainder = num_total_tasks % nproc; int tensor_size = dim0 * dim1 * dim2; int g_vector = NGA_Create(C_DBL, 1, &tensor_size, "g_vector", NULL); int g_vec_result_tensor = NGA_Create(C_DBL, 1, &tensor_size, "g_vec_result_tensor", NULL); if (me == 0) { int lo = 0; int hi = dim0 * dim1 * dim2 - 1; NGA_Put(g_vector, &lo, &hi, vec_tensor.data(), NULL); } GA_Zero(g_vec_result_tensor); GA_Sync(); const int start_task = me * num_tasks_per_proc; const int end_task = (me == nproc - 1) ? (start_task + num_tasks_per_proc + remainder - 1) : (start_task + num_tasks_per_proc - 1); for (int taskID = start_task; taskID <= end_task; ++taskID) { // int i = taskID / dim1; // int j = taskID % dim1; const double *vec; int lo = taskID * dim2; int hi = (taskID + 1) * dim2 - 1; if (me != 0) { cout << "Process " << me << " access " << lo << ":" << hi << endl; } NGA_Access(g_vector, &lo, &hi, &vec, NULL); if (me != 0) { cout << " Process " << me << " GNA_Access check" << endl; } Map<const VectorXd> vec_map(vec, dim2); VectorXd vec_result = vec_map.transpose() * matrix; NGA_Put(g_vec_result_tensor, &lo, &hi, vec_result.data(), NULL); // cout << "Process " << me << " GNA_Put check" << endl; NGA_Release(g_vector, &lo, &hi); } GA_Sync(); MPI_Barrier(MPI_COMM_WORLD); vector<double> vec_result_tensor(tensor_size); int lo = 0; int hi = tensor_size - 1; NGA_Get(g_vec_result_tensor, &lo, &hi, vec_result_tensor.data(), NULL); Tensor<double, 3> global_results = ConvertVectorToTensor(vec_result_tensor, dim0, dim1, dim2); return global_results; } </code>

Tensor<double, 3> vectorisedtensorxmatrix_gampi(const Tensor<double, 3> &tensor, const MatrixXd &matrix)
{
    const auto &dim0 = tensor.dimension(0);
    const auto &dim1 = tensor.dimension(1);
    const auto &dim2 = tensor.dimension(2);

    int me = GA_Nodeid();
    int nproc = GA_Nnodes();

    vector<double> vec_tensor = ConvertTensorToVector(tensor);

    const int num_total_tasks = dim0 * dim1;
    const int num_tasks_per_proc = num_total_tasks / nproc;
    const int remainder = num_total_tasks % nproc;

    int tensor_size = dim0 * dim1 * dim2;
    int g_vector = NGA_Create(C_DBL, 1, &tensor_size, "g_vector", NULL);
    int g_vec_result_tensor = NGA_Create(C_DBL, 1, &tensor_size, "g_vec_result_tensor", NULL);

    if (me == 0)
    {
        int lo = 0;
        int hi = dim0 * dim1 * dim2 - 1;
        NGA_Put(g_vector, &lo, &hi, vec_tensor.data(), NULL);
    }
    GA_Zero(g_vec_result_tensor);
    GA_Sync();

    const int start_task = me * num_tasks_per_proc;
    const int end_task = (me == nproc - 1) ? (start_task + num_tasks_per_proc + remainder - 1) : (start_task + num_tasks_per_proc - 1);

    for (int taskID = start_task; taskID <= end_task; ++taskID)
    {
        // int i = taskID / dim1;
        // int j = taskID % dim1;

        const double *vec;
        int lo = taskID * dim2;
        int hi = (taskID + 1) * dim2 - 1;
        if (me != 0)
        {
            cout << "Process " << me << " access " << lo << ":" << hi << endl;
        }
        NGA_Access(g_vector, &lo, &hi, &vec, NULL);
        if (me != 0)
        {
            cout << " Process " << me << " GNA_Access check" << endl;
        }

        Map<const VectorXd> vec_map(vec, dim2);

        VectorXd vec_result = vec_map.transpose() * matrix;

        NGA_Put(g_vec_result_tensor, &lo, &hi, vec_result.data(), NULL);
        // cout << "Process " << me << " GNA_Put check" << endl;

        NGA_Release(g_vector, &lo, &hi);
    }

    GA_Sync();
    MPI_Barrier(MPI_COMM_WORLD);

    vector<double> vec_result_tensor(tensor_size);
    int lo = 0;
    int hi = tensor_size - 1;
    NGA_Get(g_vec_result_tensor, &lo, &hi, vec_result_tensor.data(), NULL);

    Tensor<double, 3> global_results = ConvertVectorToTensor(vec_result_tensor, dim0, dim1, dim2);
    return global_results;
}

The code generally works, however it fails when executing the program using 3 mpi processes with defining Tensor<double,3>(10,10,10) somehow. Below is the printout of the program:

<code>Global Arrays initialized with 3 processes.

Minimum memory required: 4.91738e-05 GB

Time taken by tensorxmatrix: 0.00303842 seconds

Process 1 access 330:339

Process 2 access 660:669

1:check subscript failed:331 not in (335:667) dim=0:Received an Error in Communication

2:check subscript failed:661 not in (668:1000) dim=0:Received an Error in Communication

--------------------------------------------------------------------------

MPI_ABORT was invoked on rank 1 in communicator MPI COMMUNICATOR 3 DUP FROM 0

with errorcode -1.

NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes.

You may or may not see output from other processes, depending on

exactly when Open MPI kills them.

--------------------------------------------------------------------------

[Dell-Precision-7920-Tower:3425473] 1 more process has sent help message help-mpi-api.txt / mpi-abort

[Dell-Precision-7920-Tower:3425473] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages

</code>

<code>Global Arrays initialized with 3 processes. Minimum memory required: 4.91738e-05 GB Time taken by tensorxmatrix: 0.00303842 seconds Process 1 access 330:339 Process 2 access 660:669 1:check subscript failed:331 not in (335:667) dim=0:Received an Error in Communication 2:check subscript failed:661 not in (668:1000) dim=0:Received an Error in Communication -------------------------------------------------------------------------- MPI_ABORT was invoked on rank 1 in communicator MPI COMMUNICATOR 3 DUP FROM 0 with errorcode -1. NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes. You may or may not see output from other processes, depending on exactly when Open MPI kills them. -------------------------------------------------------------------------- [Dell-Precision-7920-Tower:3425473] 1 more process has sent help message help-mpi-api.txt / mpi-abort [Dell-Precision-7920-Tower:3425473] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages </code>

Global Arrays initialized with 3 processes.
Minimum memory required: 4.91738e-05 GB
Time taken by tensorxmatrix: 0.00303842 seconds
Process 1 access 330:339
Process 2 access 660:669
1:check subscript failed:331 not in (335:667) dim=0:Received an Error in Communication
2:check subscript failed:661 not in (668:1000) dim=0:Received an Error in Communication
--------------------------------------------------------------------------
MPI_ABORT was invoked on rank 1 in communicator MPI COMMUNICATOR 3 DUP FROM 0
with errorcode -1.

NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes.
You may or may not see output from other processes, depending on
exactly when Open MPI kills them.
--------------------------------------------------------------------------
[Dell-Precision-7920-Tower:3425473] 1 more process has sent help message help-mpi-api.txt / mpi-abort
[Dell-Precision-7920-Tower:3425473] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages

note that lo and hi for process 1 and 2 are [330:339] and [660:669] when taskID = 0 as shown in the printout, however line NGA_Access(g_vector, &lo, &hi, &vec, NULL); still appears to be accessing the wrong part of the g_vector somehow?

Appreciate for any insight, also any suggestions on how to improve the code efficiency are welcomed!

Also if it is any useful to know:

<code>Tensor<double, 3> ConvertVectorToTensor(const vector<double> &vector, const int &dim0, const int &dim1, const int &dim2)

{

Tensor<double, 3> result_tensor(dim0, dim1, dim2);

for (int i = 0; i < dim0; ++i)

{

for (int j = 0; j < dim1; ++j)

{

for (int k = 0; k < dim2; ++k)

{

result_tensor(i, j, k) = vector[i * dim1 * dim2 + j * dim2 + k];

}

return result_tensor;

}

vector<double> ConvertTensorToVector(const Tensor<double, 3> &tensor)

{

const auto &dim0 = tensor.dimension(0);

const auto &dim1 = tensor.dimension(1);

const auto &dim2 = tensor.dimension(2);

vector<double> vec(tensor.size());

for (int i = 0; i < dim0; ++i)

{

for (int j = 0; j < dim1; ++j)

{

for (int k = 0; k < dim2; ++k)

{

vec[i * dim1 * dim2 + j * dim2 + k] = tensor(i, j, k);

}

return vec;

}

</code>

<code>Tensor<double, 3> ConvertVectorToTensor(const vector<double> &vector, const int &dim0, const int &dim1, const int &dim2) { Tensor<double, 3> result_tensor(dim0, dim1, dim2); for (int i = 0; i < dim0; ++i) { for (int j = 0; j < dim1; ++j) { for (int k = 0; k < dim2; ++k) { result_tensor(i, j, k) = vector[i * dim1 * dim2 + j * dim2 + k]; } } } return result_tensor; } vector<double> ConvertTensorToVector(const Tensor<double, 3> &tensor) { const auto &dim0 = tensor.dimension(0); const auto &dim1 = tensor.dimension(1); const auto &dim2 = tensor.dimension(2); vector<double> vec(tensor.size()); for (int i = 0; i < dim0; ++i) { for (int j = 0; j < dim1; ++j) { for (int k = 0; k < dim2; ++k) { vec[i * dim1 * dim2 + j * dim2 + k] = tensor(i, j, k); } } } return vec; } </code>

Tensor<double, 3> ConvertVectorToTensor(const vector<double> &vector, const int &dim0, const int &dim1, const int &dim2)
{
    Tensor<double, 3> result_tensor(dim0, dim1, dim2);
    for (int i = 0; i < dim0; ++i)
    {
        for (int j = 0; j < dim1; ++j)
        {
            for (int k = 0; k < dim2; ++k)
            {
                result_tensor(i, j, k) = vector[i * dim1 * dim2 + j * dim2 + k];
            }
        }
    }
    return result_tensor;
}

vector<double> ConvertTensorToVector(const Tensor<double, 3> &tensor)
{
    const auto &dim0 = tensor.dimension(0);
    const auto &dim1 = tensor.dimension(1);
    const auto &dim2 = tensor.dimension(2);
    vector<double> vec(tensor.size());
    for (int i = 0; i < dim0; ++i)
    {
        for (int j = 0; j < dim1; ++j)
        {
            for (int k = 0; k < dim2; ++k)
            {
                vec[i * dim1 * dim2 + j * dim2 + k] = tensor(i, j, k);
            }
        }
    }
    return vec;
}

Thiết kế website giá rẻ

Danh mục

MPI calculate tensor * matrix with tensor defined in global using Global Arrays library

MPI calculate tensor * matrix with tensor defined in global using Global Arrays library