Thiết kế website giá rẻ

Question

I’m running a slurm controller with couple of gpu nodes. All nodes share $HOME directory with intel mpi installed. A simple mpi version of cuda program using cudaMalloc() works correctly when run directly with mpirun but fails with srun / sbatch.

The mpi program (mpiCuda.c) looks like this

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <cuda_runtime.h>

// Function to check for CUDA errors and print detailed information
#define CHECK_CUDA(call, message)                                           
    {                                                                       
        const cudaError_t error = call;                                     
        int rank;                                                           
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);                               
        char hostname[256];                                                 
        gethostname(hostname, 256);                                         
        if (error != cudaSuccess) {                                         
            fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);          
            fprintf(stderr, "Rank: %d, Hostname: %s, ", rank, hostname);    
            fprintf(stderr, "code: %d, reason: %sn", error, cudaGetErrorString(error)); 
            MPI_Abort(MPI_COMM_WORLD, error);                               
        } else {                                                            
            printf("Success: %s:%d, %s, Rank: %d, Hostname: %sn", __FILE__, __LINE__, message, rank, hostname); 
        }                                                                   
    }

int main(int argc, char *argv[]) {
    int rank, size;
    char hostname[256];

    // Initialize MPI
    MPI_Init(&argc, &argv);

    // Get the rank and size
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    // Get the hostname
    gethostname(hostname, 256);

    // Allocate memory on the GPU
    size_t bytes = 1024 * 1024; // 1 MB of memory
    void *d_memory;
    CHECK_CUDA(cudaMalloc(&d_memory, bytes), "cudaMalloc");

    // Free the allocated GPU memory
    cudaFree(d_memory);

    // Finalize MPI
    MPI_Finalize();

    return 0;
}

Since the controller doesnt have gpu, i login to one of the compute gpu nodes and compile the program like

source /opt/intel/mpi/latest/env/vars.sh
mpicc -o mpiCuda mpiCuda.c -L/usr/local/cuda/lib64 -lcudart

I then login to the controller and run it on the two compute nodes like,

source /opt/intel/mpi/latest/env/vars.sh
mpirun -np 8 -host c04-q03-0,c04-q03-1 ./mpiCuda

which works correctly and gives the following output,

Success: mpiCuda.c:45, cudaMalloc, Rank: 7, Hostname: c04-q03-1
Success: mpiCuda.c:45, cudaMalloc, Rank: 5, Hostname: c04-q03-1
Success: mpiCuda.c:45, cudaMalloc, Rank: 4, Hostname: c04-q03-1
Success: mpiCuda.c:45, cudaMalloc, Rank: 6, Hostname: c04-q03-1
Success: mpiCuda.c:45, cudaMalloc, Rank: 3, Hostname: c04-q03-0
Success: mpiCuda.c:45, cudaMalloc, Rank: 2, Hostname: c04-q03-0
Success: mpiCuda.c:45, cudaMalloc, Rank: 1, Hostname: c04-q03-0
Success: mpiCuda.c:45, cudaMalloc, Rank: 0, Hostname: c04-q03-0

yet, on the controller when i use srun it gives the following error,

export SLURM_MPI_TYPE=pmi2
srun -p gpu --nodes=2 --ntasks-per-node=2 ./mpiCuda

Error: mpiCuda.c:45, Rank: 3, Hostname: c04-q03-1, code: 999, reason: unknown error
Abort(999) on node 3 (rank 3 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 3
Error: mpiCuda.c:45, Rank: 1, Hostname: c04-q03-0, code: 999, reason: unknown error
Error: mpiCuda.c:45, Rank: 2, Hostname: c04-q03-1, code: 999, reason: unknown error
Abort(999) on node 1 (rank 1 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 1
Abort(999) on node 2 (rank 2 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 2
Error: mpiCuda.c:45, Rank: 0, Hostname: c04-q03-0, code: 999, reason: unknown error
Abort(999) on node 0 (rank 0 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 0
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
slurmstepd: error: *** STEP 36.0 ON c04-q03-0 CANCELLED AT 2024-07-20T15:58:14 ***
srun: error: c04-q03-1: tasks 2-3: Killed
srun: error: c04-q03-0: tasks 0-1: Killed

I also tried using batch script

#!/bin/bash
#SBATCH --time=00-00:40:00
#SBATCH --output=output.log
#SBATCH --error=output.log
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=2
#SBATCH --partition=gpu

ldd ./mpiCuda

mpirun ./mpiCuda

but it also produces similar error,

        linux-vdso.so.1 (0x00007fff1ac65000)
        libcudart.so.12 => /usr/local/cuda/lib64/libcudart.so.12 (0x000014a1804f3000)
        libmpifort.so.12 => /opt/intel/mpi/2021.11/lib/libmpifort.so.12 (0x000014a18013c000)
        libmpi.so.12 => /opt/intel/mpi/2021.11/lib/libmpi.so.12 (0x000014a17e604000)
        librt.so.1 => /lib64/librt.so.1 (0x000014a17e3fc000)
        libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a17e1dc000)
        libdl.so.2 => /lib64/libdl.so.2 (0x000014a17dfd8000)
        libc.so.6 => /lib64/libc.so.6 (0x000014a17dc02000)
        /lib64/ld-linux-x86-64.so.2 (0x000014a1807a3000)
        libm.so.6 => /lib64/libm.so.6 (0x000014a17d880000)
Error: mpiCuda.c:45, Rank: 2, Hostname: c04-q03-1, code: 999, reason: unknown error
Abort(999) on node 2 (rank 2 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 2
Error: mpiCuda.c:45, Rank: 3, Hostname: c04-q03-1, code: 999, reason: unknown error
Abort(999) on node 3 (rank 3 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 3

What am I missing here?

Thiết kế website giá rẻ

Danh mục

cudaMalloc() fails with unknown error using slurm but works correctly using mpirun