I’m running a slurm controller with couple of gpu nodes. All nodes share $HOME directory with intel mpi installed. A simple mpi version of cuda program using cudaMalloc() works correctly when run directly with mpirun but fails with srun / sbatch.
The mpi program (mpiCuda.c) looks like this
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <cuda_runtime.h>
// Function to check for CUDA errors and print detailed information
#define CHECK_CUDA(call, message)
{
const cudaError_t error = call;
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
char hostname[256];
gethostname(hostname, 256);
if (error != cudaSuccess) {
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);
fprintf(stderr, "Rank: %d, Hostname: %s, ", rank, hostname);
fprintf(stderr, "code: %d, reason: %sn", error, cudaGetErrorString(error));
MPI_Abort(MPI_COMM_WORLD, error);
} else {
printf("Success: %s:%d, %s, Rank: %d, Hostname: %sn", __FILE__, __LINE__, message, rank, hostname);
}
}
int main(int argc, char *argv[]) {
int rank, size;
char hostname[256];
// Initialize MPI
MPI_Init(&argc, &argv);
// Get the rank and size
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
// Get the hostname
gethostname(hostname, 256);
// Allocate memory on the GPU
size_t bytes = 1024 * 1024; // 1 MB of memory
void *d_memory;
CHECK_CUDA(cudaMalloc(&d_memory, bytes), "cudaMalloc");
// Free the allocated GPU memory
cudaFree(d_memory);
// Finalize MPI
MPI_Finalize();
return 0;
}
Since the controller doesnt have gpu, i login to one of the compute gpu nodes and compile the program like
source /opt/intel/mpi/latest/env/vars.sh
mpicc -o mpiCuda mpiCuda.c -L/usr/local/cuda/lib64 -lcudart
I then login to the controller and run it on the two compute nodes like,
source /opt/intel/mpi/latest/env/vars.sh
mpirun -np 8 -host c04-q03-0,c04-q03-1 ./mpiCuda
which works correctly and gives the following output,
Success: mpiCuda.c:45, cudaMalloc, Rank: 7, Hostname: c04-q03-1
Success: mpiCuda.c:45, cudaMalloc, Rank: 5, Hostname: c04-q03-1
Success: mpiCuda.c:45, cudaMalloc, Rank: 4, Hostname: c04-q03-1
Success: mpiCuda.c:45, cudaMalloc, Rank: 6, Hostname: c04-q03-1
Success: mpiCuda.c:45, cudaMalloc, Rank: 3, Hostname: c04-q03-0
Success: mpiCuda.c:45, cudaMalloc, Rank: 2, Hostname: c04-q03-0
Success: mpiCuda.c:45, cudaMalloc, Rank: 1, Hostname: c04-q03-0
Success: mpiCuda.c:45, cudaMalloc, Rank: 0, Hostname: c04-q03-0
yet, on the controller when i use srun it gives the following error,
export SLURM_MPI_TYPE=pmi2
srun -p gpu --nodes=2 --ntasks-per-node=2 ./mpiCuda
Error: mpiCuda.c:45, Rank: 3, Hostname: c04-q03-1, code: 999, reason: unknown error
Abort(999) on node 3 (rank 3 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 3
Error: mpiCuda.c:45, Rank: 1, Hostname: c04-q03-0, code: 999, reason: unknown error
Error: mpiCuda.c:45, Rank: 2, Hostname: c04-q03-1, code: 999, reason: unknown error
Abort(999) on node 1 (rank 1 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 1
Abort(999) on node 2 (rank 2 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 2
Error: mpiCuda.c:45, Rank: 0, Hostname: c04-q03-0, code: 999, reason: unknown error
Abort(999) on node 0 (rank 0 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 0
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
slurmstepd: error: *** STEP 36.0 ON c04-q03-0 CANCELLED AT 2024-07-20T15:58:14 ***
srun: error: c04-q03-1: tasks 2-3: Killed
srun: error: c04-q03-0: tasks 0-1: Killed
I also tried using batch script
#!/bin/bash
#SBATCH --time=00-00:40:00
#SBATCH --output=output.log
#SBATCH --error=output.log
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=2
#SBATCH --partition=gpu
ldd ./mpiCuda
mpirun ./mpiCuda
but it also produces similar error,
linux-vdso.so.1 (0x00007fff1ac65000)
libcudart.so.12 => /usr/local/cuda/lib64/libcudart.so.12 (0x000014a1804f3000)
libmpifort.so.12 => /opt/intel/mpi/2021.11/lib/libmpifort.so.12 (0x000014a18013c000)
libmpi.so.12 => /opt/intel/mpi/2021.11/lib/libmpi.so.12 (0x000014a17e604000)
librt.so.1 => /lib64/librt.so.1 (0x000014a17e3fc000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x000014a17e1dc000)
libdl.so.2 => /lib64/libdl.so.2 (0x000014a17dfd8000)
libc.so.6 => /lib64/libc.so.6 (0x000014a17dc02000)
/lib64/ld-linux-x86-64.so.2 (0x000014a1807a3000)
libm.so.6 => /lib64/libm.so.6 (0x000014a17d880000)
Error: mpiCuda.c:45, Rank: 2, Hostname: c04-q03-1, code: 999, reason: unknown error
Abort(999) on node 2 (rank 2 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 2
Error: mpiCuda.c:45, Rank: 3, Hostname: c04-q03-1, code: 999, reason: unknown error
Abort(999) on node 3 (rank 3 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 999) - process 3
What am I missing here?
Sohaib Ali is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.