void matmul(float *A, float *B, float *C, int M, int N, int K,
int threads_per_process, int mpi_rank, int mpi_world_size) {
int q = (int)sqrt(mpi_world_size);
int block_size_M = M / q;
int block_size_N = N / q;
int block_size_K = K / q;
float *A_block = (float *)malloc(block_size_M * block_size_K * sizeof(float));
float *B_block = (float *)malloc(block_size_K * block_size_N * sizeof(float));
float *C_block = (float *)malloc(block_size_M * block_size_N * sizeof(float));
MPI_Datatype block_type_A, block_type_B;
MPI_Type_vector(block_size_M, block_size_K, K, MPI_FLOAT, &block_type_A);
MPI_Type_commit(&block_type_A);
MPI_Type_vector(block_size_K, block_size_N, N, MPI_FLOAT, &block_type_B);
MPI_Type_commit(&block_type_B);
if (mpi_rank == 0) {
printf("Rank %d: Starting MPI_Scatter for matrix A...n", mpi_rank);
}
MPI_Scatter(A, 1, block_type_A, A_block, block_size_M * block_size_K, MPI_FLOAT, 0, MPI_COMM_WORLD);
if (mpi_rank == 0) {
printf("Rank %d: MPI_Scatter for matrix A completed.n", mpi_rank);
}
if (mpi_rank == 0) {
printf("Rank %d: Starting MPI_Scatter for matrix B...n", mpi_rank);
}
MPI_Scatter(B, 1, block_type_B, B_block, block_size_K * block_size_N, MPI_FLOAT, 0, MPI_COMM_WORLD);
if (mpi_rank == 0) {
printf("Rank %d: MPI_Scatter for matrix B completed.n", mpi_rank);
}
for (int step = 0; step < q; ++step) {
int root = (mpi_rank / q) * q + (mpi_rank / q + step) % q;
printf("Rank %d: Broadcasting A_block, step %d, root %d...n", mpi_rank, step, root);
MPI_Bcast(A_block, block_size_M * block_size_K, MPI_FLOAT, root, MPI_COMM_WORLD);
printf("Rank %d: Broadcast completed, step %d, root %d.n", mpi_rank, step, root);
// Local multiplication with OpenMP
#pragma omp parallel for num_threads(threads_per_process) collapse(2)
for (int i = 0; i < block_size_M; ++i) {
for (int j = 0; j < block_size_N; ++j) {
float sum = 0.0;
for (int k = 0; k < block_size_K; ++k) {
sum += A_block[i * block_size_K + k] * B_block[k * block_size_N + j];
}
C_block[i * block_size_N + j] += sum;
}
}
MPI_Status status;
int send_to = (mpi_rank + q) % (q * q);
int recv_from = (mpi_rank - q + q * q) % (q * q);
printf("Rank %d: Rotating B_block, sending to %d, receiving from %d...n", mpi_rank, send_to, recv_from);
MPI_Sendrecv_replace(B_block, block_size_K * block_size_N, MPI_FLOAT, send_to, 0, recv_from, 0, MPI_COMM_WORLD, &status);
printf("Rank %d: Rotation completed.n", mpi_rank);
}
printf("Rank %d: Gathering results...n", mpi_rank);
MPI_Gather(C_block, block_size_M * block_size_N, MPI_FLOAT, C, 1, block_type_B, 0, MPI_COMM_WORLD);
printf("Rank %d: Results gathered.n", mpi_rank);
free(A_block);
free(B_block);
free(C_block);
MPI_Type_free(&block_type_A);
MPI_Type_free(&block_type_B);
}
why is scatter a generating a segmentation fault? I did init in my main.c
`> [rank 0] Initializing matrices…Done!
Rank 0: Starting MPI_Scatter for matrix A…
[a06:3241709:0:3241709] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f0ac4021040)
backtrace (tid:3241709) ====
0 /lib/libucs.so.0(ucs_handle_error+0x2dc) [0x7f0ad40a0c1c]
1 /lib/libucs.so.0(+0x30dff) [0x7f0ad40a0dff]
2 /lib/libucs.so.0(+0x31134) [0x7f0ad40a1134]
3 /lib/x86_64-linux-gnu/libc.so.6(+0x18b963) [0x7f0ad7b66963]
4 /usr/local/lib/libopen-pal.so.40(opal_generic_simple_pack+0x3e7) [0x7f0ad785c807]
5 /usr/local/lib/openmpi/mca_pml_ucx.so(+0x86a5) [0x7f0ad5cf16a5]
6 /lib/libucp.so.0(ucp_dt_pack+0x87) [0x7f0ad412c417]
7 /lib/libucp.so.0(+0x84767) [0x7f0ad415c767]
8 /lib/ucx/libuct_ib.so.0(uct_rc_mlx5_ep_am_bcopy+0xb7) [0x7f0acff5d3e7]
9 /lib/libucp.so.0(ucp_rndv_progress_am_bcopy+0x1c3) [0x7f0ad415c993]
10 /lib/ucx/libuct_ib.so.0(uct_rc_ep_process_pending+0x10) [0x7f0acff54490]
11 /lib/libucs.so.0(ucs_arbiter_dispatch_nonempty+0xb6) [0x7f0ad4094a56]
12 /lib/ucx/libuct_ib.so.0(+0x41fee) [0x7f0acff62fee]
13 /lib/libucp.so.0(ucp_worker_progress+0x6a) [0x7f0ad412863a]
14 /usr/local/lib/openmpi/mca_pml_ucx.so(mca_pml_ucx_send+0x3b7) [0x7f0ad5ceefe7]
15 /usr/local/lib/libmpi.so.40(ompi_coll_base_scatter_intra_basic_linear+0xf0) [0x7f0ad7cc7db0]
16 /usr/local/lib/openmpi/mca_coll_tuned.so(ompi_coll_tuned_scatter_intra_dec_fixed+0x69) [0x7f0acf4fd8b9]
17 /usr/local/lib/libmpi.so.40(PMPI_Scatter+0x10d) [0x7f0ad7cab0bd]
18 ./main(+0x30cd) [0x5576687ec0cd]
19 ./main(+0x17d4) [0x5576687ea7d4]
20 /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7f0ad79ff083]
21 ./main(+0x1b5e) [0x5576687eab5e]=================================
[a06:3241709] *** Process received signal ***
[a06:3241709] Signal: Segmentation fault (11)
[a06:3241709] Signal code: (-6)
[a06:3241709] Failing at address: 0x798c003176ed
[a06:3241709] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x43090)[0x7f0ad7a1e090]
[a06:3241709] [ 1] /lib/x86_64-linux-gnu/libc.so.6(+0x18b963)[0x7f0ad7b66963]
[a06:3241709] [ 2] /usr/local/lib/libopen-pal.so.40(opal_generic_simple_pack+0x3e7)[0x7f0ad785c807]
[a06:3241709] [ 3] /usr/local/lib/openmpi/mca_pml_ucx.so(+0x86a5)[0x7f0ad5cf16a5]
[a06:3241709] [ 4] /lib/libucp.so.0(ucp_dt_pack+0x87)[0x7f0ad412c417]
[a06:3241709] [ 5] /lib/libucp.so.0(+0x84767)[0x7f0ad415c767]
[a06:3241709] [ 6] /lib/ucx/libuct_ib.so.0(uct_rc_mlx5_ep_am_bcopy+0xb7)[0x7f0acff5d3e7]
[a06:3241709] [ 7] /lib/libucp.so.0(ucp_rndv_progress_am_bcopy+0x1c3)[0x7f0ad415c993]
[a06:3241709] [ 8] /lib/ucx/libuct_ib.so.0(uct_rc_ep_process_pending+0x10)[0x7f0acff54490]
[a06:3241709] [ 9] /lib/libucs.so.0(ucs_arbiter_dispatch_nonempty+0xb6)[0x7f0ad4094a56]
[a06:3241709] [10] /lib/ucx/libuct_ib.so.0(+0x41fee)[0x7f0acff62fee]
[a06:3241709] [11] /lib/libucp.so.0(ucp_worker_progress+0x6a)[0x7f0ad412863a]
[a06:3241709] [12] /usr/local/lib/openmpi/mca_pml_ucx.so(mca_pml_ucx_send+0x3b7)[0x7f0ad5ceefe7]
[a06:3241709] [13] /usr/local/lib/libmpi.so.40(ompi_coll_base_scatter_intra_basic_linear+0xf0)[0x7f0ad7cc7db0]
[a06:3241709] [14] /usr/local/lib/openmpi/mca_coll_tuned.so(ompi_coll_tuned_scatter_intra_dec_fixed+0x69)[0x7f0acf4fd8b9]
[a06:3241709] [15] /usr/local/lib/libmpi.so.40(PMPI_Scatter+0x10d)[0x7f0ad7cab0bd]
[a06:3241709] [16] ./main(+0x30cd)[0x5576687ec0cd]
[a06:3241709] [17] ./main(+0x17d4)[0x5576687ea7d4]
[a06:3241709] [18] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7f0ad79ff083]
[a06:3241709] [19] ./main(+0x1b5e)[0x5576687eab5e]
[a06:3241709] *** End of error message ***`
박종연 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.