I try to use cudaMalloc
and cudaMemcpy
to allocate dynamic memory but meets Segmentation Fault
error during running. I think I never allocate memory for double *data
for device, but I don’t know how to do it correctly.
Here is the code:
#include <iostream>
#include <vector>
#include <memory>
#include <cuda_runtime.h>
#define cugo(ans) gpu::execute_cuda_runtime((ans), __FILE__, __LINE__)
namespace calcu {
namespace cpu {
class Nuclide {
public:
int Z_;
int A_;
int Diff_;
std::vector<double> data;
Nuclide(int Z, int A, int Diff) : Z_(Z), A_(A), Diff_(Diff) {}
};
std::vector<std::unique_ptr<Nuclide>> nuclides;
}
namespace gpu {
struct gpuNuclide {
int Z_;
int A_;
double *data;
};
gpuNuclide* nuclides = nullptr; // Device pointer
void execute_cuda_runtime(cudaError_t code, const char* fname, int line)
{
if (code != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(code) << " at " << fname << ":" << line << std::endl;
std::exit(EXIT_FAILURE);
}
}
__global__ void test_print_kernel(gpuNuclide* nuclides) {
printf("DEVICE GPU: nuclide Z = %d n", nuclides[0].Z_);
printf("DEVICE GPU: nuclide Z = %d n", nuclides[1].Z_);
printf("DEVICE GPU: nuclide Z = %d n", nuclides[2].Z_);
printf("DEVICE GPU: nuclide A = %d n", nuclides[0].A_);
printf("DEVICE GPU: nuclide A = %d n", nuclides[1].A_);
printf("DEVICE GPU: nuclide A = %d n", nuclides[2].A_);
printf("DEVICE GPU: vector[0][0] = %f n", nuclides[0].data[0]);
}
}
}
int main() {
using namespace calcu;
// Populate cpu::nuclides
cpu::nuclides.push_back(std::make_unique<cpu::Nuclide>(92, 235, 99999));
cpu::nuclides[0]->data.push_back(1.0);
cpu::nuclides[0]->data.push_back(1.5);
cpu::nuclides[0]->data.push_back(2.0);
cpu::nuclides.push_back(std::make_unique<cpu::Nuclide>(92, 238, 99999));
cpu::nuclides[1]->data.push_back(1.1);
cpu::nuclides[1]->data.push_back(555.5);
cpu::nuclides[1]->data.push_back(7122.0);
cpu::nuclides[1]->data.push_back(6908.0);
cpu::nuclides.push_back(std::make_unique<cpu::Nuclide>(94, 240, 99999));
cpu::nuclides[2]->data.push_back(0.0001);
cpu::nuclides[2]->data.push_back(0.10006);
// Allocate memory on the device for all nuclides
cugo(cudaMalloc(&gpu::nuclides, cpu::nuclides.size() * sizeof(gpu::gpuNuclide)));
// Copy each data element one by one from host to device
for (size_t i = 0; i < cpu::nuclides.size(); ++i) {
cugo(cudaMemcpy(&gpu::nuclides[i].Z_, &cpu::nuclides[i]->Z_, sizeof(int), cudaMemcpyHostToDevice));
cugo(cudaMemcpy(&gpu::nuclides[i].A_, &cpu::nuclides[i]->A_, sizeof(int), cudaMemcpyHostToDevice));
// Treatments for vector
cugo(cudaMalloc(&gpu::nuclides[i].data, cpu::nuclides[i]->data.size() * sizeof(double)));
cugo(cudaMemcpy(gpu::nuclides[i].data, cpu::nuclides[i]->data.data(), cpu::nuclides[i]->data.size() * sizeof(double), cudaMemcpyHostToDevice));
}
// Call kernel function
gpu::test_print_kernel<<<1, 1>>>(gpu::nuclides);
cugo(cudaDeviceSynchronize());
// Free device memory
cugo(cudaFree(gpu::nuclides));
return 0;
}