I am using cuda to accelarate my code, in which I process every images in a loop. Each image is processed on GPU via cuda.
I refered to cuda-samples to write the code below:
- file name:
my_cuda.cu
#include "cuda_runtime.h"
int process_one_image(args)
{
// note that declaration of some params is omitted.
unsigned char *h_data = (unsigned char *)malloc(size);
unsigned char *d_data;
cudaMalloc((void **)&d_data, size);
cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
// process the d_data on GPU
...
// copy the result from device to host.
cudaMemcpy(h_rgb, d_rgb, size, cudaMemcpyDeviceToHost);
free(h_rgb);
free(h_data)
cudaFree(d_rgb);
cudaFree(d_data);
}
in the code above, cudaMalloc
and cudaMemcpy
are in the same function process_one_image
. and the code is run correctly.
But I wanna run this code repeatedly, for like in a loop of more than 100 times.
So I do not wanna do cudaMalloc
and cudaFree
every time I process images.
So I wanna change my code into the below arrangement.
- cuda_file:
my_cuda.cu
#include "cuda_runtime.h"
int initCuda(args) // note the input args are omitted.
{
// note that declaration of some params is omitted.
unsigned char *h_data = (unsigned char *)malloc(size);
cudaMalloc((void **)&d_data, size);
}
int FinalizeCuda(args)
{
cudaFree(d_data);
cudaFree(d_rgb);
free(h_data);
free(d_rgb);
}
int process_one_image(args)
{
cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
// process the d_data on GPU
...
// copy the result from device to host.
cudaMemcpy(h_rgb, d_rgb, size, cudaMemcpyDeviceToHost);
}
- my_c_code:
c_code.c
#include "my_cuda.cu"
int processing_loop(args) // specific args are omitted
{
initCuda(init_args);
while (1)
{
int ret = process_one_image(args);
}
FinalizeCuda(init_args);
}
Here, you can notice that I wanna cudaMalloc
only once in C file, in order to accelarate this code, but I find it did not work correctly. It reported no bugs, but I get nothing from h_rgb
.
It seems that(I guess) when processing cudaMemcpy
, It could not find the correct address of d_data
and copy to d_data.
So how can I fix this bug, OR, is it a proper way to execute cudaMalloc
only once?