I am attempting to launch thrust::fill on two different device vectors in parallel on different CUDA streams. However, when I look at the kernel launches in NSight Compute, they appear to be serialized. I am new to CUDA, so I could be doing something wrong or have a misunderstanding. Here is the basic example I am working with. Thank you!
#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/sort.h>
#include <thrust/transform.h>
#include <thrust/execution_policy.h>
#define gpuErrchk(ans)
{
gpuAssert((ans), __FILE__, __LINE__);
}
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if(code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
if(abort) exit(code);
}
}
int main(void)
{
cudaStream_t stream1, stream2;
gpuErrchk(cudaStreamCreate(&stream1));
gpuErrchk(cudaStreamCreate(&stream2));
const size_t size = 10000000;
int* d_test1_ptr;
int* d_test2_ptr;
gpuErrchk(cudaMalloc((void**)&d_test1_ptr, size * sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_test2_ptr, size * sizeof(int)));
thrust::device_ptr<int> d_test1(d_test1_ptr);
thrust::device_ptr<int> d_test2(d_test2_ptr);
for(int i = 0; i < 100; i++)
{
thrust::fill(thrust::cuda::par.on(stream1), d_test1, d_test1 + size, 2);
thrust::fill(thrust::cuda::par.on(stream2), d_test2, d_test2 + size, 2);
}
gpuErrchk(cudaStreamSynchronize(stream1));
gpuErrchk(cudaStreamSynchronize(stream2));
gpuErrchk(cudaFree(d_test1_ptr));
gpuErrchk(cudaFree(d_test2_ptr));
gpuErrchk(cudaStreamDestroy(stream1));
gpuErrchk(cudaStreamDestroy(stream2));
std::cout << "Completed execution of dummy functions on different streams." << std::endl;
return 0;
}
Here is the result form NSight. It looks like there is a constant cudaStreamSynchronize call but I am not sure why.
NSight Image
I have looked at this post: /questions/24368197/getting-cuda-thrust-to-use-a-cuda-stream-of-your-choice?newreg=d818bab7b4fb4b5e879450258891e1d7 where it appears their launches are in parallel. I tried even using their exact code but the kernels were still being serialized.
Please let me know if you need more information.
I tried following the directions from this post: text
I tried performing different thrust operations.
I tried creating a custom kernel and using those in parallel. That worked but I would like to use Thrust’s API.
Nicolas Perrault is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.