Objective:
I have two groups of data that need to be copied to the GPU. The first group is large and has a lower priority, while the second one is smaller and has a higher priority, such as metadata for a new job. The cudaMempy of low-priority one is issued first. I want to ensure that the higher-priority data copy can preempt the lower-priority one.
Attempted Solution:
To achieve this, I created two CUDA streams with different priorities: one with high priority and the other with low priority. Then, I used cudaMemcpyAsync()
to copy the data on the streams based on their priorities. Below is a code example illustrating this approach:
#include <cuda_runtime.h>
#include <stdint.h>
#include <stdio.h>
#include <unistd.h>
int main()
{
cudaStream_t high, low;
// Get CUDA priority range
int low_priority_range, high_priority_range;
cudaDeviceGetStreamPriorityRange(&low_priority_range, &high_priority_range);
printf("low_priority_range: %d, high_priority_range: %dn",
low_priority_range,
high_priority_range);
// Create streams with different priorities
cudaStreamCreateWithPriority(&low, cudaStreamNonBlocking, low_priority_range);
cudaStreamCreateWithPriority(&high, cudaStreamNonBlocking, high_priority_range);
cudaEvent_t start, stop, start1, stop1;
cudaEventCreate(&start); cudaEventCreate(&start1); cudaEventCreate(&stop); cudaEventCreate(&stop1);
void *host1, *host2;
cudaMallocHost(&host1, 1024ll * 1024 * 1024 * 2); // Allocate host memory for large dataset
cudaMallocHost(&host2, 1024); // Allocate host memory for small dataset
void *device1, *device2;
cudaMalloc(&device1, 1024ll * 1024 * 1024 * 2); // Allocate device memory for large dataset
cudaMalloc(&device2, 1024); // Allocate device memory for small dataset
cudaEventRecord(start1, low); // Record start time for low-priority stream
// Copy large dataset to GPU using low-priority stream
for (int i = 0; i < 10; i++)
{
cudaMemcpyAsync(device1, host1, 1024ll * 1024 * 1024 * 2, cudaMemcpyHostToDevice, low);
}
cudaEventRecord(stop1, low); // Record stop time for low-priority stream
cudaEventRecord(start, high); // Record start time for high-priority stream
// Copy small dataset to GPU using high-priority stream
for (int i = 0; i < 10; i++)
{
cudaMemcpyAsync(device2, host2, 1024, cudaMemcpyHostToDevice, high);
}
cudaEventRecord(stop, high); // Record stop time for high-priority stream
cudaDeviceSynchronize();
float high_priority_stream_time, low_priority_stream_time;
cudaEventElapsedTime(&high_priority_stream_time, start, stop); // Calculate elapsed time for high-priority stream
cudaEventElapsedTime(&low_priority_stream_time, start1, stop1); // Calculate elapsed time for low-priority stream
printf("high priority stream time: %f msn", high_priority_stream_time);
printf("low priority stream time: %f msn", low_priority_stream_time);
return 0;
}
Current Outcome and Question:
Upon testing the code on my platform (A30), I observed that both streams took almost the same amount of time, indicating that the cudaMemcpy on the high-priority stream occurred after the low-priority stream.
My question is: How can I ensure that the cudaMemcpy issued later is executed with higher priority, i.e., it occurs before the cudaMemcpy on the other stream performing background copying?