I have some elementary understanding of how GPUs work i.e. SMs, warps, GPU memory hierarchy, threads, blocks.
I have a number of questions regarding a project I’m working on. Explaining the details can be very cumbersome, but I will try to make my questions understandable for the community while being beneficial for me too.
I am trying to run a large number of operations (much more than what can be run physically in parallel), possibly up into trillions. These operations are probabilistic and happen on dynamic data structures (DDSes) (such as a linked list) which are each independent from each other, and the entire operation (on one data structure) ideally happens via one thread, and do not increase or decrease the size of the said DDS. Since threads are addressed by their index, I can only think of using a large array of these DDSes, with each thread manipulating the DDS at the index of the thread. I also used Unified Memory (for the DDSes) as copying the DDSes from the CPU to the device and back was very laborious and was causing me a lot of problems.
Questions:
-
Is there any way I can queue these threads so that once a thread is done with manipulating a DDS, I “reset” that DDS so that another thread can begin the same process too? I am proposing this reset mechanism so that I can reuse a memory location rather than needing to declare another one. Or maybe just free it, but how will I able to make a thread use the same location (or any vacant location after a a previous thread has done its job) as a matter of fact? I want to initialize trillions in total but not all in parallel (but the most in parallel, perhaps in batches). What is also possible is a thread being “generated” when needed, not needing for a queue too in this scenario.
-
Is Unified Memory a good idea in this case? It works, I have a working prototype of these manipulations using Unified Memory but I wonder if it’s a good idea to continue with it.
-
In Q1, one might wonder what’s the point if I’m resetting the DDS. I will gather relevant numerics from the DDS before resetting (which is what I want) before resetting it. Possibly in a privatised, atomised histogram. If there is a better idea, please let me know.
I will cite the answer of this question in my project, and being a beginner in CUDA it will be very grateful for pointers (no pun intended ..) in the right direction.
`
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <limits.h>
#include <curand_kernel.h>
#define NUM_LISTS 1000000
#define LIST_SIZE 10
struct Node {
int value;
Node *next;
};
__global__ void initializeLists(Node** lists, Node* allNodes, int size, unsigned int seed) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < NUM_LISTS) {
curandState state;
curand_init(seed, idx, 0, &state);
lists[idx] = &allNodes[idx * size];
Node* head = lists[idx];
head->value = curand(&state) % 100;
for (int i = 1; i < size; ++i) {
Node* node = &allNodes[idx * size + i];
node->value = curand(&state) % 100;
head->next = node;
head = node;
}
head->next = NULL;
}
}
__device__ void sortListsWrapper(Node** head_ref) {
Node* head = *head_ref;
if (head == NULL) {
return;
}
bool swapped;
do {
swapped = false;
Node* curr = head;
Node* prev = NULL;
Node* next = NULL;
while (curr->next != NULL) {
next = curr->next;
if (curr->value > next->value) {
swapped = true;
if (prev != NULL) {
prev->next = next;
} else {
head = next;
}
curr->next = next->next;
next->next = curr;
prev = next;
} else {
prev = curr;
curr = next;
}
}
} while (swapped);
*head_ref = head;
}
__global__ void sortLists(Node** lists, float* times) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < NUM_LISTS) {
clock_t start = clock();
sortListsWrapper(&lists[idx]);
clock_t end = clock();
times[idx] = (float)(end - start) / CLOCKS_PER_SEC;
}
}
void calculateStatistics(float* times, int size) {
float sum = 0.0f, min = times[0], max = times[0], mean, stddev = 0.0f;
for (int i = 0; i < size; ++i) {
sum += times[i];
if (times[i] < min) min = times[i];
if (times[i] > max) max = times[i];
}
mean = sum / size;
for (int i = 0; i < size; ++i) {
stddev += (times[i] - mean) * (times[i] - mean);
}
stddev = sqrt(stddev / size);
printf("Average time: %f secondsn", mean);
printf("Minimum time: %f secondsn", min);
printf("Maximum time: %f secondsn", max);
printf("Standard Deviation: %f secondsn", stddev);
}
int main() {
Node** lists;
Node* allNodes;
float* times;
// Allocate Unified Memory
cudaMallocManaged(&lists, NUM_LISTS * sizeof(Node*));
cudaMallocManaged(&allNodes, NUM_LISTS * LIST_SIZE * sizeof(Node));
cudaMallocManaged(×, NUM_LISTS * sizeof(float));
int threadsPerBlock = 256;
int blocksPerGrid = 1;
// Initialize lists with random values
unsigned int seed = time(NULL);
initializeLists<<<blocksPerGrid, threadsPerBlock>>>(lists, allNodes, LIST_SIZE, seed);
cudaDeviceSynchronize();
sortLists<<<blocksPerGrid, threadsPerBlock>>>(lists, times);
cudaDeviceSynchronize();
for (int i = 0; i < 10; ++i) {
Node* current = lists[i];
printf("List %d:", i);
while (current != NULL) {
printf("%d -> ", current->value);
current = current->next;
}
printf("NULLn");
}
calculateStatistics(times, NUM_LISTS);
cudaFree(lists);
cudaFree(allNodes);
cudaFree(times);
return 0;
}
`
11