I was working on a larger program using Nvidia Cuda toolkit but kept receiving illegal memory access errors. I ended up localizing the problem to my accessing of a struct, however, as far as I understood, the struct and all elements of the structs were allocated on the device so there should have been no illegal memory access.
This example program works fine, outputs the correct value and exits with no error:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
struct test_struct {
int* arr;
void allocate(int size);
void deallocate();
};
void test_struct::allocate(int size) {
cudaMalloc((void**)&arr, size);
}
void test_struct::deallocate() {
cudaFree(arr);
}
__device__ struct test_struct* d_struct;
__global__ void print_arr(test_struct* test) {
printf("%dt", test->arr[0]);
}
int main()
{
cudaMalloc((void**)&d_struct, sizeof(test_struct));
test_struct* h_test;
h_test = (test_struct*)malloc(sizeof(test_struct));
h_test->allocate(64 * sizeof(int));
cudaMemcpy(d_struct, h_test, sizeof(test_struct), cudaMemcpyHostToDevice);
print_arr << <1, 1 >> > (d_struct);
std::cout << cudaGetErrorString(cudaDeviceSynchronize());
h_test->deallocate();
free(h_test);
cudaFree(d_struct);
return 0;
}
However, changing the print_arr method to:
__global__ void print_arr() {
printf("%dt", d_struct->arr[0]);
}
and updating the main method accordingly print_arr << <1, 1 >> > ();
causes an illegal memory access. The rest of the code is exactly the same, but instead of accessing the struct through global memory, we pass it directly into the method. So my question is why does the first code run fine, but changing it to use global memory causes it to crash? As far as I understand it, the pointer to the arr array is the same in both cases.