I instantiate device only classes with large data members. Subsequently, the data members are modified by all cuda threads via class pointers.
The program works for small number of threads (for example, 1872 blocks and 512 threads per block). But larger number of thread cases fail with GPUassert: an illegal memory access was encountered t13.cu 109
. I also ran compute-sanitizer
. It gives no error when the program passes, but, when failing, gives error messages
========= COMPUTE-SANITIZER
========= Invalid __global__ write of size 4 bytes
========= at 0xe0 in runYourClass(YourClass *)
========= by thread (352,0,0) in block (0,0,0)
========= Address 0x580 is out of bounds
========= and is 8,688,499,328 bytes before the nearest allocation at 0x205e00000 of size 8,388,864 bytes
The entire code is below:
#include <new>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
class MyClass{
public:
__device__ MyClass(int ni) : n(ni) {
X = new float[n];
}
__device__ void Fill() {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
//printf("My id is %d and n is %dn", idx, n);
X[idx] = (float)idx+1.23;
//printf("My id is %d and X is %.2fn", idx, X[idx]);
}
__device__ ~MyClass() {
delete[] X;
X = nullptr;
}
int n;
float *X;
};
class YourClass{
public:
__device__ YourClass(MyClass* mci) : mc(mci) {
m = mc->n;
Y = new float[m];
}
__device__ void Modify() {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
//printf("My id is %d and m is %dn", idx, m);
Y[idx] = (mc->X)[idx]+5.;
//printf("My id is %d and Y is %.2fn", idx, Y[idx]);
}
__device__ void Move(float *f) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
f[idx] = Y[idx];
}
__device__ ~YourClass() {
delete[] Y;
Y = nullptr;
}
MyClass* mc;
int m;
float *Y;
};
__global__ void initMyClass(MyClass *mcp, int n) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx==0) new(mcp) MyClass(n);
}
__global__ void initYourClass(YourClass *ycp, MyClass *mcp) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx==0) new(ycp) YourClass(mcp);
}
__global__ void runMyClass(MyClass *mcp) {
mcp->Fill();
}
__global__ void runYourClass(YourClass *ycp) {
ycp->Modify();
}
__global__ void moveYourClassData(YourClass *ycp, float *f) {
ycp->Move(f);
}
__global__ void cleanUpDeviceClasses(MyClass *mcp, YourClass *ycp) {
mcp->~MyClass();
ycp->~YourClass();
}
int main() {
//int nBLK=16384;
//int nTPB=256;
int nBLK=1872;
int nTPB=512;
int n=nBLK*nTPB;
MyClass *mc;
gpuErrchk( cudaMalloc(&mc, sizeof(MyClass)) );
YourClass *yc;
gpuErrchk( cudaMalloc(&yc, sizeof(YourClass)) );
initMyClass<<<1,1>>>(mc, n);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
runMyClass<<<nBLK,nTPB>>>(mc);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
initYourClass<<<1,1>>>(yc, mc);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
runYourClass<<<nBLK,nTPB>>>(yc);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
float *h_result = new float[n];
float *d_result;
gpuErrchk( cudaMalloc(&d_result, n*sizeof(float)) );
moveYourClassData<<<nBLK,nTPB>>>(yc, d_result);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
gpuErrchk( cudaMemcpy(h_result, d_result, n*sizeof(float), cudaMemcpyDeviceToHost) );
for (int i=0;i<n;++i) {
std::cout<<"i: "<<i<<", value: "<<h_result[i]<<'n';
}
cleanUpDeviceClasses<<<1,1>>>(mc, yc);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
cudaFree(mc);
cudaFree(yc);
cudaFree(d_result);
delete[] h_result;
return 0;
}
5