I am runing these snips under WSL using CUDA 12.1.
snip1
returns CUDA error 101: invalid device ordinal
but snip2
runs well.
It it only because of the positions of cudaMemPrefetchAsync
s?
And what’s the mechanism behind these?
<code>// snip1
#include <curand_kernel.h>
#include <cstdio>
void checkError()
{
cudaError_t err_;
err_ = cudaGetLastError();
if (err_ != cudaSuccess)
{
std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}
__global__ void init_curand(curandState *states, unsigned long long seed)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i * blockDim.y * gridDim.y + j;
curand_init(seed, idx, 0, &states[idx]);
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);
const int M = 10;
const int N = 20;
const int bytes = M * N * sizeof(int8_t);
int8_t *noisy;
int8_t *ising1;
int8_t *ising2;
cudaMallocManaged(&noisy, bytes);
cudaMallocManaged(&ising1, bytes);
cudaMallocManaged(&ising2, bytes);
curandState *states;
cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
sizeof(curandState));
/* ??? */
cudaMemPrefetchAsync(noisy, bytes, deviceId);
cudaMemPrefetchAsync(ising1, bytes, deviceId);
cudaMemPrefetchAsync(ising2, bytes, deviceId);
init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL));
checkError();
return 0;
}
</code>
<code>// snip1
#include <curand_kernel.h>
#include <cstdio>
void checkError()
{
cudaError_t err_;
err_ = cudaGetLastError();
if (err_ != cudaSuccess)
{
std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}
__global__ void init_curand(curandState *states, unsigned long long seed)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i * blockDim.y * gridDim.y + j;
curand_init(seed, idx, 0, &states[idx]);
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);
const int M = 10;
const int N = 20;
const int bytes = M * N * sizeof(int8_t);
int8_t *noisy;
int8_t *ising1;
int8_t *ising2;
cudaMallocManaged(&noisy, bytes);
cudaMallocManaged(&ising1, bytes);
cudaMallocManaged(&ising2, bytes);
curandState *states;
cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
sizeof(curandState));
/* ??? */
cudaMemPrefetchAsync(noisy, bytes, deviceId);
cudaMemPrefetchAsync(ising1, bytes, deviceId);
cudaMemPrefetchAsync(ising2, bytes, deviceId);
init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL));
checkError();
return 0;
}
</code>
// snip1
#include <curand_kernel.h>
#include <cstdio>
void checkError()
{
cudaError_t err_;
err_ = cudaGetLastError();
if (err_ != cudaSuccess)
{
std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}
__global__ void init_curand(curandState *states, unsigned long long seed)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i * blockDim.y * gridDim.y + j;
curand_init(seed, idx, 0, &states[idx]);
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);
const int M = 10;
const int N = 20;
const int bytes = M * N * sizeof(int8_t);
int8_t *noisy;
int8_t *ising1;
int8_t *ising2;
cudaMallocManaged(&noisy, bytes);
cudaMallocManaged(&ising1, bytes);
cudaMallocManaged(&ising2, bytes);
curandState *states;
cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
sizeof(curandState));
/* ??? */
cudaMemPrefetchAsync(noisy, bytes, deviceId);
cudaMemPrefetchAsync(ising1, bytes, deviceId);
cudaMemPrefetchAsync(ising2, bytes, deviceId);
init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL));
checkError();
return 0;
}
<code>// snip2
#include <curand_kernel.h>
#include <cstdio>
void checkError()
{
cudaError_t err_;
err_ = cudaGetLastError();
if (err_ != cudaSuccess)
{
std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}
__global__ void init_curand(curandState *states, unsigned long long seed)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i * blockDim.y * gridDim.y + j;
curand_init(seed, idx, 0, &states[idx]);
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);
const int M = 10;
const int N = 20;
const int bytes = M * N * sizeof(int8_t);
int8_t *noisy;
int8_t *ising1;
int8_t *ising2;
cudaMallocManaged(&noisy, bytes);
cudaMallocManaged(&ising1, bytes);
cudaMallocManaged(&ising2, bytes);
curandState *states;
cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
sizeof(curandState));
init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL));
checkError();
/* ??? */
cudaMemPrefetchAsync(noisy, bytes, deviceId);
cudaMemPrefetchAsync(ising1, bytes, deviceId);
cudaMemPrefetchAsync(ising2, bytes, deviceId);
return 0;
}
</code>
<code>// snip2
#include <curand_kernel.h>
#include <cstdio>
void checkError()
{
cudaError_t err_;
err_ = cudaGetLastError();
if (err_ != cudaSuccess)
{
std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}
__global__ void init_curand(curandState *states, unsigned long long seed)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i * blockDim.y * gridDim.y + j;
curand_init(seed, idx, 0, &states[idx]);
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);
const int M = 10;
const int N = 20;
const int bytes = M * N * sizeof(int8_t);
int8_t *noisy;
int8_t *ising1;
int8_t *ising2;
cudaMallocManaged(&noisy, bytes);
cudaMallocManaged(&ising1, bytes);
cudaMallocManaged(&ising2, bytes);
curandState *states;
cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
sizeof(curandState));
init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL));
checkError();
/* ??? */
cudaMemPrefetchAsync(noisy, bytes, deviceId);
cudaMemPrefetchAsync(ising1, bytes, deviceId);
cudaMemPrefetchAsync(ising2, bytes, deviceId);
return 0;
}
</code>
// snip2
#include <curand_kernel.h>
#include <cstdio>
void checkError()
{
cudaError_t err_;
err_ = cudaGetLastError();
if (err_ != cudaSuccess)
{
std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}
__global__ void init_curand(curandState *states, unsigned long long seed)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i * blockDim.y * gridDim.y + j;
curand_init(seed, idx, 0, &states[idx]);
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);
const int M = 10;
const int N = 20;
const int bytes = M * N * sizeof(int8_t);
int8_t *noisy;
int8_t *ising1;
int8_t *ising2;
cudaMallocManaged(&noisy, bytes);
cudaMallocManaged(&ising1, bytes);
cudaMallocManaged(&ising2, bytes);
curandState *states;
cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
sizeof(curandState));
init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL));
checkError();
/* ??? */
cudaMemPrefetchAsync(noisy, bytes, deviceId);
cudaMemPrefetchAsync(ising1, bytes, deviceId);
cudaMemPrefetchAsync(ising2, bytes, deviceId);
return 0;
}
New contributor
CHEN Yunkai is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.