Thiết kế website giá rẻ

Question

I am runing these snips under WSL using CUDA 12.1.
snip1 returns CUDA error 101: invalid device ordinal but snip2 runs well.
It it only because of the positions of cudaMemPrefetchAsyncs?
And what’s the mechanism behind these?

<code>// snip1

#include <curand_kernel.h>

#include <cstdio>

void checkError()

{

cudaError_t err_;

err_ = cudaGetLastError();

if (err_ != cudaSuccess)

{

std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__);

exit(EXIT_FAILURE);

}

__global__ void init_curand(curandState *states, unsigned long long seed)

{

int i = threadIdx.x + blockIdx.x * blockDim.x;

int j = threadIdx.y + blockIdx.y * blockDim.y;

int idx = i * blockDim.y * gridDim.y + j;

curand_init(seed, idx, 0, &states[idx]);

}

int main()

{

int deviceId;

int numberOfSMs;

cudaGetDevice(&deviceId);

cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);

printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs);

dim3 threadsPerBlock(16, 16);

dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);

const int M = 10;

const int N = 20;

const int bytes = M * N * sizeof(int8_t);

int8_t *noisy;

int8_t *ising1;

int8_t *ising2;

cudaMallocManaged(&noisy, bytes);

cudaMallocManaged(&ising1, bytes);

cudaMallocManaged(&ising2, bytes);

curandState *states;

cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *

sizeof(curandState));

/* ??? */

cudaMemPrefetchAsync(noisy, bytes, deviceId);

cudaMemPrefetchAsync(ising1, bytes, deviceId);

cudaMemPrefetchAsync(ising2, bytes, deviceId);

init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL));

checkError();

return 0;

}

</code>

<code>// snip1 #include <curand_kernel.h> #include <cstdio> void checkError() { cudaError_t err_; err_ = cudaGetLastError(); if (err_ != cudaSuccess) { std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__); exit(EXIT_FAILURE); } } __global__ void init_curand(curandState *states, unsigned long long seed) { int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y; int idx = i * blockDim.y * gridDim.y + j; curand_init(seed, idx, 0, &states[idx]); } int main() { int deviceId; int numberOfSMs; cudaGetDevice(&deviceId); cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId); printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs); dim3 threadsPerBlock(16, 16); dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs); const int M = 10; const int N = 20; const int bytes = M * N * sizeof(int8_t); int8_t *noisy; int8_t *ising1; int8_t *ising2; cudaMallocManaged(&noisy, bytes); cudaMallocManaged(&ising1, bytes); cudaMallocManaged(&ising2, bytes); curandState *states; cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y * sizeof(curandState)); /* ??? */ cudaMemPrefetchAsync(noisy, bytes, deviceId); cudaMemPrefetchAsync(ising1, bytes, deviceId); cudaMemPrefetchAsync(ising2, bytes, deviceId); init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL)); checkError(); return 0; } </code>

// snip1
#include <curand_kernel.h>

#include <cstdio>

void checkError()
{
  cudaError_t err_;
  err_ = cudaGetLastError();
  if (err_ != cudaSuccess)
  {
    std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }
}

__global__ void init_curand(curandState *states, unsigned long long seed)
{
  int i = threadIdx.x + blockIdx.x * blockDim.x;
  int j = threadIdx.y + blockIdx.y * blockDim.y;
  int idx = i * blockDim.y * gridDim.y + j;
  curand_init(seed, idx, 0, &states[idx]);
}

int main()
{
  int deviceId;
  int numberOfSMs;

  cudaGetDevice(&deviceId);
  cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
  printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs);

  dim3 threadsPerBlock(16, 16);
  dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);

  const int M = 10;
  const int N = 20;
  const int bytes = M * N * sizeof(int8_t);
  int8_t *noisy;
  int8_t *ising1;
  int8_t *ising2;
  cudaMallocManaged(&noisy, bytes);
  cudaMallocManaged(&ising1, bytes);
  cudaMallocManaged(&ising2, bytes);
  curandState *states;
  cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
                          sizeof(curandState));

  /* ??? */
  cudaMemPrefetchAsync(noisy, bytes, deviceId);
  cudaMemPrefetchAsync(ising1, bytes, deviceId);
  cudaMemPrefetchAsync(ising2, bytes, deviceId);

  init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL));
  checkError();

  return 0;
}

<code>// snip2

#include <curand_kernel.h>

#include <cstdio>

void checkError()

{

cudaError_t err_;

err_ = cudaGetLastError();

if (err_ != cudaSuccess)

{

std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__);

exit(EXIT_FAILURE);

}

__global__ void init_curand(curandState *states, unsigned long long seed)

{

int i = threadIdx.x + blockIdx.x * blockDim.x;

int j = threadIdx.y + blockIdx.y * blockDim.y;

int idx = i * blockDim.y * gridDim.y + j;

curand_init(seed, idx, 0, &states[idx]);

}

int main()

{

int deviceId;

int numberOfSMs;

cudaGetDevice(&deviceId);

cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);

printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs);

dim3 threadsPerBlock(16, 16);

dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);

const int M = 10;

const int N = 20;

const int bytes = M * N * sizeof(int8_t);

int8_t *noisy;

int8_t *ising1;

int8_t *ising2;

cudaMallocManaged(&noisy, bytes);

cudaMallocManaged(&ising1, bytes);

cudaMallocManaged(&ising2, bytes);

curandState *states;

cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *

sizeof(curandState));

init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL));

checkError();

/* ??? */

cudaMemPrefetchAsync(noisy, bytes, deviceId);

cudaMemPrefetchAsync(ising1, bytes, deviceId);

cudaMemPrefetchAsync(ising2, bytes, deviceId);

return 0;

}

</code>

<code>// snip2 #include <curand_kernel.h> #include <cstdio> void checkError() { cudaError_t err_; err_ = cudaGetLastError(); if (err_ != cudaSuccess) { std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__); exit(EXIT_FAILURE); } } __global__ void init_curand(curandState *states, unsigned long long seed) { int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y; int idx = i * blockDim.y * gridDim.y + j; curand_init(seed, idx, 0, &states[idx]); } int main() { int deviceId; int numberOfSMs; cudaGetDevice(&deviceId); cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId); printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs); dim3 threadsPerBlock(16, 16); dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs); const int M = 10; const int N = 20; const int bytes = M * N * sizeof(int8_t); int8_t *noisy; int8_t *ising1; int8_t *ising2; cudaMallocManaged(&noisy, bytes); cudaMallocManaged(&ising1, bytes); cudaMallocManaged(&ising2, bytes); curandState *states; cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y * sizeof(curandState)); init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL)); checkError(); /* ??? */ cudaMemPrefetchAsync(noisy, bytes, deviceId); cudaMemPrefetchAsync(ising1, bytes, deviceId); cudaMemPrefetchAsync(ising2, bytes, deviceId); return 0; } </code>

// snip2
#include <curand_kernel.h>

#include <cstdio>

void checkError()
{
  cudaError_t err_;
  err_ = cudaGetLastError();
  if (err_ != cudaSuccess)
  {
    std::printf("CUDA error %d:%s at %s:%dn", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
    exit(EXIT_FAILURE);
  }
}

__global__ void init_curand(curandState *states, unsigned long long seed)
{
  int i = threadIdx.x + blockIdx.x * blockDim.x;
  int j = threadIdx.y + blockIdx.y * blockDim.y;
  int idx = i * blockDim.y * gridDim.y + j;
  curand_init(seed, idx, 0, &states[idx]);
}

int main()
{
  int deviceId;
  int numberOfSMs;

  cudaGetDevice(&deviceId);
  cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
  printf("Device ID: %dtNumber of SMs: %dn", deviceId, numberOfSMs);

  dim3 threadsPerBlock(16, 16);
  dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);

  const int M = 10;
  const int N = 20;
  const int bytes = M * N * sizeof(int8_t);
  int8_t *noisy;
  int8_t *ising1;
  int8_t *ising2;
  cudaMallocManaged(&noisy, bytes);
  cudaMallocManaged(&ising1, bytes);
  cudaMallocManaged(&ising2, bytes);

  curandState *states;
  cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
                          sizeof(curandState));

  init_curand<<<numBlocks, threadsPerBlock>>>(states, time(NULL));
  checkError();

  /* ??? */
  cudaMemPrefetchAsync(noisy, bytes, deviceId);
  cudaMemPrefetchAsync(ising1, bytes, deviceId);
  cudaMemPrefetchAsync(ising2, bytes, deviceId);

  return 0;
}

Thiết kế website giá rẻ

Danh mục

Proper usage of cudaMemPrefetchAsync?