Thiết kế website giá rẻ

Question

Since there are different number of INT, FLOAT and DOUBLE cores in NVIDIA Ampere architecture, I want to see this difference reflected when I do computation.

I have the following script:

#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>

// Macro to check CUDA errors
#define CHECK_CUDA_ERROR(call)                                                 
  {                                                                            
    cudaError_t err = call;                                                    
    if (err != cudaSuccess) {                                                  
      fprintf(stderr, "CUDA error at %s %d: %sn", __FILE__, __LINE__,         
              cudaGetErrorString(err));                                        
      exit(EXIT_FAILURE);                                                      
    }                                                                          
  }

// CUDA kernel to add two vectors with more computation to measure computation
// time
template <typename T>
__global__ void vectorAdd(T *a, T *b, T *c, unsigned numElements) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < numElements) {
    T temp = 0;
    for (int j = 0; j < 10000; ++j) { // Increased computational load
      temp += a[i] + b[i];
    }
    c[i] = temp;
  }
}

__global__ void warmup() { printf("I am a warmup kerneln"); }

template <typename T> void createArr(T *&arr, unsigned numElements) {
  size_t size = numElements * sizeof(T);
  arr = (T *)malloc(size);
  for (unsigned i = 0; i < numElements; ++i) {
    arr[i] = static_cast<T>(rand() % 10);
  }
}

int main() {
  srand(time(0));
  unsigned numElements =
      1024 * 1024; // Increase the number of elements to ensure sufficient load

  // Allocate memory for vectors on host

  // unsigned size = numElements * sizeof(int);
  // unsigned size = numElements * sizeof(float);
  unsigned size = numElements * sizeof(double);

  // int *h_a, *h_b, *h_c, *d_a, *d_b, *d_c;
  // float *h_a, *h_b, *h_c, *d_a, *d_b, *d_c;
  double *h_a, *h_b, *h_c, *d_a, *d_b, *d_c;

  createArr(h_a, numElements);
  createArr(h_b, numElements);
  // h_c = (float *)malloc(size);
  // h_c = (int *)malloc(size);
  h_c = (double *)malloc(size);

  // Allocate memory for vectors on device
  CHECK_CUDA_ERROR(cudaMalloc(&d_a, size));
  CHECK_CUDA_ERROR(cudaMalloc(&d_b, size));
  CHECK_CUDA_ERROR(cudaMalloc(&d_c, size));

  // Copy vectors from host to device
  CHECK_CUDA_ERROR(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
  CHECK_CUDA_ERROR(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice));

  // Create CUDA events for timing
  cudaEvent_t start, stop;
  CHECK_CUDA_ERROR(cudaEventCreate(&start));
  CHECK_CUDA_ERROR(cudaEventCreate(&stop));

  // Launch warmup kernel
  warmup<<<1, 1>>>();
  CHECK_CUDA_ERROR(cudaGetLastError());
  CHECK_CUDA_ERROR(cudaDeviceSynchronize());

  // Launch vector addition kernel and measure time
  int threadsPerBlock = 256;
  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
  CHECK_CUDA_ERROR(cudaEventRecord(start));
  // vectorAdd<int>
  //     <<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, numElements);
  // vectorAdd<float>
  //     <<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, numElements);
  vectorAdd<double>
      <<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, numElements);

  CHECK_CUDA_ERROR(cudaGetLastError());
  CHECK_CUDA_ERROR(cudaEventRecord(stop));

  // Copy result from device to host
  CHECK_CUDA_ERROR(cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost));

  // Synchronize and calculate elapsed time
  CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
  float milliseconds = 0;
  CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start, stop));

  printf("Execution time: %f millisecondsn", milliseconds);

  // Free device memory
  CHECK_CUDA_ERROR(cudaFree(d_a));
  CHECK_CUDA_ERROR(cudaFree(d_b));
  CHECK_CUDA_ERROR(cudaFree(d_c));

  // Free host memory
  free(h_a);
  free(h_b);
  free(h_c);

  return 0;
}

And my output is:
Execution time (double): 25.986048 milliseconds
Execution time (int): 0.079872 milliseconds
Execution time (float): 0.377856 milliseconds

When I increase the iterations of the for loop inside the kernel from 1_000 to 10_000, the values change only for the double and float kernel:

Execution time (double): 25.986048 milliseconds
Execution time (int): 0.079872 milliseconds
Execution time (float): 3.499744 milliseconds

Why does the value for int not change? And is there anything else that I am missing?

Thiết kế website giá rẻ

Danh mục

How to calculate speed of different cores in Nvidia Ampere architecture?