Thiết kế website giá rẻ

Question

So I am trying to test the how fast is the execution of kernels for different cores. Here is the script:


#include <stdio.h>

// CUDA kernel to add two vectors of int32
__global__ void vectorAddInt(int *a, int *b, int *c, int numElements) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < numElements) {
    c[i] = a[i] + b[i];
  }
}

// CUDA kernel to add two vectors of float
__global__ void vectorAddFloat(float *a, float *b, float *c, int numElements) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < numElements) {
    c[i] = a[i] + b[i];
  }
}

// CUDA kernel to add two vectors of double
__global__ void vectorAddDouble(double *a, double *b, double *c,
                                int numElements) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < numElements) {
    c[i] = a[i] + b[i];
  }
}
__global__ void warmup_kernel() {}

int main() {
  int numElements = 32 * 1024; // Number of elements in the vectors
  size_t size = numElements * sizeof(int);

  // Allocate memory for vectors on host
  int *h_a, *h_b, *h_c;
  h_a = (int *)malloc(size);
  h_b = (int *)malloc(size);
  h_c = (int *)malloc(size);

  // Initialize vectors on host
  for (int i = 0; i < numElements; ++i) {
    h_a[i] = i;
    h_b[i] = i;
  }

  // Allocate memory for vectors on device
  int *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, size);
  cudaMalloc(&d_b, size);
  cudaMalloc(&d_c, size);

  // Create CUDA events for timing
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  // Launch vector addition kernel for int32 and measure time
  int threadsPerBlock = 1024;
  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
  warmup_kernel<<<blocksPerGrid, threadsPerBlock>>>();
  cudaEventRecord(start);
  for (int i = 0; i < 100; i++) {
    vectorAddInt<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c,
                                                     numElements);
  }
  cudaEventRecord(stop);

  // Copy result from device to host
  cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

  // Synchronize and calculate elapsed time
  cudaEventSynchronize(stop);
  float milliseconds_int32 = 0;
  cudaEventElapsedTime(&milliseconds_int32, start, stop);

  // Reset device memory and events
  cudaMemset(d_c, 0, size);
  cudaEventRecord(start);

  // Launch vector addition kernel for fp32 and measure time
  for (int i = 0; i < 100; i++) {

    vectorAddFloat<<<blocksPerGrid, threadsPerBlock>>>(
        (float *)d_a, (float *)d_b, (float *)d_c, numElements);
  }
  cudaEventRecord(stop);

  // Copy result from device to host
  cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

  // Synchronize and calculate elapsed time
  cudaEventSynchronize(stop);
  float milliseconds_fp32 = 0;
  cudaEventElapsedTime(&milliseconds_fp32, start, stop);

  // Reset device memory and events
  cudaMemset(d_c, 0, size);
  cudaEventRecord(start);

  // Launch vector addition kernel for fp64 and measure time
  for (int i = 0; i < 100; i++) {

    vectorAddDouble<<<blocksPerGrid, threadsPerBlock>>>(
        (double *)d_a, (double *)d_b, (double *)d_c, numElements);
  }
  cudaEventRecord(stop);

  // Copy result from device to host
  cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

  // Synchronize and calculate elapsed time
  cudaEventSynchronize(stop);
  float milliseconds_fp64 = 0;
  cudaEventElapsedTime(&milliseconds_fp64, start, stop);

  printf("Execution time for int32 kernel: %f millisecondsn",
         milliseconds_int32 / 100);
  printf("Execution time for fp32 kernel: %f millisecondsn",
         milliseconds_fp32 / 100);
  printf("Execution time for fp64 kernel: %f millisecondsn",
         milliseconds_fp64 / 100);

  // Free device memory
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  // Free host memory
  free(h_a);
  free(h_b);
  free(h_c);

  return 0;
}

(you can give me some comments on the code as well, if I am making any mistakes). I am using RTX 3050 (Ampere) to compile and run this code. It gives me the following results.

enter image description here

As per the architecture diagram of Ampere, given here:

https://developer-blogs.nvidia.com/wp-content/uploads/2021/guc/raD52-V3yZtQ3WzOE0Cvzvt8icgGHKXPpN2PS_5MMyZLJrVxgMtLN4r2S2kp5jYI9zrA2e0Y8vAfpZia669pbIog2U9ZKdJmQ8oSBjof6gc4IrhmorT2Rr-YopMlOf1aoU3tbn5Q.png.

There are 64 INT32 cores, 64 FP32 cores and 32 FP64 cores. Is this the reason why the execution time for INT32 and FP32 is almost the same but for FP64 it takes more time since there are fewer cores?

Cuz when I was experimenting with another GPU (GTX 1660 Ti (Turing Architecture)) I have these results:
enter image description here

The results are almost the same but in Turing there are no dedicated FP64 cores. I am a bit confused.

Thiết kế website giá rẻ

Danh mục

How to calculate difference in speed of different cores in Nvidia Ampere architecture?