So I am trying to test the how fast is the execution of kernels for different cores. Here is the script:
#include <stdio.h>
// CUDA kernel to add two vectors of int32
__global__ void vectorAddInt(int *a, int *b, int *c, int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
c[i] = a[i] + b[i];
}
}
// CUDA kernel to add two vectors of float
__global__ void vectorAddFloat(float *a, float *b, float *c, int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
c[i] = a[i] + b[i];
}
}
// CUDA kernel to add two vectors of double
__global__ void vectorAddDouble(double *a, double *b, double *c,
int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
c[i] = a[i] + b[i];
}
}
__global__ void warmup_kernel() {}
int main() {
int numElements = 32 * 1024; // Number of elements in the vectors
size_t size = numElements * sizeof(int);
// Allocate memory for vectors on host
int *h_a, *h_b, *h_c;
h_a = (int *)malloc(size);
h_b = (int *)malloc(size);
h_c = (int *)malloc(size);
// Initialize vectors on host
for (int i = 0; i < numElements; ++i) {
h_a[i] = i;
h_b[i] = i;
}
// Allocate memory for vectors on device
int *d_a, *d_b, *d_c;
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Create CUDA events for timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Launch vector addition kernel for int32 and measure time
int threadsPerBlock = 1024;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
warmup_kernel<<<blocksPerGrid, threadsPerBlock>>>();
cudaEventRecord(start);
for (int i = 0; i < 100; i++) {
vectorAddInt<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c,
numElements);
}
cudaEventRecord(stop);
// Copy result from device to host
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
// Synchronize and calculate elapsed time
cudaEventSynchronize(stop);
float milliseconds_int32 = 0;
cudaEventElapsedTime(&milliseconds_int32, start, stop);
// Reset device memory and events
cudaMemset(d_c, 0, size);
cudaEventRecord(start);
// Launch vector addition kernel for fp32 and measure time
for (int i = 0; i < 100; i++) {
vectorAddFloat<<<blocksPerGrid, threadsPerBlock>>>(
(float *)d_a, (float *)d_b, (float *)d_c, numElements);
}
cudaEventRecord(stop);
// Copy result from device to host
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
// Synchronize and calculate elapsed time
cudaEventSynchronize(stop);
float milliseconds_fp32 = 0;
cudaEventElapsedTime(&milliseconds_fp32, start, stop);
// Reset device memory and events
cudaMemset(d_c, 0, size);
cudaEventRecord(start);
// Launch vector addition kernel for fp64 and measure time
for (int i = 0; i < 100; i++) {
vectorAddDouble<<<blocksPerGrid, threadsPerBlock>>>(
(double *)d_a, (double *)d_b, (double *)d_c, numElements);
}
cudaEventRecord(stop);
// Copy result from device to host
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
// Synchronize and calculate elapsed time
cudaEventSynchronize(stop);
float milliseconds_fp64 = 0;
cudaEventElapsedTime(&milliseconds_fp64, start, stop);
printf("Execution time for int32 kernel: %f millisecondsn",
milliseconds_int32 / 100);
printf("Execution time for fp32 kernel: %f millisecondsn",
milliseconds_fp32 / 100);
printf("Execution time for fp64 kernel: %f millisecondsn",
milliseconds_fp64 / 100);
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Free host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
(you can give me some comments on the code as well, if I am making any mistakes). I am using RTX 3050 (Ampere) to compile and run this code. It gives me the following results.
enter image description here
As per the architecture diagram of Ampere, given here:
https://developer-blogs.nvidia.com/wp-content/uploads/2021/guc/raD52-V3yZtQ3WzOE0Cvzvt8icgGHKXPpN2PS_5MMyZLJrVxgMtLN4r2S2kp5jYI9zrA2e0Y8vAfpZia669pbIog2U9ZKdJmQ8oSBjof6gc4IrhmorT2Rr-YopMlOf1aoU3tbn5Q.png.
There are 64 INT32 cores, 64 FP32 cores and 32 FP64 cores. Is this the reason why the execution time for INT32 and FP32 is almost the same but for FP64 it takes more time since there are fewer cores?
Cuz when I was experimenting with another GPU (GTX 1660 Ti (Turing Architecture)) I have these results:
enter image description here
The results are almost the same but in Turing there are no dedicated FP64 cores. I am a bit confused.