Since there are different number of INT, FLOAT and DOUBLE cores in NVIDIA Ampere architecture, I want to see this difference reflected when I do computation.
I have the following script:
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>
// Macro to check CUDA errors
#define CHECK_CUDA_ERROR(call)
{
cudaError_t err = call;
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error at %s %d: %sn", __FILE__, __LINE__,
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
// CUDA kernel to add two vectors with more computation to measure computation
// time
template <typename T>
__global__ void vectorAdd(T *a, T *b, T *c, unsigned numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
T temp = 0;
for (int j = 0; j < 10000; ++j) { // Increased computational load
temp += a[i] + b[i];
}
c[i] = temp;
}
}
__global__ void warmup() { printf("I am a warmup kerneln"); }
template <typename T> void createArr(T *&arr, unsigned numElements) {
size_t size = numElements * sizeof(T);
arr = (T *)malloc(size);
for (unsigned i = 0; i < numElements; ++i) {
arr[i] = static_cast<T>(rand() % 10);
}
}
int main() {
srand(time(0));
unsigned numElements =
1024 * 1024; // Increase the number of elements to ensure sufficient load
// Allocate memory for vectors on host
// unsigned size = numElements * sizeof(int);
// unsigned size = numElements * sizeof(float);
unsigned size = numElements * sizeof(double);
// int *h_a, *h_b, *h_c, *d_a, *d_b, *d_c;
// float *h_a, *h_b, *h_c, *d_a, *d_b, *d_c;
double *h_a, *h_b, *h_c, *d_a, *d_b, *d_c;
createArr(h_a, numElements);
createArr(h_b, numElements);
// h_c = (float *)malloc(size);
// h_c = (int *)malloc(size);
h_c = (double *)malloc(size);
// Allocate memory for vectors on device
CHECK_CUDA_ERROR(cudaMalloc(&d_a, size));
CHECK_CUDA_ERROR(cudaMalloc(&d_b, size));
CHECK_CUDA_ERROR(cudaMalloc(&d_c, size));
// Copy vectors from host to device
CHECK_CUDA_ERROR(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice));
// Create CUDA events for timing
cudaEvent_t start, stop;
CHECK_CUDA_ERROR(cudaEventCreate(&start));
CHECK_CUDA_ERROR(cudaEventCreate(&stop));
// Launch warmup kernel
warmup<<<1, 1>>>();
CHECK_CUDA_ERROR(cudaGetLastError());
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
// Launch vector addition kernel and measure time
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
CHECK_CUDA_ERROR(cudaEventRecord(start));
// vectorAdd<int>
// <<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, numElements);
// vectorAdd<float>
// <<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, numElements);
vectorAdd<double>
<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, numElements);
CHECK_CUDA_ERROR(cudaGetLastError());
CHECK_CUDA_ERROR(cudaEventRecord(stop));
// Copy result from device to host
CHECK_CUDA_ERROR(cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost));
// Synchronize and calculate elapsed time
CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
float milliseconds = 0;
CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start, stop));
printf("Execution time: %f millisecondsn", milliseconds);
// Free device memory
CHECK_CUDA_ERROR(cudaFree(d_a));
CHECK_CUDA_ERROR(cudaFree(d_b));
CHECK_CUDA_ERROR(cudaFree(d_c));
// Free host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
And my output is:
Execution time (double): 25.986048 milliseconds
Execution time (int): 0.079872 milliseconds
Execution time (float): 0.377856 milliseconds
When I increase the iterations of the for loop inside the kernel from 1_000 to 10_000, the values change only for the double and float kernel:
Execution time (double): 25.986048 milliseconds
Execution time (int): 0.079872 milliseconds
Execution time (float): 3.499744 milliseconds
Why does the value for int not change? And is there anything else that I am missing?