I’m working on a cuda kernel for the adam optimization algorithm, and I’m experiencing inconsistent results due to the use of float. I understand that float can lead to variability, but I want to know if there’s a way to make the results more consistent without switching to double.
// for cuda error handling
static void HandleError(cudaError_t err,
const char* file,
int line) {
if (err != cudaSuccess) {
printf("%s in %s at line %dn", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
void checkKernelErrors(const char* kernelName) {
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
printf("CUDA Error after launching %s: %sn", kernelName, cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error after launching %s: %sn", kernelName, cudaGetErrorString(cudaStatus));
}
}
__global__ void adam_kernel(float* value, float* m, float* v, float* grad, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
m[idx] = 0.95f * m[idx] + (1 - 0.95f) * grad[idx];
v[idx] = 0.999f * v[idx] + (1 - 0.999f) * grad[idx] * grad[idx];
value[idx] = 0.01f * m[idx] / (sqrtf(v[idx]) + 1e-8f);
}
}
float randomFrom(float min, float max) {
static std::random_device rdev;
static std::default_random_engine re(rdev());
std::uniform_real_distribution<float> dist(min, max);
return dist(re);
}
int main()
{
const int size = 764 * 64;
// host variables
float* value = new float[size];
float* m = new float[size];
float* v = new float[size];
float* grad = new float[size];
// init with host variables
for (int i = 0; i < size; ++i) {
value[i] = randomFrom(0, 0.00001f);
m[i] = 0;
v[i] = 0;
grad[i] = randomFrom(0, 0.00001f);
}
// device variables
float* d_value;
float* d_m;
float* d_v;
float* d_grad;
// alloc device memory
size_t dev_size = size * sizeof(float);
HANDLE_ERROR(cudaMalloc(&d_value, dev_size));
HANDLE_ERROR(cudaMalloc(&d_m, dev_size));
HANDLE_ERROR(cudaMalloc(&d_v, dev_size));
HANDLE_ERROR(cudaMalloc(&d_grad, dev_size));
// copy host to device
HANDLE_ERROR(cudaMemcpy(d_value, value, dev_size, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_m, m, dev_size, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_v, v, dev_size, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_grad, grad, dev_size, cudaMemcpyHostToDevice));
// run kernel
int threadsPerBlock = 256;
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
adam_kernel << <blocksPerGrid, threadsPerBlock >> > (d_value, d_m, d_v, d_grad, size);
checkKernelErrors("adam_kernel");
// copy device to host
HANDLE_ERROR(cudaMemcpy(value, d_value, size * sizeof(float), cudaMemcpyDeviceToHost));
// print results
float sum = 0;
for (int i = 0; i < size; ++i) {
sum += value[i];
}
std::cout << "Sum: " << sum << std::endl;
// Free memory
delete[] value;
delete[] m;
delete[] v;
delete[] grad;
HANDLE_ERROR(cudaFree(d_value));
HANDLE_ERROR(cudaFree(d_m));
HANDLE_ERROR(cudaFree(d_v));
HANDLE_ERROR(cudaFree(d_grad));
return 0;
}
I use randomFrom to generate small random values similar to the actual weights and gradients in my full program.
Is there any technique or best practice that can help improve the precision or consistency of the results with float or is something else wrong with my code?
Thanks in advance!
FM CG is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.