i have this kernel:
__kernel void complete_fcl_feed_forward_opt(__global float* array , __global float* params, int input_size, int output_size,int threads, ulong input_offset, ulong output_offset, ulong weight_offset, ulong bias_offset, ulong array_total_size, ulong max_chunk_size, __local float* local_input) {
int idx = get_global_id(0);
int idx_local = get_local_id(0);
int weight_size = output_size*input_size;
int work_group_size = get_local_size(0);
int i,j,z;
max_chunk_size = 500;
if (idx < threads) {
for(z = 0; z < max_chunk_size; z+=work_group_size){
if(z + idx_local >= max_chunk_size || z+idx_local >= weight_size)
break;
else{
local_input[z + idx_local] = params[weight_offset+z+idx_local];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
ulong val_input = array_total_size * idx + input_offset;
ulong val_output = array_total_size * idx + output_offset;
int iter,iter2;
for (iter = 0,iter2 = 1, j= 0; j < output_size; j++) {
float sum = 0.0f;
for (i = 0; i < input_size; i++, iter++) {
if(iter >= max_chunk_size){
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
if (idx < threads) {
for(z = 0; z < max_chunk_size; z+=work_group_size){
if(iter2*max_chunk_size+z+idx_local < weight_size){
local_input[z+idx_local] = params[weight_offset+iter2*max_chunk_size+z+idx_local];
}
else
break;
}
}
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
iter2++;
iter = 0;
}
if (idx < threads) {
sum += array[val_input + i] * local_input[iter];
}
}
if (idx < threads) {
array[val_output + j] += sum + params[bias_offset+j];
}
}
}
i have enqueued the kernel with 1 dimension.
GlobalSize is 1024
LocalSize is 1024
threads is 500
the local_input array has been allocated as
s->ret = clSetKernelArg(s->kernels[KERNEL_COMPLETE_FCL_FEED_FORWARD_OPT],11,sizeof(float)*n_parameters_to_share,NULL);
cl_verification(s->ret,"Error when setting Kernel arguments61");
where n_parameters_to_share = 6144
i run 1 single work group.
max_chunk_size has been set as
s->ret = clSetKernelArg(s->kernels[KERNEL_COMPLETE_FCL_FEED_FORWARD_OPT],10,sizeof(ulong),(void*)&n_parameters_to_share);
cl_verification(s->ret,"Error when setting Kernel arguments60");
It seems the kernel works as intended only if max_chunk_size = threads, otherwise it gives me inconsistency results
The kernel should output a fully-connected neural network.
I sure 100% the threads passed is 500 inside the kernel, the max_chink_size is 6144 inside the kernel the work_group_size is 1024 inside the kernel, the array is of size threadsarray_total_size. At this point i m only dubious if the local_input is 6144sizeof(float) but it should be i ve used the same parameter for allocating the local_input memory.
What’s wrong with this kernel?