Can you help me find the reason why my CUDA coded MLP will not learn?

I wanted to write an MLP in CUDA without any dependancy’s I apologise in advance for my messy code. Please can you examine my CUDA functions to see if there is an obvious mistake which could explain why it will not solve , as it should, the simple XoR problem. We should see a decrease in error but instead it just produces a random error. I tried to make my own CUDA rng but I am using rand() instead. I have

  1. Initialised activations (outputs A_a), Errors (E_a), Weights (W_a) and Bias (B_a)
  2. I checked Feedforward NN_FF thoroughly and its mathematically sound
  3. The weights are being updated and are changing but obviously not in the direction I wish.
    Please have a look and see if there is a problem either in
    NN_FF or NN_BP
#include <stdio.h>
#include <stdlib.h>
#include <fstream>
#include <iostream>
#include <ctime>
#include <time.h>
#include "1dkern.cu"

using namespace std;

typedef struct{

float *B_a,*A_a,*W_a,*D_a,*P_a,*P_w,*E_a,*V_a;
float *L_a,*WL_a,*C_e;  
int *LU_a,*P_l,*W_l,*A_f;   
int ITER,NLAYS,NN,BS,NVAL,WTOT,NS;
float Lrt,Decy,Mmnt;
}myparam;

int *LU_a;

float getlrand(float lower,float upper){

            return ((float) rand()/ RAND_MAX) * (upper-lower) + lower;

}

/* Call all Act functions - includes rng for noisy Acts */
__device__ float mytanh_(float x)
{
  float exp2x = expf(2*x);
  return (exp2x - 1) / (exp2x + 1);
}

__device__ float sigmoid(float v)
{
    return 1 / (1 + expf(-v));
}


__device__ float InvSig(const float sum,const float act)
{
    return (act*(1-act)*sum) ;
}

__device__ float ActFun(int type, float v,int dir,float u, int seed){

if(type==0){
/*Tanh*/
if(dir==0){
return mytanh_(v);
}else{
return (1-pow((double)mytanh_(u),2.0))*v;
//return (1-(mytanh_(u)*mytanh_(u)))*v;
//return (1-mytanh_(u))*v;
            }

            }
            
if(type==1){
/*Sigmoid*/
if(dir==0){
return 1 / (1 + expf(-v));
//return sigmoid(v);
}else{
return (u*(1-u)*v);
//return InvSig(v,u);
            }

            }
if(type==2){
/*Norm*/
if(dir==0){
return v;
}else{
return v;
            }

            }           
return 0;           

}

__device__ float rng(unsigned int m_w,unsigned int m_z){

   m_z = 36969 * (m_z & 65535) + (m_z >> 16);
   m_w = 18000 * (m_w & 65535) + (m_w >> 16);

   return (m_z << 16) + m_w;  /* 32-bit result */
   

}

__global__ void INIT_PARAM(myparam PARAM){

int TW,NW;

TW=PARAM.W_l[PARAM.NLAYS-1];
NW=PARAM.P_l[PARAM.NLAYS-1];


GPU_1D_KERN_LOOP(index,TW){

const int ls = (index / 1 )        %TW;     

PARAM.W_a[ls] = -(2425-rng(150+ls,40)/1000000);
PARAM.D_a[ls] = -(2425-rng(150+ls,40)/1000000);
PARAM.P_w[ls] = -(2425-rng(150+ls,40)/1000000);
PARAM.P_a[ls] = -(2425-rng(150+ls,40)/1000000);

                                }

__syncthreads();


GPU_1D_KERN_LOOP(index,NW){

const int ls = (index / 1 )        %NW;     

PARAM.B_a[ls] = -(2425-rng(150+ls,40)/1000000);


                                }


}



__global__ void NN_FF(myparam PARAM,const int* const __restrict__ LU_a){

//cudaStream_t astream;
//cudaStreamCreateWithFlags(&astream,cudaStreamNonBlocking);

const int NLAYS = PARAM.NLAYS;

for(int LAYER=1;LAYER<NLAYS;LAYER++){


/* Set Constants */
const int NIN  = LU_a[LAYER*6+0];
const int NOUT = LU_a[LAYER*6+1];
const int SIN  = LU_a[LAYER*6+2];
const int SOUT = LU_a[LAYER*6+3];


const int NVAL = PARAM.NVAL;                //Number of Training Values (XoR or RL)
const int NUMNET = PARAM.NN;
const int ITER = PARAM.ITER;
const int BS = PARAM.BS;
const int NETSIZE = PARAM.NS;



if(LAYER==1){

/*Set Inputs*/

GPU_1D_KERN_LOOP(index,NIN*NUMNET*BS){

    const int ls = (index / 1 )        %NIN;        //Outputs / layer size
    const int bs = (index / NIN)        %BS;
    const int nn  = (index / NIN / BS) %NUMNET;
    
    
    PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+ls]=ActFun(PARAM.A_f[LAYER-1],PARAM.V_a[ITER*BS*NVAL+bs*NVAL+ls],0,0,1);
    
    
                                                }
__syncthreads();                

                            }
                                                
if(1){

GPU_1D_KERN_LOOP(index,NOUT*NUMNET*BS){

    const int ls = (index / 1 )         %NOUT;      //Outputs / layer size
    const int bs = (index / NIN)        %BS;
    const int nn  = (index / NIN / BS)  %NUMNET;
    
    PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls]=0;
    

}

__syncthreads();                

//Feedforward to MLP layer

GPU_1D_KERN_LOOP(index, NIN*NOUT*NUMNET*BS){

    
    
        const int ls = (index / 1 )                  %NOUT;     //Outputs / layer size
        const int col = (index / NOUT )              %NIN;      //Inputs /  size of feature map
        const int bs = (index / NOUT / NIN )         %BS;
        const int nn  = (index / NOUT / NIN / BS )   %NUMNET;
                                    
atomicAdd(&PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],PARAM.W_a[PARAM.W_l[LAYER-1]+(col+ls*NIN)]*PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SIN+col]);
atomicAdd(&PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],PARAM.B_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls]);

                                                        }

__syncthreads();                
                                                
if(1){

GPU_1D_KERN_LOOP(index, NOUT*BS*NUMNET){

        const int ls = (index / 1 )            %NOUT;       //Outputs / layer size
        const int bs = (index / NOUT  )        %BS;
        const int nn  = (index / NOUT / BS )   %NUMNET;
        
PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls] = ActFun(PARAM.A_f[LAYER],PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],0,0,1);
                                    }

}
                                                
__syncthreads();                                                


}
                                                }//Layers Loop

}

__global__ void NN_BP(myparam PARAM,const int* const __restrict__ LU_a){

cudaStream_t astream;
cudaStreamCreateWithFlags(&astream,cudaStreamNonBlocking);

const int NLAYS = PARAM.NLAYS;

for(int LAYER=NLAYS-1;LAYER>0;LAYER--){                             
                                                                                    /*Backprop Error*/

/* Set Constants */
const int NIN = LU_a[LAYER*6+0];
const int NOUT =LU_a[LAYER*6+1];
const int SIN = LU_a[LAYER*6+2];
const int SOUT =LU_a[LAYER*6+3];
const int WSTR =LU_a[LAYER*6+4];
const int WEND =LU_a[LAYER*6+5];
//const int WTOT   = PARAM.WTOT;
const int WSZE = WEND-WSTR;

const int NVAL = PARAM.NVAL;                //Number of Training Values (XoR or RL)
const int NUMNET = PARAM.NN;
const int ITER = PARAM.ITER;
const int BS = PARAM.BS;
const int NETSIZE = PARAM.NS;



if(LAYER==NLAYS-1){     /*Label Layer Error*/


GPU_1D_KERN_LOOP(index,NOUT*BS*NUMNET ){        /*THIS CHECKSOUT*/

    const int nl = (index / 1 )         %NOUT;      //Outputs / layer size
    const int bs = (index / NOUT)       %BS;
    const int nn  = (index / NOUT / BS) %NUMNET;
    
int ERR_POS = nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl;
int VAL_POS = ITER*BS*NVAL+bs*NVAL+(NVAL-NOUT)+nl;
int ACT_POS = nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl;
    
    
    PARAM.E_a[ERR_POS] = PARAM.V_a[VAL_POS];
    atomicAdd(&PARAM.E_a[ERR_POS],-PARAM.A_a[ACT_POS]);
    
    //Add Loss function here
    //PARAM.E_a[ERR_POS]=LOSS_FUN(PARAM.LF,PARAM.V_a[VAL_POS],PARAM.A_a[ACT_POS]);

                                        }

__syncthreads();


    /* Invert Label error */


                            
    GPU_1D_KERN_LOOP(index, NOUT*BS*NUMNET){

    
    const int nl = (index / 1 )         %NOUT;      //Number of labels  
    const int bs = (index / NOUT)       %BS;
    const int nn  = (index / NOUT / BS) %NUMNET;
 

int ERR_POS = nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl;
int ACT_POS = nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl;
    
 
    PARAM.E_a[ERR_POS] = ActFun(PARAM.A_f[NLAYS-1],PARAM.E_a[ERR_POS],1,PARAM.A_a[ACT_POS],1);
    atomicAdd(&PARAM.B_a[ERR_POS],PARAM.Lrt*ActFun(PARAM.A_f[NLAYS-1],PARAM.E_a[ERR_POS],1,PARAM.A_a[ACT_POS],1));
    

                                                        }                                   
                                        
                                        
                                
                                                }//Label Layer
__syncthreads();

if(1){
    
        /* None of this matters when you just learn from the bias!!!*/

/* Initialise Lower Error */

GPU_1D_KERN_LOOP(index,NIN*BS*NUMNET){

            
    const int bs = (index / 1)                     %BS;
    const int nn  = (index / BS)                   %NUMNET;
    const int ls = (index / BS / NUMNET  )         % NIN;
    
    PARAM.E_a[nn*BS*NETSIZE+bs*NETSIZE+SIN+ls]=0;


}

__syncthreads();

/* Backprop Error Layer to Layer */

GPU_1D_KERN_LOOP(index, NOUT*NIN*BS*NUMNET){

    const int bs = (index / 1)                        %BS;
    const int nn  = (index / BS)                      %NUMNET;
    const int nl = (index / BS / NUMNET )             %NOUT;        //Number of labels
    const int us = (index / BS / NUMNET / NOUT )      %NIN;     //layer size        
    

int ERR_LO = nn*BS*NETSIZE+bs*NETSIZE+SIN+us;
int ERR_UP = nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl;
int WGT_POS = PARAM.W_l[LAYER-1]+nl*NIN+us;     //Or use WSTART
                                                
                                                
atomicAdd(&PARAM.E_a[ERR_LO], PARAM.W_a[WGT_POS]*PARAM.E_a[ERR_UP]);            

                                        }

__syncthreads();

    GPU_1D_KERN_LOOP(index, NIN*BS*NUMNET){

    
    const int us = (index / 1 )         %NIN;   
    const int bs = (index / NOUT)       %BS;
    const int nn  = (index / NOUT / BS) %NUMNET;
 

int ERR_POS = nn*BS*NETSIZE+bs*NETSIZE+SIN+us;
int ACT_POS = nn*BS*NETSIZE+bs*NETSIZE+SIN+us;
    
 
    PARAM.E_a[ERR_POS] = ActFun(PARAM.A_f[LAYER],PARAM.E_a[ERR_POS],1,PARAM.A_a[ACT_POS],1);
    atomicAdd(&PARAM.B_a[ERR_POS],PARAM.Lrt*ActFun(PARAM.A_f[LAYER],PARAM.E_a[ERR_POS],1,PARAM.A_a[ACT_POS],1));
    

                                                        }
__syncthreads();


/*Begin Weight Updates*/

/* Deltas */

GPU_1D_KERN_LOOP(index,WSZE ){

    
    const int nl = (index / 1 )                 % WSZE;     //SIZE OF WGTS
        
    PARAM.D_a[WSTR+nl] = PARAM.W_a[WSTR+nl];
        
                                }



__syncthreads();

GPU_1D_KERN_LOOP(index,WSZE ){

    const int nl = (index / 1 )                 % WSZE;     //SIZE OF WGTS
        
    atomicAdd(&PARAM.D_a[WSTR+nl],-PARAM.P_w[WSTR+nl]);
    

        }

__syncthreads();

/* Pre Wgts */

GPU_1D_KERN_LOOP(index,WSZE ){

    const int nl = (index / 1 )                 % WSZE;     //SIZE OF WGTS
        

    PARAM.P_w[WSTR+nl] = PARAM.W_a[WSTR+nl];
                                        }
                                        
__syncthreads();

/* Wgt Update */

GPU_1D_KERN_LOOP(index,NIN*NOUT*NUMNET*BS ){

    const int bs = (index / 1)                        %BS;
    const int nn  = (index / BS)                      %NUMNET;
    const int nl = (index / BS / NUMNET )             %NOUT;    //Number of labels
    const int us = (index / BS / NUMNET /NOUT )       %NIN;     //layer size    

    atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],PARAM.Mmnt*PARAM.P_a[WSTR+nl*NIN+us]);             //Deltas    
    
    atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],PARAM.Lrt*PARAM.E_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl]*PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SIN+us]);         //Lrate
    
    atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],-PARAM.Decy*PARAM.P_w[WSTR+nl*NIN+us]);                //Preweights



}

__syncthreads();

/* Pre Deltas */

GPU_1D_KERN_LOOP(index,WSZE ){

    const int nl = (index / 1 )                 % WSZE;     //SIZE OF WGTS
        
    PARAM.P_a[WSTR+nl] = PARAM.D_a[WSTR+nl];
        
        }
        
__syncthreads();


            }                                   }//End Layers
}

class net{
public:
myparam PARAM_DEV,PARAM_HOST;
int *L_ua,*PLAYR,*LAYERS,*ACTFI,*WPLAYR;
int NLAYS,BS,TI,NUMNET;
float *VAL;
clock_t startTime, endTime;

ofstream errfile;

~net(){

cudaFree(PARAM_DEV.B_a);cudaFree(PARAM_DEV.A_a);cudaFree(PARAM_DEV.W_a);cudaFree(PARAM_DEV.D_a);cudaFree(PARAM_DEV.P_a);cudaFree(PARAM_DEV.P_w);cudaFree(PARAM_DEV.E_a);cudaFree(PARAM_DEV.V_a);
cudaFree(PARAM_DEV.L_a);cudaFree(PARAM_DEV.WL_a);cudaFree(PARAM_DEV.C_e);   
cudaFree(PARAM_DEV.P_l);cudaFree(PARAM_DEV.W_l);cudaFree(PARAM_DEV.A_f);    
cudaFree(LU_a);cudaFree(PARAM_DEV.LU_a);

delete(PARAM_HOST.B_a);delete(PARAM_HOST.A_a);delete(PARAM_HOST.W_a);delete(PARAM_HOST.D_a);delete(PARAM_HOST.P_a);delete(PARAM_HOST.P_w);delete(PARAM_HOST.E_a);delete(PARAM_HOST.V_a);
delete(PARAM_HOST.L_a);delete(PARAM_HOST.WL_a);delete(PARAM_HOST.C_e);  
delete(PARAM_HOST.P_l);delete(PARAM_HOST.W_l);delete(PARAM_HOST.A_f);   


delete PLAYR,LAYERS,ACTFI,WPLAYR,VAL,L_ua;

}

net(int LAYERS_[],const char* ACTF[],int NLAYERS){

TI = 300;                       //Number of Iterations
BS = PARAM_DEV.BS = 1;          //Batch Size
PARAM_DEV.ITER = 0; //Start of Iterations
NUMNET = PARAM_DEV.NN = 1;  //Number of Paralell Nets
PARAM_DEV.Lrt = 0.065; PARAM_DEV.Mmnt = 0.0065; PARAM_DEV.Decy = 0.000065;

errfile.open("errfile.dat",ios::out);

NLAYS = PARAM_DEV.NLAYS = NLAYERS;
LAYERS = new int[NLAYERS];
PLAYR = new int[NLAYERS];
WPLAYR = new int[NLAYERS];
ACTFI = new int[NLAYERS];
L_ua = new int[NLAYS*6];        //Layer Constants

for(int i=0;i<NLAYS;i++){

ACTFI[i]=0;

if(ACTF[i][0]=='T'&&ACTF[i][1]=='A'&&ACTF[i][2]=='N'&&ACTF[i][3]=='H'){ACTFI[i]=0;}
if(ACTF[i][0]=='S'&&ACTF[i][1]=='I'&&ACTF[i][2]=='G'&&ACTF[i][3]=='M'){ACTFI[i]=1;}
if(ACTF[i][0]=='N'&&ACTF[i][1]=='O'&&ACTF[i][2]=='R'&&ACTF[i][3]=='M'){ACTFI[i]=2;}


if(i==0){
PLAYR[0]  = LAYERS_[0];
WPLAYR[0] = 0;
        }
if(i>0){
PLAYR[i]  = PLAYR[i-1]+LAYERS_[i];              //Neurons
WPLAYR[i] = WPLAYR[i-1]+LAYERS_[i]*LAYERS_[i-1];    //Weights   
        }
LAYERS[i] = LAYERS_[i];     
                            }

PARAM_DEV.NVAL = LAYERS[NLAYS-1]+LAYERS[0];         //Set number of Values to learn (3)
PARAM_DEV.WTOT = WPLAYR[NLAYS-1];
PARAM_DEV.NS   = PLAYR[NLAYS-1];                    //Netsize

for(int i=1;i<NLAYS;i++){
if(i>1){
L_ua[i*6+0] = PLAYR[i-1]-PLAYR[i-2];//NIN
L_ua[i*6+1] = PLAYR[i]-PLAYR[i-1];//NOUT
L_ua[i*6+2] = PLAYR[i-2];//SIN
L_ua[i*6+3] = PLAYR[i-1];//SOUT
L_ua[i*6+4] = WPLAYR[i-1];//WSTR
L_ua[i*6+5] = WPLAYR[i];//WEND
    }else{
L_ua[i*6+0] = PLAYR[0];//NIN
L_ua[i*6+1] = PLAYR[1]-PLAYR[0];//NOUT
L_ua[i*6+2] = 0;//SIN
L_ua[i*6+3] = PLAYR[0];//SOUT
L_ua[i*6+4] = WPLAYR[i-1];//WSTR
L_ua[i*6+5] = WPLAYR[i];//WEND
}
}




PARAM_HOST.W_a = new float[WPLAYR[NLAYS-1]];
PARAM_HOST.P_w = new float[WPLAYR[NLAYS-1]];
PARAM_HOST.D_a = new float[WPLAYR[NLAYS-1]];
PARAM_HOST.P_a = new float[WPLAYR[NLAYS-1]];

PARAM_HOST.E_a = new float[NUMNET*BS*PLAYR[NLAYS-1]];
PARAM_HOST.A_a = new float[NUMNET*BS*PLAYR[NLAYS-1]];
PARAM_HOST.B_a = new float[NUMNET*BS*PLAYR[NLAYS-1]];

for(int i=0;i<BS*NUMNET*PLAYR[NLAYS-1];i++){

PARAM_HOST.A_a[i] = 0;
PARAM_HOST.B_a[i] =getlrand(0,1);
PARAM_HOST.E_a[i] = 0;                  
                                        }


VAL  = new float[(LAYERS[NLAYS-1]+LAYERS[0])*BS*TI];
prep_val(VAL,0);
                                            /*Init weights and deltas*/
for(int i=0;i<WPLAYR[NLAYS-1];i++){

PARAM_HOST.W_a[i] = getlrand(0,1);
PARAM_HOST.D_a[i] = getlrand(0,1);
PARAM_HOST.P_a[i] = getlrand(0,1);
PARAM_HOST.P_w[i] = getlrand(0,1);
                                
                                    }

                                                                }



void mem_alloc(){

PARAM_HOST.A_a = new float[BS*NUMNET*PLAYR[NLAYS-1]];

cudaMallocManaged((void **)&PARAM_DEV.A_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float));   //ACT
cudaMallocManaged((void **)&PARAM_DEV.B_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float));   //BIAS
cudaMallocManaged((void **)&PARAM_DEV.E_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float));   //ERR
cudaMallocManaged((void **)&PARAM_DEV.V_a, TI * BS * (LAYERS[0]+LAYERS[NLAYS-1]) * sizeof(float)); //VAL
cudaMallocManaged((void **)&PARAM_DEV.W_a, WPLAYR[NLAYS-1] *  sizeof(float));   //WGT
cudaMallocManaged((void **)&PARAM_DEV.D_a, WPLAYR[NLAYS-1] *  sizeof(float));   //DLT
cudaMallocManaged((void **)&PARAM_DEV.P_a, WPLAYR[NLAYS-1] *  sizeof(float));   //PREDLT
cudaMallocManaged((void **)&PARAM_DEV.P_w, WPLAYR[NLAYS-1] *  sizeof(float));   //PWGT
cudaMallocManaged((void **)&PARAM_DEV.A_f, NLAYS * sizeof(int));    //ACTFI

cudaMemcpy(PARAM_DEV.A_f, ACTFI, NLAYS * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.V_a,VAL, TI * BS * (LAYERS[0]+LAYERS[NLAYS-1]) * sizeof(float), cudaMemcpyHostToDevice);  /*Batch load Values Transfer to device*/
cudaMallocManaged((void **)&PARAM_DEV.P_l, NLAYS *  sizeof(int));   //Pointer to Act layers
cudaMallocManaged((void **)&PARAM_DEV.W_l, NLAYS *  sizeof(int));   //Pointer to Wgt layers
cudaMallocManaged((void **)&LU_a, NLAYS * 6 *  sizeof(int));    //Pointer to Wgt layers


cudaMemcpy(PARAM_DEV.B_a, PARAM_HOST.B_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.W_a, PARAM_HOST.W_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.D_a, PARAM_HOST.D_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.P_a, PARAM_HOST.P_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.P_w, PARAM_HOST.P_w, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyHostToDevice);

cudaMemcpy(PARAM_DEV.W_l, WPLAYR, NLAYS * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.P_l, PLAYR, NLAYS * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(LU_a, L_ua, NLAYS * 6 * sizeof(int), cudaMemcpyHostToDevice);


                        }

void prep_val(float *VAL,int EPOCH){

float xord[4][3];
xord[0][0]=0; xord[0][1]=0; xord[0][2]=-0.5;
xord[1][0]=0; xord[1][1]=1; xord[1][2]=0.5;
xord[2][0]=1; xord[2][1]=0; xord[2][2]=0.5;
xord[3][0]=1; xord[3][1]=1; xord[3][2]=-0.5;

int select=0;

for(int i=0;i<BS*TI;i++){

if(1){
select = rand()%4;
}else{
select++;
if(select==4)select=0;
}


VAL[i*3]   = xord[select][0];
VAL[i*3+1] = xord[select][1];
VAL[i*3+2] = xord[select][2];

                            
}

}

void call_nn(){

GpuInit myinit_aconn(1024,1024,1,1);    //GPU using 1024 grids and 1024 blocks
GpuInit* myinitaconn;
myinitaconn = &myinit_aconn;


PARAM_DEV.ITER = 0;     /*Cycle through ITER on device above NN_FF
                          Each time we run NN_FF we process
                          BS x Values
                        */
                        
//INIT_PARAM<<<1024,1024,0,0>>>(PARAM_DEV);                     

if(1){  
cudaMemcpy(PARAM_HOST.W_a, PARAM_DEV.W_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.P_w, PARAM_DEV.P_w, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.D_a, PARAM_DEV.D_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.P_a, PARAM_DEV.P_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);

cout<<"nWeights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.W_a[i]<<",";
}
cout<<"nnPre Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_w[i]<<",";
}
cout<<"nnDelta Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.D_a[i]<<",";
}
cout<<"nnPre Deltas:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_a[i]<<",";
}
        }

//exit(1);

startTime = clock();

for(int iter=0;iter<TI;iter++){
                        
PARAM_DEV.ITER=iter;
                        
NN_FF<<<myinitaconn->grid,myinitaconn->block,0,myinitaconn->stream1>>>(PARAM_DEV,LU_a);


NN_BP<<<myinitaconn->grid,myinitaconn->block,0,myinitaconn->stream1>>>(PARAM_DEV,LU_a);

if(iter>TI-7){
cudaMemcpy(PARAM_HOST.A_a, PARAM_DEV.A_a, BS*PLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.E_a, PARAM_DEV.E_a, BS*PLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);

cout<<"Iter:"<<iter<<"n";
for(int j=0;j<BS;j++){
cout<<"Batch="<<j<<"n";
cout<<"[";
for(int i=0;i<PLAYR[NLAYS-1];i++){
cout<<"--Act="<<PARAM_HOST.A_a[j*PLAYR[NLAYS-1]+i]<<"--Err=";
cout<<PARAM_HOST.E_a[j*PLAYR[NLAYS-1]+i]<<"n";
                                        }
cout<<"]";                                      
                                                }
                    
cudaMemcpy(PARAM_HOST.W_a, PARAM_DEV.W_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.P_w, PARAM_DEV.P_w, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.D_a, PARAM_DEV.D_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.P_a, PARAM_DEV.P_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);

cout<<"nWeights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.W_a[i]<<",";
}
cout<<"nnPre Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_w[i]<<",";
}
cout<<"nnDelta Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.D_a[i]<<",";
}
cout<<"nnPre Deltas:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_a[i]<<",";
}

            }

if(1){
float* TERR;
TERR = new float[1];
cudaMemcpy(TERR, PARAM_DEV.E_a+PLAYR[NLAYS-1]-1,sizeof(float), cudaMemcpyDeviceToHost);
errfile<<sqrt(pow(TERR[0],2))<<"n";
        }
                                    }//Iter
                                    
endTime = clock();
cout<<"nCUDA gpu took "<<100000*((endTime-startTime)/(float)CLOCKS_PER_SEC) << " ms. nn*******************n*******************" << endl;


errfile.close();

                                                }

};

int main(){

int topo[] = {2,4,1};
//const char* acts[] = {"NORM","TANH","NORM"};
const char* acts[] = {"NORM","SIGM","NORM"};

int nlayers = 3;

net mynet(topo,acts,nlayers);

mynet.mem_alloc();
mynet.call_nn();

cout<<"Hellon";
            }

Here is the 1Dkern.cu included file:

#include <algorithm>
#include <complex>
#include <iostream>
#include <math.h>
#include <vector>


/*Begin 1DKern definition */

/*This is a direct copy of Tensorflows 1DKern code*/

namespace detail {
template <typename T>
class GpuGridRange {

    struct Iterator {
        __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
        __device__ T operator*() const { return index_;}
        __device__ Iterator& operator++() {
            index_ += delta_;
            return *this;
        }

        __device__ bool operator!=(const Iterator& other) const {
            bool greater = index_ > other.index_;
            bool less = index_ < other.index_;
            if(!other.delta_){
            return less;
            }
            if(!delta_){
            return greater;
            }   

        return less || greater;
        }   

        private:
        T index_;
        const T delta_;

    };  //end Iterator struct


    public:
        __device__ GpuGridRange(T begin,T delta,T end)
        : begin_(begin),delta_(delta),end_(end) {}
    
    __device__ Iterator begin() const {return Iterator(begin_,delta_); }
    __device__ Iterator end() const {return Iterator(end_,0);}

    private:
    T begin_;
    T delta_;
    T end_; 
    


};  //end GPU class class
};  //end namespace detail

template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
return detail::GpuGridRange<T>(

    /*begin*/blockIdx.x * blockDim.x + threadIdx.x,
    /*delta*/gridDim.x * blockDim.x, /*end*/count
                );

}

template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
return detail::GpuGridRange<T>(

    /*begin*/blockIdx.y * blockDim.y + threadIdx.y,
    /*delta*/gridDim.y * blockDim.y, /*end*/count
                );

}
template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
return detail::GpuGridRange<T>(

    /*begin*/blockIdx.z * blockDim.z + threadIdx.z,
    /*delta*/gridDim.z * blockDim.z, /*end*/count
                );

}


#define GPU_1D_KERN_LOOP(i, n) 
  for (int i : ::GpuGridRangeX<int>(n))

#define GPU_AXIS_KERNEL_LOOP(i, n, axis) 
  for (int i : ::GpuGridRange##axis<int>(n))


/*End 1DKern definition*/


class GpuInit{
public:

dim3 grid;
dim3 block;
cudaStream_t stream1;   //Could be a array of streams for multiple streams

GpuInit(unsigned int gridsizeX,unsigned int blocksizeX,unsigned int gridsizeY,unsigned int blocksizeY){

/*If using GPU_1D_KERN_LOOP */

grid.x = gridsizeX;   grid.y = gridsizeY;
block.x = blocksizeX; block.y= blocksizeY;

cudaStreamCreate(&stream1);


//cout<<"Grid dimensions are: "<<grid.x<<"--"<<grid.y<<"--"<<grid.z<<"n";
//cout<<"Block dimensions are: "<<block.x<<"--"<<block.y<<"--"<<block.z<<"n";

}

        };


__device__ double atomicAdd__(double* address, double val)
{
    unsigned long long int* address_as_ull =
                             (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                               __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}


__device__ double atomicAdd_(double* address, double val)
{
    unsigned long long int* address_as_ull =
                              (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;

    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                               __longlong_as_double(assumed)));

    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
    } while (assumed != old);

    return __longlong_as_double(old);
}


This is taken from TensorFlow source.

I have vereified most of the calculations in both functions NN_FF and NN_BP. This is a big ask and I apologise if it is too much to take in. It is messy but the CUDA functions are quite straight forward. There is an array LU_a which holds the position data for each array relevent for the layer. This I confirm is working for NN_FF and is valid. Alas I cannot find the reason it will not learn.

New contributor

Housebyte Bestofnet is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa Dịch vụ tổ chức sự kiện 5 sao Thông tin về chúng tôi Dịch vụ sinh nhật bé trai Dịch vụ sinh nhật bé gái Sự kiện trọn gói Các tiết mục giải trí Dịch vụ bổ trợ Tiệc cưới sang trọng Dịch vụ khai trương Tư vấn tổ chức sự kiện Hình ảnh sự kiện Cập nhật tin tức Liên hệ ngay Thuê chú hề chuyên nghiệp Tiệc tất niên cho công ty Trang trí tiệc cuối năm Tiệc tất niên độc đáo Sinh nhật bé Hải Đăng Sinh nhật đáng yêu bé Khánh Vân Sinh nhật sang trọng Bích Ngân Tiệc sinh nhật bé Thanh Trang Dịch vụ ông già Noel Xiếc thú vui nhộn Biểu diễn xiếc quay đĩa Dịch vụ tổ chức tiệc uy tín Khám phá dịch vụ của chúng tôi Tiệc sinh nhật cho bé trai Trang trí tiệc cho bé gái Gói sự kiện chuyên nghiệp Chương trình giải trí hấp dẫn Dịch vụ hỗ trợ sự kiện Trang trí tiệc cưới đẹp Khởi đầu thành công với khai trương Chuyên gia tư vấn sự kiện Xem ảnh các sự kiện đẹp Tin mới về sự kiện Kết nối với đội ngũ chuyên gia Chú hề vui nhộn cho tiệc sinh nhật Ý tưởng tiệc cuối năm Tất niên độc đáo Trang trí tiệc hiện đại Tổ chức sinh nhật cho Hải Đăng Sinh nhật độc quyền Khánh Vân Phong cách tiệc Bích Ngân Trang trí tiệc bé Thanh Trang Thuê dịch vụ ông già Noel chuyên nghiệp Xem xiếc khỉ đặc sắc Xiếc quay đĩa thú vị
Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa
Thiết kế website Thiết kế website Thiết kế website Cách kháng tài khoản quảng cáo Mua bán Fanpage Facebook Dịch vụ SEO Tổ chức sinh nhật