I wanted to write an MLP in CUDA without any dependancy’s I apologise in advance for my messy code. Please can you examine my CUDA functions to see if there is an obvious mistake which could explain why it will not solve , as it should, the simple XoR problem. We should see a decrease in error but instead it just produces a random error. I tried to make my own CUDA rng but I am using rand() instead. I have
- Initialised activations (outputs A_a), Errors (E_a), Weights (W_a) and Bias (B_a)
- I checked Feedforward NN_FF thoroughly and its mathematically sound
- The weights are being updated and are changing but obviously not in the direction I wish.
Please have a look and see if there is a problem either in
NN_FF or NN_BP
#include <stdio.h>
#include <stdlib.h>
#include <fstream>
#include <iostream>
#include <ctime>
#include <time.h>
#include "1dkern.cu"
using namespace std;
typedef struct{
float *B_a,*A_a,*W_a,*D_a,*P_a,*P_w,*E_a,*V_a;
float *L_a,*WL_a,*C_e;
int *LU_a,*P_l,*W_l,*A_f;
int ITER,NLAYS,NN,BS,NVAL,WTOT,NS;
float Lrt,Decy,Mmnt;
}myparam;
int *LU_a;
float getlrand(float lower,float upper){
return ((float) rand()/ RAND_MAX) * (upper-lower) + lower;
}
/* Call all Act functions - includes rng for noisy Acts */
__device__ float mytanh_(float x)
{
float exp2x = expf(2*x);
return (exp2x - 1) / (exp2x + 1);
}
__device__ float sigmoid(float v)
{
return 1 / (1 + expf(-v));
}
__device__ float InvSig(const float sum,const float act)
{
return (act*(1-act)*sum) ;
}
__device__ float ActFun(int type, float v,int dir,float u, int seed){
if(type==0){
/*Tanh*/
if(dir==0){
return mytanh_(v);
}else{
return (1-pow((double)mytanh_(u),2.0))*v;
//return (1-(mytanh_(u)*mytanh_(u)))*v;
//return (1-mytanh_(u))*v;
}
}
if(type==1){
/*Sigmoid*/
if(dir==0){
return 1 / (1 + expf(-v));
//return sigmoid(v);
}else{
return (u*(1-u)*v);
//return InvSig(v,u);
}
}
if(type==2){
/*Norm*/
if(dir==0){
return v;
}else{
return v;
}
}
return 0;
}
__device__ float rng(unsigned int m_w,unsigned int m_z){
m_z = 36969 * (m_z & 65535) + (m_z >> 16);
m_w = 18000 * (m_w & 65535) + (m_w >> 16);
return (m_z << 16) + m_w; /* 32-bit result */
}
__global__ void INIT_PARAM(myparam PARAM){
int TW,NW;
TW=PARAM.W_l[PARAM.NLAYS-1];
NW=PARAM.P_l[PARAM.NLAYS-1];
GPU_1D_KERN_LOOP(index,TW){
const int ls = (index / 1 ) %TW;
PARAM.W_a[ls] = -(2425-rng(150+ls,40)/1000000);
PARAM.D_a[ls] = -(2425-rng(150+ls,40)/1000000);
PARAM.P_w[ls] = -(2425-rng(150+ls,40)/1000000);
PARAM.P_a[ls] = -(2425-rng(150+ls,40)/1000000);
}
__syncthreads();
GPU_1D_KERN_LOOP(index,NW){
const int ls = (index / 1 ) %NW;
PARAM.B_a[ls] = -(2425-rng(150+ls,40)/1000000);
}
}
__global__ void NN_FF(myparam PARAM,const int* const __restrict__ LU_a){
//cudaStream_t astream;
//cudaStreamCreateWithFlags(&astream,cudaStreamNonBlocking);
const int NLAYS = PARAM.NLAYS;
for(int LAYER=1;LAYER<NLAYS;LAYER++){
/* Set Constants */
const int NIN = LU_a[LAYER*6+0];
const int NOUT = LU_a[LAYER*6+1];
const int SIN = LU_a[LAYER*6+2];
const int SOUT = LU_a[LAYER*6+3];
const int NVAL = PARAM.NVAL; //Number of Training Values (XoR or RL)
const int NUMNET = PARAM.NN;
const int ITER = PARAM.ITER;
const int BS = PARAM.BS;
const int NETSIZE = PARAM.NS;
if(LAYER==1){
/*Set Inputs*/
GPU_1D_KERN_LOOP(index,NIN*NUMNET*BS){
const int ls = (index / 1 ) %NIN; //Outputs / layer size
const int bs = (index / NIN) %BS;
const int nn = (index / NIN / BS) %NUMNET;
PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+ls]=ActFun(PARAM.A_f[LAYER-1],PARAM.V_a[ITER*BS*NVAL+bs*NVAL+ls],0,0,1);
}
__syncthreads();
}
if(1){
GPU_1D_KERN_LOOP(index,NOUT*NUMNET*BS){
const int ls = (index / 1 ) %NOUT; //Outputs / layer size
const int bs = (index / NIN) %BS;
const int nn = (index / NIN / BS) %NUMNET;
PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls]=0;
}
__syncthreads();
//Feedforward to MLP layer
GPU_1D_KERN_LOOP(index, NIN*NOUT*NUMNET*BS){
const int ls = (index / 1 ) %NOUT; //Outputs / layer size
const int col = (index / NOUT ) %NIN; //Inputs / size of feature map
const int bs = (index / NOUT / NIN ) %BS;
const int nn = (index / NOUT / NIN / BS ) %NUMNET;
atomicAdd(&PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],PARAM.W_a[PARAM.W_l[LAYER-1]+(col+ls*NIN)]*PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SIN+col]);
atomicAdd(&PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],PARAM.B_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls]);
}
__syncthreads();
if(1){
GPU_1D_KERN_LOOP(index, NOUT*BS*NUMNET){
const int ls = (index / 1 ) %NOUT; //Outputs / layer size
const int bs = (index / NOUT ) %BS;
const int nn = (index / NOUT / BS ) %NUMNET;
PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls] = ActFun(PARAM.A_f[LAYER],PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],0,0,1);
}
}
__syncthreads();
}
}//Layers Loop
}
__global__ void NN_BP(myparam PARAM,const int* const __restrict__ LU_a){
cudaStream_t astream;
cudaStreamCreateWithFlags(&astream,cudaStreamNonBlocking);
const int NLAYS = PARAM.NLAYS;
for(int LAYER=NLAYS-1;LAYER>0;LAYER--){
/*Backprop Error*/
/* Set Constants */
const int NIN = LU_a[LAYER*6+0];
const int NOUT =LU_a[LAYER*6+1];
const int SIN = LU_a[LAYER*6+2];
const int SOUT =LU_a[LAYER*6+3];
const int WSTR =LU_a[LAYER*6+4];
const int WEND =LU_a[LAYER*6+5];
//const int WTOT = PARAM.WTOT;
const int WSZE = WEND-WSTR;
const int NVAL = PARAM.NVAL; //Number of Training Values (XoR or RL)
const int NUMNET = PARAM.NN;
const int ITER = PARAM.ITER;
const int BS = PARAM.BS;
const int NETSIZE = PARAM.NS;
if(LAYER==NLAYS-1){ /*Label Layer Error*/
GPU_1D_KERN_LOOP(index,NOUT*BS*NUMNET ){ /*THIS CHECKSOUT*/
const int nl = (index / 1 ) %NOUT; //Outputs / layer size
const int bs = (index / NOUT) %BS;
const int nn = (index / NOUT / BS) %NUMNET;
int ERR_POS = nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl;
int VAL_POS = ITER*BS*NVAL+bs*NVAL+(NVAL-NOUT)+nl;
int ACT_POS = nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl;
PARAM.E_a[ERR_POS] = PARAM.V_a[VAL_POS];
atomicAdd(&PARAM.E_a[ERR_POS],-PARAM.A_a[ACT_POS]);
//Add Loss function here
//PARAM.E_a[ERR_POS]=LOSS_FUN(PARAM.LF,PARAM.V_a[VAL_POS],PARAM.A_a[ACT_POS]);
}
__syncthreads();
/* Invert Label error */
GPU_1D_KERN_LOOP(index, NOUT*BS*NUMNET){
const int nl = (index / 1 ) %NOUT; //Number of labels
const int bs = (index / NOUT) %BS;
const int nn = (index / NOUT / BS) %NUMNET;
int ERR_POS = nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl;
int ACT_POS = nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl;
PARAM.E_a[ERR_POS] = ActFun(PARAM.A_f[NLAYS-1],PARAM.E_a[ERR_POS],1,PARAM.A_a[ACT_POS],1);
atomicAdd(&PARAM.B_a[ERR_POS],PARAM.Lrt*ActFun(PARAM.A_f[NLAYS-1],PARAM.E_a[ERR_POS],1,PARAM.A_a[ACT_POS],1));
}
}//Label Layer
__syncthreads();
if(1){
/* None of this matters when you just learn from the bias!!!*/
/* Initialise Lower Error */
GPU_1D_KERN_LOOP(index,NIN*BS*NUMNET){
const int bs = (index / 1) %BS;
const int nn = (index / BS) %NUMNET;
const int ls = (index / BS / NUMNET ) % NIN;
PARAM.E_a[nn*BS*NETSIZE+bs*NETSIZE+SIN+ls]=0;
}
__syncthreads();
/* Backprop Error Layer to Layer */
GPU_1D_KERN_LOOP(index, NOUT*NIN*BS*NUMNET){
const int bs = (index / 1) %BS;
const int nn = (index / BS) %NUMNET;
const int nl = (index / BS / NUMNET ) %NOUT; //Number of labels
const int us = (index / BS / NUMNET / NOUT ) %NIN; //layer size
int ERR_LO = nn*BS*NETSIZE+bs*NETSIZE+SIN+us;
int ERR_UP = nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl;
int WGT_POS = PARAM.W_l[LAYER-1]+nl*NIN+us; //Or use WSTART
atomicAdd(&PARAM.E_a[ERR_LO], PARAM.W_a[WGT_POS]*PARAM.E_a[ERR_UP]);
}
__syncthreads();
GPU_1D_KERN_LOOP(index, NIN*BS*NUMNET){
const int us = (index / 1 ) %NIN;
const int bs = (index / NOUT) %BS;
const int nn = (index / NOUT / BS) %NUMNET;
int ERR_POS = nn*BS*NETSIZE+bs*NETSIZE+SIN+us;
int ACT_POS = nn*BS*NETSIZE+bs*NETSIZE+SIN+us;
PARAM.E_a[ERR_POS] = ActFun(PARAM.A_f[LAYER],PARAM.E_a[ERR_POS],1,PARAM.A_a[ACT_POS],1);
atomicAdd(&PARAM.B_a[ERR_POS],PARAM.Lrt*ActFun(PARAM.A_f[LAYER],PARAM.E_a[ERR_POS],1,PARAM.A_a[ACT_POS],1));
}
__syncthreads();
/*Begin Weight Updates*/
/* Deltas */
GPU_1D_KERN_LOOP(index,WSZE ){
const int nl = (index / 1 ) % WSZE; //SIZE OF WGTS
PARAM.D_a[WSTR+nl] = PARAM.W_a[WSTR+nl];
}
__syncthreads();
GPU_1D_KERN_LOOP(index,WSZE ){
const int nl = (index / 1 ) % WSZE; //SIZE OF WGTS
atomicAdd(&PARAM.D_a[WSTR+nl],-PARAM.P_w[WSTR+nl]);
}
__syncthreads();
/* Pre Wgts */
GPU_1D_KERN_LOOP(index,WSZE ){
const int nl = (index / 1 ) % WSZE; //SIZE OF WGTS
PARAM.P_w[WSTR+nl] = PARAM.W_a[WSTR+nl];
}
__syncthreads();
/* Wgt Update */
GPU_1D_KERN_LOOP(index,NIN*NOUT*NUMNET*BS ){
const int bs = (index / 1) %BS;
const int nn = (index / BS) %NUMNET;
const int nl = (index / BS / NUMNET ) %NOUT; //Number of labels
const int us = (index / BS / NUMNET /NOUT ) %NIN; //layer size
atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],PARAM.Mmnt*PARAM.P_a[WSTR+nl*NIN+us]); //Deltas
atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],PARAM.Lrt*PARAM.E_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl]*PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SIN+us]); //Lrate
atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],-PARAM.Decy*PARAM.P_w[WSTR+nl*NIN+us]); //Preweights
}
__syncthreads();
/* Pre Deltas */
GPU_1D_KERN_LOOP(index,WSZE ){
const int nl = (index / 1 ) % WSZE; //SIZE OF WGTS
PARAM.P_a[WSTR+nl] = PARAM.D_a[WSTR+nl];
}
__syncthreads();
} }//End Layers
}
class net{
public:
myparam PARAM_DEV,PARAM_HOST;
int *L_ua,*PLAYR,*LAYERS,*ACTFI,*WPLAYR;
int NLAYS,BS,TI,NUMNET;
float *VAL;
clock_t startTime, endTime;
ofstream errfile;
~net(){
cudaFree(PARAM_DEV.B_a);cudaFree(PARAM_DEV.A_a);cudaFree(PARAM_DEV.W_a);cudaFree(PARAM_DEV.D_a);cudaFree(PARAM_DEV.P_a);cudaFree(PARAM_DEV.P_w);cudaFree(PARAM_DEV.E_a);cudaFree(PARAM_DEV.V_a);
cudaFree(PARAM_DEV.L_a);cudaFree(PARAM_DEV.WL_a);cudaFree(PARAM_DEV.C_e);
cudaFree(PARAM_DEV.P_l);cudaFree(PARAM_DEV.W_l);cudaFree(PARAM_DEV.A_f);
cudaFree(LU_a);cudaFree(PARAM_DEV.LU_a);
delete(PARAM_HOST.B_a);delete(PARAM_HOST.A_a);delete(PARAM_HOST.W_a);delete(PARAM_HOST.D_a);delete(PARAM_HOST.P_a);delete(PARAM_HOST.P_w);delete(PARAM_HOST.E_a);delete(PARAM_HOST.V_a);
delete(PARAM_HOST.L_a);delete(PARAM_HOST.WL_a);delete(PARAM_HOST.C_e);
delete(PARAM_HOST.P_l);delete(PARAM_HOST.W_l);delete(PARAM_HOST.A_f);
delete PLAYR,LAYERS,ACTFI,WPLAYR,VAL,L_ua;
}
net(int LAYERS_[],const char* ACTF[],int NLAYERS){
TI = 300; //Number of Iterations
BS = PARAM_DEV.BS = 1; //Batch Size
PARAM_DEV.ITER = 0; //Start of Iterations
NUMNET = PARAM_DEV.NN = 1; //Number of Paralell Nets
PARAM_DEV.Lrt = 0.065; PARAM_DEV.Mmnt = 0.0065; PARAM_DEV.Decy = 0.000065;
errfile.open("errfile.dat",ios::out);
NLAYS = PARAM_DEV.NLAYS = NLAYERS;
LAYERS = new int[NLAYERS];
PLAYR = new int[NLAYERS];
WPLAYR = new int[NLAYERS];
ACTFI = new int[NLAYERS];
L_ua = new int[NLAYS*6]; //Layer Constants
for(int i=0;i<NLAYS;i++){
ACTFI[i]=0;
if(ACTF[i][0]=='T'&&ACTF[i][1]=='A'&&ACTF[i][2]=='N'&&ACTF[i][3]=='H'){ACTFI[i]=0;}
if(ACTF[i][0]=='S'&&ACTF[i][1]=='I'&&ACTF[i][2]=='G'&&ACTF[i][3]=='M'){ACTFI[i]=1;}
if(ACTF[i][0]=='N'&&ACTF[i][1]=='O'&&ACTF[i][2]=='R'&&ACTF[i][3]=='M'){ACTFI[i]=2;}
if(i==0){
PLAYR[0] = LAYERS_[0];
WPLAYR[0] = 0;
}
if(i>0){
PLAYR[i] = PLAYR[i-1]+LAYERS_[i]; //Neurons
WPLAYR[i] = WPLAYR[i-1]+LAYERS_[i]*LAYERS_[i-1]; //Weights
}
LAYERS[i] = LAYERS_[i];
}
PARAM_DEV.NVAL = LAYERS[NLAYS-1]+LAYERS[0]; //Set number of Values to learn (3)
PARAM_DEV.WTOT = WPLAYR[NLAYS-1];
PARAM_DEV.NS = PLAYR[NLAYS-1]; //Netsize
for(int i=1;i<NLAYS;i++){
if(i>1){
L_ua[i*6+0] = PLAYR[i-1]-PLAYR[i-2];//NIN
L_ua[i*6+1] = PLAYR[i]-PLAYR[i-1];//NOUT
L_ua[i*6+2] = PLAYR[i-2];//SIN
L_ua[i*6+3] = PLAYR[i-1];//SOUT
L_ua[i*6+4] = WPLAYR[i-1];//WSTR
L_ua[i*6+5] = WPLAYR[i];//WEND
}else{
L_ua[i*6+0] = PLAYR[0];//NIN
L_ua[i*6+1] = PLAYR[1]-PLAYR[0];//NOUT
L_ua[i*6+2] = 0;//SIN
L_ua[i*6+3] = PLAYR[0];//SOUT
L_ua[i*6+4] = WPLAYR[i-1];//WSTR
L_ua[i*6+5] = WPLAYR[i];//WEND
}
}
PARAM_HOST.W_a = new float[WPLAYR[NLAYS-1]];
PARAM_HOST.P_w = new float[WPLAYR[NLAYS-1]];
PARAM_HOST.D_a = new float[WPLAYR[NLAYS-1]];
PARAM_HOST.P_a = new float[WPLAYR[NLAYS-1]];
PARAM_HOST.E_a = new float[NUMNET*BS*PLAYR[NLAYS-1]];
PARAM_HOST.A_a = new float[NUMNET*BS*PLAYR[NLAYS-1]];
PARAM_HOST.B_a = new float[NUMNET*BS*PLAYR[NLAYS-1]];
for(int i=0;i<BS*NUMNET*PLAYR[NLAYS-1];i++){
PARAM_HOST.A_a[i] = 0;
PARAM_HOST.B_a[i] =getlrand(0,1);
PARAM_HOST.E_a[i] = 0;
}
VAL = new float[(LAYERS[NLAYS-1]+LAYERS[0])*BS*TI];
prep_val(VAL,0);
/*Init weights and deltas*/
for(int i=0;i<WPLAYR[NLAYS-1];i++){
PARAM_HOST.W_a[i] = getlrand(0,1);
PARAM_HOST.D_a[i] = getlrand(0,1);
PARAM_HOST.P_a[i] = getlrand(0,1);
PARAM_HOST.P_w[i] = getlrand(0,1);
}
}
void mem_alloc(){
PARAM_HOST.A_a = new float[BS*NUMNET*PLAYR[NLAYS-1]];
cudaMallocManaged((void **)&PARAM_DEV.A_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float)); //ACT
cudaMallocManaged((void **)&PARAM_DEV.B_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float)); //BIAS
cudaMallocManaged((void **)&PARAM_DEV.E_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float)); //ERR
cudaMallocManaged((void **)&PARAM_DEV.V_a, TI * BS * (LAYERS[0]+LAYERS[NLAYS-1]) * sizeof(float)); //VAL
cudaMallocManaged((void **)&PARAM_DEV.W_a, WPLAYR[NLAYS-1] * sizeof(float)); //WGT
cudaMallocManaged((void **)&PARAM_DEV.D_a, WPLAYR[NLAYS-1] * sizeof(float)); //DLT
cudaMallocManaged((void **)&PARAM_DEV.P_a, WPLAYR[NLAYS-1] * sizeof(float)); //PREDLT
cudaMallocManaged((void **)&PARAM_DEV.P_w, WPLAYR[NLAYS-1] * sizeof(float)); //PWGT
cudaMallocManaged((void **)&PARAM_DEV.A_f, NLAYS * sizeof(int)); //ACTFI
cudaMemcpy(PARAM_DEV.A_f, ACTFI, NLAYS * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.V_a,VAL, TI * BS * (LAYERS[0]+LAYERS[NLAYS-1]) * sizeof(float), cudaMemcpyHostToDevice); /*Batch load Values Transfer to device*/
cudaMallocManaged((void **)&PARAM_DEV.P_l, NLAYS * sizeof(int)); //Pointer to Act layers
cudaMallocManaged((void **)&PARAM_DEV.W_l, NLAYS * sizeof(int)); //Pointer to Wgt layers
cudaMallocManaged((void **)&LU_a, NLAYS * 6 * sizeof(int)); //Pointer to Wgt layers
cudaMemcpy(PARAM_DEV.B_a, PARAM_HOST.B_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.W_a, PARAM_HOST.W_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.D_a, PARAM_HOST.D_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.P_a, PARAM_HOST.P_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.P_w, PARAM_HOST.P_w, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.W_l, WPLAYR, NLAYS * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(PARAM_DEV.P_l, PLAYR, NLAYS * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(LU_a, L_ua, NLAYS * 6 * sizeof(int), cudaMemcpyHostToDevice);
}
void prep_val(float *VAL,int EPOCH){
float xord[4][3];
xord[0][0]=0; xord[0][1]=0; xord[0][2]=-0.5;
xord[1][0]=0; xord[1][1]=1; xord[1][2]=0.5;
xord[2][0]=1; xord[2][1]=0; xord[2][2]=0.5;
xord[3][0]=1; xord[3][1]=1; xord[3][2]=-0.5;
int select=0;
for(int i=0;i<BS*TI;i++){
if(1){
select = rand()%4;
}else{
select++;
if(select==4)select=0;
}
VAL[i*3] = xord[select][0];
VAL[i*3+1] = xord[select][1];
VAL[i*3+2] = xord[select][2];
}
}
void call_nn(){
GpuInit myinit_aconn(1024,1024,1,1); //GPU using 1024 grids and 1024 blocks
GpuInit* myinitaconn;
myinitaconn = &myinit_aconn;
PARAM_DEV.ITER = 0; /*Cycle through ITER on device above NN_FF
Each time we run NN_FF we process
BS x Values
*/
//INIT_PARAM<<<1024,1024,0,0>>>(PARAM_DEV);
if(1){
cudaMemcpy(PARAM_HOST.W_a, PARAM_DEV.W_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.P_w, PARAM_DEV.P_w, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.D_a, PARAM_DEV.D_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.P_a, PARAM_DEV.P_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cout<<"nWeights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.W_a[i]<<",";
}
cout<<"nnPre Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_w[i]<<",";
}
cout<<"nnDelta Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.D_a[i]<<",";
}
cout<<"nnPre Deltas:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_a[i]<<",";
}
}
//exit(1);
startTime = clock();
for(int iter=0;iter<TI;iter++){
PARAM_DEV.ITER=iter;
NN_FF<<<myinitaconn->grid,myinitaconn->block,0,myinitaconn->stream1>>>(PARAM_DEV,LU_a);
NN_BP<<<myinitaconn->grid,myinitaconn->block,0,myinitaconn->stream1>>>(PARAM_DEV,LU_a);
if(iter>TI-7){
cudaMemcpy(PARAM_HOST.A_a, PARAM_DEV.A_a, BS*PLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.E_a, PARAM_DEV.E_a, BS*PLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cout<<"Iter:"<<iter<<"n";
for(int j=0;j<BS;j++){
cout<<"Batch="<<j<<"n";
cout<<"[";
for(int i=0;i<PLAYR[NLAYS-1];i++){
cout<<"--Act="<<PARAM_HOST.A_a[j*PLAYR[NLAYS-1]+i]<<"--Err=";
cout<<PARAM_HOST.E_a[j*PLAYR[NLAYS-1]+i]<<"n";
}
cout<<"]";
}
cudaMemcpy(PARAM_HOST.W_a, PARAM_DEV.W_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.P_w, PARAM_DEV.P_w, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.D_a, PARAM_DEV.D_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(PARAM_HOST.P_a, PARAM_DEV.P_a, WPLAYR[NLAYS-1] * sizeof(float), cudaMemcpyDeviceToHost);
cout<<"nWeights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.W_a[i]<<",";
}
cout<<"nnPre Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_w[i]<<",";
}
cout<<"nnDelta Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.D_a[i]<<",";
}
cout<<"nnPre Deltas:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_a[i]<<",";
}
}
if(1){
float* TERR;
TERR = new float[1];
cudaMemcpy(TERR, PARAM_DEV.E_a+PLAYR[NLAYS-1]-1,sizeof(float), cudaMemcpyDeviceToHost);
errfile<<sqrt(pow(TERR[0],2))<<"n";
}
}//Iter
endTime = clock();
cout<<"nCUDA gpu took "<<100000*((endTime-startTime)/(float)CLOCKS_PER_SEC) << " ms. nn*******************n*******************" << endl;
errfile.close();
}
};
int main(){
int topo[] = {2,4,1};
//const char* acts[] = {"NORM","TANH","NORM"};
const char* acts[] = {"NORM","SIGM","NORM"};
int nlayers = 3;
net mynet(topo,acts,nlayers);
mynet.mem_alloc();
mynet.call_nn();
cout<<"Hellon";
}
Here is the 1Dkern.cu included file:
#include <algorithm>
#include <complex>
#include <iostream>
#include <math.h>
#include <vector>
/*Begin 1DKern definition */
/*This is a direct copy of Tensorflows 1DKern code*/
namespace detail {
template <typename T>
class GpuGridRange {
struct Iterator {
__device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
__device__ T operator*() const { return index_;}
__device__ Iterator& operator++() {
index_ += delta_;
return *this;
}
__device__ bool operator!=(const Iterator& other) const {
bool greater = index_ > other.index_;
bool less = index_ < other.index_;
if(!other.delta_){
return less;
}
if(!delta_){
return greater;
}
return less || greater;
}
private:
T index_;
const T delta_;
}; //end Iterator struct
public:
__device__ GpuGridRange(T begin,T delta,T end)
: begin_(begin),delta_(delta),end_(end) {}
__device__ Iterator begin() const {return Iterator(begin_,delta_); }
__device__ Iterator end() const {return Iterator(end_,0);}
private:
T begin_;
T delta_;
T end_;
}; //end GPU class class
}; //end namespace detail
template <typename T> //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
return detail::GpuGridRange<T>(
/*begin*/blockIdx.x * blockDim.x + threadIdx.x,
/*delta*/gridDim.x * blockDim.x, /*end*/count
);
}
template <typename T> //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
return detail::GpuGridRange<T>(
/*begin*/blockIdx.y * blockDim.y + threadIdx.y,
/*delta*/gridDim.y * blockDim.y, /*end*/count
);
}
template <typename T> //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
return detail::GpuGridRange<T>(
/*begin*/blockIdx.z * blockDim.z + threadIdx.z,
/*delta*/gridDim.z * blockDim.z, /*end*/count
);
}
#define GPU_1D_KERN_LOOP(i, n)
for (int i : ::GpuGridRangeX<int>(n))
#define GPU_AXIS_KERNEL_LOOP(i, n, axis)
for (int i : ::GpuGridRange##axis<int>(n))
/*End 1DKern definition*/
class GpuInit{
public:
dim3 grid;
dim3 block;
cudaStream_t stream1; //Could be a array of streams for multiple streams
GpuInit(unsigned int gridsizeX,unsigned int blocksizeX,unsigned int gridsizeY,unsigned int blocksizeY){
/*If using GPU_1D_KERN_LOOP */
grid.x = gridsizeX; grid.y = gridsizeY;
block.x = blocksizeX; block.y= blocksizeY;
cudaStreamCreate(&stream1);
//cout<<"Grid dimensions are: "<<grid.x<<"--"<<grid.y<<"--"<<grid.z<<"n";
//cout<<"Block dimensions are: "<<block.x<<"--"<<block.y<<"--"<<block.z<<"n";
}
};
__device__ double atomicAdd__(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__device__ double atomicAdd_(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
This is taken from TensorFlow source.
I have vereified most of the calculations in both functions NN_FF and NN_BP. This is a big ask and I apologise if it is too much to take in. It is messy but the CUDA functions are quite straight forward. There is an array LU_a which holds the position data for each array relevent for the layer. This I confirm is working for NN_FF and is valid. Alas I cannot find the reason it will not learn.
Housebyte Bestofnet is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.