-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathJacobian-NN.c
353 lines (305 loc) · 11.1 KB
/
Jacobian-NN.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
// Jacobian neural network learning algorithm
// ==========================================
// ***** This idea abandoned because calculations are too complicated and slow
// TO-DO:
// (1) Calculate forward Jacobian J^-1 = J1
// (2) Calculate inverse Jacobian J
// (3) Calculate dJ/dw
// (4) Sort all J1 and dJ/dw components
// (5) Calculate local gradient = sum ij
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <assert.h>
#include <time.h> // time as random seed in create_NN()
#include "Jacobian-NN.h"
#define Eta 0.01 // learning rate
#define BIASOUTPUT 1.0 // output for bias. It's always 1.
double randomWeight() // generate random weight between [+1.0, -1.0]
{
// return 0.5 + (rand() / (double) RAND_MAX) * 0.01;
return (rand() / (double) RAND_MAX) * 2.0 - 1.0;
}
//******** activation functions and random weight generator ********************//
// Note: sometimes the derivative is calculated in the forward_prop function
double sigmoid(double v)
{
#define Steepness 3.0
return 1.0 / (1.0 + exp(-Steepness * v));
}
// This non-smooth rectifier can prevent the "vanishing gradient" problem:
// Gradient is constant so it never vanishes!
double rectifier(double v)
{
#define Leakage 0.1
if (v < 0.0)
return Leakage * v;
// else if (v > 1.0)
// return 1.0 + Leakage * v;
else
return v;
}
#define Slope 1.0
double softplus(double v)
{
return log(1.0 + exp(Slope * v));
}
double d_softplus(double v)
{
return Slope * 1.0 / (1.0 + exp(Slope * -v));
}
double x2(double v)
{
return v * v + v;
}
double d_x2(double v)
{
return 2.0 * v + 1.0;
}
//****************************create neural network*********************//
// GIVEN: how many layers, and how many neurons in each layer
NNET *create_NN(int numLayers, int *neuronsPerLayer)
{
NNET *net = (NNET *) malloc(sizeof (NNET));
srand(time(NULL));
net->numLayers = numLayers;
assert(numLayers >= 3);
net->layers = (LAYER *) malloc(numLayers * sizeof (LAYER));
//construct input layer, no weights
net->layers[0].numNeurons = neuronsPerLayer[0];
net->layers[0].neurons = (NEURON *) malloc(neuronsPerLayer[0] * sizeof (NEURON));
//construct hidden layers
for (int l = 1; l < numLayers; ++l) //construct layers
{
net->layers[l].neurons = (NEURON *) malloc(neuronsPerLayer[l] * sizeof (NEURON));
net->layers[l].numNeurons = neuronsPerLayer[l];
for (int n = 0; n < neuronsPerLayer[l]; ++n) // construct each neuron in the layer
{
net->layers[l].neurons[n].weights =
(double *) malloc((neuronsPerLayer[l - 1] + 1) * sizeof (double));
for (int i = 0; i <= neuronsPerLayer[l - 1]; ++i)
{
//construct weights of neuron from previous layer neurons
//when k = 0, it's bias weight
net->layers[l].neurons[n].weights[i] = randomWeight();
//net->layers[i].neurons[j].weights[k] = 0.0f;
}
}
}
return net;
}
void re_randomize(NNET *net, int numLayers, int *neuronsPerLayer)
{
srand(time(NULL));
for (int l = 1; l < numLayers; ++l) // for each layer
for (int n = 0; n < neuronsPerLayer[l]; ++n) // for each neuron
for (int i = 0; i <= neuronsPerLayer[l - 1]; ++i) // for each weight
net->layers[l].neurons[n].weights[i] = randomWeight();
}
void free_NN(NNET *net, int *neuronsPerLayer)
{
// for input layer
free(net->layers[0].neurons);
// for each hidden layer
int numLayers = net->numLayers;
for (int l = 1; l < numLayers; l++) // for each layer
{
for (int n = 0; n < neuronsPerLayer[l]; n++) // for each neuron in the layer
{
free(net->layers[l].neurons[n].weights);
}
free(net->layers[l].neurons);
}
// free all layers
free(net->layers);
// free the whole net
free(net);
}
//**************************** forward-propagation ***************************//
void forward_prop_sigmoid(NNET *net, int dim_V, double V[])
{
// set the output of input layer
for (int i = 0; i < dim_V; ++i)
net->layers[0].neurons[i].output = V[i];
// calculate output from hidden layers to output layer
for (int l = 1; l < net->numLayers; l++)
{
for (int n = 0; n < net->layers[l].numNeurons; n++)
{
double v = 0; //induced local field for neurons
// calculate v, which is the sum of the product of input and weights
for (int k = 0; k <= net->layers[l - 1].numNeurons; k++)
{
if (k == 0)
v += net->layers[l].neurons[n].weights[k] * BIASOUTPUT;
else
v += net->layers[l].neurons[n].weights[k] *
net->layers[l - 1].neurons[k - 1].output;
}
// For the last layer, skip the sigmoid function
// Note: this idea seems to destroy back-prop convergence
// if (i == net->numLayers - 1)
// net->layers[i].neurons[j].output = v;
// else
double output = sigmoid(v);
net->layers[l].neurons[n].output = output;
// There is a neat trick for the calculation of σ': σ'(x) = σ(x) (1−σ(x))
// For its simple derivation you can see this post:
// http://math.stackexchange.com/questions/78575/derivative-of-sigmoid-function-sigma-x-frac11e-x
// Therefore in the code, we use "output * (1 - output)" for the value of "σ'(summed input)",
// because output = σ(summed input), where summed_input_i = Σ_j W_ji input_j.
net->layers[l].neurons[n].grad = Steepness * output * (1.0 - output);
}
}
}
// Same as above, except with soft_plus activation function
void forward_prop_softplus(NNET *net, int dim_V, double V[])
{
// set the output of input layer
for (int i = 0; i < dim_V; ++i)
net->layers[0].neurons[i].output = V[i];
// calculate output from hidden layers to output layer
for (int l = 1; l < net->numLayers; l++)
{
for (int n = 0; n < net->layers[l].numNeurons; n++)
{
double v = 0.0; // induced local field for neurons
// calculate v, which is the sum of the product of input and weights
for (int k = 0; k <= net->layers[l - 1].numNeurons; k++)
{
if (k == 0)
v += net->layers[l].neurons[n].weights[k] * BIASOUTPUT;
else
v += net->layers[l].neurons[n].weights[k] *
net->layers[l - 1].neurons[k - 1].output;
}
net->layers[l].neurons[n].output = softplus(v);
net->layers[l].neurons[n].grad = d_softplus(v);
}
}
}
// Same as above, except with rectifier activation function
// ReLU = "rectified linear unit"
void forward_prop_ReLU(NNET *net, int dim_V, double V[])
{
// set the output of input layer
for (int i = 0; i < dim_V; ++i)
net->layers[0].neurons[i].output = V[i];
// calculate output from hidden layers to output layer
for (int l = 1; l < net->numLayers; l++)
{
for (int n = 0; n < net->layers[l].numNeurons; n++)
{
double v = 0.0; // induced local field for neurons
// calculate v, which is the sum of the product of input and weights
for (int k = 0; k <= net->layers[l - 1].numNeurons; k++)
{
if (k == 0)
v += net->layers[l].neurons[n].weights[k] * BIASOUTPUT;
else
v += net->layers[l].neurons[n].weights[k] *
net->layers[l - 1].neurons[k - 1].output;
}
net->layers[l].neurons[n].output = rectifier(v);
// This is to prepare for back-prop
if (v < 0.0)
net->layers[l].neurons[n].grad = Leakage;
// if (v > 1.0)
// net->layers[l].neurons[n].grad = Leakage;
else
net->layers[l].neurons[n].grad = 1.0;
}
}
}
// Same as above, except with x² activation function
void forward_prop_x2(NNET *net, int dim_V, double V[])
{
// set the output of input layer
for (int i = 0; i < dim_V; ++i)
net->layers[0].neurons[i].output = V[i];
// calculate output from hidden layers to output layer
for (int l = 1; l < net->numLayers; l++)
{
for (int n = 0; n < net->layers[l].numNeurons; n++)
{
double v = 0.0; // induced local field for neurons
// calculate v, which is the sum of the product of input and weights
for (int k = 0; k <= net->layers[l - 1].numNeurons; k++)
{
if (k == 0)
v += net->layers[l].neurons[n].weights[k] * BIASOUTPUT;
else
v += net->layers[l].neurons[n].weights[k] *
net->layers[l - 1].neurons[k - 1].output;
}
net->layers[l].neurons[n].output = x2(v);
net->layers[l].neurons[n].grad = d_x2(v);
}
}
}
//****************************** back-propagation ***************************//
// The error is propagated backwards starting from the output layer, hence the
// name for this algorithm.
// In the update formula, we need to adjust by "η ∙ input ∙ ∇", where η is the learning rate.
// The value of ∇_j = σ'(summed input) Σ_i W_ji ∇_i
// where σ is the sigmoid function, σ' is its derivative. This formula is obtained directly
// from differentiating the error E with respect to the weights W.
// The meaning of del (∇) is the "local gradient". At the output layer, ∇ is equal to
// the derivative σ'(summed inputs) times the error signal, while on hidden layers it is
// equal to the derivative times the weighted sum of the ∇'s from the "next" layers.
// From the algorithmic point of view, ∇ is derivative of the error with respect to the
// summed inputs (for that particular neuron). It changes for every input instance because
// the error is dependent on the NN's raw input. So, for each raw input instance, the
// "local gradient" keeps changing. I have a hypothesis that ∇ will fluctuate wildly
// when the NN topology is "inadequate" to learn the target function.
// Some history:
// It was in 1974-1986 that Paul Werbos, David Rumelhart, Geoffrey Hinton and Ronald Williams
// discovered this algorithm for neural networks, although it has been described by
// Bryson, Denham, and Dreyfus in 1963 and by Bryson and Yu-Chi Ho in 1969 as a solution to
// optimization problems. The book "Talking Nets" interviewed some of these people.
void back_prop(NNET *net, double *errors)
{
int numLayers = net->numLayers;
LAYER lastLayer = net->layers[numLayers - 1];
// calculate gradient for output layer
for (int n = 0; n < lastLayer.numNeurons; ++n)
{
// double output = lastLayer.neurons[n].output;
//for output layer, ∇ = sign(y)∙error
// .grad has been prepared in forward-prop
lastLayer.neurons[n].grad *= errors[n];
}
// calculate gradient for hidden layers
for (int l = numLayers - 2; l > 0; --l) // for each hidden layer
{
for (int n = 0; n < net->layers[l].numNeurons; n++) // for each neuron in layer
{
// double output = net->layers[l].neurons[n].output;
double sum = 0.0f;
LAYER nextLayer = net->layers[l + 1];
for (int i = 0; i < nextLayer.numNeurons; i++) // for each weight
{
sum += nextLayer.neurons[i].weights[n + 1] // ignore weights[0] = bias
* nextLayer.neurons[i].grad;
}
// .grad has been prepared in forward-prop
net->layers[l].neurons[n].grad *= sum;
}
}
// update all weights
for (int l = 1; l < numLayers; ++l) // except for 0th layer which has no weights
{
for (int n = 0; n < net->layers[l].numNeurons; n++) // for each neuron
{
net->layers[l].neurons[n].weights[0] += Eta *
net->layers[l].neurons[n].grad * 1.0; // 1.0f = bias input
for (int i = 0; i < net->layers[l - 1].numNeurons; i++) // for each weight
{
double inputForThisNeuron = net->layers[l - 1].neurons[i].output;
net->layers[l].neurons[n].weights[i + 1] += Eta *
net->layers[l].neurons[n].grad * inputForThisNeuron;
}
}
}
}