-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfind-max-NN.c
388 lines (313 loc) · 10.6 KB
/
find-max-NN.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
// Q-learner
// Basically it is a feed-forward neural net that approximates the Q-value function
// used in reinforcement learning. Call this neural network the Q-net.
// Q-net accepts K and K' as input. K is the current state of the Reasoner,
// K' is the "next" state. Q-net's output is the Q-value, ie, the utility at state
// K making the transition to K'.
// In other words, Q-net approximates the function K x K' → ℝ.
// Given K, K', and Q, Q-net learns via traditional back-prop.
// The special thing about Q-net is that there is another algorithm that computes,
// when given K, the K' that achieves maximum Q value. This is the optimization part.
// TO-DO:
// * Learn the Q: X,Y → ℝ map, where X = input, Y = output
// Though only the difference is important, we need to learn specific Q values.
// Would it perform better? But this is Q learning versus previously was V learning.
// The advantage of Q is model-free, but what is a model? Perhaps it is the internal
// structure of Q?
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <assert.h>
#include <time.h>
#include "feedforward-NN.h"
extern double sigmoid(double v);
extern double randomWeight();
extern NNET *create_NN(int numberOfLayers, int *neuronsPerLayer);
extern void forward_prop_sigmoid(NNET *, int, double *);
extern double calc_error(NNET *net, double *Y);
extern void back_prop(NNET *, double *errors);
extern void plot_W(NNET *);
extern void start_W_plot(void);
//************************** prepare Q-net ***********************//
NNET *Qnet;
#define dimK 9
#define N dimK
int QnumLayers = 4;
int QneuronsPerLayer[] = {dimK * 2, 10, 7, 1};
double K[dimK];
void init_Qnet()
{
// int numLayers2 = 5;
//the first layer -- input layer
//the last layer -- output layer
// int neuronsPerLayer[5] = {2, 3, 4, 4, 4};
// int neuronsPerLayer[5] = {18, 18, 15, 10, 1};
// int neuronsPerLayer2[5] = {18, 40, 30, 20, 1};
Qnet = (NNET*) malloc(sizeof (NNET));
//create neural network for backpropagation
Qnet = create_NN(QnumLayers, QneuronsPerLayer);
start_W_plot();
// return Qnet;
}
void load_Qnet(char *fname)
{
int numLayers2;
int *neuronsPerLayer2;
extern NNET * loadNet(int *, int *p[], char *);
// Qnet = loadNet(&numLayers2, &neuronsPerLayer2, fname);
// LAYER lastLayer = Vnet->layers[numLayers - 1];
return;
}
void save_Qnet(char *fname)
{
extern void saveNet(NNET *, int, int *, char *, char *);
// saveNet(Qnet, QnumLayers, QneuronsPerLayer, "", fname);
}
//************************** Q-learning ***********************//
// Algorithm:
// ---- (Part 1) Acting ----
// At some point in the main algorithm, control is passed here.
// At current state K, We pick an optimal action according to Q.
// We make the state transition from K to K', getting reward R.
// ---- (Part 2) Learning ----
// Then we can use R to update the Q value via:
// Q(K,K') += η { R + γ max_a Q(K',a) }.
// So the above is a ΔQ value that needs to be added to Q(K,K').
// The Q-net computes Q(K,K'); its output should be adjusted by ΔQ.
// Thus we use back-prop to adjust the weights in Q-net to achieve this.
// ==============================================================
// Finds Q value by forward-propagation
double getQ(double K[], double K2[])
{
// For input, we need to combine K1 and K2 into a single vector
double K12[dimK * 2];
for (int k = 0; k < dimK; ++k)
{
K12[k] = (double) K[k];
K12[k + dimK] = (double) K2[k];
}
forward_prop_sigmoid(Qnet, dimK * 2, K12);
LAYER LastLayer = (Qnet->layers[QnumLayers - 1]);
// The last layer has only 1 neuron, which outputs the Q value:
return LastLayer.neurons[0].output;
}
// returns the Euclidean norm (absolute value, or size) of the gradient vector
double norm(double grad[dimK])
{
double r = 0.0;
for (int k = 0; k < dimK; ++k)
r += (grad[k] * grad[k]);
return sqrt(r);
}
// **** Learn a simple Q-value map given specific Q values
// **** Used in network initialization
void train_Q()
{
double S[dimK];
static int count = 0;
double RBF(double *);
for (int i = 0; i < 5000; ++i) { // iterate a few times
for (int k = 0; k < dimK; ++k) {
// argument is random in [+10,-10]
S[k] = (rand() / (float) RAND_MAX) * 20.0 - 10.0;
}
forward_prop_sigmoid(Qnet, dimK, S);
LAYER LastLayer = (Qnet->layers[QnumLayers - 1]);
// The last layer has only 1 neuron, which outputs the Q value:
double Q2 = LastLayer.neurons[0].output;
double Q = RBF(K);
double error[1];
*error = Q - Q2; // desired - actual
back_prop(Qnet, error);
if (++count == 10) {
plot_W(Qnet);
count = 0;
}
}
}
// (Part 2) Q-learning:
// Invoke ordinary back-prop to learn Q.
// On entry, we have just made a transition K1 -> K2 with maximal Q(K1, a: K1->K2)
// and gotten a reward R(K1->K2).
// We need to calculate the max value of Q(K2,a) which, beware, is from the NEXT state K2.
// We know old Q(K1,K2), but it is now adjusted to Q += ΔQ, thus the "error" for back-prop
// is ΔQ.
// Why is Bellman update needed here? We made a transition.
// K2 needs to be re-interpreted in Sayaka-2 architecture.
// K2 is now an "action", not a state.
// We need to calculate the next state X2, but which is now unavailable.
// If the next state is invalid, then of course it should have -max value.
// Else we can get maxQ(X2).
void Q_learn(int K1[dimK], int K2[dimK], double R)
{
double maxQ(int [dimK], double [dimK]);
double K_out[dimK];
#define Gamma 0.95
#define Eta 0.2
// Calculate ΔQ = η { R + γ max_a Q(K2,a) }
double dQ[1];
dQ[0] = Eta * (R + Gamma * maxQ(K2, K_out));
// Adjust old Q value
// oldQ += dQ;
double K[dimK * 2];
for (int k = 0; k < dimK; ++k)
{
K[k] = (double) K1[k];
K[k + dimK] = (double) K2[k];
}
// Invoke back-prop a few times (perhaps this would make the learning effect stronger?)
for (int i = 0; i < 2; ++i)
{
// We need to forward_prop Qnet with input (K1,K2)
forward_prop_sigmoid(Qnet, dimK * 2, K);
back_prop(Qnet, dQ);
}
}
// **** Learn a simple V-value map via backprop
/*
void learn_V(int s2[dimK], int s[dimK])
{
double S2[dimK], S[dimK];
for (int j = 0; j < 4; ++j)
{
for (int k = 0; k < dimK; ++k)
{
S2[k] = (double) s2[k];
S[k] = (double) s[k];
}
forward_prop_sigmoid(Qnet, dimK, S2);
LAYER LastLayer = (Qnet->layers[QnumLayers - 1]);
// The last layer has only 1 neuron, which outputs the Q value:
double Q2 = LastLayer.neurons[0].output;
forward_prop_sigmoid(Qnet, dimK, S);
double Q = LastLayer.neurons[0].output;
double error[1];
*error = Q2 - Q;
back_prop(Qnet, error);
}
}
*/
// (Part 1) Q-acting:
// Find K2 that maximizes Q(K,K2). Q is a real number.
// Method: numerical differentiation to find the gradient ∇Q = [∂Q/∂K2] which is a vector.
// The approximation formula for each component of ∂Q/∂K2 is: (subscripts omitted)
// ∂Q/∂K2 ≈ { Q(K2 + δ) - Q(K2 - δ) } /2δ
// TO-DO: Perhaps with multiple random restarts
// Note: function changes the components of K2.
void Q_act(double K[dimK], double K2[dimK])
{
double gradQ[dimK]; // the gradient vector ∇Q = [∂Q/∂K2]
do // While change is smaller than threshold
{
// Start with a random K2
for (int k = 0; k < dimK; ++k)
K2[k] = (rand() / (float) RAND_MAX) * 2.0 - 1.0; // in [+1,-1]
// Find the steepest direction [∂Q/∂K2], using numerical differentiation.
#define delta 0.1
for (int k = 0; k < dimK; ++k)
{
// Create 2 copies of K2, whose k-th component is added / subtracted with δ
double K2plus[dimK], K2minus[dimK];
for (int k2 = 0; k2 < dimK; ++k2)
K2plus[k2] = K2minus[k2] = K2[k2];
K2plus[k] += delta;
K2minus[k] -= delta;
gradQ[k] = (getQ(K, K2plus) - getQ(K, K2minus)) / 2 / delta;
}
// Move a little along the gradient direction: K2 += -λ ∇Q
// (There seems to be a negative sign in the above formula)
#define Lambda 0.1
for (int k = 0; k < dimK; ++k)
K2[k] += (-Lambda * gradQ[k]);
}
#define Epsilon 0.005
while (norm(gradQ) > Epsilon);
// Return with optimal K2 value
}
// Find maximum Q(K,K') value at state K, by varying K'.
// Method: gradient descent, using numerical differentiation to find the gradient [∂Q/∂K'].
// Algorithm is similar to above.
// 2nd argument is a place-holder.
double maxQ(int K[dimK], double K2[dimK])
{
double gradQ[dimK]; // the gradient vector ∇Q = [∂Q/∂K2]
double K1[dimK];
double gradSize;
for (int k = 0; k < dimK; ++k)
K1[k] = (double) K[k];
int tries = 0;
do // While change is smaller than threshold
{
// Start with a random K2
for (int k = 0; k < dimK; ++k)
K2[k] = (rand() / (float) RAND_MAX) * 2.0 - 1.0; // in [+1,-1]
// Find the steepest direction [∂Q/∂K2], using numerical differentiation.
#define delta 0.001
for (int k = 0; k < dimK; ++k)
{
// Create 2 copies of K2, whose k-th component is added / subtracted with δ
double K2plus[dimK], K2minus[dimK];
for (int k2 = 0; k2 < dimK; ++k2)
K2plus[k2] = K2minus[k2] = K2[k2];
K2plus[k] += delta;
K2minus[k] -= delta;
gradQ[k] = (getQ(K1, K2plus) - getQ(K1, K2minus)) / (2 * delta);
}
// Move a little along the gradient direction: K2 += -λ ∇Q
// (There seems to be a negative sign in the above formula)
#define Lambda 0.1
for (int k = 0; k < dimK; ++k)
K2[k] += (-Lambda * gradQ[k]);
gradSize = norm(gradQ);
// printf("gradient norm = %f\r", gradSize);
++tries;
}
#define Epsilon 0.1
#define MaxTries 20000
while (gradSize > Epsilon && tries < MaxTries);
if (tries >= MaxTries) // need to handle exception here
{
printf("fail: ");
// The result in K2 may still be usable?
}
double result = getQ(K1, K2);
printf("%2.3f ", result);
plot_W(Qnet);
return result; // return Q value
}
#define J 3 // number of Gaussians in the sum
#define epsilon 0.01
double c[J][N];
// **** Radial Basis Function ****
// Gaussian: Φ(r) = exp( -(εr)² )
// r = Euclidean distance ‖ x - c ‖ where c is the center
double RBF(double x[]) {
double result = 0.0;
for (int j = 0; j < J; ++j) {
double r = 0.0;
for (int n = 0; n < N; ++n)
r += pow(x[n] - c[j][n], 2);
result += exp(- pow(epsilon * r, 2));
}
return result;
}
int main() {
printf("\x1b[37;1mFind maximum of a trained neural network\x1b[0m\n");
printf("\x1b[32;1m1. Generate random Gaussians\x1b[0m\n");
for (int j = 0; j < J; ++j) {
for (int n = 0; n < N; ++n) {
// in [+10,-10]
c[j][n] = (rand() / (float) RAND_MAX) * 20.0 - 10.0;
printf("%0.5f ", c[j][n]);
}
printf("\n");
}
printf("\x1b[32;1m2. Train NN\x1b[0m\n");
for (int n = 0; n < N; ++n)
K[n] = 1.0;
printf("testing RBF at (1,1,1) = %0.5f\n", RBF(K));
init_Qnet();
train_Q();
printf("\x1b[32;1m3. Find maximums of NN by random sampling\x1b[0m\n");
}