-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cpp
251 lines (201 loc) · 10.1 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
/*
CODE STILL UNDER DEVELOPMENT
This code trains a user-defined number of (over) idealized birds in a pursuit-evasion scenario.
A bird (the pursuer) learns how to chase the rest of the pack (evaders) given a certain
representation of the state.
The state is defined as the positions and orientation of all the birds. The velocity module is fixed
for every bird at every time step (v_pursuer > v_evader).
To learn optimal policies, we are employing an actor critic algorithm with Natural policy gradient.
We are training the birds in an "autocurricula" fashion: the pursuer learns first,
after some episodes (fixed to 5000 in the code) it's time for all the evaders to learn concurrently.
The natural policy gradient update is performed within Boltzmann.h file, while the actor critic
algorithm is defined in this main file.
The environment class defines the one-step dynamics and the rewards given to the agents, according to
the actions they take.
In the DirectedAgent.h class you can find how the observations are defined for the current implementation.
Support for UndirectedObservation (i.e. without distinguishing whether other birds are pointing
towards or outwards from the agent) is also available but main needs to be slightly modified.
*/
#include "State.h"
#include "Observable.h"
#include "Environment.h"
#include "Reward.h"
#include "RandomWalk.h"
#include "Boltzmann.h"
#include "Bird.h"
#include "V.h"
#include "Signal.h"
#include "Timer.h"
#include "Chase.h"
#include "Agent.h"
#include "ClosestObsDirected.h"
#include "ClosestObsUndirected.h"
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <tuple>
#include <utility>
#include <string>
#include <list>
#include <memory>
#include <algorithm>
#include <random>
#include <omp.h>
//Defining global variables and which Observers to use
#include "config.h"
int main(){
//Decide the number of vision sectors each agent observes
const std::size_t sectors_num = (evader_meridians.size()-1)*evader_parallels.size();
const std::size_t state_space_dim = pow(2*sectors_num+1,1+std::max(evader_neighbours,pursuer_neighbours));
//Seeding an almost random seed. Good enough for our program and faster than std::random
srand(time(NULL));
//Print the environment and training information into a log file
std::ofstream log_file;
log_file.open("data/env_info.csv");
log_file << "episodes_num,episodes_length,num_of_birds,state_space_dim,episode_write_step,sectors" << std::endl;
log_file << episodes_num << "," << episode_length << "," << num_of_birds << "," <<
state_space_dim << "," << step_write << "," << sectors_num << std::endl;
//---------------------------------------------------------------------------------
//Initialization of environment and agents
Environment env(num_of_birds, v0_pursuer, v0_evader, friends_range, capture_range,
steering_angle_pursuer, steering_angle_evader, pbc, pursuer_parallels.back(),
std::make_pair(pursuer_meridians.front(),pursuer_meridians.back()),
av_dist,prey_repulsion, attraction_range, prey_attraction);
std::vector<Agt> agents;
agents.push_back(Agt(Boltzmann(state_space_dim,3),Observer(pursuer_meridians,pursuer_parallels,0,pursuer_neighbours,pbc)));
for(std::size_t i=1; i < num_of_birds; ++i){
agents.push_back(Agt(Boltzmann(state_space_dim,3),Observer(evader_meridians, evader_parallels,i,evader_neighbours,pbc)));
}
//Bunch of pointers to keep track of values during the run and avoiding hard copying
std::shared_ptr<State> prev_state;
std::shared_ptr<State> next_state;
std::vector<Action> a(num_of_birds);
std::vector<std::shared_ptr<Obs>> prev_obs(num_of_birds);
std::vector<std::shared_ptr<Obs>> next_obs(num_of_birds);
std::vector<V> v(num_of_birds, V(state_space_dim));
std::vector<double> delta(num_of_birds);
std::pair<Reward,bool> r; //Reward
//Storing data to open one single buffer at the end. Write to ram >> write to disk.
std::list<std::pair<std::size_t,State>> traj;
std::list<std::tuple<std::size_t,std::size_t, Obs>> record_obs;
std::list<std::vector<std::size_t>> t_ep;
//This vector has 4 entries because it stores the value and the theta of the 3 actions for each state, for each episode
std::vector<std::vector<double>> value_policy(4*num_of_birds, std::vector<double>(
std::max(state_space_dim,static_cast<std::size_t>(episodes_num*state_space_dim/step_write))));
//Output files of the simulation
std::ofstream traj_file;
std::ofstream episode_file;
std::vector<std::ofstream> value_policy_files(num_of_birds);
//File initialization and header construction
//---------------------------------------------------------------------------------
traj_file.open("data/trajectory.csv");
//Header construction
traj_file << "Episode" << ",";
for(std::size_t i=0; i<num_of_birds; ++i){
traj_file << "x" << i << ",y" << i << ",alpha" << i;
if(i<num_of_birds-1)
traj_file << ",";
}
traj_file << "\n";
episode_file.open("data/episode.csv");
episode_file << "Episode,EndTime" << std::endl;
for(std::size_t i=0; i<num_of_birds; ++i)
{
value_policy_files[i].open("data/value_policy"+std::to_string(i)+".csv");
value_policy_files[i] << "value,";
value_policy_files[i] << "left,";
value_policy_files[i] << "straight,";
value_policy_files[i] << "right";
value_policy_files[i] << std::endl;
}
//---------------------------------------------------------------------------------
//Time of episode
std::size_t t = 0;
//Here starts the actor-critic algorithm
for(std::size_t ep=0; ep<episodes_num; ep++){
//State initialization
env.reset();
prev_state = std::make_shared<State>(env.get_state());
for(std::size_t i=0; i<agents.size(); ++i){
prev_obs[i] = std::make_shared<Obs>(agents[i].obs(*prev_state));
}
while(t < episode_length){
//Data logging every step_writeth episode
if(ep%step_write == 0){
traj.push_back(std::make_pair(ep,*prev_state));
for(std::size_t i = 0; i<num_of_birds; ++i)
record_obs.push_back(std::make_tuple(ep,i,*prev_obs[i]));
}
//All agents get an observation based on the current state and return an action
for(std::size_t i = 0; i < agents.size(); ++i){
a[i] = (agents[i].act(*prev_state, *prev_obs[i]));
}
next_state = std::make_shared<State>(env.dynamics(a, *prev_state));
#ifndef _OPENMP
for(std::size_t i=0; i<agents.size(); ++i)
next_obs[i] = std::make_shared<Obs>(agents[i].obs(*next_state));
#else
#pragma omp parallel for
for(std::size_t i=0; i<agents.size(); ++i)
next_obs[i] = std::make_shared<Obs>(agents[i].obs(*next_state));
#endif
//r contains the reward for all the agents on its first components
//and a boolean variable stating if the episode is finished or not in its second component
r = env.reward(*prev_state, static_cast<double>(episode_length), num_of_birds-1);
//If episode is over:
if(std::get<1>(r) == 1){
for(std::size_t i=0; i<num_of_birds; ++i){
delta[i] = std::get<0>(r)[i] - v[i][*prev_obs[i]];
v[i][*prev_obs[i]] += alpha_w*delta[i];
agents[i].update_policy(alpha_t*delta[i], *prev_obs[i], a[i]);
}
break;
}
for(std::size_t i=0; i<num_of_birds; ++i){
delta[i] = std::get<0>(r)[i] + v[i][*next_obs[i]] - v[i][*prev_obs[i]];
v[i][*prev_obs[i]] += alpha_w*delta[i];
agents[i].update_policy(alpha_t*delta[i], *prev_obs[i], a[i]);
}
//State update
prev_state = next_state;
//Observation update
for(std::size_t i=0; i<num_of_birds; ++i)
prev_obs[i] = next_obs[i];
//Update time step
t++;
} //End of a time step of algorithm
//Logging value and policy data
if(ep%step_write == 0){
for(std::size_t i=0; i < num_of_birds; ++i){
for(std::size_t k=0; k<state_space_dim; ++k){
value_policy[i*4][static_cast<int>(ep/step_write)*state_space_dim+k] = v[i][k];
value_policy[i*4+1][static_cast<int>(ep/step_write)*state_space_dim+k] = agents[i].get_policy()->get(k,0);
value_policy[i*4+2][static_cast<int>(ep/step_write)*state_space_dim+k] = agents[i].get_policy()->get(k,1);
value_policy[i*4+3][static_cast<int>(ep/step_write)*state_space_dim+k] = agents[i].get_policy()->get(k,2);
}
}
}
t_ep.push_back(std::vector<std::size_t>{ep,t});
t = 0;
} //End of an episode
//Transferring data to file at the end of everything
for(auto it1=traj.begin(); it1!=traj.end(); ++it1){
traj_file << std::get<0>(*it1) << "," << std::get<1>(*it1); //<< "\n";
}
for(auto it=t_ep.begin(); it!=t_ep.end(); ++it)
episode_file << (*it)[0] << "," << (*it)[1] << "\n";
for(std::size_t i=0; i<num_of_birds; ++i){
for(std::size_t k=0; k<value_policy[0].size(); ++k){
value_policy_files[i] << value_policy[4*i][k] << ",";
value_policy_files[i] << value_policy[4*i+1][k] << ",";
value_policy_files[i] << value_policy[4*i+2][k] << ",";
value_policy_files[i] << value_policy[4*i+3][k] << "\n";
}
}
//Closing files
traj_file.close();
episode_file.close();
for(std::size_t i=0; i<num_of_birds; ++i)
value_policy_files[i].close();
return 0;
}