Skip to content

Commit

Permalink
more refigments of deep learning (wip)
Browse files Browse the repository at this point in the history
  • Loading branch information
JulioJerez committed Sep 15, 2024
1 parent 0eb9121 commit 6671542
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@

namespace ndAdvancedRobot
{
//#define ND_TRAIN_MODEL
#define CONTROLLER_NAME "ndRobotArmReach-vpg.dnn"
#define ND_TRAIN_MODEL
#define CONTROLLER_NAME "ndRobotArmReach"

#define CONTROLLER_RESUME_TRANING

#define ND_USE_EULERS

Expand Down Expand Up @@ -1078,8 +1080,10 @@ namespace ndAdvancedRobot
,m_stopTraining(ndUnsigned32(2000)* ndUnsigned32(1000000))
,m_modelIsTrained(false)
{
char name[256];
m_horizon = ndFloat32(1.0f) / (ndFloat32(1.0f) - m_discountFactor);
m_outFile = fopen("robotArmReach-vpg.csv", "wb");
snprintf(name, sizeof(name), "%s-vpg.csv", CONTROLLER_NAME);
m_outFile = fopen(name, "wb");
fprintf(m_outFile, "vpg\n");

ndBrainAgentContinuePolicyGradient_TrainerMaster::HyperParameters hyperParameters;
Expand All @@ -1102,7 +1106,21 @@ namespace ndAdvancedRobot

m_master = ndSharedPtr<ndBrainAgentContinuePolicyGradient_TrainerMaster>(new ndBrainAgentContinuePolicyGradient_TrainerMaster(hyperParameters));
m_bestActor = ndSharedPtr<ndBrain>(new ndBrain(*m_master->GetActor()));
m_master->SetName(CONTROLLER_NAME);
snprintf(name, sizeof(name), "%s.dnn", CONTROLLER_NAME);
m_master->SetName(name);

#ifdef CONTROLLER_RESUME_TRANING
char fileName[256];
snprintf(name, sizeof(name), "%s_critic.dnn", CONTROLLER_NAME);
ndGetWorkingFileName(name, fileName);
ndSharedPtr<ndBrain> critic(ndBrainLoad::Load(fileName));
m_master->GetCritic()->CopyFrom(**critic);

snprintf(name, sizeof(name), "%s_actor.dnn", CONTROLLER_NAME);
ndGetWorkingFileName(name, fileName);
ndSharedPtr<ndBrain> actor(ndBrainLoad::Load(fileName));
m_master->GetActor()->CopyFrom(**actor);
#endif

auto SpawnModel = [this, scene, &visualMesh, floor](const ndMatrix& matrix)
{
Expand All @@ -1120,8 +1138,8 @@ namespace ndAdvancedRobot

ndInt32 countX = 22;
ndInt32 countZ = 23;
countX = 10;
countZ = 10;
//countX = 10;
//countZ = 10;

// add a hidden battery of model to generate trajectories in parallel
for (ndInt32 i = 0; i < countZ; ++i)
Expand Down Expand Up @@ -1241,16 +1259,19 @@ namespace ndAdvancedRobot

if (rewardTrajectory > m_saveScore)
{
char fileName[1024];
char fileName[256];
m_saveScore = ndFloor(rewardTrajectory) + 2.0f;

// save partial controller in case of crash
ndBrain* const actor = m_master->GetActor();
ndGetWorkingFileName("ndRobotArmReach_actor.dnn", fileName);
char name[256];
snprintf(name, sizeof(name), "%s_actor.dnn", CONTROLLER_NAME);
ndGetWorkingFileName(name, fileName);
actor->SaveToFile(fileName);

ndBrain* const critic = m_master->GetCritic();
ndGetWorkingFileName("ndRobotArmReach_critic.dnn", fileName);
snprintf(name, sizeof(name), "%s_critic.dnn", CONTROLLER_NAME);
ndGetWorkingFileName(name, fileName);
critic->SaveToFile(fileName);
}
}
Expand Down Expand Up @@ -1281,7 +1302,6 @@ namespace ndAdvancedRobot
ndFloat32 m_discountFactor;
ndUnsigned32 m_lastEpisode;
ndUnsigned32 m_stopTraining;

bool m_modelIsTrained;
};
}
Expand Down
10 changes: 5 additions & 5 deletions newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ namespace ndCarpole_0
,m_maxScore(ndFloat32(-1.0e10f))
,m_discountFactor(0.99f)
,m_horizon(ndFloat32(1.0f) / (ndFloat32(1.0f) - m_discountFactor))
,m_lastEpisode(-1)
,m_lastEpisode(0xffffffff)
,m_stopTraining(200 * 1000000)
,m_modelIsTrained(false)
{
Expand Down Expand Up @@ -482,10 +482,10 @@ namespace ndCarpole_0

virtual void Update(ndDemoEntityManager* const manager, ndFloat32)
{
ndInt32 stopTraining = m_master->GetFramesCount();
ndUnsigned32 stopTraining = m_master->GetFramesCount();
if (stopTraining <= m_stopTraining)
{
ndInt32 episodeCount = m_master->GetEposideCount();
ndUnsigned32 episodeCount = m_master->GetEposideCount();
m_master->OptimizeStep();

episodeCount -= m_master->GetEposideCount();
Expand Down Expand Up @@ -537,8 +537,8 @@ namespace ndCarpole_0
ndFloat32 m_maxScore;
ndFloat32 m_discountFactor;
ndFloat32 m_horizon;
ndInt32 m_lastEpisode;
ndInt32 m_stopTraining;
ndUnsigned32 m_lastEpisode;
ndUnsigned32 m_stopTraining;
bool m_modelIsTrained;
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,7 @@ ndBrainAgentContinuePolicyGradient_Trainer::ndBrainAgentContinuePolicyGradient_T
,m_trajectory(master->m_numberOfActions, master->m_numberOfObservations)
,m_master(master)
,m_randomGenerator(nullptr)
//,m_rd()
//,m_gen(m_rd())
//,m_d(ndFloat32(0.0f), ndFloat32(1.0f))
{
//m_gen.seed(m_master->m_randomSeed);
//m_master->m_randomSeed += 1;

//std::mt19937 m_gen0(m_rd());
//std::mt19937 m_gen1(m_rd());
//m_gen0.seed(m_master->m_randomSeed);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,6 @@ class ndBrainAgentContinuePolicyGradient_Trainer : public ndBrainAgent
ndBrainVector m_workingBuffer;
ndTrajectoryStep m_trajectory;
ndSharedPtr<ndBrainAgentContinuePolicyGradient_TrainerMaster> m_master;

//mutable std::random_device m_rd;
//mutable std::mt19937 m_gen;
//mutable std::normal_distribution<ndFloat32> m_d;
ndRandomGenerator* m_randomGenerator;

friend class ndBrainAgentContinuePolicyGradient_TrainerMaster;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,12 @@ ndBrainAgentDiscretePolicyGradient_Trainer::ndBrainAgentDiscretePolicyGradient_T
,m_workingBuffer()
,m_trajectory(master->m_numberOfObservations)
,m_master(master)
,m_rd()
,m_gen(m_rd())
,m_d(ndFloat32(0.0f), ndFloat32(1.0f))
,m_randomGenerator(nullptr)
{
m_gen.seed(m_master->m_randomSeed);
m_master->m_randomSeed += 1;

m_master->m_agents.Append(this);
m_trajectory.SetCount(m_master->m_maxTrajectorySteps + m_master->m_extraTrajectorySteps);
m_trajectory.SetCount(0);
m_randomGenerator = m_master->GetRandomGenerator();
}

ndBrainAgentDiscretePolicyGradient_Trainer::~ndBrainAgentDiscretePolicyGradient_Trainer()
Expand Down Expand Up @@ -216,7 +212,8 @@ ndBrainFloat ndBrainAgentDiscretePolicyGradient_Trainer::SelectAction(const ndBr
}
pdf.PushBack(sum);

ndFloat32 r = m_d(m_gen);
ndRandomGenerator& generator = *m_randomGenerator;
ndFloat32 r = generator.m_d(generator.m_gen);
ndInt32 index = m_master->m_numberOfActions - 1;
for (ndInt32 i = index; i >= 0; --i)
{
Expand Down Expand Up @@ -306,8 +303,8 @@ ndBrainAgentDiscretePolicyGradient_TrainerMaster::ndBrainAgentDiscretePolicyGrad
,m_criticLearnRate(hyperParameters.m_criticLearnRate)
,m_numberOfActions(hyperParameters.m_numberOfActions)
,m_numberOfObservations(hyperParameters.m_numberOfObservations)
,m_frameCount(0)
,m_framesAlive(0)
,m_frameCount(0)
,m_eposideCount(0)
,m_bashBufferSize(hyperParameters.m_bashBufferSize)
,m_maxTrajectorySteps(hyperParameters.m_maxTrajectorySteps)
Expand All @@ -326,8 +323,15 @@ ndBrainAgentDiscretePolicyGradient_TrainerMaster::ndBrainAgentDiscretePolicyGrad
{
ndAssert(m_numberOfActions);
ndAssert(m_numberOfObservations);
ndSetRandSeed(m_randomSeed);

m_randomGenerator = new ndBrainAgentDiscretePolicyGradient_Trainer::ndRandomGenerator[size_t(hyperParameters.m_bashTrajectoryCount)];
for (ndInt32 i = 0; i < hyperParameters.m_bashTrajectoryCount; ++i)
{
m_randomSeed++;
m_randomGenerator[i].m_gen.seed(m_randomSeed);
}

m_randomSeed = 0;
// build policy neural net
SetThreadCount(hyperParameters.m_threadsCount);
ndFixSizeArray<ndBrainLayer*, 32> layers;
Expand Down Expand Up @@ -418,7 +422,9 @@ ndBrainAgentDiscretePolicyGradient_TrainerMaster::~ndBrainAgentDiscretePolicyGra
{
delete m_baseLineValueTrainers[i];
}

delete m_baseLineValueOptimizer;
delete[] m_randomGenerator;
}

ndBrain* ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetActor()
Expand All @@ -441,7 +447,13 @@ void ndBrainAgentDiscretePolicyGradient_TrainerMaster::SetName(const ndString& n
m_name = name;
}

ndInt32 ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetFramesCount() const
ndBrainAgentDiscretePolicyGradient_Trainer::ndRandomGenerator* ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetRandomGenerator()
{
m_randomSeed = (m_randomSeed + 1) % m_bashTrajectoryCount;
return &m_randomGenerator[m_randomSeed];
}

ndUnsigned32 ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetFramesCount() const
{
return m_frameCount;
}
Expand All @@ -451,7 +463,7 @@ bool ndBrainAgentDiscretePolicyGradient_TrainerMaster::IsSampling() const
return false;
}

ndInt32 ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetEposideCount() const
ndUnsigned32 ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetEposideCount() const
{
return m_eposideCount;
}
Expand Down Expand Up @@ -702,6 +714,7 @@ void ndBrainAgentDiscretePolicyGradient_TrainerMaster::OptimizeStep()
{
agent->SaveTrajectory();
agent->ResetModel();
agent->m_randomGenerator = GetRandomGenerator();
}
m_frameCount++;
m_framesAlive++;
Expand All @@ -718,6 +731,7 @@ void ndBrainAgentDiscretePolicyGradient_TrainerMaster::OptimizeStep()
{
ndBrainAgentDiscretePolicyGradient_Trainer* const agent = node->GetInfo();
agent->m_trajectory.SetCount(0);
agent->ResetModel();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ class ndBrainAgentDiscretePolicyGradient_Trainer : public ndBrainAgent
ndInt32 m_obsevationsSize;
};

class ndRandomGenerator
{
public:
std::mt19937 m_gen;
std::random_device m_rd;
std::normal_distribution<ndFloat32> m_d;
};

ndBrainAgentDiscretePolicyGradient_Trainer(const ndSharedPtr<ndBrainAgentDiscretePolicyGradient_TrainerMaster>& master);
~ndBrainAgentDiscretePolicyGradient_Trainer();

Expand All @@ -82,10 +90,7 @@ class ndBrainAgentDiscretePolicyGradient_Trainer : public ndBrainAgent
ndBrainVector m_workingBuffer;
ndTrajectoryStep m_trajectory;
ndSharedPtr<ndBrainAgentDiscretePolicyGradient_TrainerMaster> m_master;

mutable std::random_device m_rd;
mutable std::mt19937 m_gen;
mutable std::uniform_real_distribution<ndFloat32> m_d;
ndRandomGenerator* m_randomGenerator;

friend class ndBrainAgentDiscretePolicyGradient_TrainerMaster;
};
Expand Down Expand Up @@ -136,8 +141,8 @@ class ndBrainAgentDiscretePolicyGradient_TrainerMaster : public ndBrainThreadPoo
const ndString& GetName() const;
void SetName(const ndString& name);

ndInt32 GetFramesCount() const;
ndInt32 GetEposideCount() const;
ndUnsigned32 GetFramesCount() const;
ndUnsigned32 GetEposideCount() const;

bool IsSampling() const;
ndFloat32 GetAverageScore() const;
Expand All @@ -150,6 +155,7 @@ class ndBrainAgentDiscretePolicyGradient_TrainerMaster : public ndBrainThreadPoo
void OptimizePolicy();
void OptimizeCritic();
void UpdateBaseLineValue();
ndBrainAgentDiscretePolicyGradient_Trainer::ndRandomGenerator* GetRandomGenerator();

ndBrain m_actor;
ndBrain m_baseLineValue;
Expand All @@ -162,16 +168,17 @@ class ndBrainAgentDiscretePolicyGradient_TrainerMaster : public ndBrainThreadPoo

MemoryStateValues m_stateValues;
ndArray<ndInt32> m_randomPermutation;
ndBrainAgentDiscretePolicyGradient_Trainer::ndRandomGenerator* m_randomGenerator;
ndBrainAgentDiscretePolicyGradient_Trainer::ndTrajectoryStep m_trajectoryAccumulator;

ndBrainFloat m_gamma;
ndBrainFloat m_policyLearnRate;
ndBrainFloat m_criticLearnRate;
ndInt32 m_numberOfActions;
ndInt32 m_numberOfObservations;
ndInt32 m_frameCount;
ndInt32 m_framesAlive;
ndInt32 m_eposideCount;
ndUnsigned32 m_frameCount;
ndUnsigned32 m_eposideCount;
ndInt32 m_bashBufferSize;
ndInt32 m_maxTrajectorySteps;
ndInt32 m_extraTrajectorySteps;
Expand Down

0 comments on commit 6671542

Please sign in to comment.