more refigments of deep learning (wip)

MADEAPPS · Sep 15, 2024 · 6671542 · 6671542
1 parent 0eb9121
commit 6671542
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 44 deletions.
diff --git a/newton-4.00/applications/ndSandbox/demos/ndAdvancedIndustrialRobot.cpp b/newton-4.00/applications/ndSandbox/demos/ndAdvancedIndustrialRobot.cpp
@@ -24,8 +24,10 @@
 
 namespace ndAdvancedRobot
 {
-	//#define ND_TRAIN_MODEL
-	#define CONTROLLER_NAME "ndRobotArmReach-vpg.dnn"
+	#define ND_TRAIN_MODEL
+	#define CONTROLLER_NAME "ndRobotArmReach"
+
+	#define CONTROLLER_RESUME_TRANING
 
 	#define ND_USE_EULERS
 
@@ -1078,8 +1080,10 @@ namespace ndAdvancedRobot
 			,m_stopTraining(ndUnsigned32(2000)* ndUnsigned32(1000000))
 			,m_modelIsTrained(false)
 		{
+			char name[256];
 			m_horizon = ndFloat32(1.0f) / (ndFloat32(1.0f) - m_discountFactor);
-			m_outFile = fopen("robotArmReach-vpg.csv", "wb");
+			snprintf(name, sizeof(name), "%s-vpg.csv", CONTROLLER_NAME);
+			m_outFile = fopen(name, "wb");
 			fprintf(m_outFile, "vpg\n");
 
 			ndBrainAgentContinuePolicyGradient_TrainerMaster::HyperParameters hyperParameters;
@@ -1102,7 +1106,21 @@ namespace ndAdvancedRobot
 
 			m_master = ndSharedPtr<ndBrainAgentContinuePolicyGradient_TrainerMaster>(new ndBrainAgentContinuePolicyGradient_TrainerMaster(hyperParameters));
 			m_bestActor = ndSharedPtr<ndBrain>(new ndBrain(*m_master->GetActor()));
-			m_master->SetName(CONTROLLER_NAME);
+			snprintf(name, sizeof(name), "%s.dnn", CONTROLLER_NAME);
+			m_master->SetName(name);
+
+			#ifdef CONTROLLER_RESUME_TRANING
+				char fileName[256];
+				snprintf(name, sizeof(name), "%s_critic.dnn", CONTROLLER_NAME);
+				ndGetWorkingFileName(name, fileName);
+				ndSharedPtr<ndBrain> critic(ndBrainLoad::Load(fileName));
+				m_master->GetCritic()->CopyFrom(**critic);
+
+				snprintf(name, sizeof(name), "%s_actor.dnn", CONTROLLER_NAME);
+				ndGetWorkingFileName(name, fileName);
+				ndSharedPtr<ndBrain> actor(ndBrainLoad::Load(fileName));
+				m_master->GetActor()->CopyFrom(**actor);
+			#endif
 
 			auto SpawnModel = [this, scene, &visualMesh, floor](const ndMatrix& matrix)
 			{
@@ -1120,8 +1138,8 @@ namespace ndAdvancedRobot
 
 			ndInt32 countX = 22;
 			ndInt32 countZ = 23;
-			countX = 10;
-			countZ = 10;
+			//countX = 10;
+			//countZ = 10;
 
 			// add a hidden battery of model to generate trajectories in parallel
 			for (ndInt32 i = 0; i < countZ; ++i)
@@ -1241,16 +1259,19 @@ namespace ndAdvancedRobot
 
 				if (rewardTrajectory > m_saveScore)
 				{
-					char fileName[1024];
+					char fileName[256];
 					m_saveScore = ndFloor(rewardTrajectory) + 2.0f;
 
 					// save partial controller in case of crash 
 					ndBrain* const actor = m_master->GetActor();
-					ndGetWorkingFileName("ndRobotArmReach_actor.dnn", fileName);
+					char name[256];
+					snprintf(name, sizeof(name), "%s_actor.dnn", CONTROLLER_NAME);
+					ndGetWorkingFileName(name, fileName);
 					actor->SaveToFile(fileName);
 
 					ndBrain* const critic = m_master->GetCritic();
-					ndGetWorkingFileName("ndRobotArmReach_critic.dnn", fileName);
+					snprintf(name, sizeof(name), "%s_critic.dnn", CONTROLLER_NAME);
+					ndGetWorkingFileName(name, fileName);
 					critic->SaveToFile(fileName);
 				}
 			}
@@ -1281,7 +1302,6 @@ namespace ndAdvancedRobot
 		ndFloat32 m_discountFactor;
 		ndUnsigned32 m_lastEpisode;
 		ndUnsigned32 m_stopTraining;
-
 		bool m_modelIsTrained;
 	};
 }

diff --git a/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp b/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp
@@ -350,7 +350,7 @@ namespace ndCarpole_0
 			,m_maxScore(ndFloat32(-1.0e10f))
 			,m_discountFactor(0.99f)
 			,m_horizon(ndFloat32(1.0f) / (ndFloat32(1.0f) - m_discountFactor))
-			,m_lastEpisode(-1)
+			,m_lastEpisode(0xffffffff)
 			,m_stopTraining(200 * 1000000)
 			,m_modelIsTrained(false)
 		{
@@ -482,10 +482,10 @@ namespace ndCarpole_0
 
 		virtual void Update(ndDemoEntityManager* const manager, ndFloat32)
 		{
-			ndInt32 stopTraining = m_master->GetFramesCount();
+			ndUnsigned32 stopTraining = m_master->GetFramesCount();
 			if (stopTraining <= m_stopTraining)
 			{
-				ndInt32 episodeCount = m_master->GetEposideCount();
+				ndUnsigned32 episodeCount = m_master->GetEposideCount();
 				m_master->OptimizeStep();
 
 				episodeCount -= m_master->GetEposideCount();
@@ -537,8 +537,8 @@ namespace ndCarpole_0
 		ndFloat32 m_maxScore;
 		ndFloat32 m_discountFactor;
 		ndFloat32 m_horizon;
-		ndInt32 m_lastEpisode;
-		ndInt32 m_stopTraining;
+		ndUnsigned32 m_lastEpisode;
+		ndUnsigned32 m_stopTraining;
 		bool m_modelIsTrained;
 	};
 }

diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentContinuePolicyGradient_Trainer.cpp b/newton-4.00/sdk/dBrain/ndBrainAgentContinuePolicyGradient_Trainer.cpp
@@ -212,13 +212,7 @@ ndBrainAgentContinuePolicyGradient_Trainer::ndBrainAgentContinuePolicyGradient_T
 	,m_trajectory(master->m_numberOfActions, master->m_numberOfObservations)
 	,m_master(master)
 	,m_randomGenerator(nullptr)
-	//,m_rd()
-	//,m_gen(m_rd())
-	//,m_d(ndFloat32(0.0f), ndFloat32(1.0f))
 {
-	//m_gen.seed(m_master->m_randomSeed);
-	//m_master->m_randomSeed += 1;
-
 	//std::mt19937 m_gen0(m_rd());
 	//std::mt19937 m_gen1(m_rd());
 	//m_gen0.seed(m_master->m_randomSeed);

diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentContinuePolicyGradient_Trainer.h b/newton-4.00/sdk/dBrain/ndBrainAgentContinuePolicyGradient_Trainer.h
@@ -90,10 +90,6 @@ class ndBrainAgentContinuePolicyGradient_Trainer : public ndBrainAgent
 	ndBrainVector m_workingBuffer;
 	ndTrajectoryStep m_trajectory;
 	ndSharedPtr<ndBrainAgentContinuePolicyGradient_TrainerMaster> m_master;
-
-	//mutable std::random_device m_rd;
-	//mutable std::mt19937 m_gen;
-	//mutable std::normal_distribution<ndFloat32> m_d;
 	ndRandomGenerator* m_randomGenerator;
 
 	friend class ndBrainAgentContinuePolicyGradient_TrainerMaster;

diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentDiscretePolicyGradient_Trainer.cpp b/newton-4.00/sdk/dBrain/ndBrainAgentDiscretePolicyGradient_Trainer.cpp
@@ -164,16 +164,12 @@ ndBrainAgentDiscretePolicyGradient_Trainer::ndBrainAgentDiscretePolicyGradient_T
 	,m_workingBuffer()
 	,m_trajectory(master->m_numberOfObservations)
 	,m_master(master)
-	,m_rd()
-	,m_gen(m_rd())
-	,m_d(ndFloat32(0.0f), ndFloat32(1.0f))
+	,m_randomGenerator(nullptr)
 {
-	m_gen.seed(m_master->m_randomSeed);
-	m_master->m_randomSeed += 1;
-
 	m_master->m_agents.Append(this);
 	m_trajectory.SetCount(m_master->m_maxTrajectorySteps + m_master->m_extraTrajectorySteps);
 	m_trajectory.SetCount(0);
+	m_randomGenerator = m_master->GetRandomGenerator();
 }
 
 ndBrainAgentDiscretePolicyGradient_Trainer::~ndBrainAgentDiscretePolicyGradient_Trainer()
@@ -216,7 +212,8 @@ ndBrainFloat ndBrainAgentDiscretePolicyGradient_Trainer::SelectAction(const ndBr
 	}
 	pdf.PushBack(sum);
 
-	ndFloat32 r = m_d(m_gen);
+	ndRandomGenerator& generator = *m_randomGenerator;
+	ndFloat32 r = generator.m_d(generator.m_gen);
 	ndInt32 index = m_master->m_numberOfActions - 1;
 	for (ndInt32 i = index; i >= 0; --i)
 	{
@@ -306,8 +303,8 @@ ndBrainAgentDiscretePolicyGradient_TrainerMaster::ndBrainAgentDiscretePolicyGrad
 	,m_criticLearnRate(hyperParameters.m_criticLearnRate)
 	,m_numberOfActions(hyperParameters.m_numberOfActions)
 	,m_numberOfObservations(hyperParameters.m_numberOfObservations)
-	,m_frameCount(0)
 	,m_framesAlive(0)
+	,m_frameCount(0)
 	,m_eposideCount(0)
 	,m_bashBufferSize(hyperParameters.m_bashBufferSize)
 	,m_maxTrajectorySteps(hyperParameters.m_maxTrajectorySteps)
@@ -326,8 +323,15 @@ ndBrainAgentDiscretePolicyGradient_TrainerMaster::ndBrainAgentDiscretePolicyGrad
 {
 	ndAssert(m_numberOfActions);
 	ndAssert(m_numberOfObservations);
-	ndSetRandSeed(m_randomSeed);
 
+	m_randomGenerator = new ndBrainAgentDiscretePolicyGradient_Trainer::ndRandomGenerator[size_t(hyperParameters.m_bashTrajectoryCount)];
+	for (ndInt32 i = 0; i < hyperParameters.m_bashTrajectoryCount; ++i)
+	{
+		m_randomSeed++;
+		m_randomGenerator[i].m_gen.seed(m_randomSeed);
+	}
+
+	m_randomSeed = 0;
 	// build policy neural net
 	SetThreadCount(hyperParameters.m_threadsCount);
 	ndFixSizeArray<ndBrainLayer*, 32> layers;
@@ -418,7 +422,9 @@ ndBrainAgentDiscretePolicyGradient_TrainerMaster::~ndBrainAgentDiscretePolicyGra
 	{
 		delete m_baseLineValueTrainers[i];
 	}
+
 	delete m_baseLineValueOptimizer;
+	delete[] m_randomGenerator;
 }
 
 ndBrain* ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetActor()
@@ -441,7 +447,13 @@ void ndBrainAgentDiscretePolicyGradient_TrainerMaster::SetName(const ndString& n
 	m_name = name;
 }
 
-ndInt32 ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetFramesCount() const
+ndBrainAgentDiscretePolicyGradient_Trainer::ndRandomGenerator* ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetRandomGenerator()
+{
+	m_randomSeed = (m_randomSeed + 1) % m_bashTrajectoryCount;
+	return &m_randomGenerator[m_randomSeed];
+}
+
+ndUnsigned32 ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetFramesCount() const
 {
 	return m_frameCount;
 }
@@ -451,7 +463,7 @@ bool ndBrainAgentDiscretePolicyGradient_TrainerMaster::IsSampling() const
 	return false;
 }
 
-ndInt32 ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetEposideCount() const
+ndUnsigned32 ndBrainAgentDiscretePolicyGradient_TrainerMaster::GetEposideCount() const
 {
 	return m_eposideCount;
 }
@@ -702,6 +714,7 @@ void ndBrainAgentDiscretePolicyGradient_TrainerMaster::OptimizeStep()
 		{
 			agent->SaveTrajectory();
 			agent->ResetModel();
+			agent->m_randomGenerator = GetRandomGenerator();
 		}
 		m_frameCount++;
 		m_framesAlive++;
@@ -718,6 +731,7 @@ void ndBrainAgentDiscretePolicyGradient_TrainerMaster::OptimizeStep()
 		{
 			ndBrainAgentDiscretePolicyGradient_Trainer* const agent = node->GetInfo();
 			agent->m_trajectory.SetCount(0);
+			agent->ResetModel();
 		}
 	}
 }
diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentDiscretePolicyGradient_Trainer.h b/newton-4.00/sdk/dBrain/ndBrainAgentDiscretePolicyGradient_Trainer.h
@@ -62,6 +62,14 @@ class ndBrainAgentDiscretePolicyGradient_Trainer : public ndBrainAgent
 		ndInt32 m_obsevationsSize;
 	};
 
+	class ndRandomGenerator
+	{
+		public:
+		std::mt19937 m_gen;
+		std::random_device m_rd;
+		std::normal_distribution<ndFloat32> m_d;
+	};
+
 	ndBrainAgentDiscretePolicyGradient_Trainer(const ndSharedPtr<ndBrainAgentDiscretePolicyGradient_TrainerMaster>& master);
 	~ndBrainAgentDiscretePolicyGradient_Trainer();
 
@@ -82,10 +90,7 @@ class ndBrainAgentDiscretePolicyGradient_Trainer : public ndBrainAgent
 	ndBrainVector m_workingBuffer;
 	ndTrajectoryStep m_trajectory;
 	ndSharedPtr<ndBrainAgentDiscretePolicyGradient_TrainerMaster> m_master;
-
-	mutable std::random_device m_rd;
-	mutable std::mt19937 m_gen;
-	mutable std::uniform_real_distribution<ndFloat32> m_d;
+	ndRandomGenerator* m_randomGenerator;
 
 	friend class ndBrainAgentDiscretePolicyGradient_TrainerMaster;
 };
@@ -136,8 +141,8 @@ class ndBrainAgentDiscretePolicyGradient_TrainerMaster : public ndBrainThreadPoo
 	const ndString& GetName() const;
 	void SetName(const ndString& name);
 
-	ndInt32 GetFramesCount() const;
-	ndInt32 GetEposideCount() const;
+	ndUnsigned32 GetFramesCount() const;
+	ndUnsigned32 GetEposideCount() const;
 
 	bool IsSampling() const;
 	ndFloat32 GetAverageScore() const;
@@ -150,6 +155,7 @@ class ndBrainAgentDiscretePolicyGradient_TrainerMaster : public ndBrainThreadPoo
 	void OptimizePolicy();
 	void OptimizeCritic();
 	void UpdateBaseLineValue();
+	ndBrainAgentDiscretePolicyGradient_Trainer::ndRandomGenerator* GetRandomGenerator();
 
 	ndBrain m_actor;
 	ndBrain m_baseLineValue;
@@ -162,16 +168,17 @@ class ndBrainAgentDiscretePolicyGradient_TrainerMaster : public ndBrainThreadPoo
 
 	MemoryStateValues m_stateValues;
 	ndArray<ndInt32> m_randomPermutation;
+	ndBrainAgentDiscretePolicyGradient_Trainer::ndRandomGenerator* m_randomGenerator;
 	ndBrainAgentDiscretePolicyGradient_Trainer::ndTrajectoryStep m_trajectoryAccumulator;
 
 	ndBrainFloat m_gamma;
 	ndBrainFloat m_policyLearnRate;
 	ndBrainFloat m_criticLearnRate;
 	ndInt32 m_numberOfActions;
 	ndInt32 m_numberOfObservations;
-	ndInt32 m_frameCount;
 	ndInt32 m_framesAlive;
-	ndInt32 m_eposideCount;
+	ndUnsigned32 m_frameCount;
+	ndUnsigned32 m_eposideCount;
 	ndInt32 m_bashBufferSize;
 	ndInt32 m_maxTrajectorySteps;
 	ndInt32 m_extraTrajectorySteps;