diff --git a/newton-4.00/applications/ndSandbox/demos/ndAdvancedIndustrialRobot.cpp b/newton-4.00/applications/ndSandbox/demos/ndAdvancedIndustrialRobot.cpp index c436c4c31..0a100a111 100644 --- a/newton-4.00/applications/ndSandbox/demos/ndAdvancedIndustrialRobot.cpp +++ b/newton-4.00/applications/ndSandbox/demos/ndAdvancedIndustrialRobot.cpp @@ -27,7 +27,7 @@ namespace ndAdvancedRobot #define ND_TRAIN_MODEL #define CONTROLLER_NAME "ndRobotArmReach" - //#define CONTROLLER_RESUME_TRANING + //#define CONTROLLER_RESUME_TRAINING #define ND_USE_EULERS @@ -100,7 +100,7 @@ namespace ndAdvancedRobot #define ND_MAX_X_SPAND ndReal ( 1.5f) #define ND_MIN_Y_SPAND ndReal (-2.2f) #define ND_MAX_Y_SPAND ndReal ( 1.5f) - #define ND_ACTION_SENSITIVITY ndReal (0.05f) + #define ND_ACTION_SENSITIVITY ndReal ( 0.05f) #define ND_DEAD_PENALTY ndReal (-10.0f) @@ -1125,7 +1125,7 @@ namespace ndAdvancedRobot snprintf(name, sizeof(name), "%s.dnn", CONTROLLER_NAME); m_master->SetName(name); - #ifdef CONTROLLER_RESUME_TRANING + #ifdef CONTROLLER_RESUME_TRAINING char fileName[256]; snprintf(name, sizeof(name), "%s_critic.dnn", CONTROLLER_NAME); ndGetWorkingFileName(name, fileName); @@ -1154,8 +1154,8 @@ namespace ndAdvancedRobot ndInt32 countX = 22; ndInt32 countZ = 23; - //countX = 1; - //countZ = 1; + //countX = 10; + //countZ = 10; // add a hidden battery of model to generate trajectories in parallel for (ndInt32 i = 0; i < countZ; ++i) diff --git a/newton-4.00/applications/ndSandbox/demos/ndCartpoleContinue.cpp b/newton-4.00/applications/ndSandbox/demos/ndCartpoleContinue.cpp index 291bb43f2..624a00321 100644 --- a/newton-4.00/applications/ndSandbox/demos/ndCartpoleContinue.cpp +++ b/newton-4.00/applications/ndSandbox/demos/ndCartpoleContinue.cpp @@ -26,7 +26,7 @@ namespace ndCarpole_1 //#define ND_TRAIN_AGENT #define CONTROLLER_NAME "cartpoleContinue" - #define CONTROLLER_RESUME_TRANING + //#define CONTROLLER_RESUME_TRAINING #define D_PUSH_ACCEL ndBrainFloat (15.0f) #define D_REWARD_MIN_ANGLE ndBrainFloat (20.0f * ndDegreeToRad) @@ -335,7 +335,7 @@ namespace ndCarpole_1 snprintf(name, sizeof(name), "%s.dnn", CONTROLLER_NAME); m_master->SetName(name); - #ifdef CONTROLLER_RESUME_TRANING + #ifdef CONTROLLER_RESUME_TRAINING char fileName[256]; snprintf(name, sizeof(name), "%s_critic.dnn", CONTROLLER_NAME); ndGetWorkingFileName(name, fileName); diff --git a/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp b/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp index 30b967bb2..e8b022fc3 100644 --- a/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp +++ b/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp @@ -26,7 +26,7 @@ namespace ndCarpole_0 //#define ND_TRAIN_AGENT #define CONTROLLER_NAME "cartpoleDiscrete" - #define CONTROLLER_RESUME_TRANING + //#define CONTROLLER_RESUME_TRAINING #define D_PUSH_ACCEL ndFloat32 (15.0f) #define D_REWARD_MIN_ANGLE ndFloat32 (20.0f * ndDegreeToRad) @@ -374,7 +374,7 @@ namespace ndCarpole_0 snprintf(name, sizeof(name), "%s.dnn", CONTROLLER_NAME); m_master->SetName(name); - #ifdef CONTROLLER_RESUME_TRANING + #ifdef CONTROLLER_RESUME_TRAINING char fileName[256]; snprintf(name, sizeof(name), "%s_critic.dnn", CONTROLLER_NAME); ndGetWorkingFileName(name, fileName); diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentContinuePolicyGradient_Trainer.cpp b/newton-4.00/sdk/dBrain/ndBrainAgentContinuePolicyGradient_Trainer.cpp index fd3adb286..1301c9a4e 100644 --- a/newton-4.00/sdk/dBrain/ndBrainAgentContinuePolicyGradient_Trainer.cpp +++ b/newton-4.00/sdk/dBrain/ndBrainAgentContinuePolicyGradient_Trainer.cpp @@ -29,6 +29,8 @@ #define ND_CONTINUE_POLICY_GRADIENT_BUFFER_SIZE (1024 * 128) #define ND_CONTINUE_POLICY_GRADIENT_MIN_VARIANCE ndBrainFloat(0.1f) +#define ND_USE_LOG_DEVIATION + //********************************************************************************************* // //********************************************************************************************* @@ -83,20 +85,35 @@ class ndBrainAgentContinuePolicyGradient_TrainerMaster::LastActivationLayer : pu void MakePrediction(const ndBrainVector& input, ndBrainVector& output) const { ndBrainLayerActivationTanh::MakePrediction(input, output); - for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i) - { - output[i + m_neurons / 2] = ndMax(input[i + m_neurons / 2], m_sigma); - } + #ifdef ND_USE_LOG_DEVIATION + for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i) + { + output[i + m_neurons / 2] = input[i + m_neurons / 2]; + } + #else + for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i) + { + output[i + m_neurons / 2] = ndMax(input[i + m_neurons / 2], m_sigma); + } + #endif } void InputDerivative(const ndBrainVector& input, const ndBrainVector& output, const ndBrainVector& outputDerivative, ndBrainVector& inputDerivative) const { ndBrainLayerActivationTanh::InputDerivative(input, output, outputDerivative, inputDerivative); - for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i) - { - inputDerivative[i + m_neurons / 2] = (input[i + m_neurons / 2] > ndBrainFloat(0.0f)) ? ndBrainFloat(1.0f) : ndBrainFloat(0.0f); - inputDerivative[i + m_neurons / 2] *= outputDerivative[i + m_neurons / 2]; - } + #ifdef ND_USE_LOG_DEVIATION + for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i) + { + //inputDerivative[i + m_neurons / 2] = ndBrainFloat(1.0f); + inputDerivative[i + m_neurons / 2] = outputDerivative[i + m_neurons / 2]; + } + #else + for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i) + { + inputDerivative[i + m_neurons / 2] = (input[i + m_neurons / 2] > ndBrainFloat(0.0f)) ? ndBrainFloat(1.0f) : ndBrainFloat(0.0f); + inputDerivative[i + m_neurons / 2] *= outputDerivative[i + m_neurons / 2]; + } + #endif } ndBrainFloat m_sigma; @@ -518,6 +535,7 @@ ndFloat32 ndBrainAgentContinuePolicyGradient_TrainerMaster::GetAverageScore() co return m_averageScore.GetAverage(); } +#pragma optimize( "", off ) void ndBrainAgentContinuePolicyGradient_TrainerMaster::OptimizePolicy() { ndAtomic iterator(0); @@ -553,18 +571,34 @@ void ndBrainAgentContinuePolicyGradient_TrainerMaster::OptimizePolicy() const ndBrainFloat advantage = m_agent->m_trajectoryAccumulator.GetAdvantage(m_index); const ndBrainFloat* const actions = m_agent->m_trajectoryAccumulator.GetActions(m_index); const ndInt32 numberOfActions = m_agent->m_numberOfActions; - for (ndInt32 i = numberOfActions - 1; i >= 0; --i) - { - const ndBrainFloat mean = output[i]; - const ndBrainFloat sigma1 = output[i + numberOfActions]; - const ndBrainFloat sigma2 = sigma1 * sigma1; - const ndBrainFloat sigma3 = sigma2 * sigma1; - const ndBrainFloat num = (actions[i] - mean); - ndAssert(sigma1 >= ND_CONTINUE_POLICY_GRADIENT_MIN_VARIANCE); + + #ifdef ND_USE_LOG_DEVIATION + for (ndInt32 i = numberOfActions - 1; i >= 0; --i) + { + const ndBrainFloat mean = output[i]; + ndAssert(ndExp(output[i + numberOfActions]) > 0.0f); + const ndBrainFloat sigma1 = ndMax (ndExp(output[i + numberOfActions]), ndFloat32(1.0e-2f)); + const ndBrainFloat sigma2 = sigma1 * sigma1; + const ndBrainFloat sigma3 = sigma2 * sigma1; + const ndBrainFloat num = (actions[i] - mean); - loss[i] = advantage * num / sigma2; - loss[i + numberOfActions] = advantage * (num * num / sigma3 - ndBrainFloat(1.0f) / sigma1); - } + loss[i] = advantage * num / sigma2; + loss[i + numberOfActions] = advantage * (num * num / sigma3 - ndBrainFloat(1.0f) / sigma1); + } + #else + for (ndInt32 i = numberOfActions - 1; i >= 0; --i) + { + const ndBrainFloat mean = output[i]; + const ndBrainFloat sigma1 = output[i + numberOfActions]; + const ndBrainFloat sigma2 = sigma1 * sigma1; + const ndBrainFloat sigma3 = sigma2 * sigma1; + const ndBrainFloat num = (actions[i] - mean); + ndAssert(sigma1 >= ND_CONTINUE_POLICY_GRADIENT_MIN_VARIANCE); + + loss[i] = advantage * num / sigma2; + loss[i + numberOfActions] = advantage * (num * num / sigma3 - ndBrainFloat(1.0f) / sigma1); + } + #endif } ndBrainTrainer& m_trainer;