Skip to content

Commit

Permalink
using alternate log sigma, instead of direct sigma
Browse files Browse the repository at this point in the history
in theory, this allow for a wider exploration at the beginning of training since sigma.
but I need to test to find out.
  • Loading branch information
JulioJerez committed Sep 16, 2024
1 parent 30e90fe commit 486df49
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace ndAdvancedRobot
#define ND_TRAIN_MODEL
#define CONTROLLER_NAME "ndRobotArmReach"

//#define CONTROLLER_RESUME_TRANING
//#define CONTROLLER_RESUME_TRAINING

#define ND_USE_EULERS

Expand Down Expand Up @@ -100,7 +100,7 @@ namespace ndAdvancedRobot
#define ND_MAX_X_SPAND ndReal ( 1.5f)
#define ND_MIN_Y_SPAND ndReal (-2.2f)
#define ND_MAX_Y_SPAND ndReal ( 1.5f)
#define ND_ACTION_SENSITIVITY ndReal (0.05f)
#define ND_ACTION_SENSITIVITY ndReal ( 0.05f)

#define ND_DEAD_PENALTY ndReal (-10.0f)

Expand Down Expand Up @@ -1125,7 +1125,7 @@ namespace ndAdvancedRobot
snprintf(name, sizeof(name), "%s.dnn", CONTROLLER_NAME);
m_master->SetName(name);

#ifdef CONTROLLER_RESUME_TRANING
#ifdef CONTROLLER_RESUME_TRAINING
char fileName[256];
snprintf(name, sizeof(name), "%s_critic.dnn", CONTROLLER_NAME);
ndGetWorkingFileName(name, fileName);
Expand Down Expand Up @@ -1154,8 +1154,8 @@ namespace ndAdvancedRobot

ndInt32 countX = 22;
ndInt32 countZ = 23;
//countX = 1;
//countZ = 1;
//countX = 10;
//countZ = 10;

// add a hidden battery of model to generate trajectories in parallel
for (ndInt32 i = 0; i < countZ; ++i)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace ndCarpole_1
//#define ND_TRAIN_AGENT
#define CONTROLLER_NAME "cartpoleContinue"

#define CONTROLLER_RESUME_TRANING
//#define CONTROLLER_RESUME_TRAINING

#define D_PUSH_ACCEL ndBrainFloat (15.0f)
#define D_REWARD_MIN_ANGLE ndBrainFloat (20.0f * ndDegreeToRad)
Expand Down Expand Up @@ -335,7 +335,7 @@ namespace ndCarpole_1
snprintf(name, sizeof(name), "%s.dnn", CONTROLLER_NAME);
m_master->SetName(name);

#ifdef CONTROLLER_RESUME_TRANING
#ifdef CONTROLLER_RESUME_TRAINING
char fileName[256];
snprintf(name, sizeof(name), "%s_critic.dnn", CONTROLLER_NAME);
ndGetWorkingFileName(name, fileName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace ndCarpole_0
//#define ND_TRAIN_AGENT
#define CONTROLLER_NAME "cartpoleDiscrete"

#define CONTROLLER_RESUME_TRANING
//#define CONTROLLER_RESUME_TRAINING

#define D_PUSH_ACCEL ndFloat32 (15.0f)
#define D_REWARD_MIN_ANGLE ndFloat32 (20.0f * ndDegreeToRad)
Expand Down Expand Up @@ -374,7 +374,7 @@ namespace ndCarpole_0
snprintf(name, sizeof(name), "%s.dnn", CONTROLLER_NAME);
m_master->SetName(name);

#ifdef CONTROLLER_RESUME_TRANING
#ifdef CONTROLLER_RESUME_TRAINING
char fileName[256];
snprintf(name, sizeof(name), "%s_critic.dnn", CONTROLLER_NAME);
ndGetWorkingFileName(name, fileName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#define ND_CONTINUE_POLICY_GRADIENT_BUFFER_SIZE (1024 * 128)
#define ND_CONTINUE_POLICY_GRADIENT_MIN_VARIANCE ndBrainFloat(0.1f)

#define ND_USE_LOG_DEVIATION

//*********************************************************************************************
//
//*********************************************************************************************
Expand Down Expand Up @@ -83,20 +85,35 @@ class ndBrainAgentContinuePolicyGradient_TrainerMaster::LastActivationLayer : pu
void MakePrediction(const ndBrainVector& input, ndBrainVector& output) const
{
ndBrainLayerActivationTanh::MakePrediction(input, output);
for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i)
{
output[i + m_neurons / 2] = ndMax(input[i + m_neurons / 2], m_sigma);
}
#ifdef ND_USE_LOG_DEVIATION
for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i)
{
output[i + m_neurons / 2] = input[i + m_neurons / 2];
}
#else
for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i)
{
output[i + m_neurons / 2] = ndMax(input[i + m_neurons / 2], m_sigma);
}
#endif
}

void InputDerivative(const ndBrainVector& input, const ndBrainVector& output, const ndBrainVector& outputDerivative, ndBrainVector& inputDerivative) const
{
ndBrainLayerActivationTanh::InputDerivative(input, output, outputDerivative, inputDerivative);
for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i)
{
inputDerivative[i + m_neurons / 2] = (input[i + m_neurons / 2] > ndBrainFloat(0.0f)) ? ndBrainFloat(1.0f) : ndBrainFloat(0.0f);
inputDerivative[i + m_neurons / 2] *= outputDerivative[i + m_neurons / 2];
}
#ifdef ND_USE_LOG_DEVIATION
for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i)
{
//inputDerivative[i + m_neurons / 2] = ndBrainFloat(1.0f);
inputDerivative[i + m_neurons / 2] = outputDerivative[i + m_neurons / 2];
}
#else
for (ndInt32 i = m_neurons / 2 - 1; i >= 0; --i)
{
inputDerivative[i + m_neurons / 2] = (input[i + m_neurons / 2] > ndBrainFloat(0.0f)) ? ndBrainFloat(1.0f) : ndBrainFloat(0.0f);
inputDerivative[i + m_neurons / 2] *= outputDerivative[i + m_neurons / 2];
}
#endif
}

ndBrainFloat m_sigma;
Expand Down Expand Up @@ -518,6 +535,7 @@ ndFloat32 ndBrainAgentContinuePolicyGradient_TrainerMaster::GetAverageScore() co
return m_averageScore.GetAverage();
}

#pragma optimize( "", off )
void ndBrainAgentContinuePolicyGradient_TrainerMaster::OptimizePolicy()
{
ndAtomic<ndInt32> iterator(0);
Expand Down Expand Up @@ -553,18 +571,34 @@ void ndBrainAgentContinuePolicyGradient_TrainerMaster::OptimizePolicy()
const ndBrainFloat advantage = m_agent->m_trajectoryAccumulator.GetAdvantage(m_index);
const ndBrainFloat* const actions = m_agent->m_trajectoryAccumulator.GetActions(m_index);
const ndInt32 numberOfActions = m_agent->m_numberOfActions;
for (ndInt32 i = numberOfActions - 1; i >= 0; --i)
{
const ndBrainFloat mean = output[i];
const ndBrainFloat sigma1 = output[i + numberOfActions];
const ndBrainFloat sigma2 = sigma1 * sigma1;
const ndBrainFloat sigma3 = sigma2 * sigma1;
const ndBrainFloat num = (actions[i] - mean);
ndAssert(sigma1 >= ND_CONTINUE_POLICY_GRADIENT_MIN_VARIANCE);

#ifdef ND_USE_LOG_DEVIATION
for (ndInt32 i = numberOfActions - 1; i >= 0; --i)
{
const ndBrainFloat mean = output[i];
ndAssert(ndExp(output[i + numberOfActions]) > 0.0f);
const ndBrainFloat sigma1 = ndMax (ndExp(output[i + numberOfActions]), ndFloat32(1.0e-2f));
const ndBrainFloat sigma2 = sigma1 * sigma1;
const ndBrainFloat sigma3 = sigma2 * sigma1;
const ndBrainFloat num = (actions[i] - mean);

loss[i] = advantage * num / sigma2;
loss[i + numberOfActions] = advantage * (num * num / sigma3 - ndBrainFloat(1.0f) / sigma1);
}
loss[i] = advantage * num / sigma2;
loss[i + numberOfActions] = advantage * (num * num / sigma3 - ndBrainFloat(1.0f) / sigma1);
}
#else
for (ndInt32 i = numberOfActions - 1; i >= 0; --i)
{
const ndBrainFloat mean = output[i];
const ndBrainFloat sigma1 = output[i + numberOfActions];
const ndBrainFloat sigma2 = sigma1 * sigma1;
const ndBrainFloat sigma3 = sigma2 * sigma1;
const ndBrainFloat num = (actions[i] - mean);
ndAssert(sigma1 >= ND_CONTINUE_POLICY_GRADIENT_MIN_VARIANCE);

loss[i] = advantage * num / sigma2;
loss[i + numberOfActions] = advantage * (num * num / sigma3 - ndBrainFloat(1.0f) / sigma1);
}
#endif
}

ndBrainTrainer& m_trainer;
Expand Down

0 comments on commit 486df49

Please sign in to comment.