XGBoost.to_json Unit Tests (#174)

* XGBoost.to_json Unit Tests - UT for XGBoost.to_json - Bug for Version 11, objective for classifier must be set to 'crossentropy' - Correcting bug #168 - Solving issue #169 * Correcting xgb bugs * Correcting to_json bug * Correcting the char bug in pandas_to_vertica * Small changes before merging
vertica · Oct 4, 2021 · fcfe98b · fcfe98b
1 parent 0cbdd09
commit fcfe98b
Show file tree

Hide file tree

Showing 7 changed files with 164 additions and 95 deletions.
diff --git a/verticapy/learn/ensemble.py b/verticapy/learn/ensemble.py
@@ -76,13 +76,11 @@ def to_json(self, path: str = ""):
         normalization; while Vertica uses multinomial logistic regression,  
         XGBoost Python uses Softmax. This difference does not affect the model's 
         final predictions. Categorical predictors must be encoded.
-
         Parameters
         ----------
         path: str, optional
             The path and name of the output file. If a file with the same name 
             already exists, the function returns an error.
-
         Returns
         -------
         str
@@ -91,6 +89,9 @@ def to_json(self, path: str = ""):
         """
         def xgboost_to_json(model):
             def xgboost_dummy_tree_dict(model, c: str = None):
+                # Dummy trees are used to store the prior probabilities.
+                # The Python XGBoost API do not use those information and start
+                # the training with priors = 0
                 condition = ["{} IS NOT NULL".format(elem) for elem in model.X] + ["{} IS NOT NULL".format(model.y)]
                 model.cursor.execute("SELECT COUNT(*) FROM {} WHERE {} AND {} = '{}'".format(model.input_relation, " AND ".join(condition), model.y, c))
                 avg = model.cursor.fetchone()[0]
@@ -182,25 +183,29 @@ def xgboost_tree_dict_list(model):
                         "name": "gbtree",}
             def xgboost_learner(model):
                 condition = ["{} IS NOT NULL".format(elem) for elem in model.X] + ["{} IS NOT NULL".format(model.y)]
+                n = model.get_attr("tree_count")["tree_count"][0]
                 if model.type == "XGBoostRegressor" or (len(model.classes_) == 2 and model.classes_[1] == 1 and model.classes_[0] == 0):
                     objective = "reg:squarederror"
+                    # Computing prior probability
                     model.cursor.execute("SELECT AVG({}) FROM {} WHERE {}".format(model.y, model.input_relation, " AND ".join(condition)))
                     bs = model.cursor.fetchone()[0]
                     if model.type == "XGBoostClassifier":
                         objective = "binary:logistic"
                     num_class = "0"
                     param = "reg_loss_param"
                     param_val = {"scale_pos_weight":"1"}
-                    attributes_dict = {}
+                    if model.type == "XGBoostRegressor":
+                      attributes_dict = {"scikit_learn": "{\"n_estimators\": " + str(n) + ", \"objective\": \"reg:squarederror\", \"max_depth\": " + str(model.parameters["max_depth"]) + ", \"learning_rate\": " + str(model.parameters["learning_rate"]) + ", \"verbosity\": null, \"booster\": null, \"tree_method\": null, \"gamma\": null, \"min_child_weight\": null, \"max_delta_step\": null, \"subsample\": null, \"colsample_bytree\": null, \"colsample_bylevel\": null, \"colsample_bynode\": null, \"reg_alpha\": null, \"reg_lambda\": null, \"scale_pos_weight\": null, \"base_score\": null, \"missing\": NaN, \"num_parallel_tree\": null, \"kwargs\": {}, \"random_state\": null, \"n_jobs\": null, \"monotone_constraints\": null, \"interaction_constraints\": null, \"importance_type\": \"gain\", \"gpu_id\": null, \"validate_parameters\": null, \"_estimator_type\": \"regressor\"}"}
+                    else:
+                      attributes_dict = {"scikit_learn": "{\"use_label_encoder\": true, \"n_estimators\": " + str(n) + ", \"objective\": \"binary:logistic\", \"max_depth\": " + str(model.parameters["max_depth"]) + ", \"learning_rate\": " + str(model.parameters["learning_rate"]) + ", \"verbosity\": null, \"booster\": null, \"tree_method\": null, \"gamma\": null, \"min_child_weight\": null, \"max_delta_step\": null, \"subsample\": null, \"colsample_bytree\": null, \"colsample_bylevel\": null, \"colsample_bynode\": null, \"reg_alpha\": null, \"reg_lambda\": null, \"scale_pos_weight\": null, \"base_score\": null, \"missing\": NaN, \"num_parallel_tree\": null, \"kwargs\": {}, \"random_state\": null, \"n_jobs\": null, \"monotone_constraints\": null, \"interaction_constraints\": null, \"importance_type\": \"gain\", \"gpu_id\": null, \"validate_parameters\": null, \"classes_\": [0, 1], \"n_classes_\": 2, \"_le\": {\"classes_\": [0, 1]}, \"_estimator_type\": \"classifier\"}"}
                 else:
-                    n = model.get_attr("tree_count")["tree_count"][0]
                     objective = "multi:softprob"
                     bs = 0.5
                     num_class = str(len(model.classes_))
                     param = "softmax_multiclass_param"
                     param_val = {"num_class": num_class}
                     attributes_dict = {"scikit_learn": "{\"use_label_encoder\": true, \"n_estimators\": " + str(n) + ", \"objective\": \"multi:softprob\", \"max_depth\": " + str(model.parameters["max_depth"]) + ", \"learning_rate\": " + str(model.parameters["learning_rate"]) + ", \"verbosity\": null, \"booster\": null, \"tree_method\": null, \"gamma\": null, \"min_child_weight\": null, \"max_delta_step\": null, \"subsample\": null, \"colsample_bytree\": null, \"colsample_bylevel\": null, \"colsample_bynode\": null, \"reg_alpha\": null, \"reg_lambda\": null, \"scale_pos_weight\": null, \"base_score\": null, \"missing\": NaN, \"num_parallel_tree\": null, \"kwargs\": {}, \"random_state\": null, \"n_jobs\": null, \"monotone_constraints\": null, \"interaction_constraints\": null, \"importance_type\": \"gain\", \"gpu_id\": null, \"validate_parameters\": null, \"classes_\": " + str(model.classes_) + ", \"n_classes_\": " + str(len(model.classes_)) + ", \"_le\": {\"classes_\": " + str(model.classes_) + "}, \"_estimator_type\": \"classifier\"}"}
-                    attributes_dict["scikit_learn"] = attributes_dict["scikit_learn"].replace('"', '++++')
+                attributes_dict["scikit_learn"] = attributes_dict["scikit_learn"].replace('"', '++++')
                 gradient_booster = xgboost_tree_dict_list(model)
                 return {"attributes": attributes_dict,
                         "feature_names": [],
@@ -232,40 +237,39 @@ class RandomForestClassifier(MulticlassClassifier, Tree):
 function. It is one of the ensemble learning methods for classification 
 that operates by constructing a multitude of decision trees at 
 training-time and outputting a class with the mode.
-
 Parameters
 ----------
 name: str
-	Name of the the model. The model will be stored in the DB.
+  Name of the the model. The model will be stored in the DB.
 cursor: DBcursor, optional
-	Vertica database cursor.
+  Vertica database cursor.
 n_estimators: int, optional
-	The number of trees in the forest, an integer between 0 and 1000, inclusive.
+  The number of trees in the forest, an integer between 0 and 1000, inclusive.
 max_features: int/str, optional
-	The number of randomly chosen features from which to pick the best feature 
-	to split on a given tree node. It can be an integer or one of the two following
-	methods.
-		auto : square root of the total number of predictors.
-		max  : number of predictors.
+  The number of randomly chosen features from which to pick the best feature 
+  to split on a given tree node. It can be an integer or one of the two following
+  methods.
+    auto : square root of the total number of predictors.
+    max  : number of predictors.
 max_leaf_nodes: int, optional
-	The maximum number of leaf nodes a tree in the forest can have, an integer 
-	between 1 and 1e9, inclusive.
+  The maximum number of leaf nodes a tree in the forest can have, an integer 
+  between 1 and 1e9, inclusive.
 sample: float, optional
-	The portion of the input data set that is randomly picked for training each tree, 
-	a float between 0.0 and 1.0, inclusive. 
+  The portion of the input data set that is randomly picked for training each tree, 
+  a float between 0.0 and 1.0, inclusive. 
 max_depth: int, optional
-	The maximum depth for growing each tree, an integer between 1 and 100, inclusive.
+  The maximum depth for growing each tree, an integer between 1 and 100, inclusive.
 min_samples_leaf: int, optional
-	The minimum number of samples each branch must have after splitting a node, an 
-	integer between 1 and 1e6, inclusive. A split that causes fewer remaining samples 
-	is discarded. 
+  The minimum number of samples each branch must have after splitting a node, an 
+  integer between 1 and 1e6, inclusive. A split that causes fewer remaining samples 
+  is discarded. 
 min_info_gain: float, optional
-	The minimum threshold for including a split, a float between 0.0 and 1.0, inclusive. 
-	A split with information gain less than this threshold is discarded.
+  The minimum threshold for including a split, a float between 0.0 and 1.0, inclusive. 
+  A split with information gain less than this threshold is discarded.
 nbins: int, optional 
-	The number of bins to use for continuous features, an integer between 2 and 1000, 
-	inclusive.
-	"""
+  The number of bins to use for continuous features, an integer between 2 and 1000, 
+  inclusive.
+  """
 
     def __init__(
         self,
@@ -307,40 +311,39 @@ class RandomForestRegressor(Regressor, Tree):
 function. It is one of the ensemble learning methods for regression that 
 operates by constructing a multitude of decision trees at training-time 
 and outputting a class with the mode.
-
 Parameters
 ----------
 name: str
-	Name of the the model. The model will be stored in the DB.
+  Name of the the model. The model will be stored in the DB.
 cursor: DBcursor, optional
-	Vertica database cursor.
+  Vertica database cursor.
 n_estimators: int, optional
-	The number of trees in the forest, an integer between 0 and 1000, inclusive.
+  The number of trees in the forest, an integer between 0 and 1000, inclusive.
 max_features: int/str, optional
-	The number of randomly chosen features from which to pick the best feature 
-	to split on a given tree node. It can be an integer or one of the two following
-	methods.
-		auto : square root of the total number of predictors.
-		max  : number of predictors.
+  The number of randomly chosen features from which to pick the best feature 
+  to split on a given tree node. It can be an integer or one of the two following
+  methods.
+    auto : square root of the total number of predictors.
+    max  : number of predictors.
 max_leaf_nodes: int, optional
-	The maximum number of leaf nodes a tree in the forest can have, an integer 
-	between 1 and 1e9, inclusive.
+  The maximum number of leaf nodes a tree in the forest can have, an integer 
+  between 1 and 1e9, inclusive.
 sample: float, optional
-	The portion of the input data set that is randomly picked for training each tree, 
-	a float between 0.0 and 1.0, inclusive. 
+  The portion of the input data set that is randomly picked for training each tree, 
+  a float between 0.0 and 1.0, inclusive. 
 max_depth: int, optional
-	The maximum depth for growing each tree, an integer between 1 and 100, inclusive.
+  The maximum depth for growing each tree, an integer between 1 and 100, inclusive.
 min_samples_leaf: int, optional
-	The minimum number of samples each branch must have after splitting a node, an 
-	integer between 1 and 1e6, inclusive. A split that causes fewer remaining samples 
-	is discarded. 
+  The minimum number of samples each branch must have after splitting a node, an 
+  integer between 1 and 1e6, inclusive. A split that causes fewer remaining samples 
+  is discarded. 
 min_info_gain: float, optional
-	The minimum threshold for including a split, a float between 0.0 and 1.0, inclusive. 
-	A split with information gain less than this threshold is discarded.
+  The minimum threshold for including a split, a float between 0.0 and 1.0, inclusive. 
+  A split with information gain less than this threshold is discarded.
 nbins: int, optional 
-	The number of bins to use for continuous features, an integer between 2 and 1000, 
-	inclusive.
-	"""
+  The number of bins to use for continuous features, an integer between 2 and 1000, 
+  inclusive.
+  """
 
     def __init__(
         self,
@@ -380,7 +383,6 @@ class XGBoostClassifier(MulticlassClassifier, Tree, XGBoost_to_json):
 ---------------------------------------------------------------------------
 Creates an XGBoostClassifier object using the Vertica XGB_CLASSIFIER 
 algorithm.
-
 Parameters
 ----------
 name: str
@@ -395,9 +397,6 @@ class XGBoostClassifier(MulticlassClassifier, Tree, XGBoost_to_json):
     Number of bins to use for finding splits in each column, more 
     splits leads to longer runtime but more fine-grained and possibly 
     better splits.
-objective: str, optional
-    The objective/loss function that will be used to iteratively 
-    improve the model.
 split_proposal_method: str, optional
     approximate splitting strategy. Can be 'global' or 'local'
     (not yet supported)
@@ -428,7 +427,6 @@ def __init__(
         max_ntree: int = 10,
         max_depth: int = 5,
         nbins: int = 32,
-        objective: str = "squarederror",
         split_proposal_method: str = "global",
         tol: float = 0.001,
         learning_rate: float = 0.1,
@@ -443,7 +441,6 @@ def __init__(
                 "max_ntree": max_ntree,
                 "max_depth": max_depth,
                 "nbins": nbins,
-                "objective": objective,
                 "split_proposal_method": split_proposal_method,
                 "tol": tol,
                 "learning_rate": learning_rate,
@@ -463,7 +460,6 @@ class XGBoostRegressor(Regressor, Tree, XGBoost_to_json):
 ---------------------------------------------------------------------------
 Creates an XGBoostRegressor object using the Vertica XGB_REGRESSOR 
 algorithm.
-
 Parameters
 ----------
 name: str
@@ -478,9 +474,6 @@ class XGBoostRegressor(Regressor, Tree, XGBoost_to_json):
     Number of bins to use for finding splits in each column, more 
     splits leads to longer runtime but more fine-grained and possibly 
     better splits.
-objective: str, optional
-    The objective/loss function that will be used to iteratively 
-    improve the model.
 split_proposal_method: str, optional
     approximate splitting strategy. Can be 'global' or 'local'
     (not yet supported)
@@ -511,7 +504,6 @@ def __init__(
         max_ntree: int = 10,
         max_depth: int = 5,
         nbins: int = 32,
-        objective: str = "squarederror",
         split_proposal_method: str = "global",
         tol: float = 0.001,
         learning_rate: float = 0.1,
@@ -526,7 +518,6 @@ def __init__(
                 "max_ntree": max_ntree,
                 "max_depth": max_depth,
                 "nbins": nbins,
-                "objective": objective,
                 "split_proposal_method": split_proposal_method,
                 "tol": tol,
                 "learning_rate": learning_rate,
@@ -537,4 +528,4 @@ def __init__(
         )
         cursor = check_cursor(cursor)[0]
         self.cursor = cursor
-        version(cursor=cursor, condition=[10, 1, 0])
+        version(cursor=cursor, condition=[10, 1, 0])
diff --git a/verticapy/learn/tools.py b/verticapy/learn/tools.py
@@ -405,7 +405,6 @@ def load_model(name: str, cursor=None, input_relation: str = "", test_relation:
                 int(parameters_dict["max_ntree"]),
                 int(parameters_dict["max_depth"]),
                 int(parameters_dict["nbins"]),
-                parameters_dict["objective"],
                 parameters_dict["split_proposal_method"],
                 float(parameters_dict["epsilon"]),
                 float(parameters_dict["learning_rate"]),
@@ -422,7 +421,6 @@ def load_model(name: str, cursor=None, input_relation: str = "", test_relation:
                 int(parameters_dict["max_ntree"]),
                 int(parameters_dict["max_depth"]),
                 int(parameters_dict["nbins"]),
-                parameters_dict["objective"],
                 parameters_dict["split_proposal_method"],
                 float(parameters_dict["epsilon"]),
                 float(parameters_dict["learning_rate"]),

diff --git a/verticapy/learn/vmodel.py b/verticapy/learn/vmodel.py
@@ -1094,19 +1094,6 @@ def set_params(self, parameters: dict = {}):
                 model_parameters["max_ntree"] = default_parameters["max_ntree"]
             else:
                 model_parameters["max_ntree"] = self.parameters["max_ntree"]
-            if "objective" in parameters:
-                assert str(parameters["objective"]).lower() in [
-                    "squarederror",
-                ], ParameterError(
-                    "Incorrect parameter 'objective'.\nThe objective function must be in (squarederror,), found '{}'.".format(
-                        parameters["objective"]
-                    )
-                )
-                model_parameters["objective"] = parameters["objective"]
-            elif "objective" not in self.parameters:
-                model_parameters["objective"] = default_parameters["objective"]
-            else:
-                model_parameters["objective"] = self.parameters["objective"]
             if "split_proposal_method" in parameters:
                 assert str(parameters["split_proposal_method"]).lower() in [
                     "global",