Skip to content

Commit

Permalink
XGBoost.to_json Unit Tests (#174)
Browse files Browse the repository at this point in the history
* XGBoost.to_json Unit Tests

 - UT for XGBoost.to_json
 - Bug for Version 11, objective for classifier must be set to 'crossentropy'
 - Correcting bug #168
 - Solving issue #169

* Correcting xgb bugs

* Correcting to_json bug

* Correcting the char bug in pandas_to_vertica

* Small changes before merging
  • Loading branch information
oualib authored Oct 4, 2021
1 parent 0cbdd09 commit fcfe98b
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 95 deletions.
115 changes: 53 additions & 62 deletions verticapy/learn/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,11 @@ def to_json(self, path: str = ""):
normalization; while Vertica uses multinomial logistic regression,
XGBoost Python uses Softmax. This difference does not affect the model's
final predictions. Categorical predictors must be encoded.
Parameters
----------
path: str, optional
The path and name of the output file. If a file with the same name
already exists, the function returns an error.
Returns
-------
str
Expand All @@ -91,6 +89,9 @@ def to_json(self, path: str = ""):
"""
def xgboost_to_json(model):
def xgboost_dummy_tree_dict(model, c: str = None):
# Dummy trees are used to store the prior probabilities.
# The Python XGBoost API do not use those information and start
# the training with priors = 0
condition = ["{} IS NOT NULL".format(elem) for elem in model.X] + ["{} IS NOT NULL".format(model.y)]
model.cursor.execute("SELECT COUNT(*) FROM {} WHERE {} AND {} = '{}'".format(model.input_relation, " AND ".join(condition), model.y, c))
avg = model.cursor.fetchone()[0]
Expand Down Expand Up @@ -182,25 +183,29 @@ def xgboost_tree_dict_list(model):
"name": "gbtree",}
def xgboost_learner(model):
condition = ["{} IS NOT NULL".format(elem) for elem in model.X] + ["{} IS NOT NULL".format(model.y)]
n = model.get_attr("tree_count")["tree_count"][0]
if model.type == "XGBoostRegressor" or (len(model.classes_) == 2 and model.classes_[1] == 1 and model.classes_[0] == 0):
objective = "reg:squarederror"
# Computing prior probability
model.cursor.execute("SELECT AVG({}) FROM {} WHERE {}".format(model.y, model.input_relation, " AND ".join(condition)))
bs = model.cursor.fetchone()[0]
if model.type == "XGBoostClassifier":
objective = "binary:logistic"
num_class = "0"
param = "reg_loss_param"
param_val = {"scale_pos_weight":"1"}
attributes_dict = {}
if model.type == "XGBoostRegressor":
attributes_dict = {"scikit_learn": "{\"n_estimators\": " + str(n) + ", \"objective\": \"reg:squarederror\", \"max_depth\": " + str(model.parameters["max_depth"]) + ", \"learning_rate\": " + str(model.parameters["learning_rate"]) + ", \"verbosity\": null, \"booster\": null, \"tree_method\": null, \"gamma\": null, \"min_child_weight\": null, \"max_delta_step\": null, \"subsample\": null, \"colsample_bytree\": null, \"colsample_bylevel\": null, \"colsample_bynode\": null, \"reg_alpha\": null, \"reg_lambda\": null, \"scale_pos_weight\": null, \"base_score\": null, \"missing\": NaN, \"num_parallel_tree\": null, \"kwargs\": {}, \"random_state\": null, \"n_jobs\": null, \"monotone_constraints\": null, \"interaction_constraints\": null, \"importance_type\": \"gain\", \"gpu_id\": null, \"validate_parameters\": null, \"_estimator_type\": \"regressor\"}"}
else:
attributes_dict = {"scikit_learn": "{\"use_label_encoder\": true, \"n_estimators\": " + str(n) + ", \"objective\": \"binary:logistic\", \"max_depth\": " + str(model.parameters["max_depth"]) + ", \"learning_rate\": " + str(model.parameters["learning_rate"]) + ", \"verbosity\": null, \"booster\": null, \"tree_method\": null, \"gamma\": null, \"min_child_weight\": null, \"max_delta_step\": null, \"subsample\": null, \"colsample_bytree\": null, \"colsample_bylevel\": null, \"colsample_bynode\": null, \"reg_alpha\": null, \"reg_lambda\": null, \"scale_pos_weight\": null, \"base_score\": null, \"missing\": NaN, \"num_parallel_tree\": null, \"kwargs\": {}, \"random_state\": null, \"n_jobs\": null, \"monotone_constraints\": null, \"interaction_constraints\": null, \"importance_type\": \"gain\", \"gpu_id\": null, \"validate_parameters\": null, \"classes_\": [0, 1], \"n_classes_\": 2, \"_le\": {\"classes_\": [0, 1]}, \"_estimator_type\": \"classifier\"}"}
else:
n = model.get_attr("tree_count")["tree_count"][0]
objective = "multi:softprob"
bs = 0.5
num_class = str(len(model.classes_))
param = "softmax_multiclass_param"
param_val = {"num_class": num_class}
attributes_dict = {"scikit_learn": "{\"use_label_encoder\": true, \"n_estimators\": " + str(n) + ", \"objective\": \"multi:softprob\", \"max_depth\": " + str(model.parameters["max_depth"]) + ", \"learning_rate\": " + str(model.parameters["learning_rate"]) + ", \"verbosity\": null, \"booster\": null, \"tree_method\": null, \"gamma\": null, \"min_child_weight\": null, \"max_delta_step\": null, \"subsample\": null, \"colsample_bytree\": null, \"colsample_bylevel\": null, \"colsample_bynode\": null, \"reg_alpha\": null, \"reg_lambda\": null, \"scale_pos_weight\": null, \"base_score\": null, \"missing\": NaN, \"num_parallel_tree\": null, \"kwargs\": {}, \"random_state\": null, \"n_jobs\": null, \"monotone_constraints\": null, \"interaction_constraints\": null, \"importance_type\": \"gain\", \"gpu_id\": null, \"validate_parameters\": null, \"classes_\": " + str(model.classes_) + ", \"n_classes_\": " + str(len(model.classes_)) + ", \"_le\": {\"classes_\": " + str(model.classes_) + "}, \"_estimator_type\": \"classifier\"}"}
attributes_dict["scikit_learn"] = attributes_dict["scikit_learn"].replace('"', '++++')
attributes_dict["scikit_learn"] = attributes_dict["scikit_learn"].replace('"', '++++')
gradient_booster = xgboost_tree_dict_list(model)
return {"attributes": attributes_dict,
"feature_names": [],
Expand Down Expand Up @@ -232,40 +237,39 @@ class RandomForestClassifier(MulticlassClassifier, Tree):
function. It is one of the ensemble learning methods for classification
that operates by constructing a multitude of decision trees at
training-time and outputting a class with the mode.
Parameters
----------
name: str
Name of the the model. The model will be stored in the DB.
Name of the the model. The model will be stored in the DB.
cursor: DBcursor, optional
Vertica database cursor.
Vertica database cursor.
n_estimators: int, optional
The number of trees in the forest, an integer between 0 and 1000, inclusive.
The number of trees in the forest, an integer between 0 and 1000, inclusive.
max_features: int/str, optional
The number of randomly chosen features from which to pick the best feature
to split on a given tree node. It can be an integer or one of the two following
methods.
auto : square root of the total number of predictors.
max : number of predictors.
The number of randomly chosen features from which to pick the best feature
to split on a given tree node. It can be an integer or one of the two following
methods.
auto : square root of the total number of predictors.
max : number of predictors.
max_leaf_nodes: int, optional
The maximum number of leaf nodes a tree in the forest can have, an integer
between 1 and 1e9, inclusive.
The maximum number of leaf nodes a tree in the forest can have, an integer
between 1 and 1e9, inclusive.
sample: float, optional
The portion of the input data set that is randomly picked for training each tree,
a float between 0.0 and 1.0, inclusive.
The portion of the input data set that is randomly picked for training each tree,
a float between 0.0 and 1.0, inclusive.
max_depth: int, optional
The maximum depth for growing each tree, an integer between 1 and 100, inclusive.
The maximum depth for growing each tree, an integer between 1 and 100, inclusive.
min_samples_leaf: int, optional
The minimum number of samples each branch must have after splitting a node, an
integer between 1 and 1e6, inclusive. A split that causes fewer remaining samples
is discarded.
The minimum number of samples each branch must have after splitting a node, an
integer between 1 and 1e6, inclusive. A split that causes fewer remaining samples
is discarded.
min_info_gain: float, optional
The minimum threshold for including a split, a float between 0.0 and 1.0, inclusive.
A split with information gain less than this threshold is discarded.
The minimum threshold for including a split, a float between 0.0 and 1.0, inclusive.
A split with information gain less than this threshold is discarded.
nbins: int, optional
The number of bins to use for continuous features, an integer between 2 and 1000,
inclusive.
"""
The number of bins to use for continuous features, an integer between 2 and 1000,
inclusive.
"""

def __init__(
self,
Expand Down Expand Up @@ -307,40 +311,39 @@ class RandomForestRegressor(Regressor, Tree):
function. It is one of the ensemble learning methods for regression that
operates by constructing a multitude of decision trees at training-time
and outputting a class with the mode.
Parameters
----------
name: str
Name of the the model. The model will be stored in the DB.
Name of the the model. The model will be stored in the DB.
cursor: DBcursor, optional
Vertica database cursor.
Vertica database cursor.
n_estimators: int, optional
The number of trees in the forest, an integer between 0 and 1000, inclusive.
The number of trees in the forest, an integer between 0 and 1000, inclusive.
max_features: int/str, optional
The number of randomly chosen features from which to pick the best feature
to split on a given tree node. It can be an integer or one of the two following
methods.
auto : square root of the total number of predictors.
max : number of predictors.
The number of randomly chosen features from which to pick the best feature
to split on a given tree node. It can be an integer or one of the two following
methods.
auto : square root of the total number of predictors.
max : number of predictors.
max_leaf_nodes: int, optional
The maximum number of leaf nodes a tree in the forest can have, an integer
between 1 and 1e9, inclusive.
The maximum number of leaf nodes a tree in the forest can have, an integer
between 1 and 1e9, inclusive.
sample: float, optional
The portion of the input data set that is randomly picked for training each tree,
a float between 0.0 and 1.0, inclusive.
The portion of the input data set that is randomly picked for training each tree,
a float between 0.0 and 1.0, inclusive.
max_depth: int, optional
The maximum depth for growing each tree, an integer between 1 and 100, inclusive.
The maximum depth for growing each tree, an integer between 1 and 100, inclusive.
min_samples_leaf: int, optional
The minimum number of samples each branch must have after splitting a node, an
integer between 1 and 1e6, inclusive. A split that causes fewer remaining samples
is discarded.
The minimum number of samples each branch must have after splitting a node, an
integer between 1 and 1e6, inclusive. A split that causes fewer remaining samples
is discarded.
min_info_gain: float, optional
The minimum threshold for including a split, a float between 0.0 and 1.0, inclusive.
A split with information gain less than this threshold is discarded.
The minimum threshold for including a split, a float between 0.0 and 1.0, inclusive.
A split with information gain less than this threshold is discarded.
nbins: int, optional
The number of bins to use for continuous features, an integer between 2 and 1000,
inclusive.
"""
The number of bins to use for continuous features, an integer between 2 and 1000,
inclusive.
"""

def __init__(
self,
Expand Down Expand Up @@ -380,7 +383,6 @@ class XGBoostClassifier(MulticlassClassifier, Tree, XGBoost_to_json):
---------------------------------------------------------------------------
Creates an XGBoostClassifier object using the Vertica XGB_CLASSIFIER
algorithm.
Parameters
----------
name: str
Expand All @@ -395,9 +397,6 @@ class XGBoostClassifier(MulticlassClassifier, Tree, XGBoost_to_json):
Number of bins to use for finding splits in each column, more
splits leads to longer runtime but more fine-grained and possibly
better splits.
objective: str, optional
The objective/loss function that will be used to iteratively
improve the model.
split_proposal_method: str, optional
approximate splitting strategy. Can be 'global' or 'local'
(not yet supported)
Expand Down Expand Up @@ -428,7 +427,6 @@ def __init__(
max_ntree: int = 10,
max_depth: int = 5,
nbins: int = 32,
objective: str = "squarederror",
split_proposal_method: str = "global",
tol: float = 0.001,
learning_rate: float = 0.1,
Expand All @@ -443,7 +441,6 @@ def __init__(
"max_ntree": max_ntree,
"max_depth": max_depth,
"nbins": nbins,
"objective": objective,
"split_proposal_method": split_proposal_method,
"tol": tol,
"learning_rate": learning_rate,
Expand All @@ -463,7 +460,6 @@ class XGBoostRegressor(Regressor, Tree, XGBoost_to_json):
---------------------------------------------------------------------------
Creates an XGBoostRegressor object using the Vertica XGB_REGRESSOR
algorithm.
Parameters
----------
name: str
Expand All @@ -478,9 +474,6 @@ class XGBoostRegressor(Regressor, Tree, XGBoost_to_json):
Number of bins to use for finding splits in each column, more
splits leads to longer runtime but more fine-grained and possibly
better splits.
objective: str, optional
The objective/loss function that will be used to iteratively
improve the model.
split_proposal_method: str, optional
approximate splitting strategy. Can be 'global' or 'local'
(not yet supported)
Expand Down Expand Up @@ -511,7 +504,6 @@ def __init__(
max_ntree: int = 10,
max_depth: int = 5,
nbins: int = 32,
objective: str = "squarederror",
split_proposal_method: str = "global",
tol: float = 0.001,
learning_rate: float = 0.1,
Expand All @@ -526,7 +518,6 @@ def __init__(
"max_ntree": max_ntree,
"max_depth": max_depth,
"nbins": nbins,
"objective": objective,
"split_proposal_method": split_proposal_method,
"tol": tol,
"learning_rate": learning_rate,
Expand All @@ -537,4 +528,4 @@ def __init__(
)
cursor = check_cursor(cursor)[0]
self.cursor = cursor
version(cursor=cursor, condition=[10, 1, 0])
version(cursor=cursor, condition=[10, 1, 0])
2 changes: 0 additions & 2 deletions verticapy/learn/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,6 @@ def load_model(name: str, cursor=None, input_relation: str = "", test_relation:
int(parameters_dict["max_ntree"]),
int(parameters_dict["max_depth"]),
int(parameters_dict["nbins"]),
parameters_dict["objective"],
parameters_dict["split_proposal_method"],
float(parameters_dict["epsilon"]),
float(parameters_dict["learning_rate"]),
Expand All @@ -422,7 +421,6 @@ def load_model(name: str, cursor=None, input_relation: str = "", test_relation:
int(parameters_dict["max_ntree"]),
int(parameters_dict["max_depth"]),
int(parameters_dict["nbins"]),
parameters_dict["objective"],
parameters_dict["split_proposal_method"],
float(parameters_dict["epsilon"]),
float(parameters_dict["learning_rate"]),
Expand Down
13 changes: 0 additions & 13 deletions verticapy/learn/vmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1094,19 +1094,6 @@ def set_params(self, parameters: dict = {}):
model_parameters["max_ntree"] = default_parameters["max_ntree"]
else:
model_parameters["max_ntree"] = self.parameters["max_ntree"]
if "objective" in parameters:
assert str(parameters["objective"]).lower() in [
"squarederror",
], ParameterError(
"Incorrect parameter 'objective'.\nThe objective function must be in (squarederror,), found '{}'.".format(
parameters["objective"]
)
)
model_parameters["objective"] = parameters["objective"]
elif "objective" not in self.parameters:
model_parameters["objective"] = default_parameters["objective"]
else:
model_parameters["objective"] = self.parameters["objective"]
if "split_proposal_method" in parameters:
assert str(parameters["split_proposal_method"]).lower() in [
"global",
Expand Down
Loading

0 comments on commit fcfe98b

Please sign in to comment.