From b0c87a50f8e3094feb82a4ced593eabfc26fd8b3 Mon Sep 17 00:00:00 2001 From: Badr <32390048+oualib@users.noreply.github.com> Date: Fri, 27 Aug 2021 12:36:23 +0200 Subject: [PATCH] Adding memModel Trees and some code correction (#161) * Adding memModel Trees and some code correction - memModel Trees (Unit Tests to do) - Adding matrix rotation for PCA (tests available) - simplifying code for vModel TO DO (to complete 0.7.0): - tests for trees - memModel for Naive Bayes (+ tests) * Adding tests for trees TO DO: test for the SQL syntax in vDataFrame * copyedits * reword * Update memmodel.py Co-authored-by: kxu <52899649+ansleis@users.noreply.github.com> --- verticapy/learn/memmodel.py | 411 ++++++++++++++++++++++-- verticapy/learn/tools.py | 52 ++- verticapy/learn/vmodel.py | 5 +- verticapy/stats/tools.py | 6 +- verticapy/tests/vModel/test_memmodel.py | 255 +++++++++++++++ verticapy/tests/vModel/test_tools.py | 13 +- 6 files changed, 698 insertions(+), 44 deletions(-) diff --git a/verticapy/learn/memmodel.py b/verticapy/learn/memmodel.py index 4826b5722..b27f5d5c3 100644 --- a/verticapy/learn/memmodel.py +++ b/verticapy/learn/memmodel.py @@ -50,14 +50,159 @@ # # Standard Python Modules import numpy as np +from collections.abc import Iterable # VerticaPy Modules from verticapy.toolbox import * from verticapy.errors import * # ---# -def predict_from_coef(X: Union[list, np.array], - coefficients: Union[list, np.array], +def predict_from_binary_tree(X: Union[list, np.ndarray], + children_left: list, + children_right: list, + feature: list, + threshold: list, + value: list, + classes: Union[list, np.ndarray] = [], + return_proba: bool = False, + is_regressor: bool = True,): + """ + --------------------------------------------------------------------------- + Predicts using a binary tree model and the input attributes. + + Parameters + ---------- + X: list / numpy.array + Data on which to make the prediction. + children_left: list + A list of node IDs, where children_left[i] is the node id of the left child of node i. + children_right: list + A list of node IDs, children_right[i] is the node id of the right child of node i. + feature: list + A list of features, where feature[i] is the feature to split on for the internal node i. + threshold: list + A list of thresholds, where threshold[i] is the threshold for the internal node i. + value: list + Contains the constant prediction value of each node. + classes: list / numpy.array, optional + The classes for the binary tree model. + return_proba: bool, optional + If set to True, the probability of each class is returned. + is_regressor: bool, optional + If set to True, the parameter 'value' corresponds to the result of + a regression. + + Returns + ------- + numpy.array + Predicted values + """ + check_types([("X", X, [list, np.ndarray,],), + ("children_left", children_left, [list,],), + ("children_right", children_right, [list,],), + ("feature", feature, [list,],), + ("threshold", threshold, [list,],), + ("value", value, [list,],), + ("classes", classes, [list, np.ndarray,],), + ("return_proba", return_proba, [bool,],), + ("is_regressor", is_regressor, [bool,],),]) + def predict_tree(children_left, children_right, feature, threshold, value, node_id, X,): + if children_left[node_id] == children_right[node_id]: + if not(is_regressor) and not(return_proba) and isinstance(value, Iterable): + if isinstance(classes, Iterable) and len(classes) > 0: + return classes[np.argmax(value[node_id])] + else: + return np.argmax(value[node_id]) + else: + return value[node_id] + else: + if (isinstance(threshold[node_id], str) and str(X[feature[node_id]]) == threshold[node_id]) or (not(isinstance(threshold[node_id], str)) and float(X[feature[node_id]]) < float(threshold[node_id])): + return predict_tree(children_left, children_right, feature, threshold, value, children_left[node_id], X) + else: + return predict_tree(children_left, children_right, feature, threshold, value, children_right[node_id], X) + def predict_tree_final(X,): + return predict_tree(children_left, children_right, feature, threshold, value, 0, X,) + return np.apply_along_axis(predict_tree_final, 1, np.array(X)) + +# ---# +def sql_from_binary_tree(X: Union[list, np.ndarray], + children_left: list, + children_right: list, + feature: list, + threshold: list, + value: list, + classes: Union[list, np.ndarray] = [], + return_proba: bool = False, + is_regressor: bool = True,): + """ + --------------------------------------------------------------------------- + Returns the SQL code needed to deploy a binary tree model using its attributes. + + Parameters + ---------- + X: list / numpy.array + Data on which to make the prediction. + children_left: list + A list of node IDs, where children_left[i] is the node id of the left child of node i. + children_right: list + A list of node IDs, children_right[i] is the node id of the right child of node i. + feature: list + A list of features, where feature[i] is the feature to split on for the internal node i. + threshold: list + A list of thresholds, where threshold[i] is the threshold for the internal node i. + value: list + Contains the constant prediction value of each node. If used for classification and if return_proba is set to True, each element of the list must be a sublist + with the probabilities of each classes. + classes: list / numpy.array, optional + The classes for the binary tree model. + return_proba: bool, optional + If set to True, the probability of each class is returned. + is_regressor: bool, optional + If set to True, the parameter 'value' corresponds to the result of + a regression. + + Returns + ------- + str / list + SQL code + """ + check_types([("X", X, [list, np.ndarray,],), + ("children_left", children_left, [list,],), + ("children_right", children_right, [list,],), + ("feature", feature, [list,],), + ("threshold", threshold, [list,],), + ("value", value, [list,],), + ("classes", classes, [list, np.ndarray,],), + ("return_proba", return_proba, [bool,],), + ("is_regressor", is_regressor, [bool,],),]) + def predict_tree(children_left, children_right, feature, threshold, value, node_id, X, prob_ID = 0): + if children_left[node_id] == children_right[node_id]: + if return_proba: + return value[node_id][prob_ID] + else: + if not(is_regressor) and isinstance(classes, Iterable) and len(classes) > 0: + result = classes[np.argmax(value[node_id])] + if isinstance(result, str): + return "'" + result + "'" + else: + return result + else: + return value[node_id] + else: + op = '=' if isinstance(threshold[node_id], str) else '<' + return "(CASE WHEN {} {} '{}' THEN {} ELSE {} END)".format(X[feature[node_id]], + op, threshold[node_id], + predict_tree(children_left, children_right, feature, threshold, value, children_left[node_id], X, prob_ID), + predict_tree(children_left, children_right, feature, threshold, value, children_right[node_id], X, prob_ID)) + if return_proba: + n = max([len(l) if l != None else 0 for l in value]) + return [predict_tree(children_left, children_right, feature, threshold, value, 0, X, i) for i in range(n)] + else: + return predict_tree(children_left, children_right, feature, threshold, value, 0, X,) + +# ---# +def predict_from_coef(X: Union[list, np.ndarray], + coefficients: Union[list, np.ndarray], intercept: float, method: str = "LinearRegression", return_proba: bool = False,): @@ -85,8 +230,8 @@ def predict_from_coef(X: Union[list, np.array], numpy.array Predicted values """ - check_types([("X", X, [list,],), - ("coefficients", coefficients, [list,],), + check_types([("X", X, [list, np.ndarray,],), + ("coefficients", coefficients, [list, np.ndarray,],), ("intercept", intercept, [float, int,],), ("method", method, ["LinearRegression", "LinearSVR", "LogisticRegression", "LinearSVC"],), ("return_proba", return_proba, [bool],),]) @@ -138,10 +283,10 @@ def sql_from_coef(X: list, return sql # ---# -def predict_from_bisecting_kmeans(X: Union[list, np.array], - clusters: Union[list, np.array], - left_child: Union[list, np.array], - right_child: Union[list, np.array], +def predict_from_bisecting_kmeans(X: Union[list, np.ndarray], + clusters: Union[list, np.ndarray], + left_child: Union[list, np.ndarray], + right_child: Union[list, np.ndarray], p: int = 2,): """ --------------------------------------------------------------------------- @@ -167,10 +312,10 @@ def predict_from_bisecting_kmeans(X: Union[list, np.array], numpy.array Predicted values """ - check_types([("X", X, [list,],), - ("clusters", clusters, [list,],), - ("left_child", left_child, [list,],), - ("right_child", right_child, [list,],), + check_types([("X", X, [list, np.ndarray,],), + ("clusters", clusters, [list, np.ndarray,],), + ("left_child", left_child, [list, np.ndarray,],), + ("right_child", right_child, [list, np.ndarray,],), ("p", p, [int,],),]) centroids = np.array(clusters) def predict_tree(right_child, left_child, row, node_id, centroids): @@ -248,11 +393,11 @@ def predict_tree(right_child: list, left_child: list, node_id: int, clusters_dis return sql_final # ---# -def predict_from_clusters(X: Union[list, np.array], - clusters: Union[list, np.array], +def predict_from_clusters(X: Union[list, np.ndarray], + clusters: Union[list, np.ndarray], return_distance_clusters: bool = False, return_proba: bool = False, - classes: Union[list, np.array] = [], + classes: Union[list, np.ndarray] = [], p: int = 2,): """ --------------------------------------------------------------------------- @@ -278,11 +423,11 @@ def predict_from_clusters(X: Union[list, np.array], numpy.array Predicted values """ - check_types([("X", X, [list,],), - ("clusters", clusters, [list,],), + check_types([("X", X, [list, np.ndarray,],), + ("clusters", clusters, [list, np.ndarray,],), ("return_distance_clusters", return_distance_clusters, [bool,],), ("return_proba", return_proba, [bool,],), - ("classes", classes, [list,],), + ("classes", classes, [list, np.ndarray,],), ("p", p, [int,],),]) assert not(return_distance_clusters) or not(return_proba), ParameterError("Parameters 'return_distance_clusters' and 'return_proba' cannot both be set to True.") centroids = np.array(clusters) @@ -377,9 +522,9 @@ def sql_from_clusters(X: list, return sql_final # ---# -def transform_from_pca(X: Union[list, np.array], - principal_components: Union[list, np.array], - mean: Union[list, np.array]): +def transform_from_pca(X: Union[list, np.ndarray], + principal_components: Union[list, np.ndarray], + mean: Union[list, np.ndarray]): """ --------------------------------------------------------------------------- Transforms the data with a PCA model using the input attributes. @@ -398,9 +543,9 @@ def transform_from_pca(X: Union[list, np.array], numpy.array Transformed data """ - check_types([("X", X, [list],), - ("principal_components", principal_components, [list],), - ("mean", mean, [list],),]) + check_types([("X", X, [list, np.ndarray,],), + ("principal_components", principal_components, [list, np.ndarray,],), + ("mean", mean, [list, np.ndarray,],),]) pca_values = np.array(principal_components) result = (X - np.array(mean)) L, n = [], len(principal_components[0]) @@ -682,7 +827,7 @@ class memModel: 'SVD,' 'PCA,' 'BisectingKMeans,' 'KMeans,' 'NaiveBayes,' 'XGBoostClassifier,' 'XGBoostRegressor,' 'RandomForestClassifier,' 'RandomForestRegressor,' 'LinearSVR,' 'LinearSVC,' 'LogisticRegression,' - 'LinearRegression' + 'LinearRegression', 'BinaryTreeRegressor', 'BinaryTreeClassifier' attributes: dict Dictionary which includes all the model's attributes. For OneHotEncoder: {"categories": List of the different feature categories. @@ -709,13 +854,34 @@ class memModel: "mean": List of the input predictors average.} For SVD: {"vectors": Matrix of the right singular vectors. "values": List of the singular values.} - For Normalizer: {"values": List of tuples including the model's attributes. - The required tuple depends on the specified method: - 'zscore': (mean, std) - 'robust_zscore': (median, mad) - 'minmax': (min, max) + For Normalizer: {"values": List of tuples including the model's attributes. + The required tuple depends on the specified method: + 'zscore': (mean, std) + 'robust_zscore': (median, mad) + 'minmax': (min, max) "method": The model's category, one of the following: 'zscore', 'robust_zscore', or 'minmax'.} + For BinaryTreeRegressor, BinaryTreeClassifier: + {children_left: A list of node IDs, where children_left[i] is the node id of the left + A list of node IDs, where child of node i. + children_right: children_right[i] is the node id of the + right child of node i. + feature: A list of features, where feature[i] is the feature to split on, for the internal + node i. + threshold: threshold[i] is the threshold for the internal node i. + value: Contains the constant prediction value of each node. + classes: [Only for Classifier] The classes for the binary tree model.} + For RandomForestClassifier, RandomForestRegressor, XGBoostClassifier, XGBoostRegressor: + {trees: list of memModels of type 'BinaryTreeRegressor' or + 'BinaryTreeClassifier' + learning_rate: [Only for XGBoostClassifier and XGBoostRegressor] + Learning rate. + mean: [Only for XGBoostRegressor] + Average of the response column. + logodds: [Only for XGBoostClassifier] + List of the logodds of the response classes.} + + """ # # Special Methods @@ -737,6 +903,8 @@ def __init__( "XGBoostClassifier", "XGBoostRegressor", "RandomForestClassifier", + "BinaryTreeClassifier", + "BinaryTreeRegressor", "RandomForestRegressor", "LinearSVR", "LinearSVC", @@ -744,7 +912,56 @@ def __init__( "LinearRegression", "NearestCentroids",],),]) attributes_ = {} - if model_type == "OneHotEncoder": + if model_type in ("RandomForestRegressor", "XGBoostRegressor", "RandomForestClassifier", "XGBoostClassifier",): + if ("trees" not in attributes): + raise ParameterError("{}'s attributes must include a list of memModels representing each tree.".format(model_type)) + attributes_["trees"] = [] + for tree in attributes["trees"]: + assert isinstance(tree, memModel), ParameterError("Each tree of the model must be a memModel, found '{}'.".format(type(tree))) + if model_type in ("RandomForestClassifier", "XGBoostClassifier",): + assert tree.model_type_ in ("BinaryTreeClassifier",), ParameterError("Each tree of the model must be a BinaryTreeClassifier, found '{}'.".format(tree.model_type_)) + else: + assert tree.model_type_ in ("BinaryTreeRegressor",), ParameterError("Each tree of the model must be a BinaryTreeRegressor, found '{}'.".format(tree.model_type_)) + attributes_["trees"] += [tree] + represent = "<{}>\n\nntrees = {}".format(model_type, len(attributes_["trees"])) + if model_type == "XGBoostRegressor": + if ("learning_rate" not in attributes or 'mean' not in attributes): + raise ParameterError("{}'s attributes must include the response average and the learning rate.".format(model_type)) + attributes_["mean"] = attributes["mean"] + check_types([("mean", attributes_["mean"], [int, float,],),]) + represent += "\n\nmean = {}".format(attributes_["mean"]) + if model_type == "XGBoostClassifier": + if ("learning_rate" not in attributes or 'logodds' not in attributes): + raise ParameterError("{}'s attributes must include the response classes logodds and the learning rate.".format(model_type)) + attributes_["logodds"] = np.copy(attributes["logodds"]) + check_types([("logodds", attributes_["logodds"], [list,],),]) + represent += "\n\nlogodds = {}".format(attributes_["logodds"]) + if model_type in ("XGBoostRegressor", "XGBoostClassifier",): + attributes_["learning_rate"] = attributes["learning_rate"] + check_types([("learning_rate", attributes_["learning_rate"], [int, float,],),]) + represent += "\n\nlearning_rate = {}".format(attributes_["learning_rate"]) + elif model_type in ("BinaryTreeClassifier", "BinaryTreeRegressor"): + if ("children_left" not in attributes or "children_right" not in attributes or "feature" not in attributes or "threshold" not in attributes or "value" not in attributes): + raise ParameterError("{}'s attributes must include at least the following lists: children_left, children_right, feature, threshold, value.".format(model_type)) + for elem in ("children_left", "children_right", "feature", "threshold", "value",): + if isinstance(attributes[elem], list): + attributes_[elem] = attributes[elem].copy() + else: + attributes_[elem] = np.copy(attributes[elem]) + check_types([("children_left", attributes_["children_left"], [list,],), + ("children_right", attributes_["children_right"], [list,],), + ("feature", attributes_["feature"], [list,],), + ("threshold", attributes_["threshold"], [list,],), + ("value", attributes_["value"], [list,],),]) + represent = "<{}>\n\nchildren_left = {}\n\nchildren_right = {}\n\nfeature = {}\n\nthreshold = {}\n\nvalue =\n{}".format(model_type, attributes_["children_left"], attributes_["children_right"], attributes_["feature"], attributes_["threshold"], attributes_["value"]) + if model_type in ("BinaryTreeClassifier",): + if "classes" not in attributes: + attributes_["classes"] = [] + else: + attributes_["classes"] = np.copy(attributes["classes"]) + check_types([("classes", attributes_["classes"], [list,],),]) + represent += "\n\nclasses = {}".format(attributes_["classes"]) + elif model_type == "OneHotEncoder": if "categories" not in attributes: raise ParameterError("OneHotEncoder's attributes must include a list with all the feature categories for the 'categories' parameter.") attributes_["categories"] = attributes["categories"].copy() @@ -808,7 +1025,7 @@ def __init__( attributes_["mean"] = np.copy(attributes["mean"]) check_types([("principal_components", attributes_["principal_components"], [list,],), ("mean", attributes_["mean"], [list,],),]) - represent = "<{}>\n\nprincipal_components = {}\n\nmean = {}".format(model_type, attributes_["principal_components"], attributes_["mean"]) + represent = "<{}>\n\nprincipal_components = \n{}\n\nmean = {}".format(model_type, attributes_["principal_components"], attributes_["mean"]) elif model_type in ("SVD",): if ("vectors" not in attributes or "values" not in attributes): raise ParameterError("SVD's attributes must include 2 lists: one with all the right singular vectors and one with the singular values of each input feature.") @@ -816,7 +1033,7 @@ def __init__( attributes_["values"] = np.copy(attributes["values"]) check_types([("vectors", attributes_["vectors"], [list,],), ("values", attributes_["values"], [list,],),]) - represent = "<{}>\n\nvectors = {}\n\nvalues = {}".format(model_type, attributes_["vectors"], attributes_["values"]) + represent = "<{}>\n\nvectors = \n{}\n\nvalues = {}".format(model_type, attributes_["vectors"], attributes_["values"]) elif model_type in ("Normalizer",): if ("values" not in attributes or "method" not in attributes): raise ParameterError("Normalizer's attributes must include a list including the model's aggregations and a string representing the model's method.") @@ -826,7 +1043,7 @@ def __init__( ("method", attributes_["method"], ["minmax", "zscore", "robust_zscore",],),]) represent = "<{}>\n\nvalues = {}\n\nmethod = {}".format(model_type, attributes_["values"], attributes_["method"]) else: - raise ParameterError("Model type '{}' is not yet available.".format(self.model_type_)) + raise ParameterError("Model type '{}' is not yet available.".format(model_type)) self.attributes_ = attributes_ self.model_type_ = model_type self.represent_ = represent @@ -888,6 +1105,18 @@ def predict(self, X: list): return predict_from_clusters(X, self.attributes_["clusters"], p=self.attributes_["p"], classes=self.attributes_["classes"]) elif self.model_type_ in ("BisectingKMeans",): return predict_from_bisecting_kmeans(X, self.attributes_["clusters"], self.attributes_["left_child"], self.attributes_["right_child"], p=self.attributes_["p"]) + elif self.model_type_ in ("BinaryTreeRegressor", "BinaryTreeClassifier",): + return predict_from_binary_tree(X, self.attributes_["children_left"], self.attributes_["children_right"], self.attributes_["feature"], self.attributes_["threshold"], self.attributes_["value"], self.attributes_["classes"] if self.model_type_ in ("BinaryTreeClassifier",) else [], is_regressor=self.model_type_ in ("BinaryTreeRegressor",),) + elif self.model_type_ in ("RandomForestRegressor", "XGBoostRegressor",): + result = [tree.predict(X) for tree in self.attributes_["trees"]] + if self.model_type_ in ("RandomForestRegressor",): + return np.average(np.column_stack(result), axis=1) + else: + return np.sum(np.column_stack(result), axis=1) * self.attributes_["learning_rate"] + self.attributes_["mean"] + elif self.model_type_ in ("RandomForestClassifier", "XGBoostClassifier",): + result = np.argmax(self.predict_proba(X), axis=1) + result = np.array([self.attributes_["trees"][0].attributes_["classes"][i] for i in result]) + return result else: raise FunctionError("Method 'predict' is not available for model type '{}'.".format(self.model_type_)) @@ -918,11 +1147,40 @@ def predict_sql(self, X: list): return sql_from_clusters(X, self.attributes_["clusters"], p=self.attributes_["p"], classes=self.attributes_["classes"]) elif self.model_type_ in ("BisectingKMeans",): return sql_from_bisecting_kmeans(X, self.attributes_["clusters"], self.attributes_["left_child"], self.attributes_["right_child"], p=self.attributes_["p"]) + elif self.model_type_ in ("BinaryTreeRegressor", "BinaryTreeClassifier",): + return sql_from_binary_tree(X, self.attributes_["children_left"], self.attributes_["children_right"], self.attributes_["feature"], self.attributes_["threshold"], self.attributes_["value"], self.attributes_["classes"] if self.model_type_ in ("BinaryTreeClassifier",) else [], is_regressor=self.model_type_ in ("BinaryTreeRegressor",),) + elif self.model_type_ in ("RandomForestRegressor", "XGBoostRegressor",): + result = [tree.predict_sql(X) for tree in self.attributes_["trees"]] + if self.model_type_ in ("RandomForestRegressor",): + return "(" + " + ".join(result) + ") / {}".format(len(result)) + else: + return "(" + " + ".join(result) + ") * {} + {}".format(self.attributes_["learning_rate"], self.attributes_["mean"],) + elif self.model_type_ in ("RandomForestClassifier", "XGBoostClassifier",): + classes = self.attributes_["trees"][0].attributes_["classes"] + m = len(classes) + result_proba = self.predict_proba_sql(X,) + if m == 2: + return "(CASE WHEN {} > 0.5 THEN {} ELSE {} END)".format(result_proba[1], classes[1], classes[0]) + else: + sql = [] + for i in range(m): + list_tmp = [] + for j in range(i): + list_tmp += ["{} <= {}".format(result_proba[i], result_proba[j])] + sql += [" AND ".join(list_tmp)] + sql = sql[1:] + sql.reverse() + sql_final = "CASE WHEN {} THEN NULL".format(" OR ".join(["{} IS NULL".format(elem) for elem in X])) + for i in range(m - 1): + class_i = classes[m - i - 1] + sql_final += " WHEN {} THEN {}".format(sql[i], "'{}'".format(class_i) if isinstance(class_i, str) else class_i) + sql_final += " ELSE {} END".format("'{}'".format(classes[0]) if isinstance(classes[0], str) else classes[0]) + return sql_final else: raise FunctionError("Method 'predict_sql' is not available for model type '{}'.".format(self.model_type_)) # ---# - def predict_proba(self, X: list): + def predict_proba(self, X: list,): """ --------------------------------------------------------------------------- Predicts probabilities using the model's attributes. @@ -943,6 +1201,24 @@ def predict_proba(self, X: list): return predict_from_clusters(X, self.attributes_["clusters"], p=self.attributes_["p"], return_proba=True,) elif self.model_type_ in ("NearestCentroids",): return predict_from_clusters(X, self.attributes_["clusters"], p=self.attributes_["p"], classes=self.attributes_["classes"], return_proba=True,) + elif self.model_type_ in ("BinaryTreeClassifier",): + return predict_from_binary_tree(X, self.attributes_["children_left"], self.attributes_["children_right"], self.attributes_["feature"], self.attributes_["threshold"], self.attributes_["value"], self.attributes_["classes"], True, is_regressor=False,) + elif self.model_type_ in ("RandomForestClassifier",): + result, n = 0, len(self.attributes_["trees"]) + for i in range(n): + result_tmp = self.attributes_["trees"][i].predict_proba(X) + result_tmp_arg = np.zeros_like(result_tmp) + result_tmp_arg[np.arange(len(result_tmp)), result_tmp.argmax(1)] = 1 + result += result_tmp_arg + return result / n + elif self.model_type_ in ("XGBoostClassifier",): + result = 0 + for tree in self.attributes_["trees"]: + result += tree.predict_proba(X) + result = self.attributes_["logodds"] + self.attributes_["learning_rate"] * result + result = 1 / (1 + np.exp(- result)) + result /= np.sum(result, axis=1)[:,None] + return result else: raise FunctionError("Method 'predict_proba' is not available for model type '{}'.".format(self.model_type_)) @@ -969,6 +1245,39 @@ def predict_proba_sql(self, X: list): return sql_from_clusters(X, self.attributes_["clusters"], p=self.attributes_["p"], return_proba=True,) elif self.model_type_ in ("NearestCentroids",): return sql_from_clusters(X, self.attributes_["clusters"], p=self.attributes_["p"], classes=self.attributes_["classes"], return_proba=True,) + elif self.model_type_ in ("BinaryTreeClassifier",): + return sql_from_binary_tree(X, self.attributes_["children_left"], self.attributes_["children_right"], self.attributes_["feature"], self.attributes_["threshold"], self.attributes_["value"], self.attributes_["classes"], True, is_regressor=False,) + elif self.model_type_ in ("RandomForestClassifier",): + trees, n, m = [], len(self.attributes_["trees"]), len(self.attributes_["trees"][0].attributes_["classes"]) + for i in range(n): + val = [] + for elem in self.attributes_["trees"][i].attributes_["value"]: + if isinstance(elem, type(None)): + val += [elem] + else: + value_tmp = np.zeros_like([elem]) + value_tmp[np.arange(1), np.array([elem]).argmax(1)] = 1 + val += [list(value_tmp[0])] + tree = memModel("BinaryTreeClassifier", {"children_left": self.attributes_["trees"][i].attributes_["children_left"], + "children_right": self.attributes_["trees"][i].attributes_["children_right"], + "feature": self.attributes_["trees"][i].attributes_["feature"], + "threshold": self.attributes_["trees"][i].attributes_["threshold"], + "value": val, + "classes": self.attributes_["trees"][i].attributes_["classes"],}) + trees += [tree] + result = [trees[i].predict_proba_sql(X) for i in range(n)] + classes_proba = [] + for i in range(m): + classes_proba += ["(" + " + ".join([val[i] for val in result]) + ") / {}".format(n)] + return classes_proba + elif self.model_type_ in ("XGBoostClassifier",): + result, n, m = [], len(self.attributes_["trees"]), len(self.attributes_["trees"][0].attributes_["classes"]) + all_probas = [self.attributes_["trees"][i].predict_proba_sql(X) for i in range(n)] + for i in range(m): + result += ["(1 / (1 + EXP(- ({} + {} * (".format(self.attributes_["logodds"][i], self.attributes_["learning_rate"]) + " + ".join(all_probas[i]) + ")))))"] + sum_result = "(" + " + ".join(result) + ")" + result = [item + " / {}".format(sum_result) for item in result] + return result else: raise FunctionError("Method 'predict_proba_sql' is not available for model type '{}'.".format(self.model_type_)) @@ -1030,3 +1339,35 @@ def transform_sql(self, X: list): else: raise FunctionError("Method 'transform_sql' is not available for model type '{}'.".format(self.model_type_)) + # ---# + def rotate(self, gamma: float = 1.0, q: int = 20, tol: float = 1e-6): + """ + --------------------------------------------------------------------------- + Performs a Oblimin (Varimax, Quartimax) rotation on the the model's + PCA matrix. + + Parameters + ---------- + gamma: float, optional + Oblimin rotation factor, determines the type of rotation. + It must be between 0.0 and 1.0. + gamma = 0.0 results in a Quartimax rotation. + gamma = 1.0 results in a Varimax rotation. + q: int, optional + Maximum number of iterations. + tol: float, optional + The algorithm stops when the Frobenius norm of gradient is less than tol. + + Returns + ------- + self + memModel + """ + from verticapy.learn.tools import matrix_rotation + + if self.model_type_ in ("PCA",): + principal_components = matrix_rotation(self.get_attributes()["principal_components"], gamma, q, tol) + self.set_attributes({"principal_components": principal_components}) + else: + raise FunctionError("Method 'rotate' is not available for model type '{}'.".format(self.model_type_)) + return self diff --git a/verticapy/learn/tools.py b/verticapy/learn/tools.py index 8412e8895..fb075661e 100644 --- a/verticapy/learn/tools.py +++ b/verticapy/learn/tools.py @@ -54,6 +54,8 @@ # Standard Python Modules import numpy as np +from numpy import eye, asarray, dot, sum, diag +from numpy.linalg import svd from typing import Union # @@ -635,4 +637,52 @@ def load_model(name: str, cursor=None, input_relation: str = "", test_relation: model.classes_ = [0, 1] if model_type in ("svm_classifier", "svm_regressor", "logistic_reg", "linear_reg",): model.coef_ = model.get_attr("details") - return model \ No newline at end of file + return model + +# ---# +# This piece of code was taken from +# https://en.wikipedia.org/wiki/Talk:Varimax_rotation +def matrix_rotation(Phi: list, + gamma: float = 1.0, + q: int = 20, + tol: float = 1e-6): + """ +--------------------------------------------------------------------------- +Performs a Oblimin (Varimax, Quartimax) rotation on the the model's +PCA matrix. + +Parameters +---------- +Phi: list / numpy.array + input matrix. +gamma: float, optional + Oblimin rotation factor, determines the type of rotation. + It must be between 0.0 and 1.0. + gamma = 0.0 results in a Quartimax rotation. + gamma = 1.0 results in a Varimax rotation. +q: int, optional + Maximum number of iterations. +tol: float, optional + The algorithm stops when the Frobenius norm of gradient is less than tol. + +Returns +------- +model + The model. + """ + check_types([("Phi", Phi, [list,],), + ("gamma", gamma, [int, float,],), + ("q", q, [int, float,],), + ("tol", tol, [int, float,],),]) + Phi = np.array(Phi) + p,k = Phi.shape + R = eye(k) + d=0 + for i in range(q): + d_old = d + Lambda = dot(Phi, R) + u,s,vh = svd(dot(Phi.T,asarray(Lambda)**3 - (gamma/p) * dot(Lambda, diag(diag(dot(Lambda.T,Lambda)))))) + R = dot(u,vh) + d = sum(s) + if d_old!=0 and d/d_old < 1 + tol: break + return dot(Phi, R) \ No newline at end of file diff --git a/verticapy/learn/vmodel.py b/verticapy/learn/vmodel.py index 38fd86d3e..856ece168 100644 --- a/verticapy/learn/vmodel.py +++ b/verticapy/learn/vmodel.py @@ -51,7 +51,6 @@ # Standard Python Modules import os, warnings import numpy as np -from collections.abc import Iterable from typing import Union # VerticaPy Modules @@ -2387,10 +2386,8 @@ def to_python(self, name: str = "predict", return_proba: bool = False, return_di func = "def {}(X):\n\timport numpy as np\n\t".format(name) if self.type in ("LinearRegression", "LinearSVR", "LogisticRegression", "LinearSVC",): result = "{} + np.sum(np.array({}) * np.array(X), axis=1)".format(self.coef_["coefficient"][0], self.coef_["coefficient"][1:]) - if self.type in ("LogisticRegression",): + if self.type in ("LogisticRegression", "LinearSVC",): func += f"result = 1 / (1 + np.exp(- ({result})))" - elif self.type in ("LinearSVC",): - func += f"result = 1 - 1 / (1 + np.exp({result}))" else: func += "result = " + result if return_proba and self.type in ("LogisticRegression", "LinearSVC",): diff --git a/verticapy/stats/tools.py b/verticapy/stats/tools.py index 7d779cd56..1a521d8a0 100644 --- a/verticapy/stats/tools.py +++ b/verticapy/stats/tools.py @@ -635,16 +635,16 @@ def het_breuschpagan( ): """ --------------------------------------------------------------------------- -Breusch-Pagan test for heteroscedasticity. +Uses the Breusch-Pagan to test a model for heteroskedasticity. Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str - Input residual vcolumn. + Input residual vColumn. X: list - Exogenous Variables to test the heteroscedasticity on. + The exogenous variables to test. Returns ------- diff --git a/verticapy/tests/vModel/test_memmodel.py b/verticapy/tests/vModel/test_memmodel.py index 8b2bed7a5..37444b23d 100644 --- a/verticapy/tests/vModel/test_memmodel.py +++ b/verticapy/tests/vModel/test_memmodel.py @@ -113,6 +113,12 @@ def test_PCA(self,): assert attributes["principal_components"][0][1] == 0.2 assert attributes["principal_components"][1][0] == 0.7 assert attributes["principal_components"][1][1] == 0.8 + model = model.rotate() + attributes = model.get_attributes() + assert attributes["principal_components"][0][0] == pytest.approx(0.05887149) + assert attributes["principal_components"][0][1] == pytest.approx(0.21571775) + assert attributes["principal_components"][1][0] == pytest.approx(0.01194755) + assert attributes["principal_components"][1][1] == pytest.approx(1.06294744) assert attributes["mean"][0] == 0.9 assert attributes["mean"][1] == 0.8 assert model.model_type_ == "PCA" @@ -339,4 +345,253 @@ def test_BisectingKMeans(self,): assert attributes["p"] == 3 assert model.model_type_ == "BisectingKMeans" + def test_BinaryTreeRegressor(self,): + model = memModel("BinaryTreeRegressor", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, 3, 11, 1993],}) + prediction = model.predict([['male', 100], ['female', 20] , ['female', 50]]) + assert prediction[0] == pytest.approx(3.0) + assert prediction[1] == pytest.approx(11.0) + assert prediction[2] == pytest.approx(1993.0) + assert model.predict_sql(['sex', 'fare']) == "(CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 11 ELSE 1993 END) ELSE 3 END)" + attributes = model.get_attributes() + assert attributes["children_left"][0] == 1 + assert attributes["children_left"][1] == 3 + assert attributes["children_right"][0] == 2 + assert attributes["children_right"][1] == 4 + assert attributes["feature"][0] == 0 + assert attributes["feature"][1] == 1 + assert attributes["threshold"][0] == 'female' + assert attributes["threshold"][1] == 30 + assert attributes["value"][2] == 3 + assert attributes["value"][3] == 11 + assert model.model_type_ == "BinaryTreeRegressor" + + def test_BinaryTreeClassifier(self,): + model = memModel("BinaryTreeClassifier", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, [0.8, 0.1, 0.1], [0.1, 0.8, 0.1], [0.2, 0.2, 0.6]], + "classes": ['a', 'b', 'c',]}) + prediction = model.predict([['male', 100], ['female', 20] , ['female', 50]]) + assert prediction[0] == 'a' + assert prediction[1] == 'b' + assert prediction[2] == 'c' + assert model.predict_sql(['sex', 'fare']) == "(CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 'b' ELSE 'c' END) ELSE 'a' END)" + prediction = model.predict_proba([['male', 100], ['female', 20] , ['female', 50]]) + assert prediction[0][0] == 0.8 + assert prediction[0][1] == 0.1 + assert prediction[0][2] == 0.1 + assert prediction[1][0] == 0.1 + assert prediction[1][1] == 0.8 + assert prediction[1][2] == 0.1 + assert prediction[2][0] == 0.2 + assert prediction[2][1] == 0.2 + assert prediction[2][2] == 0.6 + attributes = model.get_attributes() + assert attributes["children_left"][0] == 1 + assert attributes["children_left"][1] == 3 + assert attributes["children_right"][0] == 2 + assert attributes["children_right"][1] == 4 + assert attributes["feature"][0] == 0 + assert attributes["feature"][1] == 1 + assert attributes["threshold"][0] == 'female' + assert attributes["threshold"][1] == 30 + assert attributes["value"][2][0] == 0.8 + assert attributes["value"][3][0] == 0.1 + model.set_attributes({"classes": [0, 1, 2],}) + attributes = model.get_attributes() + assert attributes["classes"][0] == 0 + assert attributes["classes"][1] == 1 + assert attributes["classes"][2] == 2 + assert model.model_type_ == "BinaryTreeClassifier" + + def test_RandomForestRegressor(self,): + model1 = memModel("BinaryTreeRegressor", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, 3, 11, 1993],}) + model2 = memModel("BinaryTreeRegressor", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, -3, -11, -1993],}) + model3 = memModel("BinaryTreeRegressor", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, 0, 3, 6],}) + model = memModel("RandomForestRegressor", {"trees": [model1, model2, model3]}) + prediction = model.predict([['male', 100], ['female', 20] , ['female', 50]]) + assert prediction[0] == pytest.approx(0.0) + assert prediction[1] == pytest.approx(1.0) + assert prediction[2] == pytest.approx(2.0) + assert model.predict_sql(['sex', 'fare']) == "((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 11 ELSE 1993 END) ELSE 3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN -11 ELSE -1993 END) ELSE -3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 3 ELSE 6 END) ELSE 0 END)) / 3" + attributes = model.get_attributes()["trees"][0].get_attributes() + assert attributes["children_left"][0] == 1 + assert attributes["children_left"][1] == 3 + assert attributes["children_right"][0] == 2 + assert attributes["children_right"][1] == 4 + assert attributes["feature"][0] == 0 + assert attributes["feature"][1] == 1 + assert attributes["threshold"][0] == 'female' + assert attributes["threshold"][1] == 30 + assert attributes["value"][2] == 3 + assert attributes["value"][3] == 11 + assert model.model_type_ == "RandomForestRegressor" + + def test_RandomForestClassifier(self,): + model1 = memModel("BinaryTreeClassifier", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, [0.8, 0.1, 0.1], [0.1, 0.8, 0.1], [0.1, 0.1, 0.8]], + "classes": ['a', 'b', 'c'],}) + model2 = memModel("BinaryTreeClassifier", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, [0.7, 0.15, 0.15], [0.2, 0.6, 0.2], [0.2, 0.2, 0.6]], + "classes": ['a', 'b', 'c'],}) + model3 = memModel("BinaryTreeClassifier", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, [0.3, 0.7, 0.0], [0.0, 0.4, 0.6], [0.9, 0.1, 0.0]], + "classes": ['a', 'b', 'c'],}) + model = memModel("RandomForestClassifier", {"trees": [model1, model2, model3]}) + prediction = model.predict([['male', 100], ['female', 20] , ['female', 50]]) + assert prediction[0] == 'a' + assert prediction[1] == 'b' + assert prediction[2] == 'c' + assert model.predict_sql(['sex', 'fare']) == "CASE WHEN sex IS NULL OR fare IS NULL THEN NULL WHEN ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 1.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 1.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 1.0 ELSE 0.0 END) ELSE 0.0 END)) / 3 <= ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.0 END) ELSE 1.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.0 END) ELSE 1.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 1.0 END) ELSE 0.0 END)) / 3 AND ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 1.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 1.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 1.0 ELSE 0.0 END) ELSE 0.0 END)) / 3 <= ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 1.0 ELSE 0.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 1.0 ELSE 0.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.0 END) ELSE 1.0 END)) / 3 THEN 'c' WHEN ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 1.0 ELSE 0.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 1.0 ELSE 0.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.0 END) ELSE 1.0 END)) / 3 <= ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.0 END) ELSE 1.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.0 END) ELSE 1.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 1.0 END) ELSE 0.0 END)) / 3 THEN 'b' ELSE 'a' END" + prediction = model.predict_proba([['male', 100], ['female', 20] , ['female', 50]]) + assert prediction[0][0] == pytest.approx(0.66666667) + assert prediction[0][1] == pytest.approx(0.33333333) + assert prediction[0][2] == pytest.approx(0.0) + assert prediction[1][0] == pytest.approx(0.0) + assert prediction[1][1] == pytest.approx(0.66666667) + assert prediction[1][2] == pytest.approx(0.33333333) + assert prediction[2][0] == pytest.approx(0.33333333) + assert prediction[2][1] == pytest.approx(0.0) + assert prediction[2][2] == pytest.approx(0.66666667) + prediction = model.predict_proba_sql(["sex", "fare"]) + assert prediction[0] == "((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.0 END) ELSE 1.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.0 END) ELSE 1.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 1.0 END) ELSE 0.0 END)) / 3" + assert prediction[1] == "((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 1.0 ELSE 0.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 1.0 ELSE 0.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.0 END) ELSE 1.0 END)) / 3" + assert prediction[2] == "((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 1.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 1.0 END) ELSE 0.0 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 1.0 ELSE 0.0 END) ELSE 0.0 END)) / 3" + attributes = model.get_attributes()["trees"][0].get_attributes() + assert attributes["children_left"][0] == 1 + assert attributes["children_left"][1] == 3 + assert attributes["children_right"][0] == 2 + assert attributes["children_right"][1] == 4 + assert attributes["feature"][0] == 0 + assert attributes["feature"][1] == 1 + assert attributes["threshold"][0] == 'female' + assert attributes["threshold"][1] == 30 + assert attributes["value"][2][0] == 0.8 + assert attributes["value"][3][0] == 0.1 + assert model.model_type_ == "RandomForestClassifier" + + def test_XGBoostRegressor(self,): + model1 = memModel("BinaryTreeRegressor", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, 3, 11, 1993],}) + model2 = memModel("BinaryTreeRegressor", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, -3, -11, -1993],}) + model3 = memModel("BinaryTreeRegressor", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, 0, 3, 6],}) + model = memModel("XGBoostRegressor", {"trees": [model1, model2, model3], + "learning_rate": 0.1, + "mean": 1.0}) + prediction = model.predict([['male', 100], ['female', 20] , ['female', 50]]) + assert prediction[0] == pytest.approx(1.0) + assert prediction[1] == pytest.approx(1.3) + assert prediction[2] == pytest.approx(1.6) + assert model.predict_sql(['sex', 'fare']) == "((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 11 ELSE 1993 END) ELSE 3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN -11 ELSE -1993 END) ELSE -3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 3 ELSE 6 END) ELSE 0 END)) * 0.1 + 1.0" + attributes = model.get_attributes()["trees"][0].get_attributes() + assert attributes["children_left"][0] == 1 + assert attributes["children_left"][1] == 3 + assert attributes["children_right"][0] == 2 + assert attributes["children_right"][1] == 4 + assert attributes["feature"][0] == 0 + assert attributes["feature"][1] == 1 + assert attributes["threshold"][0] == 'female' + assert attributes["threshold"][1] == 30 + assert attributes["value"][2] == 3 + assert attributes["value"][3] == 11 + attributes = model.get_attributes() + assert attributes["learning_rate"] == 0.1 + assert attributes["mean"] == 1.0 + model.set_attributes({"learning_rate": 0.2, "mean": 2.0}) + attributes = model.get_attributes() + assert attributes["learning_rate"] == 0.2 + assert attributes["mean"] == 2.0 + assert model.model_type_ == "XGBoostRegressor" + + def test_XGBoostClassifier(self,): + model1 = memModel("BinaryTreeClassifier", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, [0.8, 0.1, 0.1], [0.1, 0.8, 0.1], [0.1, 0.1, 0.8]], + "classes": ['a', 'b', 'c'],}) + model2 = memModel("BinaryTreeClassifier", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, [0.7, 0.15, 0.15], [0.2, 0.6, 0.2], [0.2, 0.2, 0.6]], + "classes": ['a', 'b', 'c'],}) + model3 = memModel("BinaryTreeClassifier", {"children_left": [1, 3, None, None, None], + "children_right": [2, 4, None, None, None], + "feature": [0, 1, None, None, None], + "threshold": ['female', 30, None, None, None], + "value": [None, None, [0.3, 0.7, 0.0], [0.0, 0.4, 0.6], [0.9, 0.1, 0.0]], + "classes": ['a', 'b', 'c'],}) + model = memModel("XGBoostClassifier", {"trees": [model1, model2, model3], + "learning_rate": 0.1, + "logodds": [0.1, 0.12, 0.15]}) + prediction = model.predict([['male', 100], ['female', 20] , ['female', 50]]) + assert prediction[0] == 'a' + assert prediction[1] == 'b' + assert prediction[2] == 'c' + assert model.predict_sql(['sex', 'fare']) == "CASE WHEN sex IS NULL OR fare IS NULL THEN NULL WHEN (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END)))))) / ((1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) + (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) + (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END))))))) <= (1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) / ((1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) + (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) + (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END))))))) AND (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END)))))) / ((1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) + (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) + (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END))))))) <= (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) / ((1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) + (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) + (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END))))))) THEN 'c' WHEN (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) / ((1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) + (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) + (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END))))))) <= (1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) / ((1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) + (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) + (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END))))))) THEN 'b' ELSE 'a' END" + prediction = model.predict_proba([['male', 100], ['female', 20] , ['female', 50]]) + assert prediction[0][0] == pytest.approx(0.34171499) + assert prediction[0][1] == pytest.approx(0.33211396) + assert prediction[0][2] == pytest.approx(0.32617105) + assert prediction[1][0] == pytest.approx(0.31948336) + assert prediction[1][1] == pytest.approx(0.34467713) + assert prediction[1][2] == pytest.approx(0.33583951) + assert prediction[2][0] == pytest.approx(0.33286283) + assert prediction[2][1] == pytest.approx(0.32394435) + assert prediction[2][2] == pytest.approx(0.34319282) + prediction = model.predict_proba_sql(["sex", "fare"]) + assert prediction[0] == "(1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) / ((1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) + (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) + (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END)))))))" + assert prediction[1] == "(1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) / ((1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) + (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) + (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END)))))))" + assert prediction[2] == "(1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END)))))) / ((1 / (1 + EXP(- (0.1 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.1 END) ELSE 0.8 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.8 ELSE 0.1 END) ELSE 0.1 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.1 ELSE 0.8 END) ELSE 0.1 END)))))) + (1 / (1 + EXP(- (0.12 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.2 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.2 END) ELSE 0.15 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.2 ELSE 0.6 END) ELSE 0.15 END)))))) + (1 / (1 + EXP(- (0.15 + 0.1 * ((CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.0 ELSE 0.9 END) ELSE 0.3 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.4 ELSE 0.1 END) ELSE 0.7 END) + (CASE WHEN sex = 'female' THEN (CASE WHEN fare < '30' THEN 0.6 ELSE 0.0 END) ELSE 0.0 END)))))))" + attributes = model.get_attributes()["trees"][0].get_attributes() + assert attributes["children_left"][0] == 1 + assert attributes["children_left"][1] == 3 + assert attributes["children_right"][0] == 2 + assert attributes["children_right"][1] == 4 + assert attributes["feature"][0] == 0 + assert attributes["feature"][1] == 1 + assert attributes["threshold"][0] == 'female' + assert attributes["threshold"][1] == 30 + assert attributes["value"][2][0] == 0.8 + assert attributes["value"][3][0] == 0.1 + assert model.model_type_ == "XGBoostClassifier" + diff --git a/verticapy/tests/vModel/test_tools.py b/verticapy/tests/vModel/test_tools.py index 5ddb31af9..861287616 100755 --- a/verticapy/tests/vModel/test_tools.py +++ b/verticapy/tests/vModel/test_tools.py @@ -24,6 +24,7 @@ from verticapy.learn.decomposition import * from verticapy.learn.preprocessing import * from verticapy.learn.tsa import * +from verticapy.learn.tools import * import matplotlib.pyplot as plt @@ -291,5 +292,15 @@ def test_load_model(self, base, titanic_vd): #model.drop() base.cursor.execute("DROP SCHEMA load_model_test CASCADE") - + def test_matrix_rotation(self,): + result = matrix_rotation([[0.5, 0.6], [0.1, 0.2]]) + assert result[0][0] == pytest.approx(0.01539405) + assert result[0][1] == pytest.approx(0.78087324) + assert result[1][0] == pytest.approx(0.05549495) + assert result[1][1] == pytest.approx(0.21661097) + result = matrix_rotation([[0.5, 0.6], [0.1, 0.2]], gamma=0.0) + assert result[0][0] == pytest.approx(0.0010429389547800816) + assert result[0][1] == pytest.approx(0.78102427) + assert result[1][0] == pytest.approx(-0.05092405) + assert result[1][1] == pytest.approx(0.21773089) \ No newline at end of file