From 18800d585515611b37905486bda57adfc917445a Mon Sep 17 00:00:00 2001
From: Sean McKay <sean.mckay@freshconsulting.com>
Date: Tue, 14 Mar 2023 10:58:12 -0700
Subject: [PATCH] added support for sparse matrices, by turning off the
 "should_preprocess" step.

---
 lazypredict/Supervised.py | 130 ++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 63 deletions(-)

diff --git a/lazypredict/Supervised.py b/lazypredict/Supervised.py
index bf61208..5739720 100644
--- a/lazypredict/Supervised.py
+++ b/lazypredict/Supervised.py
@@ -40,9 +40,9 @@
     "GaussianProcessClassifier",
     "HistGradientBoostingClassifier",
     "MLPClassifier",
-    "LogisticRegressionCV", 
-    "MultiOutputClassifier", 
-    "MultinomialNB", 
+    "LogisticRegressionCV",
+    "MultiOutputClassifier",
+    "MultinomialNB",
     "OneVsOneClassifier",
     "OneVsRestClassifier",
     "OutputCodeClassifier",
@@ -52,20 +52,20 @@
 
 removed_regressors = [
     "TheilSenRegressor",
-    "ARDRegression", 
-    "CCA", 
-    "IsotonicRegression", 
+    "ARDRegression",
+    "CCA",
+    "IsotonicRegression",
     "StackingRegressor",
-    "MultiOutputRegressor", 
-    "MultiTaskElasticNet", 
-    "MultiTaskElasticNetCV", 
-    "MultiTaskLasso", 
-    "MultiTaskLassoCV", 
-    "PLSCanonical", 
-    "PLSRegression", 
-    "RadiusNeighborsRegressor", 
-    "RegressorChain", 
-    "VotingRegressor", 
+    "MultiOutputRegressor",
+    "MultiTaskElasticNet",
+    "MultiTaskElasticNetCV",
+    "MultiTaskLasso",
+    "MultiTaskLassoCV",
+    "PLSCanonical",
+    "PLSRegression",
+    "RadiusNeighborsRegressor",
+    "RegressorChain",
+    "VotingRegressor",
 ]
 
 CLASSIFIERS = [
@@ -219,7 +219,7 @@ def __init__(
         self.random_state = random_state
         self.classifiers = classifiers
 
-    def fit(self, X_train, X_test, y_train, y_test):
+    def fit(self, X_train, X_test, y_train, y_test, should_preprocess: bool = True):
         """Fit Classification algorithms to X_train and y_train, predict and score on X_test, y_test.
         Parameters
         ----------
@@ -235,6 +235,9 @@ def fit(self, X_train, X_test, y_train, y_test):
         y_test : array-like,
             Testing vectors, where rows is the number of samples
             and columns is the number of features.
+        should_preprocess : bool,
+            Indicates if preprocessing columns is needed.
+            Turn this off if your matrix is sparse.
         Returns
         -------
         scores : Pandas DataFrame
@@ -257,20 +260,22 @@ def fit(self, X_train, X_test, y_train, y_test):
             X_train = pd.DataFrame(X_train)
             X_test = pd.DataFrame(X_test)
 
-        numeric_features = X_train.select_dtypes(include=[np.number]).columns
-        categorical_features = X_train.select_dtypes(include=["object"]).columns
+        preprocessor = None
+        if should_preprocess:
+            numeric_features = X_train.select_dtypes(include=[np.number]).columns
+            categorical_features = X_train.select_dtypes(include=["object"]).columns
 
-        categorical_low, categorical_high = get_card_split(
-            X_train, categorical_features
-        )
+            categorical_low, categorical_high = get_card_split(
+                X_train, categorical_features
+            )
 
-        preprocessor = ColumnTransformer(
-            transformers=[
-                ("numeric", numeric_transformer, numeric_features),
-                ("categorical_low", categorical_transformer_low, categorical_low),
-                ("categorical_high", categorical_transformer_high, categorical_high),
-            ]
-        )
+            preprocessor = ColumnTransformer(
+                transformers=[
+                    ("numeric", numeric_transformer, numeric_features),
+                    ("categorical_low", categorical_transformer_low, categorical_low),
+                    ("categorical_high", categorical_transformer_high, categorical_high),
+                ]
+            )
 
         if self.classifiers == "all":
             self.classifiers = CLASSIFIERS
@@ -288,17 +293,14 @@ def fit(self, X_train, X_test, y_train, y_test):
         for name, model in tqdm(self.classifiers):
             start = time.time()
             try:
+                steps = []
+                if should_preprocess:
+                    steps.append(("preprocessor", preprocessor))
                 if "random_state" in model().get_params().keys():
-                    pipe = Pipeline(
-                        steps=[
-                            ("preprocessor", preprocessor),
-                            ("classifier", model(random_state=self.random_state)),
-                        ]
-                    )
+                    steps.append(("classifier", model(random_state=self.random_state)))
                 else:
-                    pipe = Pipeline(
-                        steps=[("preprocessor", preprocessor), ("classifier", model())]
-                    )
+                    steps.append(("classifier", model()))
+                pipe = Pipeline(steps=steps)
 
                 pipe.fit(X_train, y_train)
                 self.models[name] = pipe
@@ -349,6 +351,7 @@ def fit(self, X_train, X_test, y_train, y_test):
                 if self.predictions:
                     predictions[name] = y_pred
             except Exception as exception:
+                print(f"{name} got error: {exception}")
                 if self.ignore_warnings is False:
                     print(name + " model failed to execute")
                     print(exception)
@@ -404,7 +407,7 @@ def provide_models(self, X_train, X_test, y_train, y_test):
         Returns
         -------
         models: dict-object,
-            Returns a dictionary with each model pipeline as value 
+            Returns a dictionary with each model pipeline as value
             with key as name of models.
         """
         if len(self.models.keys()) == 0:
@@ -519,7 +522,7 @@ def __init__(
         self.random_state = random_state
         self.regressors = regressors
 
-    def fit(self, X_train, X_test, y_train, y_test):
+    def fit(self, X_train, X_test, y_train, y_test, should_preprocess: bool = True):
         """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.
         Parameters
         ----------
@@ -535,6 +538,9 @@ def fit(self, X_train, X_test, y_train, y_test):
         y_test : array-like,
             Testing vectors, where rows is the number of samples
             and columns is the number of features.
+        should_preprocess : bool,
+            If preprocessing of columns should be done.
+            Turn this off if your matrix is sparse.
         Returns
         -------
         scores : Pandas DataFrame
@@ -557,20 +563,22 @@ def fit(self, X_train, X_test, y_train, y_test):
             X_train = pd.DataFrame(X_train)
             X_test = pd.DataFrame(X_test)
 
-        numeric_features = X_train.select_dtypes(include=[np.number]).columns
-        categorical_features = X_train.select_dtypes(include=["object"]).columns
+        preprocessor = None
+        if should_preprocess:
+            numeric_features = X_train.select_dtypes(include=[np.number]).columns
+            categorical_features = X_train.select_dtypes(include=["object"]).columns
 
-        categorical_low, categorical_high = get_card_split(
-            X_train, categorical_features
-        )
+            categorical_low, categorical_high = get_card_split(
+                X_train, categorical_features
+            )
 
-        preprocessor = ColumnTransformer(
-            transformers=[
-                ("numeric", numeric_transformer, numeric_features),
-                ("categorical_low", categorical_transformer_low, categorical_low),
-                ("categorical_high", categorical_transformer_high, categorical_high),
-            ]
-        )
+            preprocessor = ColumnTransformer(
+                transformers=[
+                    ("numeric", numeric_transformer, numeric_features),
+                    ("categorical_low", categorical_transformer_low, categorical_low),
+                    ("categorical_high", categorical_transformer_high, categorical_high),
+                ]
+            )
 
         if self.regressors == "all":
             self.regressors = REGRESSORS
@@ -588,18 +596,14 @@ def fit(self, X_train, X_test, y_train, y_test):
         for name, model in tqdm(self.regressors):
             start = time.time()
             try:
+                steps = []
+                if should_preprocess:
+                    steps.append(("preprocessor", preprocessor))
                 if "random_state" in model().get_params().keys():
-                    pipe = Pipeline(
-                        steps=[
-                            ("preprocessor", preprocessor),
-                            ("regressor", model(random_state=self.random_state)),
-                        ]
-                    )
+                    steps.append(("regressor", model(random_state=self.random_state)))
                 else:
-                    pipe = Pipeline(
-                        steps=[("preprocessor", preprocessor), ("regressor", model())]
-                    )
-
+                    steps.append(("regressor", model()))
+                pipe = Pipeline(steps=steps)
                 pipe.fit(X_train, y_train)
                 self.models[name] = pipe
                 y_pred = pipe.predict(X_test)
@@ -681,7 +685,7 @@ def provide_models(self, X_train, X_test, y_train, y_test):
         Returns
         -------
         models: dict-object,
-            Returns a dictionary with each model pipeline as value 
+            Returns a dictionary with each model pipeline as value
             with key as name of models.
         """
         if len(self.models.keys()) == 0: