Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added support for sparse matrices #424

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 67 additions & 63 deletions lazypredict/Supervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@
"GaussianProcessClassifier",
"HistGradientBoostingClassifier",
"MLPClassifier",
"LogisticRegressionCV",
"MultiOutputClassifier",
"MultinomialNB",
"LogisticRegressionCV",
"MultiOutputClassifier",
"MultinomialNB",
"OneVsOneClassifier",
"OneVsRestClassifier",
"OutputCodeClassifier",
Expand All @@ -52,20 +52,20 @@

removed_regressors = [
"TheilSenRegressor",
"ARDRegression",
"CCA",
"IsotonicRegression",
"ARDRegression",
"CCA",
"IsotonicRegression",
"StackingRegressor",
"MultiOutputRegressor",
"MultiTaskElasticNet",
"MultiTaskElasticNetCV",
"MultiTaskLasso",
"MultiTaskLassoCV",
"PLSCanonical",
"PLSRegression",
"RadiusNeighborsRegressor",
"RegressorChain",
"VotingRegressor",
"MultiOutputRegressor",
"MultiTaskElasticNet",
"MultiTaskElasticNetCV",
"MultiTaskLasso",
"MultiTaskLassoCV",
"PLSCanonical",
"PLSRegression",
"RadiusNeighborsRegressor",
"RegressorChain",
"VotingRegressor",
]

CLASSIFIERS = [
Expand Down Expand Up @@ -219,7 +219,7 @@ def __init__(
self.random_state = random_state
self.classifiers = classifiers

def fit(self, X_train, X_test, y_train, y_test):
def fit(self, X_train, X_test, y_train, y_test, should_preprocess: bool = True):
"""Fit Classification algorithms to X_train and y_train, predict and score on X_test, y_test.
Parameters
----------
Expand All @@ -235,6 +235,9 @@ def fit(self, X_train, X_test, y_train, y_test):
y_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
should_preprocess : bool,
Indicates if preprocessing columns is needed.
Turn this off if your matrix is sparse.
Returns
-------
scores : Pandas DataFrame
Expand All @@ -257,20 +260,22 @@ def fit(self, X_train, X_test, y_train, y_test):
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns
preprocessor = None
if should_preprocess:
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

categorical_low, categorical_high = get_card_split(
X_train, categorical_features
)
categorical_low, categorical_high = get_card_split(
X_train, categorical_features
)

preprocessor = ColumnTransformer(
transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical_low", categorical_transformer_low, categorical_low),
("categorical_high", categorical_transformer_high, categorical_high),
]
)
preprocessor = ColumnTransformer(
transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical_low", categorical_transformer_low, categorical_low),
("categorical_high", categorical_transformer_high, categorical_high),
]
)

if self.classifiers == "all":
self.classifiers = CLASSIFIERS
Expand All @@ -288,17 +293,14 @@ def fit(self, X_train, X_test, y_train, y_test):
for name, model in tqdm(self.classifiers):
start = time.time()
try:
steps = []
if should_preprocess:
steps.append(("preprocessor", preprocessor))
if "random_state" in model().get_params().keys():
pipe = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", model(random_state=self.random_state)),
]
)
steps.append(("classifier", model(random_state=self.random_state)))
else:
pipe = Pipeline(
steps=[("preprocessor", preprocessor), ("classifier", model())]
)
steps.append(("classifier", model()))
pipe = Pipeline(steps=steps)

pipe.fit(X_train, y_train)
self.models[name] = pipe
Expand Down Expand Up @@ -349,6 +351,7 @@ def fit(self, X_train, X_test, y_train, y_test):
if self.predictions:
predictions[name] = y_pred
except Exception as exception:
print(f"{name} got error: {exception}")
if self.ignore_warnings is False:
print(name + " model failed to execute")
print(exception)
Expand Down Expand Up @@ -404,7 +407,7 @@ def provide_models(self, X_train, X_test, y_train, y_test):
Returns
-------
models: dict-object,
Returns a dictionary with each model pipeline as value
Returns a dictionary with each model pipeline as value
with key as name of models.
"""
if len(self.models.keys()) == 0:
Expand Down Expand Up @@ -519,7 +522,7 @@ def __init__(
self.random_state = random_state
self.regressors = regressors

def fit(self, X_train, X_test, y_train, y_test):
def fit(self, X_train, X_test, y_train, y_test, should_preprocess: bool = True):
"""Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.
Parameters
----------
Expand All @@ -535,6 +538,9 @@ def fit(self, X_train, X_test, y_train, y_test):
y_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
should_preprocess : bool,
If preprocessing of columns should be done.
Turn this off if your matrix is sparse.
Returns
-------
scores : Pandas DataFrame
Expand All @@ -557,20 +563,22 @@ def fit(self, X_train, X_test, y_train, y_test):
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns
preprocessor = None
if should_preprocess:
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(include=["object"]).columns

categorical_low, categorical_high = get_card_split(
X_train, categorical_features
)
categorical_low, categorical_high = get_card_split(
X_train, categorical_features
)

preprocessor = ColumnTransformer(
transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical_low", categorical_transformer_low, categorical_low),
("categorical_high", categorical_transformer_high, categorical_high),
]
)
preprocessor = ColumnTransformer(
transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical_low", categorical_transformer_low, categorical_low),
("categorical_high", categorical_transformer_high, categorical_high),
]
)

if self.regressors == "all":
self.regressors = REGRESSORS
Expand All @@ -588,18 +596,14 @@ def fit(self, X_train, X_test, y_train, y_test):
for name, model in tqdm(self.regressors):
start = time.time()
try:
steps = []
if should_preprocess:
steps.append(("preprocessor", preprocessor))
if "random_state" in model().get_params().keys():
pipe = Pipeline(
steps=[
("preprocessor", preprocessor),
("regressor", model(random_state=self.random_state)),
]
)
steps.append(("regressor", model(random_state=self.random_state)))
else:
pipe = Pipeline(
steps=[("preprocessor", preprocessor), ("regressor", model())]
)

steps.append(("regressor", model()))
pipe = Pipeline(steps=steps)
pipe.fit(X_train, y_train)
self.models[name] = pipe
y_pred = pipe.predict(X_test)
Expand Down Expand Up @@ -681,7 +685,7 @@ def provide_models(self, X_train, X_test, y_train, y_test):
Returns
-------
models: dict-object,
Returns a dictionary with each model pipeline as value
Returns a dictionary with each model pipeline as value
with key as name of models.
"""
if len(self.models.keys()) == 0:
Expand Down