Merge pull request #55 from goeckslab/v0.9.0

V0.9.0 - update sklearn to v0.24.*, tensorflow to v2.* etc.
goeckslab · May 11, 2021 · d662fc5 · d662fc5
2 parents a8561b4 + 39b2605
commit d662fc5
Show file tree

Hide file tree

Showing 2,330 changed files with 19,427 additions and 823,772 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -2,7 +2,7 @@ version: 2
 jobs:
   build:
     docker:
-      - image: circleci/python:3.6.8
+      - image: circleci/python:3.7.10
     steps:
       - checkout
       - run:
@@ -25,7 +25,7 @@ jobs:
           path: test-reports
   tool_lint:
     docker:
-      - image: circleci/python:3.6.8
+      - image: circleci/python:3.7.10
     steps:
       - checkout
       - run:
@@ -39,7 +39,7 @@ jobs:
             planemo lint
   tool_test:
     docker:
-      - image: circleci/python:3.6.8
+      - image: circleci/python:3.7.10
     parallelism: 4
     steps:
       - checkout

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,7 +1,7 @@
 include LICENSE
 include README.md
 include requirements.txt
-include galaxy_ml/pk_whitelist.json
+include galaxy_ml/model_persist/pk_whitelist.json
 include galaxy_ml/externals/selene_sdk/sequences/_sequence.pyx
 include galaxy_ml/externals/selene_sdk/targets/_genomic_features.pyx
 recursive-include galaxy_ml/externals/selene_sdk/sequences/data *.bed*
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,3 +1,18 @@
+### Version 0.9.0
+
+#### Changes
+
+- Updates scikit-learn to v0.24.x, tensorflow to v2.4.x, xgboost to v1.3.x, mlxtend to v0.17.x, skrebate to v.62, imbalanced-learn to v0.8.x and so on.
+- Makes load_model and save_model util methods in keras_galaxy_model module.
+- Refactors `_SafePickler` and moves it from `utils` to `model_persist`.
+- Refactors `dump_model_to_h5` and `load_model_from_h5` to dynamically save and load xgboost and tensorflow models.
+- Replaces pickled models with `h5mlm` models in all tools.
+
+#### Bug Fixes
+
+- 
+
+
 ### Version 0.8.3
 
 #### Changes

diff --git a/galaxy_ml/__init__.py b/galaxy_ml/__init__.py
@@ -1 +1,13 @@
-__version__ = '0.8.3'
+__version__ = '0.9.0'
+
+
+__all__ = (
+    'keras_galaxy_models',
+    'feature_selectors',
+    'preprocessors',
+    'iraps_classifier',
+    'model_validations',
+    'binarize_target',
+    'metrics',
+    'model_persist'
+)
diff --git a/galaxy_ml/binarize_target/_binarize_estimators.py b/galaxy_ml/binarize_target/_binarize_estimators.py
@@ -97,6 +97,8 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             self.n_outputs_ = self.classifier_.n_outputs_
         if hasattr(self.classifier_, 'n_features_'):
             self.n_features_ = self.classifier_.n_features_
+        if hasattr(self.classifier_, 'classes_'):
+            self.classes_ = self.classifier_.classes_
 
         return self
 
@@ -108,7 +110,10 @@ def predict(self, X):
     def decision_function(self, X):
         """Predict using a fitted estimator
         """
-        return self.classifier_.decision_function(X)
+        try:
+            return self.classifier_.decision_function(X)
+        except Exception:
+            raise
 
     def predict_proba(self, X):
         """Predict using a fitted estimator

diff --git a/galaxy_ml/binarize_target/_iraps_classifier.py b/galaxy_ml/binarize_target/_iraps_classifier.py
@@ -16,7 +16,7 @@
 from joblib import Parallel, delayed
 from scipy.stats import ttest_ind
 from sklearn.base import BaseEstimator, RegressorMixin, clone
-from sklearn.feature_selection.univariate_selection import _BaseFilter
+from sklearn.feature_selection._univariate_selection import _BaseFilter
 from sklearn.utils import as_float_array, check_X_y, check_random_state
 from sklearn.utils.validation import check_is_fitted, check_memory
 

diff --git a/galaxy_ml/binarize_target/_scorers.py b/galaxy_ml/binarize_target/_scorers.py
@@ -2,14 +2,15 @@
 from ..utils import get_main_estimator
 from sklearn import metrics
 from sklearn.utils.multiclass import type_of_target
-from sklearn.metrics.scorer import _BaseScorer
+from sklearn.metrics._scorer import _BaseScorer
 
 
 class _BinarizeTargetThresholdScorer(_BaseScorer):
     """
-    Base class to make binarized target specific scorer.
+    Class to make binarized target specific scorer to evaluate decision
+    function output.
     """
-    def __call__(self, clf, X, y, sample_weight=None):
+    def _score(self, method_caller, clf, X, y, sample_weight=None):
         main_estimator = get_main_estimator(clf)
         discretize_value = main_estimator.discretize_value
         less_is_positive = main_estimator.less_is_positive
@@ -24,38 +25,40 @@ def __call__(self, clf, X, y, sample_weight=None):
             raise ValueError("{0} format is not supported".format(y_type))
 
         try:
-            y_score = clf.decision_function(X)
+            y_pred = method_caller(clf, "decision_function", X)
 
             # For multi-output multi-class estimator
-            if isinstance(y_score, list):
-                y_score = np.vstack([p for p in y_score]).T
+            if isinstance(y_pred, list):
+                y_pred = np.vstack([p for p in y_pred]).T
+            elif y_type == "binary" and "pos_label" in self._kwargs:
+                self._check_pos_label(
+                    self._kwargs["pos_label"], clf.classes_
+                )
+                if self._kwargs["pos_label"] == clf.classes_[0]:
+                    # The implicit positive class of the binary classifier
+                    # does not match `pos_label`: we need to invert the
+                    # predictions
+                    y_pred *= -1
 
         except (NotImplementedError, AttributeError):
-            y_score = clf.predict_proba(X)
+            y_pred = method_caller(clf, "predict_proba", X)
 
             if y_type == "binary":
-                if y_score.shape[1] == 2:
-                    y_score = y_score[:, 1]
-                else:
-                    raise ValueError('got predict_proba of shape {},'
-                                     ' but need classifier with two'
-                                     ' classes for {} scoring'.format(
-                                         y_score.shape,
-                                         self._score_func.__name__))
-            elif isinstance(y_score, list):
-                y_score = np.vstack([p[:, -1] for p in y_score]).T
+                y_pred = self._select_proba_binary(y_pred, clf.classes_)
+            elif isinstance(y_pred, list):
+                y_pred = np.vstack([p[:, -1] for p in y_pred]).T
 
         if sample_weight is not None:
-            return self._sign * self._score_func(y_trans, y_score,
+            return self._sign * self._score_func(y_trans, y_pred,
                                                  sample_weight=sample_weight,
                                                  **self._kwargs)
         else:
-            return self._sign * self._score_func(y_trans, y_score,
+            return self._sign * self._score_func(y_trans, y_pred,
                                                  **self._kwargs)
 
+    def _factory_args(self):
+        return ", needs_threshold=True"
 
-# TODO deprecate in next major version
-_BinarizeTargetProbaScorer = _BinarizeTargetThresholdScorer
 
 # roc_auc
 binarize_auc_scorer =\
@@ -80,10 +83,11 @@ def __call__(self, clf, X, y, sample_weight=None):
 
 class _BinarizeTargetPredictScorer(_BaseScorer):
     """
-    Base class to make binarized target specific scorer.
+    Class to make binarized target specific scorer to evaluate predicted
+    target values.
     """
-    def __call__(self, clf, X, y, sample_weight=None):
-        main_estimator = get_main_estimator(clf)
+    def _score(self, method_caller, estimator, X, y, sample_weight=None):
+        main_estimator = get_main_estimator(estimator)
         discretize_value = main_estimator.discretize_value
         less_is_positive = main_estimator.less_is_positive
 
@@ -92,7 +96,7 @@ def __call__(self, clf, X, y, sample_weight=None):
         else:
             y_trans = y > discretize_value
 
-        y_pred = clf.predict(X)
+        y_pred = method_caller(estimator, "predict", X)
         if sample_weight is not None:
             return self._sign * self._score_func(y_trans, y_pred,
                                                  sample_weight=sample_weight,

diff --git a/galaxy_ml/feature_selectors.py b/galaxy_ml/feature_selectors.py
@@ -17,10 +17,10 @@
 
 from sklearn.base import BaseEstimator
 from sklearn.base import MetaEstimatorMixin, clone, is_classifier
-from sklearn.feature_selection.rfe import _rfe_single_fit, RFE, RFECV
+from sklearn.feature_selection._rfe import _rfe_single_fit, RFE, RFECV
 from sklearn.model_selection import check_cv
-from sklearn.metrics.scorer import check_scoring
-from sklearn.utils import check_X_y, safe_indexing, safe_sqr
+from sklearn.metrics._scorer import check_scoring
+from sklearn.utils import check_X_y, safe_sqr
 
 
 __all__ = ('DyRFE', 'DyRFECV', '_MyPipeline', '_MyimbPipeline',

diff --git a/galaxy_ml/iraps_classifier.py b/galaxy_ml/iraps_classifier.py
@@ -9,11 +9,7 @@
                               binarize_auc_scorer,
                               binarize_average_precision_scorer)
 
-from .binarize_target import _BinarizeTargetThresholdScorer \
-                        as _BinarizeTargetProbaScorer
-
 
 __all__ = ('IRAPSCore', 'IRAPSClassifier', 'binarize_auc_scorer',
            'binarize_average_precision_scorer', 'BinarizeTargetClassifier',
-           'BinarizeTargetRegressor', 'BinarizeTargetTransformer',
-           '_BinarizeTargetProbaScorer')
+           'BinarizeTargetRegressor', 'BinarizeTargetTransformer')