Merge pull request #19 from jlgarridol/development

Version 1.0.5.1
jlgarridol · May 20, 2024 · 58ba5b7 · 58ba5b7
2 parents 291179d + 0c8698e
commit 58ba5b7
Show file tree

Hide file tree

Showing 13 changed files with 4,461 additions and 3,104 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.5.1] - 2024-05-20
+
+### Fixed
+
+- Fixed bugs in `artificial_ssl_dataset`, now support again pandas DataFrame and y_unlabeled returns the right values
+
 ## [1.0.5] - 2024-05-08
 
 ### Added

diff --git a/docs/search.js b/docs/search.js
diff --git a/docs/sslearn.html b/docs/sslearn.html
@@ -61,6 +61,7 @@ <h2>Submodules</h2>
                     <li><a href="sslearn/base.html">base</a></li>
                     <li><a href="sslearn/datasets.html">datasets</a></li>
                     <li><a href="sslearn/model_selection.html">model_selection</a></li>
+                    <li><a href="sslearn/restricted (Copia en conflicto de CROSS-PC 2024-05-14).html">restricted (Copia en conflicto de CROSS-PC 2024-05-14)</a></li>
                     <li><a href="sslearn/restricted.html">restricted</a></li>
                     <li><a href="sslearn/subview.html">subview</a></li>
                     <li><a href="sslearn/utils.html">utils</a></li>
@@ -162,7 +163,7 @@ <h2 id="citing">Citing</h2>
 </span><span id="L-10"><a href="#L-10"><span class="linenos">10</span></a>    <span class="vm">__doc__</span> <span class="o">=</span> <span class="s2">&quot;Semi-Supervised Learning (SSL) is a Python package that provides tools to train and evaluate semi-supervised learning models.&quot;</span>
 </span><span id="L-11"><a href="#L-11"><span class="linenos">11</span></a>
 </span><span id="L-12"><a href="#L-12"><span class="linenos">12</span></a>
-</span><span id="L-13"><a href="#L-13"><span class="linenos">13</span></a><span class="n">__version__</span><span class="o">=</span><span class="s1">&#39;1.0.5&#39;</span>
+</span><span id="L-13"><a href="#L-13"><span class="linenos">13</span></a><span class="n">__version__</span><span class="o">=</span><span class="s1">&#39;1.0.5.1&#39;</span>
 </span><span id="L-14"><a href="#L-14"><span class="linenos">14</span></a><span class="n">__AUTHOR__</span><span class="o">=</span><span class="s2">&quot;José Luis Garrido-Labrador&quot;</span>  <span class="c1"># Author of the package</span>
 </span><span id="L-15"><a href="#L-15"><span class="linenos">15</span></a><span class="n">__AUTHOR_EMAIL__</span><span class="o">=</span><span class="s2">&quot;[email protected]&quot;</span>  <span class="c1"># Author&#39;s email</span>
 </span><span id="L-16"><a href="#L-16"><span class="linenos">16</span></a><span class="n">__URL__</span><span class="o">=</span><span class="s2">&quot;https://pypi.org/project/sslearn/&quot;</span>

diff --git a/docs/sslearn/model_selection.html b/docs/sslearn/model_selection.html
diff --git a/docs/sslearn/restricted (Copia en conflicto de CROSS-PC 2024-05-14).html b/docs/sslearn/restricted (Copia en conflicto de CROSS-PC 2024-05-14).html
diff --git a/docs/sslearn/wrapper.html b/docs/sslearn/wrapper.html
diff --git a/sitemap.xml b/sitemap.xml
@@ -5,6 +5,7 @@
 <url><loc>https://pdoc.dev/docs/</loc></url>
 <url><loc>https://pdoc.dev/docs/sslearn.html</loc></url>
 <url><loc>https://pdoc.dev/docs/sslearn/subview.html</loc></url>
+<url><loc>https://pdoc.dev/docs/sslearn/restricted (Copia en conflicto de CROSS-PC 2024-05-14).html</loc></url>
 <url><loc>https://pdoc.dev/docs/sslearn/model_selection.html</loc></url>
 <url><loc>https://pdoc.dev/docs/sslearn/base.html</loc></url>
 <url><loc>https://pdoc.dev/docs/sslearn/datasets.html</loc></url>

diff --git a/sslearn/__init__.py b/sslearn/__init__.py
@@ -10,7 +10,7 @@
     __doc__ = "Semi-Supervised Learning (SSL) is a Python package that provides tools to train and evaluate semi-supervised learning models."
 
 
-__version__='1.0.5'
+__version__='1.0.5.1'
 __AUTHOR__="José Luis Garrido-Labrador"  # Author of the package
 __AUTHOR_EMAIL__="[email protected]"  # Author's email
 __URL__="https://pypi.org/project/sslearn/"

diff --git a/sslearn/model_selection/_split.py b/sslearn/model_selection/_split.py
@@ -1,6 +1,7 @@
 import sklearn.model_selection as ms
 from sklearn.utils import check_random_state
 import numpy as np
+import pandas as pd
 
 
 class StratifiedKFoldSS():
@@ -108,6 +109,16 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, force_minimu
         "Label rate must be in (0, 1)."
     assert "test_size" not in kwards and "train_size" not in kwards,\
         "Test size and train size are illegal parameters in this method."
+
+    columns = None
+    is_df = False
+    if hasattr(X, "iloc"):
+        is_df = True
+        columns = X.columns
+        X = X.values
+    if hasattr(y, "iloc"):
+        is_df = True
+        y = y.values
 
     indices = np.arange(len(y))
 
@@ -127,6 +138,8 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, force_minimu
     if force_minimum is not None:
         label = np.concatenate((selected, label))
 
+    y_unlabel_original = y[unlabel]
+
     # Create the label and unlabel sets
     X_label, y_label, X_unlabel, y_unlabel = X[label], y[label],\
         X[unlabel], np.array([-1] * len(unlabel))
@@ -135,10 +148,14 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, force_minimu
     X = np.concatenate((X_label, X_unlabel), axis=0)
     y = np.concatenate((y_label, y_unlabel), axis=0)
 
+    if is_df:
+        X = pd.DataFrame(X, columns=columns)
+        y = pd.Series(y)
+
     if indexes:
-        return X, y, X_unlabel, y_unlabel, label, unlabel
+        return X, y, X_unlabel, y_unlabel_original, label, unlabel
 
-    return X, y, X_unlabel, y_unlabel
+    return X, y, X_unlabel, y_unlabel_original
 
 
     """    

diff --git a/sslearn/restricted (Copia en conflicto de CROSS-PC 2024-05-14).py b/sslearn/restricted (Copia en conflicto de CROSS-PC 2024-05-14).py
@@ -0,0 +1,208 @@
+"""Summary of module `sslearn.restricted`:
+
+This module contains classes to train a classifier using the restricted set classification approach.
+
+## Classes
+
+[WhoIsWhoClassifier](#WhoIsWhoClassifier):
+> Who is Who Classifier
+
+## Functions
+
+[conflict_rate](#conflict_rate): 
+> Compute the conflict rate of a prediction, given a set of restrictions.
+[combine_predictions](#combine_predictions): 
+> Combine the predictions of a group of instances to keep the restrictions.
+
+
+"""
+
+import numpy as np
+from sklearn.base import ClassifierMixin, MetaEstimatorMixin, BaseEstimator
+from scipy.optimize import linear_sum_assignment
+import warnings
+import pandas as pd
+
+__all__ = ["conflict_rate", "combine_predictions", "WhoIsWhoClassifier"]
+
+class WhoIsWhoClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
+
+    def __init__(self, base_estimator, method="hungarian", conflict_weighted=True):
+        """
+        Who is Who Classifier
+        Kuncheva, L. I., Rodriguez, J. J., & Jackson, A. S. (2017).
+        Restricted set classification: Who is there?. <i>Pattern Recognition</i>, 63, 158-170.
+
+        Parameters
+        ----------
+        base_estimator : ClassifierMixin
+            The base estimator to be used for training.
+        method : str, optional
+            The method to use to assing class, it can be `greedy` to first-look or `hungarian` to use the Hungarian algorithm, by default "hungarian"
+        conflict_weighted : bool, default=True
+            Whether to weighted the confusion rate by the number of instances with the same group.
+        """        
+        allowed_methods = ["greedy", "hungarian"]
+        self.base_estimator = base_estimator
+        self.method = method
+        if method not in allowed_methods:
+            raise ValueError(f"method {self.method} not supported, use one of {allowed_methods}")
+        self.conflict_weighted = conflict_weighted
+
+
+    def fit(self, X, y, instance_group=None, **kwards):
+        """Fit the model according to the given training data.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+        y : array-like of shape (n_samples,)
+            The target values.
+        instance_group : array-like of shape (n_samples)
+            The group. Two instances with the same label are not allowed to be in the same group. If None, group restriction will not be used in training.
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        self.base_estimator = self.base_estimator.fit(X, y, **kwards)
+        self.classes_ = self.base_estimator.classes_
+        if instance_group is not None:
+            self.conflict_in_train = conflict_rate(self.base_estimator.predict(X), instance_group, self.conflict_weighted)
+        else:
+            self.conflict_in_train = None
+        return self
+
+    def conflict_rate(self, X, instance_group):
+        """Calculate the conflict rate of the model.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+        instance_group : array-like of shape (n_samples)
+            The group. Two instances with the same label are not allowed to be in the same group.
+        Returns
+        -------
+        float
+            The conflict rate.
+        """
+        y_pred = self.base_estimator.predict(X)
+        return conflict_rate(y_pred, instance_group, self.conflict_weighted)
+
+    def predict(self, X, instance_group):
+        """Predict class for X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+        **kwards : array-like of shape (n_samples)
+            The group. Two instances with the same label are not allowed to be in the same group.
+        Returns
+        -------
+        array-like of shape (n_samples, n_classes)
+            The class probabilities of the input samples.
+        """
+
+        y_prob = self.predict_proba(X)
+
+        y_predicted = combine_predictions(y_prob, instance_group, len(self.classes_), self.method)
+
+        return self.classes_.take(y_predicted)
+
+
+    def predict_proba(self, X):
+        """Predict class probabilities for X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+        Returns
+        -------
+        array-like of shape (n_samples, n_classes)
+            The class probabilities of the input samples.
+        """
+        return self.base_estimator.predict_proba(X)
+
+
+def conflict_rate(y_pred, restrictions, weighted=True):
+    """
+    Computes the conflict rate of a prediction, given a set of restrictions.
+    Parameters
+    ----------
+    y_pred : array-like of shape (n_samples,)
+        Predicted target values.
+    restrictions : array-like of shape (n_samples,)
+        Restrictions for each sample. If two samples have the same restriction, they cannot have the same y.
+    weighted : bool, default=True
+        Whether to weighted the confusion rate by the number of instances with the same group.
+    Returns
+    -------
+    conflict rate : float
+        The conflict rate.
+    """
+
+    # Check that y_pred and restrictions have the same length
+    if len(y_pred) != len(restrictions):
+        raise ValueError("y_pred and restrictions must have the same length.")
+
+    restricted_df = pd.DataFrame({'y_pred': y_pred, 'restrictions': restrictions})
+
+    conflicted = restricted_df.groupby('restrictions').agg({'y_pred': lambda x: np.unique(x, return_counts=True)[1][np.unique(x, return_counts=True)[1]>1].sum()})
+    if weighted:
+        return conflicted.sum().y_pred / len(y_pred)
+    else:
+        rcount = restricted_df.groupby('restrictions').count()
+        return (conflicted.y_pred / rcount.y_pred).sum()
+
+def combine_predictions(y_probas, instance_group, class_number, method="hungarian"):
+    y_predicted = []
+    for group in np.unique(instance_group):
+
+        mask = instance_group == group
+        probas_matrix = y_probas[mask]
+
+
+        preds = list(np.argmax(probas_matrix, axis=1))
+
+        if len(preds) == len(set(preds)) or probas_matrix.shape[0] > class_number:
+            y_predicted.extend(preds)
+            if probas_matrix.shape[0] > class_number:
+                warnings.warn("That the number of instances in the group is greater than the number of classes.", UserWarning)
+            continue
+
+        if method == "greedy":
+            y = _greedy(probas_matrix)
+        elif method == "hungarian":
+            y = _hungarian(probas_matrix)
+
+        y_predicted.extend(y)
+    return y_predicted
+
+def _greedy(probas_matrix):        
+
+    probas = probas_matrix.reshape(probas_matrix.size,)
+    order = probas.argsort()[::-1]
+
+    y_pred_group = [None for i in range(probas_matrix.shape[0])]
+
+    instance_to_predict = {i for i in range(probas_matrix.shape[0])}
+    class_predicted = set()
+    for item in order:
+        class_ = item % probas_matrix.shape[0]
+        instance = item // probas_matrix.shape[0]
+        if instance in instance_to_predict and class_ not in class_predicted:
+            y_pred_group[instance] = class_
+            instance_to_predict.remove(instance)
+            class_predicted.add(class_)
+
+    return y_pred_group
+
+
+def _hungarian(probas_matrix):
+
+    costs = np.log(probas_matrix)
+    costs[costs == -np.inf] = 0  # if proba is 0, then the cost is 0
+    _, col_ind = linear_sum_assignment(costs, maximize=True)
+    col_ind = list(col_ind)
+
+    return col_ind
diff --git a/sslearn/wrapper/_co.py b/sslearn/wrapper/_co.py
@@ -109,8 +109,6 @@ def score(self, X, y, sample_weight=None):
         score : float
             Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
         """
-        from .metrics import accuracy_score
-
         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
 
 

diff --git a/sslearn/wrapper/_self.py b/sslearn/wrapper/_self.py
@@ -1,11 +1,12 @@
 import numpy as np
 import pandas as pd
 from scipy.stats import norm
-from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.base import BaseEstimator, MetaEstimatorMixin
 from sklearn.base import clone as skclone
 from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph
 from sklearn.semi_supervised import SelfTrainingClassifier
 from sklearn.utils import check_random_state, resample
+from sklearn.metrics import accuracy_score
 
 from sslearn.utils import calculate_prior_probability, check_classifier
 
@@ -124,7 +125,7 @@ def fit(self, X, y):
         return super().fit(X, y_adapted)
 
 
-class Setred(ClassifierMixin, BaseEstimator):
+class Setred(BaseEstimator, MetaEstimatorMixin):
     """
     **Self-training with Editing.**
     ----------------------------
@@ -365,3 +366,29 @@ def predict_proba(self, X, **kwards):
             The predicted classes
         """
         return self._base_estimator.predict_proba(X, **kwards)
+
+    def score(self, X, y, sample_weight=None):
+        """
+        Return the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for `X`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
+        """
+        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
diff --git a/test/test_model_selection.py b/test/test_model_selection.py
@@ -1,13 +1,20 @@
 import os
 import sys
 import numpy as np
+import pandas as pd
 import pytest
 
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
 
 from sslearn.model_selection import (artificial_ssl_dataset, StratifiedKFoldSS)
 from sklearn.datasets import load_iris
 
+def test_artificial_ssl_dataset_with_pandas():
+    X, y = load_iris(return_X_y=True)
+    X, y, X_unlabel, true_label = artificial_ssl_dataset(pd.DataFrame(X), pd.Series(y), label_rate=0.1)
+    assert X_unlabel.shape[0] == true_label.shape[0]
+    assert X_unlabel.shape[0]/X.shape[0] == pytest.approx(0.9)
+
 def test_artificial_ssl_dataset():
     X, y = load_iris(return_X_y=True)
     X, y, X_unlabel, true_label = artificial_ssl_dataset(X, y, label_rate=0.1)