Skip to content

Commit

Permalink
Merge pull request #19 from jlgarridol/development
Browse files Browse the repository at this point in the history
Version 1.0.5.1
  • Loading branch information
jlgarridol authored May 20, 2024
2 parents 291179d + 0c8698e commit 58ba5b7
Show file tree
Hide file tree
Showing 13 changed files with 4,461 additions and 3,104 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.0.5.1] - 2024-05-20

### Fixed

- Fixed bugs in `artificial_ssl_dataset`, now support again pandas DataFrame and y_unlabeled returns the right values

## [1.0.5] - 2024-05-08

### Added
Expand Down
2 changes: 1 addition & 1 deletion docs/search.js

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion docs/sslearn.html
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ <h2>Submodules</h2>
<li><a href="sslearn/base.html">base</a></li>
<li><a href="sslearn/datasets.html">datasets</a></li>
<li><a href="sslearn/model_selection.html">model_selection</a></li>
<li><a href="sslearn/restricted (Copia en conflicto de CROSS-PC 2024-05-14).html">restricted (Copia en conflicto de CROSS-PC 2024-05-14)</a></li>
<li><a href="sslearn/restricted.html">restricted</a></li>
<li><a href="sslearn/subview.html">subview</a></li>
<li><a href="sslearn/utils.html">utils</a></li>
Expand Down Expand Up @@ -162,7 +163,7 @@ <h2 id="citing">Citing</h2>
</span><span id="L-10"><a href="#L-10"><span class="linenos">10</span></a> <span class="vm">__doc__</span> <span class="o">=</span> <span class="s2">&quot;Semi-Supervised Learning (SSL) is a Python package that provides tools to train and evaluate semi-supervised learning models.&quot;</span>
</span><span id="L-11"><a href="#L-11"><span class="linenos">11</span></a>
</span><span id="L-12"><a href="#L-12"><span class="linenos">12</span></a>
</span><span id="L-13"><a href="#L-13"><span class="linenos">13</span></a><span class="n">__version__</span><span class="o">=</span><span class="s1">&#39;1.0.5&#39;</span>
</span><span id="L-13"><a href="#L-13"><span class="linenos">13</span></a><span class="n">__version__</span><span class="o">=</span><span class="s1">&#39;1.0.5.1&#39;</span>
</span><span id="L-14"><a href="#L-14"><span class="linenos">14</span></a><span class="n">__AUTHOR__</span><span class="o">=</span><span class="s2">&quot;José Luis Garrido-Labrador&quot;</span> <span class="c1"># Author of the package</span>
</span><span id="L-15"><a href="#L-15"><span class="linenos">15</span></a><span class="n">__AUTHOR_EMAIL__</span><span class="o">=</span><span class="s2">&quot;[email protected]&quot;</span> <span class="c1"># Author&#39;s email</span>
</span><span id="L-16"><a href="#L-16"><span class="linenos">16</span></a><span class="n">__URL__</span><span class="o">=</span><span class="s2">&quot;https://pypi.org/project/sslearn/&quot;</span>
Expand Down
438 changes: 227 additions & 211 deletions docs/sslearn/model_selection.html

Large diffs are not rendered by default.

Large diffs are not rendered by default.

5,859 changes: 2,975 additions & 2,884 deletions docs/sslearn/wrapper.html

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions sitemap.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
<url><loc>https://pdoc.dev/docs/</loc></url>
<url><loc>https://pdoc.dev/docs/sslearn.html</loc></url>
<url><loc>https://pdoc.dev/docs/sslearn/subview.html</loc></url>
<url><loc>https://pdoc.dev/docs/sslearn/restricted (Copia en conflicto de CROSS-PC 2024-05-14).html</loc></url>
<url><loc>https://pdoc.dev/docs/sslearn/model_selection.html</loc></url>
<url><loc>https://pdoc.dev/docs/sslearn/base.html</loc></url>
<url><loc>https://pdoc.dev/docs/sslearn/datasets.html</loc></url>
Expand Down
2 changes: 1 addition & 1 deletion sslearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
__doc__ = "Semi-Supervised Learning (SSL) is a Python package that provides tools to train and evaluate semi-supervised learning models."


__version__='1.0.5'
__version__='1.0.5.1'
__AUTHOR__="José Luis Garrido-Labrador" # Author of the package
__AUTHOR_EMAIL__="[email protected]" # Author's email
__URL__="https://pypi.org/project/sslearn/"
Expand Down
21 changes: 19 additions & 2 deletions sslearn/model_selection/_split.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sklearn.model_selection as ms
from sklearn.utils import check_random_state
import numpy as np
import pandas as pd


class StratifiedKFoldSS():
Expand Down Expand Up @@ -108,6 +109,16 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, force_minimu
"Label rate must be in (0, 1)."
assert "test_size" not in kwards and "train_size" not in kwards,\
"Test size and train size are illegal parameters in this method."

columns = None
is_df = False
if hasattr(X, "iloc"):
is_df = True
columns = X.columns
X = X.values
if hasattr(y, "iloc"):
is_df = True
y = y.values

indices = np.arange(len(y))

Expand All @@ -127,6 +138,8 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, force_minimu
if force_minimum is not None:
label = np.concatenate((selected, label))

y_unlabel_original = y[unlabel]

# Create the label and unlabel sets
X_label, y_label, X_unlabel, y_unlabel = X[label], y[label],\
X[unlabel], np.array([-1] * len(unlabel))
Expand All @@ -135,10 +148,14 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, force_minimu
X = np.concatenate((X_label, X_unlabel), axis=0)
y = np.concatenate((y_label, y_unlabel), axis=0)

if is_df:
X = pd.DataFrame(X, columns=columns)
y = pd.Series(y)

if indexes:
return X, y, X_unlabel, y_unlabel, label, unlabel
return X, y, X_unlabel, y_unlabel_original, label, unlabel

return X, y, X_unlabel, y_unlabel
return X, y, X_unlabel, y_unlabel_original


"""
Expand Down
208 changes: 208 additions & 0 deletions sslearn/restricted (Copia en conflicto de CROSS-PC 2024-05-14).py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
"""Summary of module `sslearn.restricted`:
This module contains classes to train a classifier using the restricted set classification approach.
## Classes
[WhoIsWhoClassifier](#WhoIsWhoClassifier):
> Who is Who Classifier
## Functions
[conflict_rate](#conflict_rate):
> Compute the conflict rate of a prediction, given a set of restrictions.
[combine_predictions](#combine_predictions):
> Combine the predictions of a group of instances to keep the restrictions.
"""

import numpy as np
from sklearn.base import ClassifierMixin, MetaEstimatorMixin, BaseEstimator
from scipy.optimize import linear_sum_assignment
import warnings
import pandas as pd

__all__ = ["conflict_rate", "combine_predictions", "WhoIsWhoClassifier"]

class WhoIsWhoClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):

def __init__(self, base_estimator, method="hungarian", conflict_weighted=True):
"""
Who is Who Classifier
Kuncheva, L. I., Rodriguez, J. J., & Jackson, A. S. (2017).
Restricted set classification: Who is there?. <i>Pattern Recognition</i>, 63, 158-170.
Parameters
----------
base_estimator : ClassifierMixin
The base estimator to be used for training.
method : str, optional
The method to use to assing class, it can be `greedy` to first-look or `hungarian` to use the Hungarian algorithm, by default "hungarian"
conflict_weighted : bool, default=True
Whether to weighted the confusion rate by the number of instances with the same group.
"""
allowed_methods = ["greedy", "hungarian"]
self.base_estimator = base_estimator
self.method = method
if method not in allowed_methods:
raise ValueError(f"method {self.method} not supported, use one of {allowed_methods}")
self.conflict_weighted = conflict_weighted


def fit(self, X, y, instance_group=None, **kwards):
"""Fit the model according to the given training data.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
y : array-like of shape (n_samples,)
The target values.
instance_group : array-like of shape (n_samples)
The group. Two instances with the same label are not allowed to be in the same group. If None, group restriction will not be used in training.
Returns
-------
self : object
Returns self.
"""
self.base_estimator = self.base_estimator.fit(X, y, **kwards)
self.classes_ = self.base_estimator.classes_
if instance_group is not None:
self.conflict_in_train = conflict_rate(self.base_estimator.predict(X), instance_group, self.conflict_weighted)
else:
self.conflict_in_train = None
return self

def conflict_rate(self, X, instance_group):
"""Calculate the conflict rate of the model.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
instance_group : array-like of shape (n_samples)
The group. Two instances with the same label are not allowed to be in the same group.
Returns
-------
float
The conflict rate.
"""
y_pred = self.base_estimator.predict(X)
return conflict_rate(y_pred, instance_group, self.conflict_weighted)

def predict(self, X, instance_group):
"""Predict class for X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
**kwards : array-like of shape (n_samples)
The group. Two instances with the same label are not allowed to be in the same group.
Returns
-------
array-like of shape (n_samples, n_classes)
The class probabilities of the input samples.
"""

y_prob = self.predict_proba(X)

y_predicted = combine_predictions(y_prob, instance_group, len(self.classes_), self.method)

return self.classes_.take(y_predicted)


def predict_proba(self, X):
"""Predict class probabilities for X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples.
Returns
-------
array-like of shape (n_samples, n_classes)
The class probabilities of the input samples.
"""
return self.base_estimator.predict_proba(X)


def conflict_rate(y_pred, restrictions, weighted=True):
"""
Computes the conflict rate of a prediction, given a set of restrictions.
Parameters
----------
y_pred : array-like of shape (n_samples,)
Predicted target values.
restrictions : array-like of shape (n_samples,)
Restrictions for each sample. If two samples have the same restriction, they cannot have the same y.
weighted : bool, default=True
Whether to weighted the confusion rate by the number of instances with the same group.
Returns
-------
conflict rate : float
The conflict rate.
"""

# Check that y_pred and restrictions have the same length
if len(y_pred) != len(restrictions):
raise ValueError("y_pred and restrictions must have the same length.")

restricted_df = pd.DataFrame({'y_pred': y_pred, 'restrictions': restrictions})

conflicted = restricted_df.groupby('restrictions').agg({'y_pred': lambda x: np.unique(x, return_counts=True)[1][np.unique(x, return_counts=True)[1]>1].sum()})
if weighted:
return conflicted.sum().y_pred / len(y_pred)
else:
rcount = restricted_df.groupby('restrictions').count()
return (conflicted.y_pred / rcount.y_pred).sum()

def combine_predictions(y_probas, instance_group, class_number, method="hungarian"):
y_predicted = []
for group in np.unique(instance_group):

mask = instance_group == group
probas_matrix = y_probas[mask]


preds = list(np.argmax(probas_matrix, axis=1))

if len(preds) == len(set(preds)) or probas_matrix.shape[0] > class_number:
y_predicted.extend(preds)
if probas_matrix.shape[0] > class_number:
warnings.warn("That the number of instances in the group is greater than the number of classes.", UserWarning)
continue

if method == "greedy":
y = _greedy(probas_matrix)
elif method == "hungarian":
y = _hungarian(probas_matrix)

y_predicted.extend(y)
return y_predicted

def _greedy(probas_matrix):

probas = probas_matrix.reshape(probas_matrix.size,)
order = probas.argsort()[::-1]

y_pred_group = [None for i in range(probas_matrix.shape[0])]

instance_to_predict = {i for i in range(probas_matrix.shape[0])}
class_predicted = set()
for item in order:
class_ = item % probas_matrix.shape[0]
instance = item // probas_matrix.shape[0]
if instance in instance_to_predict and class_ not in class_predicted:
y_pred_group[instance] = class_
instance_to_predict.remove(instance)
class_predicted.add(class_)

return y_pred_group


def _hungarian(probas_matrix):

costs = np.log(probas_matrix)
costs[costs == -np.inf] = 0 # if proba is 0, then the cost is 0
_, col_ind = linear_sum_assignment(costs, maximize=True)
col_ind = list(col_ind)

return col_ind
2 changes: 0 additions & 2 deletions sslearn/wrapper/_co.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,6 @@ def score(self, X, y, sample_weight=None):
score : float
Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
"""
from .metrics import accuracy_score

return accuracy_score(y, self.predict(X), sample_weight=sample_weight)


Expand Down
31 changes: 29 additions & 2 deletions sslearn/wrapper/_self.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.base import BaseEstimator, MetaEstimatorMixin
from sklearn.base import clone as skclone
from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.utils import check_random_state, resample
from sklearn.metrics import accuracy_score

from sslearn.utils import calculate_prior_probability, check_classifier

Expand Down Expand Up @@ -124,7 +125,7 @@ def fit(self, X, y):
return super().fit(X, y_adapted)


class Setred(ClassifierMixin, BaseEstimator):
class Setred(BaseEstimator, MetaEstimatorMixin):
"""
**Self-training with Editing.**
----------------------------
Expand Down Expand Up @@ -365,3 +366,29 @@ def predict_proba(self, X, **kwards):
The predicted classes
"""
return self._base_estimator.predict_proba(X, **kwards)

def score(self, X, y, sample_weight=None):
"""
Return the mean accuracy on the given test data and labels.
In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test samples.
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
True labels for `X`.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
score : float
Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
"""
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
7 changes: 7 additions & 0 deletions test/test_model_selection.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import os
import sys
import numpy as np
import pandas as pd
import pytest

sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))

from sslearn.model_selection import (artificial_ssl_dataset, StratifiedKFoldSS)
from sklearn.datasets import load_iris

def test_artificial_ssl_dataset_with_pandas():
X, y = load_iris(return_X_y=True)
X, y, X_unlabel, true_label = artificial_ssl_dataset(pd.DataFrame(X), pd.Series(y), label_rate=0.1)
assert X_unlabel.shape[0] == true_label.shape[0]
assert X_unlabel.shape[0]/X.shape[0] == pytest.approx(0.9)

def test_artificial_ssl_dataset():
X, y = load_iris(return_X_y=True)
X, y, X_unlabel, true_label = artificial_ssl_dataset(X, y, label_rate=0.1)
Expand Down

0 comments on commit 58ba5b7

Please sign in to comment.