From 1341ae8c0abc8ef9c8c2d695a6461c4a75ff9794 Mon Sep 17 00:00:00 2001 From: vc1492a Date: Mon, 28 Oct 2024 09:43:00 -0700 Subject: [PATCH 1/3] add python 3.13 test to workflows --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index eeb6f20..2d619ba 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -47,7 +47,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 From 45e07f6f69915f04a3c9ca7b9e5982f3ce7bb82f Mon Sep 17 00:00:00 2001 From: vc1492a Date: Fri, 1 Nov 2024 14:08:54 -0700 Subject: [PATCH 2/3] some cleanup --- .gitignore | 1 + changelog.md | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index b80b260..8455419 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ PyNomaly/loop_dev.py *.pyc *.coverage.* .coveragerc +.pypirc # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/changelog.md b/changelog.md index 4204181..46520a8 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,10 @@ All notable changes to PyNomaly will be documented in this Changelog. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## 0.3.4 +### Changed +- Changed source code as necessary to address a [user-reported issue](https://github.com/vc1492a/PyNomaly/issues/49), corrected in [this commit](https://github.com/vc1492a/PyNomaly/commit/bbdd12a318316ca9c7e0272a5b06909f3fc4f9b0) + ## 0.3.3 ### Changed - The implementation of the progress bar to support use when the number of From f89a99cefa37c0ff509545b7cf3d08a59e74ee1d Mon Sep 17 00:00:00 2001 From: vc1492a Date: Sat, 2 Nov 2024 20:32:11 -0700 Subject: [PATCH 3/3] add documentation to clarify distance matrix differences --- readme.md | 18 +++++++++++++++--- tests/test_loop.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/readme.md b/readme.md index 0abfa6a..7ba30d7 100644 --- a/readme.md +++ b/readme.md @@ -38,7 +38,7 @@ This Python 3 implementation uses Numpy and the formulas outlined in to calculate the Local Outlier Probability of each sample. ## Dependencies -- Python 3.6 - 3.12 +- Python 3.6 - 3.13 - numpy >= 1.16.3 - python-utils >= 2.3.0 - (optional) numba >= 0.45.1 @@ -281,7 +281,12 @@ PyNomaly provides the ability to specify a distance matrix so that any distance metric can be used (a neighbor index matrix must also be provided). This can be useful when wanting to use a distance other than the euclidean. +Note that in order to maintain alignment with the LoOP definition of closest neighbors, +an additional neighbor is added when using [scikit-learn's NearestNeighbors](https://scikit-learn.org/1.5/modules/neighbors.html) since `NearestNeighbors` +includes the point itself when calculating the cloest neighbors (whereas the LoOP method does not include distances to point itself). + ```python +import numpy as np from sklearn.neighbors import NearestNeighbors data = np.array([ @@ -293,11 +298,18 @@ data = np.array([ [421.5, 90.3, 50.0] ]) -neigh = NearestNeighbors(n_neighbors=3, metric='hamming') +# Generate distance and neighbor matrices +n_neighbors = 3 # the number of neighbors according to the LoOP definition +neigh = NearestNeighbors(n_neighbors=n_neighbors+1, metric='hamming') neigh.fit(data) d, idx = neigh.kneighbors(data, return_distance=True) -m = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx, n_neighbors=3).fit() +# Remove self-distances - you MUST do this to preserve the same results as intended by the definition of LoOP +indices = np.delete(indices, 0, 1) +distances = np.delete(distances, 0, 1) + +# Fit and return scores +m = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx, n_neighbors=n_neighbors+1).fit() scores = m.local_outlier_probabilities ``` diff --git a/tests/test_loop.py b/tests/test_loop.py index ba453e9..ad97bde 100644 --- a/tests/test_loop.py +++ b/tests/test_loop.py @@ -790,3 +790,47 @@ def test_data_flipping() -> None: fit2.norm_prob_local_outlier_factor, decimal=6, ) + + +def test_distance_matrix_consistency(X_n120) -> None: + """ + Test to ensure that the distance matrix is consistent with the neighbor + matrix and that the software is able to handle self-distances. + :return: None + """ + + neigh = NearestNeighbors(metric='euclidean') + neigh.fit(X_n120) + distances, indices = neigh.kneighbors(X_n120, n_neighbors=11, return_distance=True) + + # remove the closest neighbor (its the point itself) from each row in the indices matrix and distances matrix + indices = np.delete(indices, 0, 1) + distances = np.delete(distances, 0, 1) + + # Fit LoOP with and without distance matrix + clf_data = loop.LocalOutlierProbability(X_n120, n_neighbors=10) + clf_dist = loop.LocalOutlierProbability(distance_matrix=distances, neighbor_matrix=indices, n_neighbors=11) + + # Attempt to retrieve scores and check types + scores_data = clf_data.fit().local_outlier_probabilities + scores_dist = clf_dist.fit().local_outlier_probabilities + + # Debugging prints to investigate types and contents + print("Type of scores_data:", type(scores_data)) + print("Type of scores_dist:", type(scores_dist)) + print("Value of scores_data:", scores_data) + print("Value of scores_dist:", scores_dist) + print("Shape of scores_data:", scores_data.shape) + print("Shape of scores_dist:", scores_dist.shape) + + # Convert to arrays if they aren't already + scores_data = np.array(scores_data) if not isinstance(scores_data, np.ndarray) else scores_data + scores_dist = np.array(scores_dist) if not isinstance(scores_dist, np.ndarray) else scores_dist + + # Check shapes and types before assertion + assert scores_data.shape == scores_dist.shape, "Score shapes mismatch" + assert isinstance(scores_data, np.ndarray), "Expected scores_data to be a numpy array" + assert isinstance(scores_dist, np.ndarray), "Expected scores_dist to be a numpy array" + + # Compare scores allowing for minor floating-point differences + assert_array_almost_equal(scores_data, scores_dist, decimal=10, err_msg="Inconsistent LoOP scores due to self-distances")