From 4d5eee0dcc1e206cf11f9e0e2bc6fd4cd22fef81 Mon Sep 17 00:00:00 2001 From: Simone Balducci <93096843+sbaldu@users.noreply.github.com> Date: Mon, 13 Jan 2025 10:57:20 +0100 Subject: [PATCH] Feature clusterer import method (#76) * Add `import_clusterer` method * Add test for new clusterer import * Add docstring * Fix "if main" of test file * Addition to gitignore --- .gitignore | 1 + CLUEstering/CLUEstering.py | 63 ++++++++++++++++++++++++++++------ setup.py | 2 +- tests/test_clusterer_import.py | 47 +++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 11 deletions(-) create mode 100644 tests/test_clusterer_import.py diff --git a/.gitignore b/.gitignore index 0c5de73e..97df7d11 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ vgcore.* *file2.csv *_output.csv *_passed.py +*test_sissa_import.csv diff --git a/CLUEstering/CLUEstering.py b/CLUEstering/CLUEstering.py index 60afeccd..6e7a2440 100644 --- a/CLUEstering/CLUEstering.py +++ b/CLUEstering/CLUEstering.py @@ -713,7 +713,7 @@ def run_clue(self, cluster_ids = np.array(cluster_id_is_seed[0]) is_seed = np.array(cluster_id_is_seed[1]) clusters = np.unique(cluster_ids) - n_seeds = np.sum([1 for i in clusters if i > -1]) + n_seeds = np.sum(is_seed) n_clusters = len(clusters) cluster_points = [[] for _ in range(n_clusters)] @@ -1185,24 +1185,67 @@ def to_csv(self, output_folder: str, file_name: str) -> None: data = {} for i in range(self.clust_data.n_dim): data['x' + str(i)] = self.clust_data.coords.T[i] + data['weight'] = self.clust_data.weight data['cluster_ids'] = self.clust_prop.cluster_ids data['is_seed'] = self.clust_prop.is_seed df_ = pd.DataFrame(data) df_.to_csv(out_path,index=False) + def import_clusterer(self, input_folder: str, file_name: str) -> None: + """ + Imports the results of a previous clustering. + + Parameters + ---------- + input_folder : string + Full path to the folder containing the file. + file_name : string + Name of the file, with the '.csv' suffix. + + Modified attributes + ------------------- + clust_data : clustering_data + Properties of the input data. + clust_prop : cluster_properties + Properties of the clusters found. + + Returns + ------- + None + """ + + in_path = input_folder + file_name + df_ = pd.read_csv(in_path, dtype=float) + cluster_ids = np.asarray(df_["cluster_ids"], dtype=int) + is_seed = np.array(df_["is_seed"], dtype=int) + + self._handle_dataframe(df_.iloc[:, :-2]) + + clusters = np.unique(cluster_ids) + n_seeds = np.sum(is_seed) + n_clusters = len(clusters) + + cluster_points = [[] for _ in range(n_clusters)] + for i in range(self.clust_data.n_points): + cluster_points[cluster_ids[i]].append(i) + + points_per_cluster = np.array([len(clust) for clust in cluster_points]) + self.clust_prop = cluster_properties(n_clusters, + n_seeds, + clusters, + cluster_ids, + is_seed, + np.asarray(cluster_points, dtype=object), + points_per_cluster, + df_) + if __name__ == "__main__": - c = clusterer(0.8, 5, 1.) - c.read_data('./blob.csv') + c = clusterer(20., 10., 20.) + c.read_data('./sissa.csv') c.input_plotter() c.run_clue(backend="cpu serial", verbose=True) - # c.run_clue(backend="cpu tbb", verbose=True) + c.run_clue(backend="cpu tbb", verbose=True) # c.run_clue(backend="gpu cuda", verbose=True) # c.run_clue(backend="gpu hip", verbose=True) c.cluster_plotter() - # c.to_csv('./','sissa_output_tbb.csv') - c.list_devices('cpu serial') - c.list_devices('cpu tbb') - c.list_devices('gpu cuda') - c.list_devices('gpu hip') - c.list_devices() diff --git a/setup.py b/setup.py index 5e925577..7ccc5e06 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools import setup import subprocess -__version__ = "2.3.2" +__version__ = "2.3.2.1" this_directory = Path(__file__).parent long_description = (this_directory/'README.md').read_text() diff --git a/tests/test_clusterer_import.py b/tests/test_clusterer_import.py new file mode 100644 index 00000000..91b1475e --- /dev/null +++ b/tests/test_clusterer_import.py @@ -0,0 +1,47 @@ +''' +Test the import of a clusterer from csv file +''' + +from check_result import check_result +import os +import sys +import pandas as pd +import pytest +sys.path.insert(1, '../CLUEstering/') +import CLUEstering as clue + +@pytest.fixture +def sissa(): + ''' + Returns the dataframe containing the sissa dataset + ''' + return pd.read_csv("./test_datasets/sissa.csv") + +def test_clusterer_import(sissa): + ''' + Try importing a clusterer from csv file and check that it's equal to the original clusterer + ''' + # Check if the output file already exists and if it does, delete it + if os.path.isfile('./test_sissa_import.csv'): + os.remove('./test_sissa_import.csv') + + c = clue.clusterer(20., 10., 20.) + c.read_data(sissa) + c.run_clue() + c.to_csv('./', 'test_sissa_import.csv') + + d = clue.clusterer(20., 10., 20.) + d.import_clusterer('./', 'test_sissa_import.csv') + + assert c.clust_prop == d.clust_prop + +if __name__ == "__main__": + c = clue.clusterer(20., 10., 20.) + c.read_data("./test_datasets/sissa.csv") + c.run_clue() + c.cluster_plotter() + c.to_csv('./', 'test_sissa_import.csv') + + d = clue.clusterer(20., 10., 20.) + d.import_clusterer('./', 'test_sissa_import.csv') + d.cluster_plotter()