From 4d5eee0dcc1e206cf11f9e0e2bc6fd4cd22fef81 Mon Sep 17 00:00:00 2001
From: Simone Balducci <93096843+sbaldu@users.noreply.github.com>
Date: Mon, 13 Jan 2025 10:57:20 +0100
Subject: [PATCH] Feature clusterer import method (#76)

* Add `import_clusterer` method

* Add test for new clusterer import

* Add docstring

* Fix "if main" of test file

* Addition to gitignore
---
 .gitignore                     |  1 +
 CLUEstering/CLUEstering.py     | 63 ++++++++++++++++++++++++++++------
 setup.py                       |  2 +-
 tests/test_clusterer_import.py | 47 +++++++++++++++++++++++++
 4 files changed, 102 insertions(+), 11 deletions(-)
 create mode 100644 tests/test_clusterer_import.py

diff --git a/.gitignore b/.gitignore
index 0c5de73e..97df7d11 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,4 @@ vgcore.*
 *file2.csv
 *_output.csv
 *_passed.py
+*test_sissa_import.csv
diff --git a/CLUEstering/CLUEstering.py b/CLUEstering/CLUEstering.py
index 60afeccd..6e7a2440 100644
--- a/CLUEstering/CLUEstering.py
+++ b/CLUEstering/CLUEstering.py
@@ -713,7 +713,7 @@ def run_clue(self,
         cluster_ids = np.array(cluster_id_is_seed[0])
         is_seed = np.array(cluster_id_is_seed[1])
         clusters = np.unique(cluster_ids)
-        n_seeds = np.sum([1 for i in clusters if i > -1])
+        n_seeds = np.sum(is_seed)
         n_clusters = len(clusters)
 
         cluster_points = [[] for _ in range(n_clusters)]
@@ -1185,24 +1185,67 @@ def to_csv(self, output_folder: str, file_name: str) -> None:
         data = {}
         for i in range(self.clust_data.n_dim):
             data['x' + str(i)] = self.clust_data.coords.T[i]
+        data['weight'] = self.clust_data.weight
         data['cluster_ids'] = self.clust_prop.cluster_ids
         data['is_seed'] = self.clust_prop.is_seed
 
         df_ = pd.DataFrame(data)
         df_.to_csv(out_path,index=False)
 
+    def import_clusterer(self, input_folder: str, file_name: str) -> None:
+        """
+        Imports the results of a previous clustering.
+
+        Parameters
+        ----------
+        input_folder : string
+            Full path to the folder containing the file.
+        file_name : string
+            Name of the file, with the '.csv' suffix.
+
+        Modified attributes
+        -------------------
+        clust_data : clustering_data
+            Properties of the input data.
+        clust_prop : cluster_properties
+            Properties of the clusters found.
+
+        Returns
+        -------
+        None
+        """
+
+        in_path = input_folder + file_name
+        df_ = pd.read_csv(in_path, dtype=float)
+        cluster_ids = np.asarray(df_["cluster_ids"], dtype=int)
+        is_seed = np.array(df_["is_seed"], dtype=int)
+
+        self._handle_dataframe(df_.iloc[:, :-2])
+
+        clusters = np.unique(cluster_ids)
+        n_seeds = np.sum(is_seed)
+        n_clusters = len(clusters)
+
+        cluster_points = [[] for _ in range(n_clusters)]
+        for i in range(self.clust_data.n_points):
+            cluster_points[cluster_ids[i]].append(i)
+
+        points_per_cluster = np.array([len(clust) for clust in cluster_points])
+        self.clust_prop = cluster_properties(n_clusters,
+                                             n_seeds,
+                                             clusters,
+                                             cluster_ids,
+                                             is_seed,
+                                             np.asarray(cluster_points, dtype=object),
+                                             points_per_cluster,
+                                             df_)
+
 if __name__ == "__main__":
-    c = clusterer(0.8, 5, 1.)
-    c.read_data('./blob.csv')
+    c = clusterer(20., 10., 20.)
+    c.read_data('./sissa.csv')
     c.input_plotter()
     c.run_clue(backend="cpu serial", verbose=True)
-    # c.run_clue(backend="cpu tbb", verbose=True)
+    c.run_clue(backend="cpu tbb", verbose=True)
     # c.run_clue(backend="gpu cuda", verbose=True)
     # c.run_clue(backend="gpu hip", verbose=True)
     c.cluster_plotter()
-    # c.to_csv('./','sissa_output_tbb.csv')
-    c.list_devices('cpu serial')
-    c.list_devices('cpu tbb')
-    c.list_devices('gpu cuda')
-    c.list_devices('gpu hip')
-    c.list_devices()
diff --git a/setup.py b/setup.py
index 5e925577..7ccc5e06 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 from setuptools import setup
 import subprocess
 
-__version__ = "2.3.2"
+__version__ = "2.3.2.1"
 
 this_directory = Path(__file__).parent
 long_description = (this_directory/'README.md').read_text()
diff --git a/tests/test_clusterer_import.py b/tests/test_clusterer_import.py
new file mode 100644
index 00000000..91b1475e
--- /dev/null
+++ b/tests/test_clusterer_import.py
@@ -0,0 +1,47 @@
+'''
+Test the import of a clusterer from csv file
+'''
+
+from check_result import check_result
+import os
+import sys
+import pandas as pd
+import pytest
+sys.path.insert(1, '../CLUEstering/')
+import CLUEstering as clue
+
+@pytest.fixture
+def sissa():
+    '''
+    Returns the dataframe containing the sissa dataset
+    '''
+    return pd.read_csv("./test_datasets/sissa.csv")
+
+def test_clusterer_import(sissa):
+    '''
+    Try importing a clusterer from csv file and check that it's equal to the original clusterer
+    '''
+    # Check if the output file already exists and if it does, delete it
+    if os.path.isfile('./test_sissa_import.csv'):
+        os.remove('./test_sissa_import.csv')
+
+    c = clue.clusterer(20., 10., 20.)
+    c.read_data(sissa)
+    c.run_clue()
+    c.to_csv('./', 'test_sissa_import.csv')
+
+    d = clue.clusterer(20., 10., 20.)
+    d.import_clusterer('./', 'test_sissa_import.csv')
+
+    assert c.clust_prop == d.clust_prop
+
+if __name__ == "__main__":
+    c = clue.clusterer(20., 10., 20.)
+    c.read_data("./test_datasets/sissa.csv")
+    c.run_clue()
+    c.cluster_plotter()
+    c.to_csv('./', 'test_sissa_import.csv')
+
+    d = clue.clusterer(20., 10., 20.)
+    d.import_clusterer('./', 'test_sissa_import.csv')
+    d.cluster_plotter()