ssciwr · lkeegan · Mar 1, 2024 · Feb 29, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,8 +1,8 @@
-cmake_minimum_required(VERSION 3.23..3.27)
+cmake_minimum_required(VERSION 3.23..3.29)
 
 project(
   hammingdist
-  VERSION 1.1.0
+  VERSION 1.2.0
   LANGUAGES CXX)
 
 include(CTest)
@@ -53,10 +53,6 @@ add_subdirectory(ext)
 # Build the hamming library
 add_subdirectory(src)
 
-# The standalone executable
-add_executable(distance distance.cc)
-target_link_libraries(distance PUBLIC hamming)
-
 # The python package
 if(HAMMING_BUILD_PYTHON)
   add_subdirectory(python)

diff --git a/README.md b/README.md
@@ -36,7 +36,13 @@ data = hammingdist.from_fasta("example.fasta", include_x=True)
 
 # The distance data can be accessed point-wise, though looping over all distances might be quite inefficient
 print(data[14,42])
+```
+
+## Output formats
+
+The constructed distances matrix can then be written to disk in several different formats:
 
+```python
 # The data can be written to disk in csv format (default `distance` Ripser format) and retrieved:
 data.dump("backup.csv")
 retrieval = hammingdist.from_csv("backup.csv")
@@ -45,12 +51,13 @@ retrieval = hammingdist.from_csv("backup.csv")
 data.dump_lower_triangular("lt.txt")
 retrieval = hammingdist.from_lower_triangular("lt.txt")
 
+# Or in sparse format (`sparse` Ripser format: space-delimited triplet of `i j d(i,j)`
+# with one line for each distance entry i > j which is not above threshold):
+data.dump_sparse("sparse.txt", threshold=3)
+
 # If the `remove_duplicates` option was used, the sequence indices can also be written.
 # For each input sequence, this prints the corresponding index in the output:
 data.dump_sequence_indices("indices.txt")
-
-# Finally, we can pass the data as a list of strings in Python:
-data = hammingdist.from_stringlist(["ACGTACGT", "ACGTAGGT", "ATTTACGT"])
 ```
 
 ## Duplicates

diff --git a/distance.cc b/distance.cc
diff --git a/include/hamming/hamming.hh b/include/hamming/hamming.hh
@@ -3,7 +3,11 @@
 #include "hamming/hamming_impl.hh"
 #include "hamming/hamming_types.hh"
 #include "hamming/hamming_utils.hh"
+#ifdef HAMMING_WITH_OPENMP
+#include <omp.h>
+#endif
 #include <cmath>
+#include <fmt/core.h>
 #include <fstream>
 #include <sstream>
 #include <string>
@@ -72,6 +76,43 @@ template <typename DistIntType> struct DataSet {
     write_lower_triangular(filename, result);
   }
 
+  void dump_sparse(const std::string &filename, int threshold) {
+    std::ofstream stream(filename);
+#ifdef HAMMING_WITH_OPENMP
+    constexpr std::size_t samples_per_thread{200};
+#pragma omp parallel for schedule(static, 1) default(none)                     \
+    shared(result, stream, nsamples, samples_per_thread, threshold)
+    for (std::size_t i_start = 1; i_start < nsamples;
+         i_start += samples_per_thread) {
+      std::size_t i_end{std::min(i_start + samples_per_thread, nsamples)};
+      std::size_t offset{i_start * (i_start - 1) / 2};
+      auto *d = result.data() + offset;
+      std::string thread_local_lines;
+      for (std::size_t i = i_start; i < i_end; ++i) {
+        for (std::size_t j = 0; j < i; ++j) {
+          if (*d <= threshold) {
+            thread_local_lines.append(
+                fmt::format("{} {} {}\n", i, j, static_cast<int>(*d)));
+          }
+          ++d;
+        }
+      }
+#pragma omp critical
+      stream << thread_local_lines;
+    }
+#else
+    auto *d = result.data();
+    for (std::size_t i = 1; i < nsamples; ++i) {
+      for (std::size_t j = 0; j < i; ++j) {
+        if (*d <= threshold) {
+          stream << fmt::format("{} {} {}\n", i, j, static_cast<int>(*d));
+        }
+        ++d;
+      }
+    }
+#endif
+  }
+
   void dump_sequence_indices(const std::string &filename) {
     std::ofstream stream(filename);
     if (sequence_indices.empty()) {

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "hammingdist"
-version = "1.1.0"
+version = "1.2.0"
 description = "A fast tool to calculate Hamming distances"
 readme = "README.md"
 license = {text = "MIT"}

diff --git a/python/hammingdist.cc b/python/hammingdist.cc
@@ -35,6 +35,10 @@ PYBIND11_MODULE(hammingdist, m) {
            &DataSet<DefaultDistIntType>::dump_lower_triangular,
            "Dump distances matrix in lower triangular format (comma-delimited, "
            "row-major)")
+      .def("dump_sparse", &DataSet<DefaultDistIntType>::dump_sparse,
+           py::arg("filename"), py::arg("threshold") = 255,
+           "Dump distances matrix in sparse format excluding any distances "
+           "above threshold")
       .def("dump_sequence_indices",
            &DataSet<DefaultDistIntType>::dump_sequence_indices,
            "Dump row index in distances matrix for each input sequence")
@@ -47,6 +51,10 @@ PYBIND11_MODULE(hammingdist, m) {
       .def("dump_lower_triangular", &DataSet<uint16_t>::dump_lower_triangular,
            "Dump distances matrix in lower triangular format (comma-delimited, "
            "row-major)")
+      .def("dump_sparse", &DataSet<uint16_t>::dump_sparse, py::arg("filename"),
+           py::arg("threshold") = 65535,
+           "Dump distances matrix in sparse format excluding any distances "
+           "above threshold")
       .def("dump_sequence_indices", &DataSet<uint16_t>::dump_sequence_indices,
            "Dump row index in distances matrix for each input sequence")
       .def("__getitem__", &DataSet<uint16_t>::operator[])

diff --git a/python/tests/test_hammingdist.py b/python/tests/test_hammingdist.py
@@ -162,3 +162,33 @@ def test_from_fasta_to_lower_triangular(tmp_path, max_distance):
     assert len(data) == 2
     assert np.allclose(np.fromstring(data[0], sep=","), lower_triangular_dist[0])
     assert np.allclose(np.fromstring(data[1], sep=","), lower_triangular_dist[1])
+
+
+@pytest.mark.parametrize("samples", [2, 3, 5, 11, 54, 120, 532, 981, 1568])
+@pytest.mark.parametrize("threshold", [0, 1, 2, 3, 4, 9, 89, 497])
+def test_dump_sparse(tmp_path, samples, threshold):
+    lt_file = str(tmp_path / "lt.txt")
+    with open(lt_file, "w") as f:
+        for n in range(1, samples):
+            f.write(
+                ",".join(
+                    str(x)
+                    for x in np.random.randint(low=0, high=256, size=n, dtype=np.uint32)
+                )
+            )
+            f.write("\n")
+    data = hammingdist.from_lower_triangular(lt_file)
+    # expected number of sparse entries
+    num_below_threshold = np.sum(np.array(data._distances) <= threshold)
+    if num_below_threshold > 0:
+        sparse_file = str(tmp_path / "sparse.txt")
+        data.dump_sparse(sparse_file, threshold)
+        sparse = np.loadtxt(sparse_file, delimiter=" ", dtype=np.uint32)
+        if sparse.ndim == 1:
+            sparse = np.array([sparse])
+        assert sparse.shape[0] == num_below_threshold
+        assert sparse.shape[1] == 3
+        # each sparse entry has the correct distance, and is not above threshold
+        for e in sparse:
+            assert e[2] <= threshold
+            assert data[e[0], e[1]] == e[2]