Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dump_sparse() method to DataSet #126

Merged
merged 1 commit into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
cmake_minimum_required(VERSION 3.23..3.27)
cmake_minimum_required(VERSION 3.23..3.29)

project(
hammingdist
VERSION 1.1.0
VERSION 1.2.0
LANGUAGES CXX)

include(CTest)
Expand Down Expand Up @@ -53,10 +53,6 @@ add_subdirectory(ext)
# Build the hamming library
add_subdirectory(src)

# The standalone executable
add_executable(distance distance.cc)
target_link_libraries(distance PUBLIC hamming)

# The python package
if(HAMMING_BUILD_PYTHON)
add_subdirectory(python)
Expand Down
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,13 @@ data = hammingdist.from_fasta("example.fasta", include_x=True)

# The distance data can be accessed point-wise, though looping over all distances might be quite inefficient
print(data[14,42])
```

## Output formats

The constructed distances matrix can then be written to disk in several different formats:

```python
# The data can be written to disk in csv format (default `distance` Ripser format) and retrieved:
data.dump("backup.csv")
retrieval = hammingdist.from_csv("backup.csv")
Expand All @@ -45,12 +51,13 @@ retrieval = hammingdist.from_csv("backup.csv")
data.dump_lower_triangular("lt.txt")
retrieval = hammingdist.from_lower_triangular("lt.txt")

# Or in sparse format (`sparse` Ripser format: space-delimited triplet of `i j d(i,j)`
# with one line for each distance entry i > j which is not above threshold):
data.dump_sparse("sparse.txt", threshold=3)

# If the `remove_duplicates` option was used, the sequence indices can also be written.
# For each input sequence, this prints the corresponding index in the output:
data.dump_sequence_indices("indices.txt")

# Finally, we can pass the data as a list of strings in Python:
data = hammingdist.from_stringlist(["ACGTACGT", "ACGTAGGT", "ATTTACGT"])
```

## Duplicates
Expand Down
20 changes: 0 additions & 20 deletions distance.cc

This file was deleted.

41 changes: 41 additions & 0 deletions include/hamming/hamming.hh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
#include "hamming/hamming_impl.hh"
#include "hamming/hamming_types.hh"
#include "hamming/hamming_utils.hh"
#ifdef HAMMING_WITH_OPENMP
#include <omp.h>
#endif
#include <cmath>
#include <fmt/core.h>
#include <fstream>
#include <sstream>
#include <string>
Expand Down Expand Up @@ -72,6 +76,43 @@ template <typename DistIntType> struct DataSet {
write_lower_triangular(filename, result);
}

void dump_sparse(const std::string &filename, int threshold) {
std::ofstream stream(filename);
#ifdef HAMMING_WITH_OPENMP
constexpr std::size_t samples_per_thread{200};
#pragma omp parallel for schedule(static, 1) default(none) \
shared(result, stream, nsamples, samples_per_thread, threshold)
for (std::size_t i_start = 1; i_start < nsamples;
i_start += samples_per_thread) {
std::size_t i_end{std::min(i_start + samples_per_thread, nsamples)};
std::size_t offset{i_start * (i_start - 1) / 2};
auto *d = result.data() + offset;
std::string thread_local_lines;
for (std::size_t i = i_start; i < i_end; ++i) {
for (std::size_t j = 0; j < i; ++j) {
if (*d <= threshold) {
thread_local_lines.append(
fmt::format("{} {} {}\n", i, j, static_cast<int>(*d)));
}
++d;
}
}
#pragma omp critical
stream << thread_local_lines;
}
#else
auto *d = result.data();
for (std::size_t i = 1; i < nsamples; ++i) {
for (std::size_t j = 0; j < i; ++j) {
if (*d <= threshold) {
stream << fmt::format("{} {} {}\n", i, j, static_cast<int>(*d));
}
++d;
}
}
#endif
}

void dump_sequence_indices(const std::string &filename) {
std::ofstream stream(filename);
if (sequence_indices.empty()) {
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"

[project]
name = "hammingdist"
version = "1.1.0"
version = "1.2.0"
description = "A fast tool to calculate Hamming distances"
readme = "README.md"
license = {text = "MIT"}
Expand Down
8 changes: 8 additions & 0 deletions python/hammingdist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ PYBIND11_MODULE(hammingdist, m) {
&DataSet<DefaultDistIntType>::dump_lower_triangular,
"Dump distances matrix in lower triangular format (comma-delimited, "
"row-major)")
.def("dump_sparse", &DataSet<DefaultDistIntType>::dump_sparse,
py::arg("filename"), py::arg("threshold") = 255,
"Dump distances matrix in sparse format excluding any distances "
"above threshold")
.def("dump_sequence_indices",
&DataSet<DefaultDistIntType>::dump_sequence_indices,
"Dump row index in distances matrix for each input sequence")
Expand All @@ -47,6 +51,10 @@ PYBIND11_MODULE(hammingdist, m) {
.def("dump_lower_triangular", &DataSet<uint16_t>::dump_lower_triangular,
"Dump distances matrix in lower triangular format (comma-delimited, "
"row-major)")
.def("dump_sparse", &DataSet<uint16_t>::dump_sparse, py::arg("filename"),
py::arg("threshold") = 65535,
"Dump distances matrix in sparse format excluding any distances "
"above threshold")
.def("dump_sequence_indices", &DataSet<uint16_t>::dump_sequence_indices,
"Dump row index in distances matrix for each input sequence")
.def("__getitem__", &DataSet<uint16_t>::operator[])
Expand Down
30 changes: 30 additions & 0 deletions python/tests/test_hammingdist.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,33 @@ def test_from_fasta_to_lower_triangular(tmp_path, max_distance):
assert len(data) == 2
assert np.allclose(np.fromstring(data[0], sep=","), lower_triangular_dist[0])
assert np.allclose(np.fromstring(data[1], sep=","), lower_triangular_dist[1])


@pytest.mark.parametrize("samples", [2, 3, 5, 11, 54, 120, 532, 981, 1568])
@pytest.mark.parametrize("threshold", [0, 1, 2, 3, 4, 9, 89, 497])
def test_dump_sparse(tmp_path, samples, threshold):
lt_file = str(tmp_path / "lt.txt")
with open(lt_file, "w") as f:
for n in range(1, samples):
f.write(
",".join(
str(x)
for x in np.random.randint(low=0, high=256, size=n, dtype=np.uint32)
)
)
f.write("\n")
data = hammingdist.from_lower_triangular(lt_file)
# expected number of sparse entries
num_below_threshold = np.sum(np.array(data._distances) <= threshold)
if num_below_threshold > 0:
sparse_file = str(tmp_path / "sparse.txt")
data.dump_sparse(sparse_file, threshold)
sparse = np.loadtxt(sparse_file, delimiter=" ", dtype=np.uint32)
if sparse.ndim == 1:
sparse = np.array([sparse])
assert sparse.shape[0] == num_below_threshold
assert sparse.shape[1] == 3
# each sparse entry has the correct distance, and is not above threshold
for e in sparse:
assert e[2] <= threshold
assert data[e[0], e[1]] == e[2]