Skip to content

Latest commit

 

History

History
195 lines (129 loc) · 5.2 KB

README.rst

File metadata and controls

195 lines (129 loc) · 5.2 KB

backedarray

Sparse csc and csr arrays backed by on-disk storage in Zarr or HDF5. Allows accessing slices of larger than memory arrays. Inspired by h5sparse and anndata.

Installation

pip install backedarray

Examples

import backedarray as ba
import scipy.sparse
import numpy as np
import h5py
import zarr

Create Dataset

csr_matrix = scipy.sparse.random(100, 50, format="csr", density=0.2)
dense_array = csr_matrix.toarray()

HDF5 Backend

# Write sparse matrix in csc or csr format to hdf5 file
h5_csr_path = 'csr.h5'
with h5py.File(h5_csr_path, "w") as f:
    ba.write_sparse(f.create_group("X"), csr_matrix)

Zarr Backend

# Write sparse matrix in csc or csr format to zarr file
zarr_csr_path = 'csr.zarr'
with zarr.open(zarr_csr_path, mode="w") as f:
    ba.write_sparse(f.create_group("X"), csr_matrix)

Read Dataset

HDF5 Backend

h5_csr_file = h5py.File(h5_csr_path, "r")
h5_csr_disk = ba.open(h5_csr_file["X"])

Zarr Backend

zarr_csr_disk = ba.open(zarr.open(zarr_csr_path)["X"])

Numpy Style Indexing

zarr_csr_disk[1:3].toarray()
array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.06275782, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.61030855, 0.46886635, 0.        , 0.11597629,
        0.        , 0.        , 0.        , 0.23471198, 0.        ,
        0.        , 0.        , 0.        , 0.4911036 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.00851426,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.10065413],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.93545866, 0.        , 0.        , 0.        , 0.        ,
        0.26147665, 0.        , 0.99931215, 0.        , 0.        ,
        0.        , 0.        , 0.18532786, 0.        , 0.69309913,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.32219088, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.14121076, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.70207481, 0.        , 0.        , 0.        , 0.        ]])
h5_csr_disk[2:].toarray()
array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.89758627, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.81611075, 0.        , 0.        , ..., 0.82151986, 0.        ,
        0.        ]])
h5_csr_disk[...].toarray()
array([[0.        , 0.45873864, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.10065413],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.81611075, 0.        , 0.        , ..., 0.82151986, 0.        ,
        0.        ]])
h5_csr_file.close()

Append

zarr_csr_disk.append(csr_matrix)
np.testing.assert_array_equal(zarr_csr_disk[...].toarray(), scipy.sparse.vstack((csr_matrix, csr_matrix)).toarray())

Read h5ad files created using anndata

%%bash
if [ ! -f "pbmc3k.h5ad" ]; then
    wget -q https://raw.githubusercontent.com/chanzuckerberg/cellxgene/main/example-dataset/pbmc3k.h5ad
fi
import anndata.experimental
with h5py.File('pbmc3k.h5ad', 'r') as f:
    obs = anndata.experimental.read_elem(f['obs'])
    var = anndata.experimental.read_elem(f['var'])
    X = ba.open(f['X'])