Skip to content

Commit

Permalink
Prepare v0.4.0: Maintenance
Browse files Browse the repository at this point in the history
  • Loading branch information
JWatter committed Jul 15, 2024
1 parent ce8478c commit 457404e
Show file tree
Hide file tree
Showing 33 changed files with 177 additions and 116 deletions.
4 changes: 2 additions & 2 deletions docsource/download_notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
with urllib.request.urlopen(repo_url) as f:
response = f.read().decode('utf-8')

pieces = response.split('"notebooks/')[1:]
basenames = [p.split('.ipynb"')[0] for p in pieces]
pieces = response.split('.ipynb">')[1:]
basenames = [p.split('.ipynb</a>')[0] for p in pieces]

# download all notebooks

Expand Down
34 changes: 34 additions & 0 deletions docsource/release_notes/release_notes_0.4.0.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
TACCO 0.4.0 (2024-07-16)
========================

Fixes
-----

- Fix incompatibility with new scipy version 1.14.0 `#20 <https://github.com/simonwm/tacco/issues/20>`__

Breaking changes
----------------

- Adapted to work with anndata>=0.9.0 and pandas>=2.0.0. This involves changes in the handling of dtypes and (floating point) precision, which leads to "small" numerical changes in the results.

- Stronger preservation of input dtype in :func:`tacco.utils.split_beads` with downstream effects on :func:`tacco.tools.split_observations` and :func:`tacco.preprocessing.normalize_platform`.

- :func:`tacco.tools.enrichments`, :func:`tacco.tools.get_contributions`, :func:`tacco.tools.get_compositions`: Always work with float64 to avoid rounding errors as far as possible.

Miscellaneous
-------------

- :func:`tacco.tools.annotate`: Report clear error message for using bisectioning with integer data

- :func:`tacco.utils.row_scale`,:func:`tacco.utils.col_scale`: Report clear error message for rescaling integer data inplace with floating rescaling factors

- :func:`tacco.tools.enrichments`, :func:`tacco.tools.get_contributions`: Deprecate on-the-fly sample split in favour of explicit use of :func:`tacco.utils.split_spatial_samples`.

- Handeled pandas `FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.`

- Handeled anndata `FutureWarning: The dtype argument is deprecated and will be removed in late 2024.` Requires anndata>=0.9.0

- Handeled anndata `FutureWarning: Use anndata.concat instead of AnnData.concatenate, AnnData.concatenate is deprecated and will be removed in the future. See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html`

- Handeled sklearn `FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.`

8 changes: 7 additions & 1 deletion docsource/release_notes/template_release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@ Features
- Add :func:`tacco.tools.other_new_func` to implement feature request.

Fixes
--------
-----
.. include resolution of bugs and very unintuitive behaviour here
- Fix some issue `#9 <https://github.com/simonwm/tacco/issues/9>`__

Breaking changes
----------------
.. include changes which alter previous behaviour here
- Support for new AnnData version

Documentation
-------------
.. include documentation updates here
Expand Down
3 changes: 1 addition & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@ dependencies:
- numba>=0.51.2
- numpy
- matplotlib!=3.7.0
- pyqt!=5.9.2
- seaborn
- sparse_dot_mkl>=0.7.3
- scanpy>=1.7.0
- statsmodels
- anndata
- anndata>=0.9.0
- pandas>=1.1.0
- scipy>=1.6.0
- mkl
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ dynamic = ["version"]

dependencies = [
"requests",
"importlib; python_version == '2.6'",
"joblib",
"numba>=0.51.2",
"numpy",
Expand All @@ -41,7 +40,7 @@ dependencies = [
"sparse_dot_mkl>=0.7.3",
"scanpy>=1.7.0",
"statsmodels",
"anndata",
"anndata>=0.9.0",
"pandas>=1.1.0",
"scipy>=1.6.0",
"mkl",
Expand Down
12 changes: 6 additions & 6 deletions tacco/plots/_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -1111,7 +1111,7 @@ def scatter(
raise ValueError(f'The `group_key` {group_key!r} is not `None`, but `adata` is not a single `AnnData` instance!')
if group_key not in adata.obs:
raise ValueError(f'The `group_key` {group_key!r} is not available in `adata.obs`!')
adata = { c: adata[df.index] for c, df in adata.obs.groupby(group_key) if len(df) > 0 }
adata = { c: adata[df.index] for c, df in adata.obs.groupby(group_key, observed=False) if len(df) > 0 }

typing_data, adatas, methods, types, colors, coords = _validate_scatter_args(adata, position_key, keys, colors, show_only, method_labels=method_labels, counts_location=counts_location, compositional=compositional)
n_solutions, n_samples, n_types = len(typing_data), len(adatas), len(types)
Expand Down Expand Up @@ -3608,7 +3608,7 @@ def annotated_heatmap(
raise ValueError(f'`var_key` {var_key!r} is not a column of `adata.var`!')
if not hasattr(adata.var[var_key], 'cat'):
print(f'WARNING: `var_key` {var_key!r} is not a categorical column of `adata.var`! Treating it as a categorical column...')
marker = {c: df.index for c,df in adata.var.groupby(var_key)}
marker = {c: df.index for c,df in adata.var.groupby(var_key, observed=False)}

all_marker = [c for l,m in marker.items() for c in m]
# reorder genes to represent the annotation
Expand All @@ -3631,7 +3631,7 @@ def annotated_heatmap(
if not hasattr(adata.obs[obs_key], 'cat'):
print(f'WARNING: `obs_key` {obs_key!r} is not a categorical column of `adata.obs`! Treating it as a categorical column...')

cells = {c: df.index for c,df in adata.obs.groupby(obs_key)}
cells = {c: df.index for c,df in adata.obs.groupby(obs_key, observed=False)}
all_cells = [c for l,m in cells.items() for c in m]
# reorder cells to represent the annotation
adata = adata[all_cells]
Expand Down Expand Up @@ -3846,7 +3846,7 @@ def annotation_coordinate(
if group_key is None:
group_adatas = {'':adata}
else:
group_adatas = {group:adata[df.index] for group,df in adata.obs.groupby(group_key) if len(df)>0 }
group_adatas = {group:adata[df.index] for group,df in adata.obs.groupby(group_key, observed=False) if len(df)>0 }

if annotation_key in adata.obs:
annotation = adata.obs[annotation_key]
Expand Down Expand Up @@ -4012,8 +4012,8 @@ def dotplot(
marker_counts = adata[:,markers].to_df()
if log1p:
marker_counts = np.log1p(marker_counts)
mean_exp = pd.DataFrame({c: marker_counts.loc[df.index].mean(axis=0) for c,df in adata.obs.groupby(group_key) })
mean_pos = pd.DataFrame({c: (marker_counts.loc[df.index] != 0).mean(axis=0) for c,df in adata.obs.groupby(group_key) })
mean_exp = pd.DataFrame({c: marker_counts.loc[df.index].mean(axis=0) for c,df in adata.obs.groupby(group_key, observed=False) })
mean_pos = pd.DataFrame({c: (marker_counts.loc[df.index] != 0).mean(axis=0) for c,df in adata.obs.groupby(group_key, observed=False) })

if marks is not None:
marks = marks.reindex_like(mean_pos)
Expand Down
2 changes: 1 addition & 1 deletion tacco/preprocessing/_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def subsample_annotation(

selection = np.concatenate([
utils.complete_choice(df.index, counts.loc[l,'new'], seed=seed)
for l,df in adata.obs.groupby(annotation_key)
for l,df in adata.obs.groupby(annotation_key,observed=False)
])

return adata[selection]
Expand Down
2 changes: 1 addition & 1 deletion tacco/preprocessing/_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def filter(
else:
not_constant = adatas[i].X.max(axis=0)!=adatas[i].X.min(axis=0)
if issparse(not_constant):
not_constant = not_constant.A
not_constant = not_constant.toarray()
not_constant = not_constant.flatten()
good_genes = good_genes.intersection(adatas[i].var.index[not_constant])

Expand Down
6 changes: 3 additions & 3 deletions tacco/tools/_OT.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def get_minimal_transitions(
):
assert(aa.shape==bb.shape)
if issparse(aa):
aa = aa.A
aa = aa.toarray()
if issparse(bb):
bb = bb.A
bb = bb.toarray()

res = _get_minimal_transitions(aa,bb)

Expand Down Expand Up @@ -98,7 +98,7 @@ def _annotate_OT(
xam = get_minimal_transitions(ax.T, xm)
ax.eliminate_zeros()
ax.data = 1 / ax.data
ma = np.einsum('xam,xa,ax->ma', xam, xa.A, ax.A)
ma = np.einsum('xam,xa,ax->ma', xam, xa.toarray(), ax.toarray())

cell_type = utils.parallel_nnls(ma, om)

Expand Down
9 changes: 6 additions & 3 deletions tacco/tools/_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,9 @@ def _method(adata, reference, annotation_key, annotation_prior, verbose):

if bisection_divisor < 2:
raise ValueError('`bisection_divisor` is smaller than 2!')

if not pd.api.types.is_float_dtype(adata.X):
raise ValueError(f'Bisectioning is only possible for floating point data in `.X`, but it got {adata.X.dtype}!')

def get_bisections(bisections, bisection_divisor):
remaining = [1.0]
Expand Down Expand Up @@ -229,8 +232,8 @@ def get_bisections(bisections, bisection_divisor):
utils.row_scale(reconstruction, cell_prior)

if len(adata.obs.index) == 1 and scipy.sparse.issparse(reconstruction): # edgecase bug in scanpy
adata.X = adata.X.A
reconstruction = reconstruction.A
adata.X = adata.X.toarray()
reconstruction = reconstruction.toarray()
adata.X -= current * reconstruction

del reconstruction
Expand Down Expand Up @@ -408,7 +411,7 @@ def _method(adata, reference, annotation_key, annotation_prior, verbose):
sc.pp.pca(preped, random_state=42, n_comps=min(10,min(preped.shape[0],preped.shape[1])-1))

new_cats = []
for cat, df in reference.obs.groupby(annotation_key):
for cat, df in reference.obs.groupby(annotation_key, observed=False):
_multi_center = min(multi_center, df.shape[0])

X = preped[df.index].obsm['X_pca']
Expand Down
4 changes: 2 additions & 2 deletions tacco/tools/_co_occurrence.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def get_labels(adata, key_name, key,):
# split the data into samples to treat separately

if sample_key is not None:
sample_adatas = { sample: adata[df.index] for sample, df in adata.obs.groupby(sample_key) }
sample_adatas = { sample: adata[df.index] for sample, df in adata.obs.groupby(sample_key, observed=False) }
samples = list(sample_adatas.keys())
sample_adatas = list(sample_adatas.values())
sample_labels = [ labels.loc[_adata.obs.index] for _adata in sample_adatas ]
Expand Down Expand Up @@ -791,7 +791,7 @@ def annotation_coordinate(
if sample_key is None:
sample_adatas = [adata]
else:
sample_adatas = [adata[df.index] for sample,df in adata.obs.groupby(sample_key)]
sample_adatas = [adata[df.index] for sample,df in adata.obs.groupby(sample_key,observed=False)]

if distance_key is None:
if verbose > 0:
Expand Down
35 changes: 24 additions & 11 deletions tacco/tools/_enrichments.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,17 @@ def get_contributions(
position_key
The `.obsm` key or array-like of `.obs` keys with the position space
coordinates. If `None`, no position splits are performed.
NOTE: Splitting samples spatially on the fly is deprecated. Instead,
use :func:`~tacco.utils.split_spatial_samples` explicitly and supply it
as the `sample_key`.
position_split
The number of splits per spatial dimension before enrichment. Can be a
tuple with the spatial dimension as length to assign a different split
per dimension. If `None`, no position splits are performed. See also
`min_obs`.
NOTE: Splitting samples spatially on the fly is deprecated. Instead,
use :func:`~tacco.utils.split_spatial_samples` explicitly and supply it
as the `sample_key`.
min_obs
The minimum number of observations per sample: if less observations are
available, the sample is not used. This also limits the number of
Expand Down Expand Up @@ -166,7 +172,7 @@ def get_contributions(
if value_key in adata_obs.columns and (value_location is None or value_location == 'obs'):
found.append('obs')
if hasattr(adata_obs[value_key], 'cat'):
obs = pd.get_dummies(adata_obs[value_key])
obs = pd.get_dummies(adata_obs[value_key], dtype=np.uint8)
obs.columns.name = value_key
else:
obs = pd.DataFrame({value_key:adata_obs[value_key]})
Expand Down Expand Up @@ -243,7 +249,7 @@ def get_contributions(

obs.columns.name = value_key.name if hasattr(value_key,'name') and value_key.name is not None else 'value'
obs.columns = obs.columns.astype('category')

if sample_key is None:
sample_column = None
else:
Expand All @@ -252,7 +258,7 @@ def get_contributions(
if reads:
counts = get.counts(adata, counts_location=counts_location, annotation=False, copy=False)
totals = utils.get_sum(counts.X, axis=1)
obs *= totals[:,None]
obs = obs * totals[:,None].astype(np.float64, copy=False)

# prepare positions aleady here to follow obs filtering in the following steps
positions = None
Expand Down Expand Up @@ -280,7 +286,8 @@ def get_contributions(

# divide spatial samples spatially into subsamples: keeps all the correlation structure
if position_key is not None and position_split is not None:

import warnings
warnings.warn(f'Splitting samples spatially on the fly is deprecated. Instead, use tacco.utils.split_spatial_samples explicitly and supply it as the sample_key.', DeprecationWarning)
sample_column = utils.spatial_split(positions, position_key=positions.columns, sample_key=sample_column, position_split=position_split, min_obs=min_obs)

else: # filter out too small samples
Expand Down Expand Up @@ -308,11 +315,11 @@ def _normalize(x):
sums = x
elif isinstance(reduction, str):
if reduction == 'sum':
sums = x.sum(axis=0, skipna=True)
sums = pd.Series(np.nansum(x.to_numpy(), axis=0, dtype=np.float64), index=x.columns)
elif reduction == 'mean':
sums = x.mean(axis=0, skipna=True)
sums = pd.Series(np.nanmean(x.to_numpy(), axis=0, dtype=np.float64), index=x.columns)
elif reduction == 'median':
sums = x.median(axis=0, skipna=True)
sums = pd.Series(np.nanmedian(x.to_numpy(), axis=0), index=x.columns)
else:
raise ValueError('`reduction` "%s" is not implemented.' % reduction)
else:
Expand All @@ -327,10 +334,10 @@ def _normalize(x):
if normalization == 'sum':
# normalize total to 1 for each groupXsample
#factor = sums.to_numpy().sum(axis=-1, skipna=True)
factor = np.nansum(sums.to_numpy(), axis=-1)
factor = np.nansum(sums.to_numpy(), axis=-1, dtype=np.float64)
elif normalization == 'percent':
# normalize total to 1 for each groupXsample
factor = np.nansum(sums.to_numpy(), axis=-1) / 100
factor = np.nansum(sums.to_numpy(), axis=-1, dtype=np.float64) / 100
elif normalization in ['gmean','clr']:
# normalize by geometric mean for each groupXsample
sums_le_0 = ~(sums>0) # also includes nans
Expand All @@ -344,7 +351,7 @@ def _normalize(x):
min_sums = sums[~sums_le_0].to_numpy().min()
sums = sums.copy()
sums[sums_le_0] = min_sums * 1e-3
factor = stats.gmean(sums,axis=-1)
factor = stats.gmean(sums,axis=-1, dtype=np.float64)
elif normalization in sums.index:
factor = sums[normalization]
else:
Expand All @@ -361,7 +368,7 @@ def _normalize(x):
except Exception as e:
raise ValueError('The supplied `normalization` is neither string nor a working callable!')

compositions = obs.groupby(grouping, group_keys=False).apply(_normalize)
compositions = obs.groupby(grouping, group_keys=False, observed=False).apply(_normalize)

if len(compositions.index) == len(groups.index) and (compositions.index == groups.index).all():
compositions.index = pd.MultiIndex.from_arrays([groups,pd.Series(groups.index,index=groups.index)])
Expand Down Expand Up @@ -480,11 +487,17 @@ def enrichments(
position_key
The `.obsm` key or array-like of `.obs` keys with the position space
coordinates. If `None`, no position splits are performed.
NOTE: Splitting samples spatially on the fly is deprecated. Instead,
use :func:`~tacco.utils.split_spatial_samples` explicitly and supply it
as the `sample_key`.
position_split
The number of splits per spatial dimension before enrichment. Can be a
tuple with the spatial dimension as length to assign a different split
per dimension. If `None`, no position splits are performed. See also
`min_obs`.
NOTE: Splitting samples spatially on the fly is deprecated. Instead,
use :func:`~tacco.utils.split_spatial_samples` explicitly and supply it
as the `sample_key`.
reference_group
The particular group value to which all other groups should be
compared. This group will be compared to the rest. If `None`, all
Expand Down
2 changes: 1 addition & 1 deletion tacco/tools/_find_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,6 @@ def get_closest_annotation(batch):
new_anno = pd.Series(anno.iloc[np.argmin(dists, axis=1)].to_numpy(), index=new_pos.index)
return pd.concat([anno, new_anno]).reindex_like(all_anno)

adata.obs[region_key] = batches.groupby(batches).transform(get_closest_annotation)
adata.obs[region_key] = batches.groupby(batches, observed=False).transform(get_closest_annotation)

return adata
2 changes: 1 addition & 1 deletion tacco/tools/_in_silico.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def mix_in_silico(
np.around(sample_X, decimals=0, out=sample_X)
if issparse(sample_X):
sample_X.eliminate_zeros()
sample_data = ad.AnnData(X=sample_X, obs=sampling, var=adata.var.copy(), dtype=sample_X.dtype)
sample_data = ad.AnnData(X=sample_X, obs=sampling, var=adata.var.copy())
if platform_log10_mean is not None:
sample_data.var['platform_effect'] = rescaling_factors

Expand Down
2 changes: 1 addition & 1 deletion tacco/tools/_orthology.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def _construct_gene2DB_matrix(tax_id):
merge_adata = ad.AnnData(merge_matrix, obs=pd.DataFrame(index=hom_df['Symbol'].cat.categories), var=pd.DataFrame(index=hom_df[homology_key].cat.categories))

hom_df[homology_key] = hom_df[homology_key].astype(str)
merge_adata.var[f'{tax_id} orthologs'] = hom_df.groupby(homology_key)['Symbol'].apply(lambda x: list(x))
merge_adata.var[f'{tax_id} orthologs'] = hom_df.groupby(homology_key, observed=False)['Symbol'].apply(lambda x: list(x))
for dbck in merge_adata.var[merge_adata.var[f'{tax_id} orthologs'].isna()].index:
merge_adata.var.loc[dbck,f'{tax_id} orthologs'] = []

Expand Down
4 changes: 2 additions & 2 deletions tacco/tools/_points.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ def dense_warning():

distance = None

for anno, obs in adata.obs.groupby(annotation_column):
for anno, obs in adata.obs.groupby(annotation_column,observed=False):

_whole_row = whole_row[obs.index].to_numpy()
_distance = utils.dense_distance_matrix(positions.iloc[_whole_row].to_numpy(), **kw_args)
Expand All @@ -405,7 +405,7 @@ def dense_warning():

try:

for anno, obs in adata.obs.groupby(annotation_column):
for anno, obs in adata.obs.groupby(annotation_column,observed=False):

_whole_row = whole_row[obs.index].to_numpy()

Expand Down
Loading

0 comments on commit 457404e

Please sign in to comment.