Prepare v0.4.0: Maintenance

simonwm · Jul 15, 2024 · 457404e · 457404e
1 parent ce8478c
commit 457404e
Show file tree

Hide file tree

Showing 33 changed files with 177 additions and 116 deletions.
diff --git a/docsource/download_notebooks.py b/docsource/download_notebooks.py
@@ -18,8 +18,8 @@
 with urllib.request.urlopen(repo_url) as f:
     response = f.read().decode('utf-8')
 
-pieces = response.split('"notebooks/')[1:]
-basenames = [p.split('.ipynb"')[0] for p in pieces]
+pieces = response.split('.ipynb">')[1:]
+basenames = [p.split('.ipynb</a>')[0] for p in pieces]
 
 # download all notebooks
 

diff --git a/docsource/release_notes/release_notes_0.4.0.rst b/docsource/release_notes/release_notes_0.4.0.rst
@@ -0,0 +1,34 @@
+TACCO 0.4.0 (2024-07-16)
+========================
+
+Fixes
+-----
+
+- Fix incompatibility with new scipy version 1.14.0 `#20 <https://github.com/simonwm/tacco/issues/20>`__
+
+Breaking changes
+----------------
+
+- Adapted to work with anndata>=0.9.0 and pandas>=2.0.0. This involves changes in the handling of dtypes and (floating point) precision, which leads to "small" numerical changes in the results.
+
+- Stronger preservation of input dtype in :func:`tacco.utils.split_beads` with downstream effects on :func:`tacco.tools.split_observations` and :func:`tacco.preprocessing.normalize_platform`.
+
+- :func:`tacco.tools.enrichments`, :func:`tacco.tools.get_contributions`, :func:`tacco.tools.get_compositions`: Always work with float64 to avoid rounding errors as far as possible.
+
+Miscellaneous
+-------------
+
+- :func:`tacco.tools.annotate`: Report clear error message for using bisectioning with integer data
+
+- :func:`tacco.utils.row_scale`,:func:`tacco.utils.col_scale`: Report clear error message for rescaling integer data inplace with floating rescaling factors
+
+- :func:`tacco.tools.enrichments`, :func:`tacco.tools.get_contributions`: Deprecate on-the-fly sample split in favour of explicit use of :func:`tacco.utils.split_spatial_samples`.
+
+- Handeled pandas `FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.`
+
+- Handeled anndata `FutureWarning: The dtype argument is deprecated and will be removed in late 2024.` Requires anndata>=0.9.0
+
+- Handeled anndata `FutureWarning: Use anndata.concat instead of AnnData.concatenate, AnnData.concatenate is deprecated and will be removed in the future. See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html`  
+
+- Handeled sklearn `FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.`
+
diff --git a/docsource/release_notes/template_release_notes.rst b/docsource/release_notes/template_release_notes.rst
@@ -16,11 +16,17 @@ Features
 - Add :func:`tacco.tools.other_new_func` to implement feature request.
 
 Fixes
---------
+-----
 .. include resolution of bugs and very unintuitive behaviour here
 
 - Fix some issue `#9 <https://github.com/simonwm/tacco/issues/9>`__
 
+Breaking changes
+----------------
+.. include changes which alter previous behaviour here
+
+- Support for new AnnData version
+
 Documentation
 -------------
 .. include documentation updates here

diff --git a/environment.yml b/environment.yml
@@ -9,12 +9,11 @@ dependencies:
   - numba>=0.51.2
   - numpy
   - matplotlib!=3.7.0
-  - pyqt!=5.9.2
   - seaborn
   - sparse_dot_mkl>=0.7.3
   - scanpy>=1.7.0
   - statsmodels
-  - anndata
+  - anndata>=0.9.0
   - pandas>=1.1.0
   - scipy>=1.6.0
   - mkl

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,6 @@ dynamic = ["version"]
 
 dependencies = [
   "requests",
-  "importlib; python_version == '2.6'",
   "joblib",
   "numba>=0.51.2",
   "numpy",
@@ -41,7 +40,7 @@ dependencies = [
   "sparse_dot_mkl>=0.7.3",
   "scanpy>=1.7.0",
   "statsmodels",
-  "anndata",
+  "anndata>=0.9.0",
   "pandas>=1.1.0",
   "scipy>=1.6.0",
   "mkl",

diff --git a/tacco/plots/_plots.py b/tacco/plots/_plots.py
@@ -1111,7 +1111,7 @@ def scatter(
             raise ValueError(f'The `group_key` {group_key!r} is not `None`, but `adata` is not a single `AnnData` instance!')
         if group_key not in adata.obs:
             raise ValueError(f'The `group_key` {group_key!r} is not available in `adata.obs`!')
-        adata = { c: adata[df.index] for c, df in adata.obs.groupby(group_key) if len(df) > 0 }
+        adata = { c: adata[df.index] for c, df in adata.obs.groupby(group_key, observed=False) if len(df) > 0 }
 
     typing_data, adatas, methods, types, colors, coords = _validate_scatter_args(adata, position_key, keys, colors, show_only, method_labels=method_labels, counts_location=counts_location, compositional=compositional)
     n_solutions, n_samples, n_types = len(typing_data), len(adatas), len(types)
@@ -3608,7 +3608,7 @@ def annotated_heatmap(
                     raise ValueError(f'`var_key` {var_key!r} is not a column of `adata.var`!')
                 if not hasattr(adata.var[var_key], 'cat'):
                     print(f'WARNING: `var_key` {var_key!r} is not a categorical column of `adata.var`! Treating it as a categorical column...')
-                marker = {c: df.index for c,df in adata.var.groupby(var_key)}
+                marker = {c: df.index for c,df in adata.var.groupby(var_key, observed=False)}
 
         all_marker = [c for l,m in marker.items() for c in m]
         # reorder genes to represent the annotation
@@ -3631,7 +3631,7 @@ def annotated_heatmap(
         if not hasattr(adata.obs[obs_key], 'cat'):
             print(f'WARNING: `obs_key` {obs_key!r} is not a categorical column of `adata.obs`! Treating it as a categorical column...')
 
-        cells = {c: df.index for c,df in adata.obs.groupby(obs_key)}
+        cells = {c: df.index for c,df in adata.obs.groupby(obs_key, observed=False)}
         all_cells = [c for l,m in cells.items() for c in m]
         # reorder cells to represent the annotation
         adata = adata[all_cells]
@@ -3846,7 +3846,7 @@ def annotation_coordinate(
     if group_key is None:
         group_adatas = {'':adata}
     else:
-        group_adatas = {group:adata[df.index] for group,df in adata.obs.groupby(group_key) if len(df)>0 }
+        group_adatas = {group:adata[df.index] for group,df in adata.obs.groupby(group_key, observed=False) if len(df)>0 }
 
     if annotation_key in adata.obs:
         annotation = adata.obs[annotation_key]
@@ -4012,8 +4012,8 @@ def dotplot(
     marker_counts = adata[:,markers].to_df()
     if log1p:
         marker_counts = np.log1p(marker_counts)
-    mean_exp = pd.DataFrame({c: marker_counts.loc[df.index].mean(axis=0) for c,df in adata.obs.groupby(group_key) })
-    mean_pos = pd.DataFrame({c: (marker_counts.loc[df.index] != 0).mean(axis=0) for c,df in adata.obs.groupby(group_key) })
+    mean_exp = pd.DataFrame({c: marker_counts.loc[df.index].mean(axis=0) for c,df in adata.obs.groupby(group_key, observed=False) })
+    mean_pos = pd.DataFrame({c: (marker_counts.loc[df.index] != 0).mean(axis=0) for c,df in adata.obs.groupby(group_key, observed=False) })
 
     if marks is not None:
         marks = marks.reindex_like(mean_pos)

diff --git a/tacco/preprocessing/_platform.py b/tacco/preprocessing/_platform.py
@@ -83,7 +83,7 @@ def subsample_annotation(
 
     selection = np.concatenate([
         utils.complete_choice(df.index, counts.loc[l,'new'], seed=seed)
-        for l,df in adata.obs.groupby(annotation_key)
+        for l,df in adata.obs.groupby(annotation_key,observed=False)
     ])
 
     return adata[selection]

diff --git a/tacco/preprocessing/_qc.py b/tacco/preprocessing/_qc.py
@@ -154,7 +154,7 @@ def filter(
                 else:
                     not_constant = adatas[i].X.max(axis=0)!=adatas[i].X.min(axis=0)
                     if issparse(not_constant):
-                        not_constant = not_constant.A
+                        not_constant = not_constant.toarray()
                     not_constant = not_constant.flatten()
                     good_genes = good_genes.intersection(adatas[i].var.index[not_constant])
 

diff --git a/tacco/tools/_OT.py b/tacco/tools/_OT.py
@@ -36,9 +36,9 @@ def get_minimal_transitions(
 ):
     assert(aa.shape==bb.shape)
     if issparse(aa):
-        aa = aa.A
+        aa = aa.toarray()
     if issparse(bb):
-        bb = bb.A
+        bb = bb.toarray()
 
     res = _get_minimal_transitions(aa,bb)
 
@@ -98,7 +98,7 @@ def _annotate_OT(
         xam = get_minimal_transitions(ax.T, xm)
         ax.eliminate_zeros()
         ax.data = 1 / ax.data
-        ma = np.einsum('xam,xa,ax->ma', xam, xa.A, ax.A)
+        ma = np.einsum('xam,xa,ax->ma', xam, xa.toarray(), ax.toarray())
 
         cell_type = utils.parallel_nnls(ma, om)
 

diff --git a/tacco/tools/_annotate.py b/tacco/tools/_annotate.py
@@ -180,6 +180,9 @@ def _method(adata, reference, annotation_key, annotation_prior, verbose):
 
         if bisection_divisor < 2:
             raise ValueError('`bisection_divisor` is smaller than 2!')
+
+        if not pd.api.types.is_float_dtype(adata.X):
+            raise ValueError(f'Bisectioning is only possible for floating point data in `.X`, but it got {adata.X.dtype}!')
 
         def get_bisections(bisections, bisection_divisor):
             remaining = [1.0]
@@ -229,8 +232,8 @@ def get_bisections(bisections, bisection_divisor):
                 utils.row_scale(reconstruction, cell_prior)
 
                 if len(adata.obs.index) == 1 and scipy.sparse.issparse(reconstruction): # edgecase bug in scanpy
-                    adata.X = adata.X.A
-                    reconstruction = reconstruction.A
+                    adata.X = adata.X.toarray()
+                    reconstruction = reconstruction.toarray()
                 adata.X -= current * reconstruction
 
                 del reconstruction
@@ -408,7 +411,7 @@ def _method(adata, reference, annotation_key, annotation_prior, verbose):
         sc.pp.pca(preped, random_state=42, n_comps=min(10,min(preped.shape[0],preped.shape[1])-1))
 
         new_cats = []
-        for cat, df in reference.obs.groupby(annotation_key):
+        for cat, df in reference.obs.groupby(annotation_key, observed=False):
             _multi_center = min(multi_center, df.shape[0])
 
             X = preped[df.index].obsm['X_pca']

diff --git a/tacco/tools/_co_occurrence.py b/tacco/tools/_co_occurrence.py
@@ -346,7 +346,7 @@ def get_labels(adata, key_name, key,):
     # split the data into samples to treat separately
 
     if sample_key is not None:
-        sample_adatas = { sample: adata[df.index] for sample, df in adata.obs.groupby(sample_key) }
+        sample_adatas = { sample: adata[df.index] for sample, df in adata.obs.groupby(sample_key, observed=False) }
         samples = list(sample_adatas.keys())
         sample_adatas = list(sample_adatas.values())
         sample_labels = [ labels.loc[_adata.obs.index] for _adata in sample_adatas ]
@@ -791,7 +791,7 @@ def annotation_coordinate(
     if sample_key is None:
         sample_adatas = [adata]
     else:
-        sample_adatas = [adata[df.index] for sample,df in adata.obs.groupby(sample_key)]
+        sample_adatas = [adata[df.index] for sample,df in adata.obs.groupby(sample_key,observed=False)]
 
     if distance_key is None:
         if verbose > 0:

diff --git a/tacco/tools/_enrichments.py b/tacco/tools/_enrichments.py
@@ -50,11 +50,17 @@ def get_contributions(
     position_key
         The `.obsm` key or array-like of `.obs` keys with the position space
         coordinates. If `None`, no position splits are performed.
+        NOTE: Splitting samples spatially on the fly is deprecated. Instead,
+        use :func:`~tacco.utils.split_spatial_samples` explicitly and supply it
+        as the `sample_key`.
     position_split
         The number of splits per spatial dimension before enrichment. Can be a
         tuple with the spatial dimension as length to assign a different split
         per dimension. If `None`, no position splits are performed. See also
         `min_obs`.
+        NOTE: Splitting samples spatially on the fly is deprecated. Instead,
+        use :func:`~tacco.utils.split_spatial_samples` explicitly and supply it
+        as the `sample_key`.
     min_obs
         The minimum number of observations per sample: if less observations are
         available, the sample is not used. This also limits the number of
@@ -166,7 +172,7 @@ def get_contributions(
         if value_key in adata_obs.columns and (value_location is None or value_location == 'obs'):
             found.append('obs')
             if hasattr(adata_obs[value_key], 'cat'):
-                obs = pd.get_dummies(adata_obs[value_key])
+                obs = pd.get_dummies(adata_obs[value_key], dtype=np.uint8)
                 obs.columns.name = value_key
             else:
                 obs = pd.DataFrame({value_key:adata_obs[value_key]})
@@ -243,7 +249,7 @@ def get_contributions(
 
         obs.columns.name = value_key.name if hasattr(value_key,'name') and value_key.name is not None else 'value'
         obs.columns = obs.columns.astype('category')
-    
+
     if sample_key is None:
         sample_column = None
     else:
@@ -252,7 +258,7 @@ def get_contributions(
     if reads:
         counts = get.counts(adata, counts_location=counts_location, annotation=False, copy=False)
         totals = utils.get_sum(counts.X, axis=1)
-        obs *= totals[:,None]
+        obs = obs * totals[:,None].astype(np.float64, copy=False)
 
     # prepare positions aleady here to follow obs filtering in the following steps
     positions = None
@@ -280,7 +286,8 @@ def get_contributions(
 
     # divide spatial samples spatially into subsamples: keeps all the correlation structure
     if position_key is not None and position_split is not None:
-
+        import warnings
+        warnings.warn(f'Splitting samples spatially on the fly is deprecated. Instead, use tacco.utils.split_spatial_samples explicitly and supply it as the sample_key.', DeprecationWarning)
         sample_column = utils.spatial_split(positions, position_key=positions.columns, sample_key=sample_column, position_split=position_split, min_obs=min_obs)
 
     else: # filter out too small samples
@@ -308,11 +315,11 @@ def _normalize(x):
             sums = x
         elif isinstance(reduction, str):
             if reduction == 'sum':
-                sums = x.sum(axis=0, skipna=True)
+                sums = pd.Series(np.nansum(x.to_numpy(), axis=0, dtype=np.float64), index=x.columns)
             elif reduction == 'mean':
-                sums = x.mean(axis=0, skipna=True)
+                sums = pd.Series(np.nanmean(x.to_numpy(), axis=0, dtype=np.float64), index=x.columns)
             elif reduction == 'median':
-                sums = x.median(axis=0, skipna=True)
+                sums = pd.Series(np.nanmedian(x.to_numpy(), axis=0), index=x.columns)
             else:
                 raise ValueError('`reduction` "%s" is not implemented.' % reduction)
         else:
@@ -327,10 +334,10 @@ def _normalize(x):
             if normalization == 'sum':
                 # normalize total to 1 for each groupXsample
                 #factor = sums.to_numpy().sum(axis=-1, skipna=True)
-                factor = np.nansum(sums.to_numpy(), axis=-1)
+                factor = np.nansum(sums.to_numpy(), axis=-1, dtype=np.float64)
             elif normalization == 'percent':
                 # normalize total to 1 for each groupXsample
-                factor = np.nansum(sums.to_numpy(), axis=-1) / 100
+                factor = np.nansum(sums.to_numpy(), axis=-1, dtype=np.float64) / 100
             elif normalization in ['gmean','clr']:
                 # normalize by geometric mean for each groupXsample
                 sums_le_0 = ~(sums>0) # also includes nans
@@ -344,7 +351,7 @@ def _normalize(x):
                         min_sums = sums[~sums_le_0].to_numpy().min()
                         sums = sums.copy()
                         sums[sums_le_0] = min_sums * 1e-3
-                    factor = stats.gmean(sums,axis=-1)
+                    factor = stats.gmean(sums,axis=-1, dtype=np.float64)
             elif normalization in sums.index:
                 factor = sums[normalization]
             else:
@@ -361,7 +368,7 @@ def _normalize(x):
             except Exception as e:
                 raise ValueError('The supplied `normalization` is neither string nor a working callable!')
 
-    compositions = obs.groupby(grouping, group_keys=False).apply(_normalize)
+    compositions = obs.groupby(grouping, group_keys=False, observed=False).apply(_normalize)
 
     if len(compositions.index) == len(groups.index) and (compositions.index == groups.index).all():
         compositions.index = pd.MultiIndex.from_arrays([groups,pd.Series(groups.index,index=groups.index)])
@@ -480,11 +487,17 @@ def enrichments(
     position_key
         The `.obsm` key or array-like of `.obs` keys with the position space
         coordinates. If `None`, no position splits are performed.
+        NOTE: Splitting samples spatially on the fly is deprecated. Instead,
+        use :func:`~tacco.utils.split_spatial_samples` explicitly and supply it
+        as the `sample_key`.
     position_split
         The number of splits per spatial dimension before enrichment. Can be a
         tuple with the spatial dimension as length to assign a different split
         per dimension. If `None`, no position splits are performed. See also
         `min_obs`.
+        NOTE: Splitting samples spatially on the fly is deprecated. Instead,
+        use :func:`~tacco.utils.split_spatial_samples` explicitly and supply it
+        as the `sample_key`.
     reference_group
         The particular group value to which all other groups should be
         compared. This group will be compared to the rest. If `None`, all

diff --git a/tacco/tools/_find_regions.py b/tacco/tools/_find_regions.py
@@ -230,6 +230,6 @@ def get_closest_annotation(batch):
         new_anno = pd.Series(anno.iloc[np.argmin(dists, axis=1)].to_numpy(), index=new_pos.index)
         return pd.concat([anno, new_anno]).reindex_like(all_anno)
 
-    adata.obs[region_key] = batches.groupby(batches).transform(get_closest_annotation)
+    adata.obs[region_key] = batches.groupby(batches, observed=False).transform(get_closest_annotation)
 
     return adata
diff --git a/tacco/tools/_in_silico.py b/tacco/tools/_in_silico.py
@@ -234,7 +234,7 @@ def mix_in_silico(
                 np.around(sample_X, decimals=0, out=sample_X)
         if issparse(sample_X):
             sample_X.eliminate_zeros()
-        sample_data = ad.AnnData(X=sample_X, obs=sampling, var=adata.var.copy(), dtype=sample_X.dtype)
+        sample_data = ad.AnnData(X=sample_X, obs=sampling, var=adata.var.copy())
         if platform_log10_mean is not None:
             sample_data.var['platform_effect'] = rescaling_factors
 

diff --git a/tacco/tools/_orthology.py b/tacco/tools/_orthology.py
@@ -96,7 +96,7 @@ def _construct_gene2DB_matrix(tax_id):
     merge_adata = ad.AnnData(merge_matrix, obs=pd.DataFrame(index=hom_df['Symbol'].cat.categories), var=pd.DataFrame(index=hom_df[homology_key].cat.categories))
 
     hom_df[homology_key] = hom_df[homology_key].astype(str)
-    merge_adata.var[f'{tax_id} orthologs'] = hom_df.groupby(homology_key)['Symbol'].apply(lambda x: list(x))
+    merge_adata.var[f'{tax_id} orthologs'] = hom_df.groupby(homology_key, observed=False)['Symbol'].apply(lambda x: list(x))
     for dbck in merge_adata.var[merge_adata.var[f'{tax_id} orthologs'].isna()].index:
         merge_adata.var.loc[dbck,f'{tax_id} orthologs'] = []
 

diff --git a/tacco/tools/_points.py b/tacco/tools/_points.py
@@ -382,7 +382,7 @@ def dense_warning():
 
             distance = None
 
-            for anno, obs in adata.obs.groupby(annotation_column):
+            for anno, obs in adata.obs.groupby(annotation_column,observed=False):
 
                 _whole_row = whole_row[obs.index].to_numpy()
                 _distance = utils.dense_distance_matrix(positions.iloc[_whole_row].to_numpy(), **kw_args)
@@ -405,7 +405,7 @@ def dense_warning():
 
             try:
 
-                for anno, obs in adata.obs.groupby(annotation_column):
+                for anno, obs in adata.obs.groupby(annotation_column,observed=False):
 
                     _whole_row = whole_row[obs.index].to_numpy()