diff --git a/simba/tools/_pbg.py b/simba/tools/_pbg.py index 7d0b307..417c5fc 100755 --- a/simba/tools/_pbg.py +++ b/simba/tools/_pbg.py @@ -11,14 +11,8 @@ from pathlib import Path import attr -from torchbiggraph.config import ( - add_to_sys_path, - ConfigFileLoader -) -from torchbiggraph.converters.importers import ( - convert_input_data, - TSVEdgelistReader -) +from torchbiggraph.config import add_to_sys_path, ConfigFileLoader +from torchbiggraph.converters.importers import convert_input_data, TSVEdgelistReader from torchbiggraph.train import train from torchbiggraph.util import ( set_logging_verbosity, @@ -28,32 +22,35 @@ from ._utils import _randomize_matrix from .._settings import settings - -def gen_graph(list_CP=None, - list_PM=None, - list_PK=None, - list_CG=None, - list_CC=None, - list_adata=None, - prefix_C='C', - prefix_P='P', - prefix_M='M', - prefix_K='K', - prefix_G='G', - prefix='E', - layer='simba', - copy=False, - dirname='graph0', - add_edge_weights=None, - use_highly_variable=True, - use_top_pcs=True, - use_top_pcs_CP=None, - use_top_pcs_PM=None, - use_top_pcs_PK=None, - get_marker_significance=False, - fold_null_nodes = 1.0, - ): + +def gen_graph( + list_CP=None, + list_PM=None, + list_PK=None, + list_CG=None, + list_CC=None, + list_KA=None, + list_adata=None, + prefix_C="C", + prefix_P="P", + prefix_M="M", + prefix_K="K", + prefix_A="A", + prefix_G="G", + prefix="E", + layer="simba", + copy=False, + dirname="graph0", + add_edge_weights=None, + use_highly_variable=True, + use_top_pcs=True, + use_top_pcs_CP=None, + use_top_pcs_PM=None, + use_top_pcs_PK=None, + get_marker_significance=False, + fold_null_nodes=1.0, +): """Generate graph for PBG training. Observations and variables of each Anndata object will be encoded @@ -82,6 +79,8 @@ def gen_graph(list_CP=None, A list of anndata objects that store relation between Peaks and Motifs list_PK: `list`, optional (default: None) A list of anndata objects that store relation between Peaks and Kmers + list_KA: `list`, optional (default: None) + A list of anndata objects that store relation between Kmers and Annotations list_CG: `list`, optional (default: None) A list of anndata objects that store RNA-seq data (Cells by Genes) list_CC: `list`, optional (default: None) @@ -148,45 +147,50 @@ def gen_graph(list_CP=None, Statistics of input graph """ - if sum(list(map(lambda x: x is None, - [list_CP, - list_PM, - list_PK, - list_CG, - list_CC, - list_adata]))) == 6: - return 'No graph is generated' - if get_marker_significance: - gen_graph(list_CP=list_CP, - list_PM=list_PM, - list_PK=list_PK, - list_CG=list_CG, - list_CC=list_CC, - prefix_C=prefix_C, - prefix_P=prefix_P, - prefix_M=prefix_M, - prefix_K=prefix_K, - prefix_G=prefix_G, - layer=layer, - copy=copy, - dirname=dirname, - add_edge_weights=add_edge_weights, - use_highly_variable=use_highly_variable, - use_top_pcs=use_top_pcs, - use_top_pcs_CP=use_top_pcs_CP, - use_top_pcs_PM=use_top_pcs_PM, - use_top_pcs_PK=use_top_pcs_PK, - get_marker_significance=False, - ) + if ( + sum( + list( + map( + lambda x: x is None, + [list_CP, list_PM, list_PK, list_CG, list_CC, list_adata], + ) + ) + ) + == 6 + ): + return "No graph is generated" + if get_marker_significance: + gen_graph( + list_CP=list_CP, + list_PM=list_PM, + list_PK=list_PK, + list_KA=list_KA, + list_CG=list_CG, + list_CC=list_CC, + prefix_C=prefix_C, + prefix_P=prefix_P, + prefix_M=prefix_M, + prefix_K=prefix_K, + prefix_G=prefix_G, + layer=layer, + copy=copy, + dirname=dirname, + add_edge_weights=add_edge_weights, + use_highly_variable=use_highly_variable, + use_top_pcs=use_top_pcs, + use_top_pcs_CP=use_top_pcs_CP, + use_top_pcs_PM=use_top_pcs_PM, + use_top_pcs_PK=use_top_pcs_PK, + get_marker_significance=False, + ) dirname_orig = dirname dirname += "_with_sig" - filepath = os.path.join(settings.workdir, 'pbg', dirname) - settings.pbg_params['entity_path'] = \ - os.path.join(filepath, "input/entity") - settings.pbg_params['edge_paths'] = \ - [os.path.join(filepath, "input/edge"), ] - settings.pbg_params['entity_path'] = \ - os.path.join(filepath, "input/entity") + filepath = os.path.join(settings.workdir, "pbg", dirname) + settings.pbg_params["entity_path"] = os.path.join(filepath, "input/entity") + settings.pbg_params["edge_paths"] = [ + os.path.join(filepath, "input/edge"), + ] + settings.pbg_params["entity_path"] = os.path.join(filepath, "input/entity") if not os.path.exists(filepath): os.makedirs(filepath) if add_edge_weights is None: @@ -194,52 +198,63 @@ def gen_graph(list_CP=None, add_edge_weights = False else: add_edge_weights = True - def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weight = True, weight_scale = 1): + + def _get_df_edges( + adj_mat, + df_source, + df_dest, + adata, + relation_id, + include_weight=True, + weight_scale=1, + ): col_names = ["source", "relation", "destination"] if include_weight: col_names.append("weight") df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['source'] = df_source.loc[ - adata.obs_names[adj_mat.nonzero()[0]], - 'alias'].values - df_edges_x['relation'] = relation_id - df_edges_x['destination'] = df_dest.loc[ - adata.var_names[adj_mat.nonzero()[1]], - 'alias'].values + df_edges_x["source"] = df_source.loc[ + adata.obs_names[adj_mat.nonzero()[0]], "alias" + ].values + df_edges_x["relation"] = relation_id + df_edges_x["destination"] = df_dest.loc[ + adata.var_names[adj_mat.nonzero()[1]], "alias" + ].values if include_weight: - df_edges_x['weight'] = weight_scale - return(df_edges_x) + df_edges_x["weight"] = weight_scale + return df_edges_x + if list_adata is not None: id_ent = pd.Index([]) # ids of all entities dict_ent_type = dict() ctr_ent = 0 # counter for entity types - entity_alias = pd.DataFrame(columns=['alias']) + entity_alias = pd.DataFrame(columns=["alias"]) dict_graph_stats = dict() if add_edge_weights: col_names = ["source", "relation", "destination", "weight"] else: col_names = ["source", "relation", "destination"] df_edges = pd.DataFrame(columns=col_names) - settings.pbg_params['relations'] = [] + settings.pbg_params["relations"] = [] if get_marker_significance and list_adata: - raise NotImplementedError("Marker gene significance is not yet implemented for graph generation from AnnData list.") + raise NotImplementedError( + "Marker gene significance is not yet implemented for graph generation from AnnData list." + ) for ctr_rel, adata_ori in enumerate(list_adata): obs_names = adata_ori.obs_names var_names = adata_ori.var_names if len(set(obs_names).intersection(id_ent)) == 0: - prefix_i = f'{prefix}{ctr_ent}' + prefix_i = f"{prefix}{ctr_ent}" id_ent = id_ent.union(adata_ori.obs_names) entity_alias_obs = pd.DataFrame( index=obs_names, - columns=['alias'], - data=[f'{prefix_i}.{x}' - for x in range(len(obs_names))]) - settings.pbg_params['entities'][ - prefix_i] = {'num_partitions': 1} + columns=["alias"], + data=[f"{prefix_i}.{x}" for x in range(len(obs_names))], + ) + settings.pbg_params["entities"][prefix_i] = {"num_partitions": 1} dict_ent_type[prefix_i] = obs_names entity_alias = pd.concat( - [entity_alias, entity_alias_obs], - ignore_index=False) + [entity_alias, entity_alias_obs], ignore_index=False + ) obs_type = prefix_i ctr_ent += 1 else: @@ -249,30 +264,32 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh break if not set(obs_names).issubset(id_ent): id_ent = id_ent.union(adata_ori.obs_names) - adt_obs_names = list(set(obs_names)-set(item)) + adt_obs_names = list(set(obs_names) - set(item)) entity_alias_obs = pd.DataFrame( index=adt_obs_names, - columns=['alias'], - data=[f'{prefix_i}.{len(item)+x}' - for x in range(len(adt_obs_names))]) + columns=["alias"], + data=[ + f"{prefix_i}.{len(item)+x}" + for x in range(len(adt_obs_names)) + ], + ) dict_ent_type[obs_type] = obs_names.union(adt_obs_names) entity_alias = pd.concat( - [entity_alias, entity_alias_obs], - ignore_index=False) + [entity_alias, entity_alias_obs], ignore_index=False + ) if len(set(var_names).intersection(id_ent)) == 0: - prefix_i = f'{prefix}{ctr_ent}' + prefix_i = f"{prefix}{ctr_ent}" id_ent = id_ent.union(adata_ori.var_names) entity_alias_var = pd.DataFrame( index=var_names, - columns=['alias'], - data=[f'{prefix_i}.{x}' - for x in range(len(var_names))]) - settings.pbg_params['entities'][ - prefix_i] = {'num_partitions': 1} + columns=["alias"], + data=[f"{prefix_i}.{x}" for x in range(len(var_names))], + ) + settings.pbg_params["entities"][prefix_i] = {"num_partitions": 1} dict_ent_type[prefix_i] = var_names entity_alias = pd.concat( - [entity_alias, entity_alias_var], - ignore_index=False) + [entity_alias, entity_alias_var], ignore_index=False + ) var_type = prefix_i ctr_ent += 1 else: @@ -282,63 +299,71 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh break if not set(var_names).issubset(id_ent): id_ent = id_ent.union(adata_ori.var_names) - adt_var_names = list(set(var_names)-set(item)) + adt_var_names = list(set(var_names) - set(item)) entity_alias_var = pd.DataFrame( index=adt_var_names, - columns=['alias'], - data=[f'{prefix_i}.{len(item)+x}' - for x in range(len(adt_var_names))]) + columns=["alias"], + data=[ + f"{prefix_i}.{len(item)+x}" + for x in range(len(adt_var_names)) + ], + ) dict_ent_type[var_type] = var_names.union(adt_var_names) entity_alias = pd.concat( - [entity_alias, entity_alias_var], - ignore_index=False) + [entity_alias, entity_alias_var], ignore_index=False + ) # generate edges if layer is not None: if layer in adata_ori.layers.keys(): arr_simba = adata_ori.layers[layer] else: - print(f'`{layer}` does not exist in adata {ctr_rel} ' - 'in `list_adata`.`.X` is being used instead.') + print( + f"`{layer}` does not exist in adata {ctr_rel} " + "in `list_adata`.`.X` is being used instead." + ) arr_simba = adata_ori.X else: arr_simba = adata_ori.X _row, _col = arr_simba.nonzero() df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['source'] = entity_alias.loc[ - obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{ctr_rel}' - df_edges_x['destination'] = entity_alias.loc[ - var_names[_col], 'alias'].values + df_edges_x["source"] = entity_alias.loc[obs_names[_row], "alias"].values + df_edges_x["relation"] = f"r{ctr_rel}" + df_edges_x["destination"] = entity_alias.loc[ + var_names[_col], "alias" + ].values if add_edge_weights: - df_edges_x['weight'] = \ - arr_simba[_row, _col].A.flatten() - settings.pbg_params['relations'].append({ - 'name': f'r{ctr_rel}', - 'lhs': f'{obs_type}', - 'rhs': f'{var_type}', - 'operator': 'none', - 'weight': 1.0 - }) - dict_graph_stats[f'relation{ctr_rel}'] = { - 'source': obs_type, - 'destination': var_type, - 'n_edges': df_edges_x.shape[0]} + df_edges_x["weight"] = arr_simba[_row, _col].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{ctr_rel}", + "lhs": f"{obs_type}", + "rhs": f"{var_type}", + "operator": "none", + "weight": 1.0, + } + ) + dict_graph_stats[f"relation{ctr_rel}"] = { + "source": obs_type, + "destination": var_type, + "n_edges": df_edges_x.shape[0], + } print( - f'relation{ctr_rel}: ' - f'source: {obs_type}, ' - f'destination: {var_type}\n' - f'#edges: {df_edges_x.shape[0]}') + f"relation{ctr_rel}: " + f"source: {obs_type}, " + f"destination: {var_type}\n" + f"#edges: {df_edges_x.shape[0]}" + ) - df_edges = pd.concat( - [df_edges, df_edges_x], - ignore_index=True) - adata_ori.obs['pbg_id'] = "" - adata_ori.var['pbg_id'] = "" - adata_ori.obs.loc[obs_names, 'pbg_id'] = \ - entity_alias.loc[obs_names, 'alias'].copy() - adata_ori.var.loc[var_names, 'pbg_id'] = \ - entity_alias.loc[var_names, 'alias'].copy() + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) + adata_ori.obs["pbg_id"] = "" + adata_ori.var["pbg_id"] = "" + adata_ori.obs.loc[obs_names, "pbg_id"] = entity_alias.loc[ + obs_names, "alias" + ].copy() + adata_ori.var.loc[var_names, "pbg_id"] = entity_alias.loc[ + var_names, "alias" + ].copy() else: # Collect the indices of entities @@ -346,6 +371,7 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh ids_genes = pd.Index([]) ids_peaks = pd.Index([]) ids_kmers = pd.Index([]) + ids_annots = pd.Index([]) ids_motifs = pd.Index([]) if list_CP is not None: @@ -355,7 +381,7 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh else: flag_top_pcs = use_top_pcs_CP if flag_top_pcs: - adata = adata_ori[:, adata_ori.var['top_pcs']].copy() + adata = adata_ori[:, adata_ori.var["top_pcs"]].copy() else: adata = adata_ori.copy() ids_cells_i = adata.obs.index @@ -372,13 +398,11 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh if not flag_included: # create a new set of entities # when not all indices are included - dict_cells[ - f'{prefix_C}{len(dict_cells)+1}'] = \ - ids_cells_i + dict_cells[f"{prefix_C}{len(dict_cells)+1}"] = ids_cells_i ids_peaks = ids_peaks.union(adata.var.index) if get_marker_significance: - n_npeaks = min(int(len(ids_peaks)*fold_null_nodes), len(ids_peaks)) - ids_npeaks = pd.Index([f'n{prefix_P}.{x}' for x in range(n_npeaks)]) + n_npeaks = min(int(len(ids_peaks) * fold_null_nodes), len(ids_peaks)) + ids_npeaks = pd.Index([f"n{prefix_P}.{x}" for x in range(n_npeaks)]) if list_PM is not None: for adata_ori in list_PM: if use_top_pcs_PM is None: @@ -386,14 +410,14 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh else: flag_top_pcs = use_top_pcs_PM if flag_top_pcs: - adata = adata_ori[:, adata_ori.var['top_pcs']].copy() + adata = adata_ori[:, adata_ori.var["top_pcs"]].copy() else: adata = adata_ori.copy() ids_peaks = ids_peaks.union(adata.obs.index) ids_motifs = ids_motifs.union(adata.var.index) if get_marker_significance: - n_nmotifs = int(len(ids_motifs)*fold_null_nodes) - ids_nmotifs = pd.Index([f'n{prefix_M}.{x}' for x in range(n_nmotifs)]) + n_nmotifs = int(len(ids_motifs) * fold_null_nodes) + ids_nmotifs = pd.Index([f"n{prefix_M}.{x}" for x in range(n_nmotifs)]) if list_PK is not None: for adata_ori in list_PK: if use_top_pcs_PK is None: @@ -401,19 +425,26 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh else: flag_top_pcs = use_top_pcs_PK if flag_top_pcs: - adata = adata_ori[:, adata_ori.var['top_pcs']].copy() + adata = adata_ori[:, adata_ori.var["top_pcs"]].copy() else: adata = adata_ori.copy() ids_peaks = ids_peaks.union(adata.obs.index) ids_kmers = ids_kmers.union(adata.var.index) if get_marker_significance: - n_nkmers = int(len(ids_kmers)*fold_null_nodes) - ids_nkmers = pd.Index([f'n{prefix_K}.{x}' for x in range(n_nkmers)]) + n_nkmers = int(len(ids_kmers) * fold_null_nodes) + ids_nkmers = pd.Index([f"n{prefix_K}.{x}" for x in range(n_nkmers)]) + if list_KA is not None: + for adata_ori in list_KA: + adata = adata_ori.copy() + ids_kmers = ids_kmers.union(adata.obs.index) + ids_annots = ids_annots.union(adata.var.index) + if get_marker_significance: + n_nannots = int(len(ids_annots) * fold_null_nodes) + ids_nannots = pd.Index([f"n{prefix_K}.{x}" for x in range(n_nannots)]) if list_CG is not None: for adata_ori in list_CG: if use_highly_variable: - adata = adata_ori[ - :, adata_ori.var['highly_variable']].copy() + adata = adata_ori[:, adata_ori.var["highly_variable"]].copy() else: adata = adata_ori.copy() ids_cells_i = adata.obs.index @@ -430,93 +461,94 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh if not flag_included: # create a new set of entities # when not all indices are included - dict_cells[ - f'{prefix_C}{len(dict_cells)+1}'] = \ - ids_cells_i + dict_cells[f"{prefix_C}{len(dict_cells)+1}"] = ids_cells_i ids_genes = ids_genes.union(adata.var.index) if get_marker_significance: - n_ngenes = int(len(ids_genes)*fold_null_nodes) - ids_ngenes = pd.Index([f'n{prefix_G}.{x}' for x in range(n_ngenes)]) + n_ngenes = int(len(ids_genes) * fold_null_nodes) + ids_ngenes = pd.Index([f"n{prefix_G}.{x}" for x in range(n_ngenes)]) - entity_alias = pd.DataFrame(columns=['alias']) + entity_alias = pd.DataFrame(columns=["alias"]) dict_df_cells = dict() # unique cell dataframes for k in dict_cells.keys(): dict_df_cells[k] = pd.DataFrame( index=dict_cells[k], - columns=['alias'], - data=[f'{k}.{x}' for x in range(len(dict_cells[k]))]) - settings.pbg_params['entities'][k] = {'num_partitions': 1} + columns=["alias"], + data=[f"{k}.{x}" for x in range(len(dict_cells[k]))], + ) + settings.pbg_params["entities"][k] = {"num_partitions": 1} entity_alias = pd.concat( - [entity_alias, dict_df_cells[k]], - ignore_index=False) + [entity_alias, dict_df_cells[k]], ignore_index=False + ) if len(ids_genes) > 0: df_genes = pd.DataFrame( - index=ids_genes, - columns=['alias'], - data=[f'{prefix_G}.{x}' for x in range(len(ids_genes))]) - settings.pbg_params['entities'][prefix_G] = {'num_partitions': 1} - entity_alias = pd.concat( - [entity_alias, df_genes], - ignore_index=False) + index=ids_genes, + columns=["alias"], + data=[f"{prefix_G}.{x}" for x in range(len(ids_genes))], + ) + settings.pbg_params["entities"][prefix_G] = {"num_partitions": 1} + entity_alias = pd.concat([entity_alias, df_genes], ignore_index=False) if get_marker_significance and (len(ids_ngenes) > 0): df_ngenes = pd.DataFrame( - index=ids_ngenes, - columns=['alias'], - data=ids_ngenes.tolist()) - settings.pbg_params['entities'][f'n{prefix_G}'] = {'num_partitions': 1} - entity_alias = pd.concat([entity_alias, df_ngenes], - ignore_index=False) + index=ids_ngenes, columns=["alias"], data=ids_ngenes.tolist() + ) + settings.pbg_params["entities"][f"n{prefix_G}"] = {"num_partitions": 1} + entity_alias = pd.concat([entity_alias, df_ngenes], ignore_index=False) if len(ids_peaks) > 0: df_peaks = pd.DataFrame( - index=ids_peaks, - columns=['alias'], - data=[f'{prefix_P}.{x}' for x in range(len(ids_peaks))]) - settings.pbg_params['entities'][prefix_P] = {'num_partitions': 1} - entity_alias = pd.concat( - [entity_alias, df_peaks], - ignore_index=False) + index=ids_peaks, + columns=["alias"], + data=[f"{prefix_P}.{x}" for x in range(len(ids_peaks))], + ) + settings.pbg_params["entities"][prefix_P] = {"num_partitions": 1} + entity_alias = pd.concat([entity_alias, df_peaks], ignore_index=False) if get_marker_significance and (len(ids_npeaks) > 0): df_npeaks = pd.DataFrame( - index=ids_npeaks, - columns=['alias'], - data=ids_npeaks.tolist()) - settings.pbg_params['entities'][f'n{prefix_P}'] = {'num_partitions': 1} - entity_alias = pd.concat([entity_alias,df_npeaks], - ignore_index=False) + index=ids_npeaks, columns=["alias"], data=ids_npeaks.tolist() + ) + settings.pbg_params["entities"][f"n{prefix_P}"] = {"num_partitions": 1} + entity_alias = pd.concat([entity_alias, df_npeaks], ignore_index=False) if len(ids_kmers) > 0: df_kmers = pd.DataFrame( - index=ids_kmers, - columns=['alias'], - data=[f'{prefix_K}.{x}' for x in range(len(ids_kmers))]) - settings.pbg_params['entities'][prefix_K] = {'num_partitions': 1} - entity_alias = pd.concat( - [entity_alias, df_kmers], - ignore_index=False) + index=ids_kmers, + columns=["alias"], + data=[f"{prefix_K}.{x}" for x in range(len(ids_kmers))], + ) + settings.pbg_params["entities"][prefix_K] = {"num_partitions": 1} + entity_alias = pd.concat([entity_alias, df_kmers], ignore_index=False) if get_marker_significance and (len(ids_nkmers) > 0): df_nkmers = pd.DataFrame( - index=ids_nkmers, - columns=['alias'], - data=ids_nkmers.tolist()) - settings.pbg_params['entities'][f'n{prefix_K}'] = {'num_partitions': 1} - entity_alias = pd.concat([entity_alias,df_nkmers], - ignore_index=False) + index=ids_nkmers, columns=["alias"], data=ids_nkmers.tolist() + ) + settings.pbg_params["entities"][f"n{prefix_K}"] = {"num_partitions": 1} + entity_alias = pd.concat([entity_alias, df_nkmers], ignore_index=False) + if len(ids_annots) > 0: + df_annots = pd.DataFrame( + index=ids_annots, + columns=["alias"], + data=[f"{prefix_M}.{x}" for x in range(len(ids_annots))], + ) + settings.pbg_params["entities"][prefix_M] = {"num_partitions": 1} + entity_alias = pd.concat([entity_alias, df_annots], ignore_index=False) + if get_marker_significance and (len(ids_nannots) > 0): + df_nannots = pd.DataFrame( + index=ids_nannots, columns=["alias"], data=ids_nannots.tolist() + ) + settings.pbg_params["entities"][f"n{prefix_M}"] = {"num_partitions": 1} + entity_alias = pd.concat([entity_alias, df_nannots], ignore_index=False) if len(ids_motifs) > 0: df_motifs = pd.DataFrame( index=ids_motifs, - columns=['alias'], - data=[f'{prefix_M}.{x}' for x in range(len(ids_motifs))]) - settings.pbg_params['entities'][prefix_M] = {'num_partitions': 1} - entity_alias = pd.concat( - [entity_alias, df_motifs], - ignore_index=False) + columns=["alias"], + data=[f"{prefix_M}.{x}" for x in range(len(ids_motifs))], + ) + settings.pbg_params["entities"][prefix_M] = {"num_partitions": 1} + entity_alias = pd.concat([entity_alias, df_motifs], ignore_index=False) if get_marker_significance and (len(ids_nmotifs) > 0): df_nmotifs = pd.DataFrame( - index=ids_nmotifs, - columns=['alias'], - data=ids_nmotifs.tolist()) - settings.pbg_params['entities'][f'n{prefix_M}'] = {'num_partitions': 1} - entity_alias = pd.concat([entity_alias,df_nmotifs], - ignore_index=False) + index=ids_nmotifs, columns=["alias"], data=ids_nmotifs.tolist() + ) + settings.pbg_params["entities"][f"n{prefix_M}"] = {"num_partitions": 1} + entity_alias = pd.concat([entity_alias, df_nmotifs], ignore_index=False) # generate edges dict_graph_stats = dict() if add_edge_weights: @@ -525,12 +557,12 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh col_names = ["source", "relation", "destination"] df_edges = pd.DataFrame(columns=col_names) id_r = 0 - settings.pbg_params['relations'] = [] + settings.pbg_params["relations"] = [] if list_CP is not None: for i, adata_ori in enumerate(list_CP): if use_top_pcs: - adata = adata_ori[:, adata_ori.var['top_pcs']].copy() + adata = adata_ori[:, adata_ori.var["top_pcs"]].copy() else: adata = adata_ori.copy() # select reference of cells @@ -541,277 +573,438 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh if layer in adata.layers.keys(): arr_simba = adata.layers[layer] else: - print(f'`{layer}` does not exist in anndata {i} ' - 'in `list_CP`.`.X` is being used instead.') + print( + f"`{layer}` does not exist in anndata {i} " + "in `list_CP`.`.X` is being used instead." + ) arr_simba = adata.X else: arr_simba = adata.X if get_marker_significance: - #n_npeaks = int(len(ids_peaks)*fold_null_nodes) - null_matrix = _randomize_matrix(arr_simba, n_npeaks, method='degPreserving') - null_adata = ad.AnnData(obs=adata.obs, var=df_npeaks, layers={"disc":null_matrix}) + # n_npeaks = int(len(ids_peaks)*fold_null_nodes) + null_matrix = _randomize_matrix( + arr_simba, n_npeaks, method="degPreserving" + ) + null_adata = ad.AnnData( + obs=adata.obs, var=df_npeaks, layers={"disc": null_matrix} + ) _row, _col = arr_simba.nonzero() df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['source'] = df_cells.loc[ - adata.obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{id_r}' - df_edges_x['destination'] = df_peaks.loc[ - adata.var_names[_col], 'alias'].values + df_edges_x["source"] = df_cells.loc[ + adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["destination"] = df_peaks.loc[ + adata.var_names[_col], "alias" + ].values if add_edge_weights: - df_edges_x['weight'] = \ - arr_simba[_row, _col].A.flatten() - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'{key}', - 'rhs': f'{prefix_P}', - 'operator': 'none', - 'weight': 1.0 - }) - dict_graph_stats[f'relation{id_r}'] = { - 'source': key, - 'destination': prefix_P, - 'n_edges': df_edges_x.shape[0]} + df_edges_x["weight"] = arr_simba[_row, _col].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{key}", + "rhs": f"{prefix_P}", + "operator": "none", + "weight": 1.0, + } + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": key, + "destination": prefix_P, + "n_edges": df_edges_x.shape[0], + } print( - f'relation{id_r}: ' - f'source: {key}, ' - f'destination: {prefix_P}\n' - f'#edges: {df_edges_x.shape[0]}') + f"relation{id_r}: " + f"source: {key}, " + f"destination: {prefix_P}\n" + f"#edges: {df_edges_x.shape[0]}" + ) id_r += 1 - df_edges = pd.concat( - [df_edges, df_edges_x], - ignore_index=True) - adata_ori.obs['pbg_id'] = "" - adata_ori.var['pbg_id'] = "" - adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \ - df_cells.loc[adata.obs_names, 'alias'].copy() - adata_ori.var.loc[adata.var_names, 'pbg_id'] = \ - df_peaks.loc[adata.var_names, 'alias'].copy() + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) + adata_ori.obs["pbg_id"] = "" + adata_ori.var["pbg_id"] = "" + adata_ori.obs.loc[adata.obs_names, "pbg_id"] = df_cells.loc[ + adata.obs_names, "alias" + ].copy() + adata_ori.var.loc[adata.var_names, "pbg_id"] = df_peaks.loc[ + adata.var_names, "alias" + ].copy() if get_marker_significance: _col, _row = null_matrix.transpose().nonzero() df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['destination'] = df_cells.loc[ - null_adata.obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{id_r}' - df_edges_x['source'] = df_npeaks.loc[ - null_adata.var_names[_col], 'alias'].values - df_edges_x['weight'] = \ - null_matrix.transpose()[_col, _row].A.flatten() - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'n{prefix_P}', - 'rhs': f'{key}', - 'operator': 'fix', - 'weight': 1.0, - }) + df_edges_x["destination"] = df_cells.loc[ + null_adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["source"] = df_npeaks.loc[ + null_adata.var_names[_col], "alias" + ].values + df_edges_x["weight"] = null_matrix.transpose()[ + _col, _row + ].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"n{prefix_P}", + "rhs": f"{key}", + "operator": "fix", + "weight": 1.0, + } + ) print( - f'relation{id_r}: ' - f'source: n{prefix_P}, ' - f'destination: {key}\n' - f'#edges: {df_edges_x.shape[0]}') - dict_graph_stats[f'relation{id_r}'] = { - 'source': f'n{prefix_P}', - 'destination': key, - 'n_edges': df_edges_x.shape[0]} + f"relation{id_r}: " + f"source: n{prefix_P}, " + f"destination: {key}\n" + f"#edges: {df_edges_x.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": f"n{prefix_P}", + "destination": key, + "n_edges": df_edges_x.shape[0], + } id_r += 1 - df_edges = pd.concat( - [df_edges, df_edges_x], - ignore_index=True) + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) if list_PM is not None: for i, adata_ori in enumerate(list_PM): if use_top_pcs: - adata = adata_ori[:, adata_ori.var['top_pcs']].copy() + adata = adata_ori[:, adata_ori.var["top_pcs"]].copy() else: adata = adata_ori.copy() if layer is not None: if layer in adata.layers.keys(): arr_simba = adata.layers[layer] else: - print(f'`{layer}` does not exist in anndata {i} ' - 'in `list_PM`.`.X` is being used instead.') + print( + f"`{layer}` does not exist in anndata {i} " + "in `list_PM`.`.X` is being used instead." + ) arr_simba = adata.X else: arr_simba = adata.X if get_marker_significance: - n_nmotifs = int(len(ids_motifs)*fold_null_nodes) - null_matrix = _randomize_matrix(arr_simba, n_nmotifs, method='degPreserving') - null_adata = ad.AnnData(obs=adata.obs, var=df_nmotifs, layers={"disc":null_matrix}) + n_nmotifs = int(len(ids_motifs) * fold_null_nodes) + null_matrix = _randomize_matrix( + arr_simba, n_nmotifs, method="degPreserving" + ) + null_adata = ad.AnnData( + obs=adata.obs, var=df_nmotifs, layers={"disc": null_matrix} + ) _row, _col = arr_simba.nonzero() df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['source'] = df_peaks.loc[ - adata.obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{id_r}' - df_edges_x['destination'] = df_motifs.loc[ - adata.var_names[_col], 'alias'].values + df_edges_x["source"] = df_peaks.loc[ + adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["destination"] = df_motifs.loc[ + adata.var_names[_col], "alias" + ].values if add_edge_weights: - df_edges_x['weight'] = \ - arr_simba[_row, _col].A.flatten() - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'{prefix_P}', - 'rhs': f'{prefix_M}', - 'operator': 'none', - 'weight': 0.2 - }) + df_edges_x["weight"] = arr_simba[_row, _col].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{prefix_P}", + "rhs": f"{prefix_M}", + "operator": "none", + "weight": 0.2, + } + ) else: - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'{prefix_P}', - 'rhs': f'{prefix_M}', - 'operator': 'none', - 'weight': 0.2 - }) - dict_graph_stats[f'relation{id_r}'] = { - 'source': prefix_P, - 'destination': prefix_M, - 'n_edges': df_edges_x.shape[0]} + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{prefix_P}", + "rhs": f"{prefix_M}", + "operator": "none", + "weight": 0.2, + } + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": prefix_P, + "destination": prefix_M, + "n_edges": df_edges_x.shape[0], + } print( - f'relation{id_r}: ' - f'source: {prefix_P}, ' - f'destination: {prefix_M}\n' - f'#edges: {df_edges_x.shape[0]}') + f"relation{id_r}: " + f"source: {prefix_P}, " + f"destination: {prefix_M}\n" + f"#edges: {df_edges_x.shape[0]}" + ) id_r += 1 - df_edges = pd.concat( - [df_edges, df_edges_x], - ignore_index=True) - adata_ori.obs['pbg_id'] = "" - adata_ori.var['pbg_id'] = "" - adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \ - df_peaks.loc[adata.obs_names, 'alias'].copy() - adata_ori.var.loc[adata.var_names, 'pbg_id'] = \ - df_motifs.loc[adata.var_names, 'alias'].copy() + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) + adata_ori.obs["pbg_id"] = "" + adata_ori.var["pbg_id"] = "" + adata_ori.obs.loc[adata.obs_names, "pbg_id"] = df_peaks.loc[ + adata.obs_names, "alias" + ].copy() + adata_ori.var.loc[adata.var_names, "pbg_id"] = df_motifs.loc[ + adata.var_names, "alias" + ].copy() if get_marker_significance: _col, _row = null_matrix.transpose().nonzero() df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['destination'] = df_peaks.loc[ - null_adata.obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{id_r}' - df_edges_x['source'] = df_nmotifs.loc[ - null_adata.var_names[_col], 'alias'].values - df_edges_x['weight'] = \ - null_matrix.transpose()[_col, _row].A.flatten() - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'n{prefix_M}', - 'rhs': f'{prefix_P}', - 'operator': 'fix', - 'weight': 1.0, - }) + df_edges_x["destination"] = df_peaks.loc[ + null_adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["source"] = df_nmotifs.loc[ + null_adata.var_names[_col], "alias" + ].values + df_edges_x["weight"] = null_matrix.transpose()[ + _col, _row + ].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"n{prefix_M}", + "rhs": f"{prefix_P}", + "operator": "fix", + "weight": 1.0, + } + ) print( - f'relation{id_r}: ' - f'source: n{prefix_M}, ' - f'destination: {prefix_P}\n' - f'#edges: {df_edges_x.shape[0]}') - dict_graph_stats[f'relation{id_r}'] = { - 'source': f'n{prefix_M}', - 'destination': prefix_P, - 'n_edges': df_edges_x.shape[0]} + f"relation{id_r}: " + f"source: n{prefix_M}, " + f"destination: {prefix_P}\n" + f"#edges: {df_edges_x.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": f"n{prefix_M}", + "destination": prefix_P, + "n_edges": df_edges_x.shape[0], + } id_r += 1 - df_edges = pd.concat( - [df_edges, df_edges_x], - ignore_index=True) + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) if list_PK is not None: for i, adata_ori in enumerate(list_PK): if use_top_pcs: - adata = adata_ori[:, adata_ori.var['top_pcs']].copy() + adata = adata_ori[:, adata_ori.var["top_pcs"]].copy() else: adata = adata_ori.copy() if layer is not None: if layer in adata.layers.keys(): arr_simba = adata.layers[layer] else: - print(f'`{layer}` does not exist in anndata {i} ' - 'in `list_PK`.`.X` is being used instead.') + print( + f"`{layer}` does not exist in anndata {i} " + "in `list_PK`.`.X` is being used instead." + ) + arr_simba = adata.X + else: + arr_simba = adata.X + if get_marker_significance: + n_nkmers = int(len(ids_kmers) * fold_null_nodes) + null_matrix = _randomize_matrix( + arr_simba, n_nkmers, method="degPreserving" + ) + null_adata = ad.AnnData( + obs=adata.obs, var=df_nkmers, layers={"disc": null_matrix} + ) + _row, _col = arr_simba.nonzero() + df_edges_x = pd.DataFrame(columns=col_names) + df_edges_x["source"] = df_peaks.loc[ + adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["destination"] = df_kmers.loc[ + adata.var_names[_col], "alias" + ].values + if add_edge_weights: + df_edges_x["weight"] = arr_simba[_row, _col].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{prefix_P}", + "rhs": f"{prefix_K}", + "operator": "none", + "weight": 0.02, + } + ) + else: + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{prefix_P}", + "rhs": f"{prefix_K}", + "operator": "none", + "weight": 0.02, + } + ) + print( + f"relation{id_r}: " + f"source: {prefix_P}, " + f"destination: {prefix_K}\n" + f"#edges: {df_edges_x.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": prefix_P, + "destination": prefix_K, + "n_edges": df_edges_x.shape[0], + } + + id_r += 1 + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) + adata_ori.obs["pbg_id"] = "" + adata_ori.var["pbg_id"] = "" + adata_ori.obs.loc[adata.obs_names, "pbg_id"] = df_peaks.loc[ + adata.obs_names, "alias" + ].copy() + adata_ori.var.loc[adata.var_names, "pbg_id"] = df_kmers.loc[ + adata.var_names, "alias" + ].copy() + if get_marker_significance: + _col, _row = null_matrix.transpose().nonzero() + df_edges_x = pd.DataFrame(columns=col_names) + df_edges_x["destination"] = df_peaks.loc[ + null_adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["source"] = df_nkmers.loc[ + null_adata.var_names[_col], "alias" + ].values + df_edges_x["weight"] = null_matrix.transpose()[ + _col, _row + ].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"n{prefix_K}", + "rhs": f"{prefix_P}", + "operator": "fix", + "weight": 0.02, + } + ) + print( + f"relation{id_r}: " + f"source: n{prefix_K}, " + f"destination: {prefix_P}\n" + f"#edges: {df_edges_x.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": f"n{prefix_K}", + "destination": prefix_P, + "n_edges": df_edges_x.shape[0], + } + id_r += 1 + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) + + if list_KA is not None: + for i, adata_ori in enumerate(list_PK): + adata = adata_ori.copy() + if layer is not None: + if layer in adata.layers.keys(): + arr_simba = adata.layers[layer] + else: + print( + f"`{layer}` does not exist in anndata {i} " + "in `list_PK`.`.X` is being used instead." + ) arr_simba = adata.X else: arr_simba = adata.X if get_marker_significance: - n_nkmers = int(len(ids_kmers)*fold_null_nodes) - null_matrix = _randomize_matrix(arr_simba, n_nkmers, method='degPreserving') - null_adata = ad.AnnData(obs=adata.obs, var=df_nkmers, layers={"disc":null_matrix}) + n_nannots = int(len(ids_annots) * fold_null_nodes) + null_matrix = _randomize_matrix( + arr_simba, n_nannots, method="degPreserving" + ) + null_adata = ad.AnnData( + obs=adata.obs, var=df_nannots, layers={"disc": null_matrix} + ) _row, _col = arr_simba.nonzero() df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['source'] = df_peaks.loc[ - adata.obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{id_r}' - df_edges_x['destination'] = df_kmers.loc[ - adata.var_names[_col], 'alias'].values + df_edges_x["source"] = df_kmers.loc[ + adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["destination"] = df_annots.loc[ + adata.var_names[_col], "alias" + ].values if add_edge_weights: - df_edges_x['weight'] = \ - arr_simba[_row, _col].A.flatten() - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'{prefix_P}', - 'rhs': f'{prefix_K}', - 'operator': 'none', - 'weight': 0.02 - }) + df_edges_x["weight"] = arr_simba[_row, _col].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{prefix_K}", + "rhs": f"{prefix_A}", + "operator": "none", + "weight": 0.02, # ? + } + ) else: - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'{prefix_P}', - 'rhs': f'{prefix_K}', - 'operator': 'none', - 'weight': 0.02 - }) + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{prefix_K}", + "rhs": f"{prefix_A}", + "operator": "none", + "weight": 0.02, + } + ) print( - f'relation{id_r}: ' - f'source: {prefix_P}, ' - f'destination: {prefix_K}\n' - f'#edges: {df_edges_x.shape[0]}') - dict_graph_stats[f'relation{id_r}'] = { - 'source': prefix_P, - 'destination': prefix_K, - 'n_edges': df_edges_x.shape[0]} + f"relation{id_r}: " + f"source: {prefix_K}, " + f"destination: {prefix_A}\n" + f"#edges: {df_edges_x.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": prefix_K, + "destination": prefix_A, + "n_edges": df_edges_x.shape[0], + } id_r += 1 - df_edges = pd.concat( - [df_edges, df_edges_x], - ignore_index=True) - adata_ori.obs['pbg_id'] = "" - adata_ori.var['pbg_id'] = "" - adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \ - df_peaks.loc[adata.obs_names, 'alias'].copy() - adata_ori.var.loc[adata.var_names, 'pbg_id'] = \ - df_kmers.loc[adata.var_names, 'alias'].copy() + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) + adata_ori.obs["pbg_id"] = "" + adata_ori.var["pbg_id"] = "" + adata_ori.obs.loc[adata.obs_names, "pbg_id"] = df_kmers.loc[ + adata.obs_names, "alias" + ].copy() + adata_ori.var.loc[adata.var_names, "pbg_id"] = df_annots.loc[ + adata.var_names, "alias" + ].copy() if get_marker_significance: _col, _row = null_matrix.transpose().nonzero() df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['destination'] = df_peaks.loc[ - null_adata.obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{id_r}' - df_edges_x['source'] = df_nkmers.loc[ - null_adata.var_names[_col], 'alias'].values - df_edges_x['weight'] = \ - null_matrix.transpose()[_col, _row].A.flatten() - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'n{prefix_K}', - 'rhs': f'{prefix_P}', - 'operator': 'fix', - 'weight': 0.02, - }) + df_edges_x["destination"] = df_peaks.loc[ + null_adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["source"] = df_nkmers.loc[ + null_adata.var_names[_col], "alias" + ].values + df_edges_x["weight"] = null_matrix.transpose()[ + _col, _row + ].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"n{prefix_K}", + "rhs": f"{prefix_A}", + "operator": "fix", + "weight": 0.02, + } + ) print( - f'relation{id_r}: ' - f'source: n{prefix_K}, ' - f'destination: {prefix_P}\n' - f'#edges: {df_edges_x.shape[0]}') - dict_graph_stats[f'relation{id_r}'] = { - 'source': f'n{prefix_K}', - 'destination': prefix_P, - 'n_edges': df_edges_x.shape[0]} + f"relation{id_r}: " + f"source: n{prefix_K}, " + f"destination: {prefix_A}\n" + f"#edges: {df_edges_x.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": f"n{prefix_K}", + "destination": prefix_A, + "n_edges": df_edges_x.shape[0], + } id_r += 1 - df_edges = pd.concat( - [df_edges, df_edges_x], - ignore_index=True) + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) if list_CG is not None: for i, adata_ori in enumerate(list_CG): if use_highly_variable: - adata = adata_ori[ - :, adata_ori.var['highly_variable']].copy() + adata = adata_ori[:, adata_ori.var["highly_variable"]].copy() else: adata = adata_ori.copy() # select reference of cells @@ -822,135 +1015,168 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh if layer in adata.layers.keys(): arr_simba = adata.layers[layer] else: - print(f'`{layer}` does not exist in anndata {i} ' - 'in `list_CG`.`.X` is being used instead.') + print( + f"`{layer}` does not exist in anndata {i} " + "in `list_CG`.`.X` is being used instead." + ) arr_simba = adata.X else: arr_simba = adata.X if get_marker_significance: - n_ngenes = int(len(ids_genes)*fold_null_nodes) - null_exp_matrix = _randomize_matrix(arr_simba, n_ngenes, method='degPreserving') - null_adata = ad.AnnData(obs=adata.obs, var=df_ngenes, layers={"disc":null_exp_matrix}) + n_ngenes = int(len(ids_genes) * fold_null_nodes) + null_exp_matrix = _randomize_matrix( + arr_simba, n_ngenes, method="degPreserving" + ) + null_adata = ad.AnnData( + obs=adata.obs, var=df_ngenes, layers={"disc": null_exp_matrix} + ) if add_edge_weights: _row, _col = arr_simba.nonzero() df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['source'] = df_cells.loc[ - adata.obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{id_r}' - df_edges_x['destination'] = df_genes.loc[ - adata.var_names[_col], 'alias'].values - df_edges_x['weight'] = \ - arr_simba[_row, _col].A.flatten() - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'{key}', - 'rhs': f'{prefix_G}', - 'operator': 'none', - 'weight': 1.0, - }) + df_edges_x["source"] = df_cells.loc[ + adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["destination"] = df_genes.loc[ + adata.var_names[_col], "alias" + ].values + df_edges_x["weight"] = arr_simba[_row, _col].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{key}", + "rhs": f"{prefix_G}", + "operator": "none", + "weight": 1.0, + } + ) print( - f'relation{id_r}: ' - f'source: {key}, ' - f'destination: {prefix_G}\n' - f'#edges: {df_edges_x.shape[0]}') - dict_graph_stats[f'relation{id_r}'] = { - 'source': key, - 'destination': prefix_G, - 'n_edges': df_edges_x.shape[0]} + f"relation{id_r}: " + f"source: {key}, " + f"destination: {prefix_G}\n" + f"#edges: {df_edges_x.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": key, + "destination": prefix_G, + "n_edges": df_edges_x.shape[0], + } id_r += 1 - df_edges = pd.concat( - [df_edges, df_edges_x], - ignore_index=True) + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) if get_marker_significance: _col, _row = null_exp_matrix.transpose().nonzero() df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['destination'] = df_cells.loc[ - null_adata.obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{id_r}' - df_edges_x['source'] = df_ngenes.loc[ - null_adata.var_names[_col], 'alias'].values - df_edges_x['weight'] = \ - null_exp_matrix.transpose()[_col, _row].A.flatten() - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'n{prefix_G}', - 'rhs': f'{key}', - 'operator': 'fix', - 'weight': 1.0, - }) + df_edges_x["destination"] = df_cells.loc[ + null_adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["source"] = df_ngenes.loc[ + null_adata.var_names[_col], "alias" + ].values + df_edges_x["weight"] = null_exp_matrix.transpose()[ + _col, _row + ].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"n{prefix_G}", + "rhs": f"{key}", + "operator": "fix", + "weight": 1.0, + } + ) print( - f'relation{id_r}: ' - f'source: n{prefix_G}, ' - f'destination: {key}\n' - f'#edges: {df_edges_x.shape[0]}') - dict_graph_stats[f'relation{id_r}'] = { - 'source': f'n{prefix_G}', - 'destination': key, - 'n_edges': df_edges_x.shape[0]} + f"relation{id_r}: " + f"source: n{prefix_G}, " + f"destination: {key}\n" + f"#edges: {df_edges_x.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": f"n{prefix_G}", + "destination": key, + "n_edges": df_edges_x.shape[0], + } id_r += 1 - df_edges = pd.concat( - [df_edges, df_edges_x], - ignore_index=True) + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) else: expr_level = np.unique(arr_simba.data) - expr_weight = np.linspace( - start=1, stop=5, num=len(expr_level)) + expr_weight = np.linspace(start=1, stop=5, num=len(expr_level)) for i_lvl, lvl in enumerate(expr_level): _row, _col = (arr_simba == lvl).astype(int).nonzero() df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['source'] = df_cells.loc[ - adata.obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{id_r}' - df_edges_x['destination'] = df_genes.loc[ - adata.var_names[_col], 'alias'].values - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'{key}', - 'rhs': f'{prefix_G}', - 'operator': 'none', - 'weight': round(expr_weight[i_lvl], 2), - }) + df_edges_x["source"] = df_cells.loc[ + adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["destination"] = df_genes.loc[ + adata.var_names[_col], "alias" + ].values + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{key}", + "rhs": f"{prefix_G}", + "operator": "none", + "weight": round(expr_weight[i_lvl], 2), + } + ) print( - f'relation{id_r}: ' - f'source: {key}, ' - f'destination: {prefix_G}\n' - f'#edges: {df_edges_x.shape[0]}') - dict_graph_stats[f'relation{id_r}'] = { - 'source': key, - 'destination': prefix_G, - 'n_edges': df_edges_x.shape[0]} + f"relation{id_r}: " + f"source: {key}, " + f"destination: {prefix_G}\n" + f"#edges: {df_edges_x.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": key, + "destination": prefix_G, + "n_edges": df_edges_x.shape[0], + } id_r += 1 - df_edges = pd.concat( - [df_edges, df_edges_x], ignore_index=True) + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) if get_marker_significance: - # generate null AnnData with cells x null genes - df_edges_v = _get_df_edges((null_exp_matrix == lvl).astype(int).T, - df_ngenes, df_cells, null_adata.transpose(), f'r{id_r}', include_weight=True, weight_scale=lvl) - print(f'relation{id_r}: ' - f'source: n{prefix_G}, ' - f'destination: {key}\n' - f'#edges: {df_edges_v.shape[0]}') - dict_graph_stats[f'relation{id_r}'] = \ - {'source': f'n{prefix_G}', - 'destination': key, - 'n_edges': df_edges_v.shape[0]} - df_edges = pd.concat([df_edges, df_edges_v], - ignore_index=True) - settings.pbg_params['relations'].append( - {'name': f'r{id_r}', - 'lhs': f'n{prefix_G}', - 'rhs': f'{key}', - 'operator': 'fix', - 'weight': round(expr_weight[i_lvl], 2), - }) + # generate null AnnData with cells x null genes + df_edges_v = _get_df_edges( + (null_exp_matrix == lvl).astype(int).T, + df_ngenes, + df_cells, + null_adata.transpose(), + f"r{id_r}", + include_weight=True, + weight_scale=lvl, + ) + print( + f"relation{id_r}: " + f"source: n{prefix_G}, " + f"destination: {key}\n" + f"#edges: {df_edges_v.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": f"n{prefix_G}", + "destination": key, + "n_edges": df_edges_v.shape[0], + } + df_edges = pd.concat( + [df_edges, df_edges_v], ignore_index=True + ) + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"n{prefix_G}", + "rhs": f"{key}", + "operator": "fix", + "weight": round(expr_weight[i_lvl], 2), + } + ) id_r += 1 - adata_ori.obs['pbg_id'] = "" - adata_ori.var['pbg_id'] = "" - adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \ - df_cells.loc[adata.obs_names, 'alias'].copy() - adata_ori.var.loc[adata.var_names, 'pbg_id'] = \ - df_genes.loc[adata.var_names, 'alias'].copy() + adata_ori.obs["pbg_id"] = "" + adata_ori.var["pbg_id"] = "" + adata_ori.obs.loc[adata.obs_names, "pbg_id"] = df_cells.loc[ + adata.obs_names, "alias" + ].copy() + adata_ori.var.loc[adata.var_names, "pbg_id"] = df_genes.loc[ + adata.var_names, "alias" + ].copy() if list_CC is not None: for i, adata in enumerate(list_CC): @@ -965,99 +1191,104 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh if layer in adata.layers.keys(): arr_simba = adata.layers[layer] else: - print(f'`{layer}` does not exist in anndata {i} ' - 'in `list_PM`.`.X` is being used instead.') + print( + f"`{layer}` does not exist in anndata {i} " + "in `list_PM`.`.X` is being used instead." + ) arr_simba = adata.X else: arr_simba = adata.X _row, _col = arr_simba.nonzero() # edges between ref and query df_edges_x = pd.DataFrame(columns=col_names) - df_edges_x['source'] = df_cells_obs.loc[ - adata.obs_names[_row], 'alias'].values - df_edges_x['relation'] = f'r{id_r}' - df_edges_x['destination'] = df_cells_var.loc[ - adata.var_names[_col], 'alias'].values + df_edges_x["source"] = df_cells_obs.loc[ + adata.obs_names[_row], "alias" + ].values + df_edges_x["relation"] = f"r{id_r}" + df_edges_x["destination"] = df_cells_var.loc[ + adata.var_names[_col], "alias" + ].values if add_edge_weights: - df_edges_x['weight'] = \ - arr_simba[_row, _col].A.flatten() - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'{key_obs}', - 'rhs': f'{key_var}', - 'operator': 'none', - 'weight': 1.0 - }) + df_edges_x["weight"] = arr_simba[_row, _col].A.flatten() + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{key_obs}", + "rhs": f"{key_var}", + "operator": "none", + "weight": 1.0, + } + ) else: - settings.pbg_params['relations'].append({ - 'name': f'r{id_r}', - 'lhs': f'{key_obs}', - 'rhs': f'{key_var}', - 'operator': 'none', - 'weight': 10.0 - }) + settings.pbg_params["relations"].append( + { + "name": f"r{id_r}", + "lhs": f"{key_obs}", + "rhs": f"{key_var}", + "operator": "none", + "weight": 10.0, + } + ) print( - f'relation{id_r}: ' - f'source: {key_obs}, ' - f'destination: {key_var}\n' - f'#edges: {df_edges_x.shape[0]}') - dict_graph_stats[f'relation{id_r}'] = { - 'source': key_obs, - 'destination': key_var, - 'n_edges': df_edges_x.shape[0]} + f"relation{id_r}: " + f"source: {key_obs}, " + f"destination: {key_var}\n" + f"#edges: {df_edges_x.shape[0]}" + ) + dict_graph_stats[f"relation{id_r}"] = { + "source": key_obs, + "destination": key_var, + "n_edges": df_edges_x.shape[0], + } id_r += 1 - df_edges = pd.concat( - [df_edges, df_edges_x], - ignore_index=True) - adata.obs['pbg_id'] = df_cells_obs.loc[ - adata.obs_names, 'alias'].copy() - adata.var['pbg_id'] = df_cells_var.loc[ - adata.var_names, 'alias'].copy() + df_edges = pd.concat([df_edges, df_edges_x], ignore_index=True) + adata.obs["pbg_id"] = df_cells_obs.loc[adata.obs_names, "alias"].copy() + adata.var["pbg_id"] = df_cells_var.loc[adata.var_names, "alias"].copy() - print(f'Total number of edges: {df_edges.shape[0]}') - dict_graph_stats['n_edges'] = df_edges.shape[0] + print(f"Total number of edges: {df_edges.shape[0]}") + dict_graph_stats["n_edges"] = df_edges.shape[0] settings.graph_stats[dirname] = dict_graph_stats print(f'Writing graph file "pbg_graph.txt" to "{filepath}" ...') - df_edges.to_csv(os.path.join(filepath, "pbg_graph.txt"), - header=False, - index=False, - sep='\t') - entity_alias.to_csv(os.path.join(filepath, 'entity_alias.txt'), - header=True, - index=True, - sep='\t') - with open(os.path.join(filepath, 'graph_stats.json'), 'w') as fp: - json.dump(dict_graph_stats, - fp, - sort_keys=True, - indent=4, - separators=(',', ': ')) + df_edges.to_csv( + os.path.join(filepath, "pbg_graph.txt"), header=False, index=False, sep="\t" + ) + entity_alias.to_csv( + os.path.join(filepath, "entity_alias.txt"), header=True, index=True, sep="\t" + ) + with open(os.path.join(filepath, "graph_stats.json"), "w") as fp: + json.dump( + dict_graph_stats, fp, sort_keys=True, indent=4, separators=(",", ": ") + ) print("Finished.") - settings.graph_stats[dirname]['entities'] = settings.pbg_params['entities'] - settings.graph_stats[dirname]['relations'] = settings.pbg_params['relations'] + settings.graph_stats[dirname]["entities"] = settings.pbg_params["entities"] + settings.graph_stats[dirname]["relations"] = settings.pbg_params["relations"] if get_marker_significance: - filepath = os.path.join(settings.workdir, 'pbg', dirname_orig) - settings.pbg_params['entity_path'] = \ - os.path.join(filepath, "input/entity") - settings.pbg_params['edge_paths'] = \ - [os.path.join(filepath, "input/edge"), ] - settings.pbg_params['entities'] = settings.graph_stats[dirname_orig]['entities'] - settings.pbg_params['relations'] = settings.graph_stats[dirname_orig]['relations'] + filepath = os.path.join(settings.workdir, "pbg", dirname_orig) + settings.pbg_params["entity_path"] = os.path.join(filepath, "input/entity") + settings.pbg_params["edge_paths"] = [ + os.path.join(filepath, "input/edge"), + ] + settings.pbg_params["entities"] = settings.graph_stats[dirname_orig]["entities"] + settings.pbg_params["relations"] = settings.graph_stats[dirname_orig][ + "relations" + ] if copy: return df_edges else: return None -def pbg_train(dirname=None, - pbg_params=None, - output='model', - auto_wd=True, - save_wd=False, - use_edge_weights=False, - get_marker_significance=False,): +def pbg_train( + dirname=None, + pbg_params=None, + output="model", + auto_wd=True, + save_wd=False, + use_edge_weights=False, + get_marker_significance=False, +): """PBG training Parameters @@ -1093,55 +1324,66 @@ def pbg_train(dirname=None, if pbg_params is None: pbg_params = settings.pbg_params.copy() else: - assert isinstance(pbg_params, dict),\ - "`pbg_params` must be dict" + assert isinstance(pbg_params, dict), "`pbg_params` must be dict" if dirname is None: - filepath = Path(pbg_params['entity_path']).parent.parent.as_posix() + filepath = Path(pbg_params["entity_path"]).parent.parent.as_posix() dirname = os.path.basename(filepath) else: - filepath = os.path.join(settings.workdir, 'pbg', dirname) - pbg_params['checkpoint_path'] = os.path.join(filepath, output) - settings.pbg_params['checkpoint_path'] = pbg_params['checkpoint_path'] + filepath = os.path.join(settings.workdir, "pbg", dirname) + pbg_params["checkpoint_path"] = os.path.join(filepath, output) + settings.pbg_params["checkpoint_path"] = pbg_params["checkpoint_path"] if get_marker_significance: - pbg_train(dirname=dirname, - pbg_params=pbg_params, - output=output, - auto_wd=auto_wd, - save_wd=True, - use_edge_weights=use_edge_weights, - get_marker_significance=False) + pbg_train( + dirname=dirname, + pbg_params=pbg_params, + output=output, + auto_wd=auto_wd, + save_wd=True, + use_edge_weights=use_edge_weights, + get_marker_significance=False, + ) pbg_params = pbg_params.copy() - n_edges = settings.graph_stats[ - os.path.basename(filepath)]['n_edges'] + n_edges = settings.graph_stats[os.path.basename(filepath)]["n_edges"] filepath += "_with_sig" - pbg_params['checkpoint_path'] = os.path.join(filepath, output) - pbg_params['entity_path'] = os.path.join(filepath, "input/entity") - pbg_params['edge_paths'] = [os.path.join(filepath, "input/edge"), ] - pbg_params['relations'] = settings.graph_stats[dirname + "_with_sig"]['relations'] + pbg_params["checkpoint_path"] = os.path.join(filepath, output) + pbg_params["entity_path"] = os.path.join(filepath, "input/entity") + pbg_params["edge_paths"] = [ + os.path.join(filepath, "input/edge"), + ] + pbg_params["relations"] = settings.graph_stats[dirname + "_with_sig"][ + "relations" + ] auto_wd = False - pbg_params['wd'] = settings.pbg_params['wd'] * n_edges / settings.graph_stats[ - os.path.basename(filepath)]['n_edges'] - settings.pbg_params['wd'] = pbg_params['wd'] + pbg_params["wd"] = ( + settings.pbg_params["wd"] + * n_edges + / settings.graph_stats[os.path.basename(filepath)]["n_edges"] + ) + settings.pbg_params["wd"] = pbg_params["wd"] if auto_wd: # empirical numbers from simulation experiments - if settings.graph_stats[ - os.path.basename(filepath)]['n_edges'] < 5e7: + if settings.graph_stats[os.path.basename(filepath)]["n_edges"] < 5e7: # optimial wd (0.013) for sample size (2725781) - wd = 0.013 * 2725781 / settings.graph_stats[ - os.path.basename(filepath)]['n_edges'] + wd = ( + 0.013 + * 2725781 + / settings.graph_stats[os.path.basename(filepath)]["n_edges"] + ) else: # optimial wd (0.0004) for sample size (59103481) - wd = 0.0004 * 59103481 / settings.graph_stats[ - os.path.basename(filepath)]['n_edges'] - print(f'Auto-estimated weight decay is {wd:.6E}') - pbg_params['wd'] = wd + wd = ( + 0.0004 + * 59103481 + / settings.graph_stats[os.path.basename(filepath)]["n_edges"] + ) + print(f"Auto-estimated weight decay is {wd:.6E}") + pbg_params["wd"] = wd if save_wd: - settings.pbg_params['wd'] = pbg_params['wd'] + settings.pbg_params["wd"] = pbg_params["wd"] print(f"`.settings.pbg_params['wd']` has been updated to {wd:.6E}") - # to avoid oversubscription issues in workloads # that involve nested parallelism os.environ["OMP_NUM_THREADS"] = "1" @@ -1163,7 +1405,7 @@ def pbg_train(dirname=None, input_edge_paths, TSVEdgelistReader(lhs_col=0, rhs_col=2, rel_col=1, weight_col=3), dynamic_relations=config.dynamic_relations, - ) + ) else: convert_input_data( config.entities, @@ -1173,7 +1415,7 @@ def pbg_train(dirname=None, input_edge_paths, TSVEdgelistReader(lhs_col=0, rhs_col=2, rel_col=1), dynamic_relations=config.dynamic_relations, - ) + ) subprocess_init = SubprocessInitializer() subprocess_init.register(setup_logging, config.verbose) subprocess_init.register(add_to_sys_path, loader.config_dir.name) diff --git a/simba/tools/_post_training.py b/simba/tools/_post_training.py index 8015a78..cc5e123 100755 --- a/simba/tools/_post_training.py +++ b/simba/tools/_post_training.py @@ -6,16 +6,13 @@ from scipy.stats import entropy from sklearn.neighbors import KDTree from scipy.spatial import distance + # import faiss from ._utils import _gini, _get_fdr -def softmax(adata_ref, - adata_query, - T=0.5, - n_top=None, - percentile=0): +def softmax(adata_ref, adata_query, T=0.5, n_top=None, percentile=0): """Softmax-based transformation This will transform query data to reference-comparable data @@ -42,21 +39,32 @@ def softmax(adata_ref, Store #observations × #dimensions softmax transformed data matrix. """ - scores_ref_query = np.matmul(adata_ref.X, adata_query.X.T) + scores_ref_query = np.matmul(adata_ref.X, adata_query.X.T) # n_ref x n_query # avoid overflow encountered scores_ref_query = scores_ref_query - scores_ref_query.max() - scores_softmax = np.exp(scores_ref_query/T) / \ - (np.exp(scores_ref_query/T).sum(axis=0))[None, :] + scores_softmax = ( + np.exp(scores_ref_query / T) + / (np.exp(scores_ref_query / T).sum(axis=0))[None, :] + ) if n_top is None: thresh = np.percentile(scores_softmax, q=percentile, axis=0) else: - thresh = (np.sort(scores_softmax, axis=0)[::-1, :])[n_top-1, ] + thresh = (np.sort(scores_softmax, axis=0)[::-1, :])[n_top - 1,] mask = scores_softmax < thresh[None, :] scores_softmax[mask] = 0 # rescale to make scores add up to 1 - scores_softmax = scores_softmax/scores_softmax.sum(axis=0, keepdims=1) + scores_softmax = scores_softmax / scores_softmax.sum(axis=0, keepdims=1) X_query = np.dot(scores_softmax.T, adata_ref.X) - adata_query.layers['softmax'] = X_query + adata_query.layers["softmax"] = X_query + + +def stable_softmax(x): + z = x - np.max(x, axis=0) + numerator = np.exp(z) + denominator = np.sum(numerator, axis=0)[None, :] + softmax = numerator / denominator + + return softmax class SimbaEmbed: @@ -70,16 +78,17 @@ class SimbaEmbed: """ - def __init__(self, - adata_ref, - list_adata_query, - T=0.5, - list_T=None, - percentile=50, - n_top=None, - list_percentile=None, - use_precomputed=True, - ): + def __init__( + self, + adata_ref, + list_adata_query, + T=0.5, + list_T=None, + percentile=50, + n_top=None, + list_percentile=None, + use_precomputed=True, + ): """ Parameters ---------- @@ -105,11 +114,9 @@ def __init__(self, It should correspond to each of query data. Once it's specified, it will override `cutoff`. """ - assert isinstance(list_adata_query, list), \ - "`list_adata_query` must be list" + assert isinstance(list_adata_query, list), "`list_adata_query` must be list" if list_T is not None: - assert isinstance(list_T, list), \ - "`list_T` must be list" + assert isinstance(list_T, list), "`list_T` must be list" self.adata_ref = adata_ref self.list_adata_query = list_adata_query self.T = T @@ -142,7 +149,7 @@ def embed(self): # index=adata_ref.obs.index, # columns=['id_dataset']) obs_all = adata_ref.obs.copy() - obs_all['id_dataset'] = ['ref']*adata_ref.shape[0] + obs_all["id_dataset"] = ["ref"] * adata_ref.shape[0] for i, adata_query in enumerate(list_adata_query): if list_T is not None: param_T = list_T[i] @@ -153,12 +160,15 @@ def embed(self): else: param_percentile = percentile if use_precomputed: - if 'softmax' in adata_query.layers.keys(): - print(f'Reading in precomputed softmax-transformed matrix ' - f'for query data {i};') + if "softmax" in adata_query.layers.keys(): + print( + f"Reading in precomputed softmax-transformed matrix " + f"for query data {i};" + ) else: - print(f'No softmax-transformed matrix exists ' - f'for query data {i}') + print( + f"No softmax-transformed matrix exists " f"for query data {i}" + ) print("Performing softmax transformation;") softmax( adata_ref, @@ -168,16 +178,15 @@ def embed(self): n_top=n_top, ) else: - print(f'Performing softmax transformation ' - f'for query data {i};') + print(f"Performing softmax transformation " f"for query data {i};") softmax( adata_ref, adata_query, T=param_T, percentile=param_percentile, n_top=n_top, - ) - X_all = np.vstack((X_all, adata_query.layers['softmax'])) + ) + X_all = np.vstack((X_all, adata_query.layers["softmax"])) # obs_all = obs_all.append( # pd.DataFrame( # data=[f'query_{i}']*adata_query.shape[0], @@ -185,22 +194,22 @@ def embed(self): # columns=['id_dataset']) # ) obs_query = adata_query.obs.copy() - obs_query['id_dataset'] = [f'query_{i}']*adata_query.shape[0] - obs_all = pd.concat( - [obs_all, obs_query], ignore_index=False) - adata_all = ad.AnnData(X=X_all, - obs=obs_all) + obs_query["id_dataset"] = [f"query_{i}"] * adata_query.shape[0] + obs_all = pd.concat([obs_all, obs_query], ignore_index=False) + adata_all = ad.AnnData(X=X_all, obs=obs_all) return adata_all -def embed(adata_ref, - list_adata_query, - T=0.5, - list_T=None, - percentile=0, - n_top=None, - list_percentile=None, - use_precomputed=False): +def embed( + adata_ref, + list_adata_query, + T=0.5, + list_T=None, + percentile=0, + n_top=None, + list_percentile=None, + use_precomputed=False, +): """Embed a list of query datasets along with reference dataset into the same space @@ -230,21 +239,21 @@ def embed(adata_ref, softmax: `array_like` (`.layers['softmax']`) Store #observations × #dimensions softmax transformed data matrix. """ - SE = SimbaEmbed(adata_ref, - list_adata_query, - T=T, - list_T=list_T, - percentile=percentile, - n_top=n_top, - list_percentile=list_percentile, - use_precomputed=use_precomputed) + SE = SimbaEmbed( + adata_ref, + list_adata_query, + T=T, + list_T=list_T, + percentile=percentile, + n_top=n_top, + list_percentile=list_percentile, + use_precomputed=use_precomputed, + ) adata_all = SE.embed() return adata_all -def _compare_entities(adata_ref, - adata_query, - n_top_cells=50, - T=1): + +def _compare_entities(adata_ref, adata_query, n_top_cells=50, T=1): """Compare the embeddings of two entities by calculating the following values between reference and query entities: @@ -288,31 +297,33 @@ def _compare_entities(adata_ref, X_ref = adata_ref.X X_query = adata_query.X X_cmp = np.matmul(X_ref, X_query.T) - adata_cmp = ad.AnnData(X=X_cmp, - obs=adata_ref.obs, - var=adata_query.obs) - adata_cmp.layers['norm'] = X_cmp \ - - np.log(np.exp(X_cmp).mean(axis=0)).reshape(1, -1) - X_sorted_normed = np.sort(adata_cmp.layers['norm'], axis=0) - adata_cmp.layers['softmax'] = np.exp(X_cmp/T) \ - / np.exp(X_cmp/T).sum(axis=0).reshape(1, -1) - adata_cmp.var['max'] = \ - np.clip(np.sort(adata_cmp.layers['norm'], axis=0)[-n_top_cells:, ], - a_min=0, - a_max=None).mean(axis=0) - adata_cmp.var['maxmin'] = (X_sorted_normed[-n_top_cells:, ]-X_sorted_normed[:n_top_cells,]).mean(axis=0) - adata_cmp.var['std'] = np.std(X_cmp, axis=0, ddof=1) - adata_cmp.var['gini'] = np.array([_gini(adata_cmp.layers['softmax'][:, i]) - for i in np.arange(X_cmp.shape[1])]) - adata_cmp.var['entropy'] = entropy(adata_cmp.layers['softmax']) + adata_cmp = ad.AnnData(X=X_cmp, obs=adata_ref.obs, var=adata_query.obs) + adata_cmp.layers["norm"] = X_cmp - np.log(np.exp(X_cmp).mean(axis=0)).reshape(1, -1) + X_sorted_normed = np.sort(adata_cmp.layers["norm"], axis=0) + adata_cmp.layers["softmax"] = np.exp(X_cmp / T) / np.exp(X_cmp / T).sum( + axis=0 + ).reshape(1, -1) + adata_cmp.var["max"] = np.clip( + np.sort(adata_cmp.layers["norm"], axis=0)[-n_top_cells:,], a_min=0, a_max=None + ).mean(axis=0) + adata_cmp.var["maxmin"] = ( + X_sorted_normed[-n_top_cells:,] - X_sorted_normed[:n_top_cells,] + ).mean(axis=0) + adata_cmp.var["std"] = np.std(X_cmp, axis=0, ddof=1) + adata_cmp.var["gini"] = np.array( + [_gini(adata_cmp.layers["softmax"][:, i]) for i in np.arange(X_cmp.shape[1])] + ) + adata_cmp.var["entropy"] = entropy(adata_cmp.layers["softmax"]) return adata_cmp -def compare_entities(adata_ref, - adata_query, - adata_query_null = None, - n_top_cells=50, - T=1, - ): + +def compare_entities( + adata_ref, + adata_query, + adata_query_null=None, + n_top_cells=50, + T=1, +): """Compare the embeddings of two entities by calculating the following values between reference and query entities: @@ -333,7 +344,7 @@ def compare_entities(adata_ref, based on softmax probability) Optionally produces FDR of metrics when null query embedding is provided. - Metric score of null embedding will be used as the null disbribution of the + Metric score of null embedding will be used as the null disbribution of the metrics. Parameters @@ -361,13 +372,15 @@ def compare_entities(adata_ref, adata_cmp: `AnnData` Store reference entity as observations and query entity as variables """ - adata_cmp = _compare_entities(adata_ref, adata_query, n_top_cells = n_top_cells, T = T) - if adata_query_null is None: - return(adata_cmp) - - adata_cmp_null = _compare_entities(adata_ref, adata_query_null, n_top_cells = n_top_cells, T = T) - for metric in ['max', 'std', 'gini', 'entropy']: - if metric == 'entropy': + adata_cmp = _compare_entities(adata_ref, adata_query, n_top_cells=n_top_cells, T=T) + if adata_query_null is None: + return adata_cmp + + adata_cmp_null = _compare_entities( + adata_ref, adata_query_null, n_top_cells=n_top_cells, T=T + ) + for metric in ["max", "std", "gini", "entropy"]: + if metric == "entropy": # for entropy, we need the p_value in the opposite direction (lower value if significant) p_val, fdr = _get_fdr(-adata_cmp.var[metric], -adata_cmp_null.var[metric]) else: @@ -377,19 +390,20 @@ def compare_entities(adata_ref, return adata_cmp -def query(adata, - obsm='X_umap', - layer=None, - metric='euclidean', - anno_filter=None, - filters=None, - entity=None, - pin=None, - k=20, - use_radius=False, - r=None, - **kwargs - ): +def query( + adata, + obsm="X_umap", + layer=None, + metric="euclidean", + anno_filter=None, + filters=None, + entity=None, + pin=None, + k=20, + use_radius=False, + r=None, + **kwargs, +): """Query the "database" of entites Parameters @@ -432,17 +446,14 @@ def query(adata, output: `pandas.DataFrame`, (`adata.uns['query']['output']`) Query result. """ - if sum(list(map(lambda x: x is None, - [entity, pin]))) == 2: + if sum(list(map(lambda x: x is None, [entity, pin]))) == 2: raise ValueError("One of `entity` and `pin` must be specified") - if sum(list(map(lambda x: x is not None, - [entity, pin]))) == 2: + if sum(list(map(lambda x: x is not None, [entity, pin]))) == 2: print("`entity` will be ignored.") if entity is not None: entity = np.array(entity).flatten() - if sum(list(map(lambda x: x is not None, - [layer, obsm]))) == 2: + if sum(list(map(lambda x: x is not None, [layer, obsm]))) == 2: raise ValueError("Only one of `layer` and `obsm` can be used") elif obsm is not None: X = adata.obsm[obsm].copy() @@ -461,29 +472,25 @@ def query(adata, if use_radius: kdt = KDTree(X, metric=metric, **kwargs) if r is None: - r = np.mean(X.max(axis=0) - X.min(axis=0))/5 - ind, dist = kdt.query_radius(pin, - r=r, - sort_results=True, - return_distance=True) + r = np.mean(X.max(axis=0) - X.min(axis=0)) / 5 + ind, dist = kdt.query_radius(pin, r=r, sort_results=True, return_distance=True) df_output = pd.DataFrame() for ii in np.arange(pin.shape[0]): - df_output_ii = adata.obs.iloc[ind[ii], ].copy() - df_output_ii['distance'] = dist[ii] + df_output_ii = adata.obs.iloc[ind[ii],].copy() + df_output_ii["distance"] = dist[ii] if entity is not None: - df_output_ii['query'] = entity[ii] + df_output_ii["query"] = entity[ii] else: - df_output_ii['query'] = ii - df_output = pd.concat( - [df_output, df_output_ii], ignore_index=False) + df_output_ii["query"] = ii + df_output = pd.concat([df_output, df_output_ii], ignore_index=False) if anno_filter is not None: if anno_filter in adata.obs_keys(): if filters is None: filters = df_output[anno_filter].unique().tolist() - df_output.query(f'{anno_filter} == @filters', inplace=True) + df_output.query(f"{anno_filter} == @filters", inplace=True) else: - raise ValueError(f'could not find {anno_filter}') - df_output = df_output.sort_values(by='distance') + raise ValueError(f"could not find {anno_filter}") + df_output = df_output.sort_values(by="distance") else: # assert (metric in ['euclidean', 'dot_product']),\ # "`metric` must be one of ['euclidean','dot_product']" @@ -491,17 +498,13 @@ def query(adata, if anno_filter in adata.obs_keys(): if filters is None: filters = adata.obs[anno_filter].unique().tolist() - ids_filters = \ - np.where(np.isin(adata.obs[anno_filter], filters))[0] + ids_filters = np.where(np.isin(adata.obs[anno_filter], filters))[0] else: - raise ValueError(f'could not find {anno_filter}') + raise ValueError(f"could not find {anno_filter}") else: ids_filters = np.arange(X.shape[0]) kdt = KDTree(X[ids_filters, :], metric=metric, **kwargs) - dist, ind = kdt.query(pin, - k=k, - sort_results=True, - return_distance=True) + dist, ind = kdt.query(pin, k=k, sort_results=True, return_distance=True) # use faiss # X = X.astype('float32') @@ -517,46 +520,47 @@ def query(adata, df_output = pd.DataFrame() for ii in np.arange(pin.shape[0]): - df_output_ii = \ - adata.obs.iloc[ids_filters, ].iloc[ind[ii, ], ].copy() - df_output_ii['distance'] = dist[ii, ] + df_output_ii = adata.obs.iloc[ids_filters,].iloc[ind[ii,],].copy() + df_output_ii["distance"] = dist[ii,] if entity is not None: - df_output_ii['query'] = entity[ii] + df_output_ii["query"] = entity[ii] else: - df_output_ii['query'] = ii - df_output = pd.concat( - [df_output, df_output_ii], ignore_index=False) - df_output = df_output.sort_values(by='distance') - - adata.uns['query'] = dict() - adata.uns['query']['params'] = {'obsm': obsm, - 'layer': layer, - 'entity': entity, - 'pin': pin, - 'k': k, - 'use_radius': use_radius, - 'r': r} - adata.uns['query']['output'] = df_output.copy() + df_output_ii["query"] = ii + df_output = pd.concat([df_output, df_output_ii], ignore_index=False) + df_output = df_output.sort_values(by="distance") + + adata.uns["query"] = dict() + adata.uns["query"]["params"] = { + "obsm": obsm, + "layer": layer, + "entity": entity, + "pin": pin, + "k": k, + "use_radius": use_radius, + "r": r, + } + adata.uns["query"]["output"] = df_output.copy() return df_output -def find_master_regulators(adata_all, - list_tf_motif=None, - list_tf_gene=None, - metric='euclidean', - anno_filter='entity_anno', - filter_gene='gene', - metrics_gene=None, - metrics_motif=None, - cutoff_gene_max=1.5, - cutoff_gene_gini=0.3, - cutoff_gene_std=None, - cutoff_gene_entropy=None, - cutoff_motif_max=1.5, - cutoff_motif_gini=0.3, - cutoff_motif_std=None, - cutoff_motif_entropy=None, - ): +def find_master_regulators( + adata_all, + list_tf_motif=None, + list_tf_gene=None, + metric="euclidean", + anno_filter="entity_anno", + filter_gene="gene", + metrics_gene=None, + metrics_motif=None, + cutoff_gene_max=1.5, + cutoff_gene_gini=0.3, + cutoff_gene_std=None, + cutoff_gene_entropy=None, + cutoff_motif_max=1.5, + cutoff_motif_gini=0.3, + cutoff_motif_std=None, + cutoff_motif_entropy=None, +): """Find all the master regulators Parameters @@ -599,108 +603,108 @@ def find_master_regulators(adata_all, df_MR: `pandas.DataFrame` Dataframe of master regulators """ - if sum(list(map(lambda x: x is None, - [list_tf_motif, list_tf_gene]))) > 0: + if sum(list(map(lambda x: x is None, [list_tf_motif, list_tf_gene]))) > 0: return "Please specify both `list_tf_motif` and `list_tf_gene`" - assert isinstance(list_tf_motif, list), \ - "`list_tf_motif` must be list" - assert isinstance(list_tf_gene, list), \ - "`list_tf_gene` must be list" - assert len(list_tf_motif) == len(list_tf_gene), \ - "`list_tf_motif` and `list_tf_gene` must have the same length" - assert len(list_tf_motif) == len(set(list_tf_motif)), \ - "Duplicates are found in `list_tf_motif`" - - genes = adata_all[adata_all.obs[anno_filter] == filter_gene].\ - obs_names.tolist().copy() + assert isinstance(list_tf_motif, list), "`list_tf_motif` must be list" + assert isinstance(list_tf_gene, list), "`list_tf_gene` must be list" + assert len(list_tf_motif) == len( + list_tf_gene + ), "`list_tf_motif` and `list_tf_gene` must have the same length" + assert len(list_tf_motif) == len( + set(list_tf_motif) + ), "Duplicates are found in `list_tf_motif`" + + genes = ( + adata_all[adata_all.obs[anno_filter] == filter_gene].obs_names.tolist().copy() + ) # Master Regulator - df_MR = pd.DataFrame(list(zip(list_tf_motif, list_tf_gene)), - columns=['motif', 'gene']) + df_MR = pd.DataFrame( + list(zip(list_tf_motif, list_tf_gene)), columns=["motif", "gene"] + ) if metrics_motif is not None: - print('Adding motif metrics ...') - assert isinstance(metrics_motif, pd.DataFrame), \ - "`metrics_motif` must be pd.DataFrame" - df_metrics_motif = metrics_motif.loc[list_tf_motif, ].copy() - df_metrics_motif.columns = df_metrics_motif.columns + '_motif' - df_MR = df_MR.merge(df_metrics_motif, - how='left', - left_on='motif', - right_index=True) + print("Adding motif metrics ...") + assert isinstance( + metrics_motif, pd.DataFrame + ), "`metrics_motif` must be pd.DataFrame" + df_metrics_motif = metrics_motif.loc[list_tf_motif,].copy() + df_metrics_motif.columns = df_metrics_motif.columns + "_motif" + df_MR = df_MR.merge( + df_metrics_motif, how="left", left_on="motif", right_index=True + ) if metrics_gene is not None: - print('Adding gene metrics ...') - assert isinstance(metrics_gene, pd.DataFrame), \ - "`metrics_gene` must be pd.DataFrame" - df_metrics_gene = metrics_gene.loc[list_tf_gene, ].copy() + print("Adding gene metrics ...") + assert isinstance( + metrics_gene, pd.DataFrame + ), "`metrics_gene` must be pd.DataFrame" + df_metrics_gene = metrics_gene.loc[list_tf_gene,].copy() df_metrics_gene.index = list_tf_motif # avoid duplicate genes - df_metrics_gene.columns = df_metrics_gene.columns + '_gene' - df_MR = df_MR.merge(df_metrics_gene, - how='left', - left_on='motif', - right_index=True) - print('Computing distances between TF motifs and genes ...') - dist_MG = distance.cdist(adata_all[df_MR['motif'], ].X, - adata_all[genes, ].X, - metric=metric) - dist_MG = pd.DataFrame(dist_MG, - index=df_MR['motif'].tolist(), - columns=genes) - df_MR.insert(2, 'rank', -1) - df_MR.insert(3, 'dist', -1) + df_metrics_gene.columns = df_metrics_gene.columns + "_gene" + df_MR = df_MR.merge( + df_metrics_gene, how="left", left_on="motif", right_index=True + ) + print("Computing distances between TF motifs and genes ...") + dist_MG = distance.cdist( + adata_all[df_MR["motif"],].X, adata_all[genes,].X, metric=metric + ) + dist_MG = pd.DataFrame(dist_MG, index=df_MR["motif"].tolist(), columns=genes) + df_MR.insert(2, "rank", -1) + df_MR.insert(3, "dist", -1) for i in np.arange(df_MR.shape[0]): - x_motif = df_MR['motif'].iloc[i] - x_gene = df_MR['gene'].iloc[i] - df_MR.loc[i, 'rank'] = dist_MG.loc[x_motif, ].rank()[x_gene] - df_MR.loc[i, 'dist'] = dist_MG.loc[x_motif, x_gene] + x_motif = df_MR["motif"].iloc[i] + x_gene = df_MR["gene"].iloc[i] + df_MR.loc[i, "rank"] = dist_MG.loc[x_motif,].rank()[x_gene] + df_MR.loc[i, "dist"] = dist_MG.loc[x_motif, x_gene] if metrics_gene is not None: - print('filtering master regulators based on gene metrics:') + print("filtering master regulators based on gene metrics:") if cutoff_gene_entropy is not None: - print('entropy') - df_MR = df_MR[df_MR['entropy_gene'] > cutoff_gene_entropy] + print("entropy") + df_MR = df_MR[df_MR["entropy_gene"] > cutoff_gene_entropy] if cutoff_gene_gini is not None: - print('Gini index') - df_MR = df_MR[df_MR['gini_gene'] > cutoff_gene_gini] + print("Gini index") + df_MR = df_MR[df_MR["gini_gene"] > cutoff_gene_gini] if cutoff_gene_max is not None: - print('max') - df_MR = df_MR[df_MR['max_gene'] > cutoff_gene_max] + print("max") + df_MR = df_MR[df_MR["max_gene"] > cutoff_gene_max] if cutoff_gene_std is not None: - print('standard deviation') - df_MR = df_MR[df_MR['std_gene'] > cutoff_gene_std] + print("standard deviation") + df_MR = df_MR[df_MR["std_gene"] > cutoff_gene_std] if metrics_motif is not None: - print('filtering master regulators based on motif metrics:') + print("filtering master regulators based on motif metrics:") if cutoff_motif_entropy is not None: - print('entropy') - df_MR = df_MR[df_MR['entropy_motif'] > cutoff_motif_entropy] + print("entropy") + df_MR = df_MR[df_MR["entropy_motif"] > cutoff_motif_entropy] if cutoff_motif_gini is not None: - print('Gini index') - df_MR = df_MR[df_MR['gini_motif'] > cutoff_motif_gini] + print("Gini index") + df_MR = df_MR[df_MR["gini_motif"] > cutoff_motif_gini] if cutoff_motif_max is not None: - print('max') - df_MR = df_MR[df_MR['max_motif'] > cutoff_motif_max] + print("max") + df_MR = df_MR[df_MR["max_motif"] > cutoff_motif_max] if cutoff_motif_std is not None: - print('standard deviation') - df_MR = df_MR[df_MR['std_motif'] > cutoff_motif_std] - df_MR = df_MR.sort_values(by='rank', ignore_index=True) + print("standard deviation") + df_MR = df_MR[df_MR["std_motif"] > cutoff_motif_std] + df_MR = df_MR.sort_values(by="rank", ignore_index=True) return df_MR -def find_target_genes(adata_all, - adata_PM, - list_tf_motif=None, - list_tf_gene=None, - adata_CP=None, - metric='euclidean', - anno_filter='entity_anno', - filter_peak='peak', - filter_gene='gene', - n_genes=200, - cutoff_gene=None, - cutoff_peak=1000, - use_precomputed=True, - ): +def find_target_genes( + adata_all, + adata_PM, + list_tf_motif=None, + list_tf_gene=None, + adata_CP=None, + metric="euclidean", + anno_filter="entity_anno", + filter_peak="peak", + filter_gene="gene", + n_genes=200, + cutoff_gene=None, + cutoff_peak=1000, + use_precomputed=True, +): """For a given TF, infer its target genes Parameters @@ -752,168 +756,169 @@ def find_target_genes(adata_all, tf_targets: `dict`, (`adata.uns['tf_targets']`) Distances calculated between genes, peaks, and motifs """ - if sum(list(map(lambda x: x is None, - [list_tf_motif, list_tf_gene]))) > 0: + if sum(list(map(lambda x: x is None, [list_tf_motif, list_tf_gene]))) > 0: return "Please specify both `list_tf_motif` and `list_tf_gene`" - assert isinstance(list_tf_motif, list), \ - "`list_tf_motif` must be list" - assert isinstance(list_tf_gene, list), \ - "`list_tf_gene` must be list" - assert len(list_tf_motif) == len(list_tf_gene), \ - "`list_tf_motif` and `list_tf_gene` must have the same length" + assert isinstance(list_tf_motif, list), "`list_tf_motif` must be list" + assert isinstance(list_tf_gene, list), "`list_tf_gene` must be list" + assert len(list_tf_motif) == len( + list_tf_gene + ), "`list_tf_motif` and `list_tf_gene` must have the same length" def isin(a, b): return np.array([item in b for item in a]) - print('Preprocessing ...') - if use_precomputed and 'tf_targets' in adata_all.uns_keys(): - print('importing precomputed variables ...') - genes = adata_all.uns['tf_targets']['genes'] - peaks = adata_all.uns['tf_targets']['peaks'] - peaks_in_genes = adata_all.uns['tf_targets']['peaks_in_genes'] - dist_PG = adata_all.uns['tf_targets']['dist_PG'] - overlap_PG = adata_all.uns['tf_targets']['overlap'] + print("Preprocessing ...") + if use_precomputed and "tf_targets" in adata_all.uns_keys(): + print("importing precomputed variables ...") + genes = adata_all.uns["tf_targets"]["genes"] + peaks = adata_all.uns["tf_targets"]["peaks"] + peaks_in_genes = adata_all.uns["tf_targets"]["peaks_in_genes"] + dist_PG = adata_all.uns["tf_targets"]["dist_PG"] + overlap_PG = adata_all.uns["tf_targets"]["overlap"] else: - assert (adata_CP is not None), \ - '`adata_CP` needs to be specified '\ - 'when no precomputed variable is stored' - if 'gene_scores' not in adata_CP.uns_keys(): + assert adata_CP is not None, ( + "`adata_CP` needs to be specified " "when no precomputed variable is stored" + ) + if "gene_scores" not in adata_CP.uns_keys(): print('Please run "si.tl.gene_scores(adata_CP)" first.') else: - overlap_PG = adata_CP.uns['gene_scores']['overlap'].copy() - overlap_PG['peak'] = \ - overlap_PG[['chr_p', 'start_p', 'end_p']].apply( - lambda row: '_'.join(row.values.astype(str)), axis=1) - tuples = list(zip(overlap_PG['symbol_g'], overlap_PG['peak'])) - multi_indices = pd.MultiIndex.from_tuples( - tuples, names=["gene", "peak"]) + overlap_PG = adata_CP.uns["gene_scores"]["overlap"].copy() + overlap_PG["peak"] = overlap_PG[["chr_p", "start_p", "end_p"]].apply( + lambda row: "_".join(row.values.astype(str)), axis=1 + ) + tuples = list(zip(overlap_PG["symbol_g"], overlap_PG["peak"])) + multi_indices = pd.MultiIndex.from_tuples(tuples, names=["gene", "peak"]) overlap_PG.index = multi_indices - genes = adata_all[adata_all.obs[anno_filter] == filter_gene].\ - obs_names.tolist().copy() - peaks = adata_all[adata_all.obs[anno_filter] == filter_peak].\ - obs_names.tolist().copy() - peaks_in_genes = list(set(overlap_PG['peak'])) - - print(f'#genes: {len(genes)}') - print(f'#peaks: {len(peaks)}') - print(f'#genes-associated peaks: {len(peaks_in_genes)}') - print('computing distances between genes ' - 'and genes-associated peaks ...') + genes = ( + adata_all[adata_all.obs[anno_filter] == filter_gene] + .obs_names.tolist() + .copy() + ) + peaks = ( + adata_all[adata_all.obs[anno_filter] == filter_peak] + .obs_names.tolist() + .copy() + ) + peaks_in_genes = list(set(overlap_PG["peak"])) + + print(f"#genes: {len(genes)}") + print(f"#peaks: {len(peaks)}") + print(f"#genes-associated peaks: {len(peaks_in_genes)}") + print("computing distances between genes " "and genes-associated peaks ...") dist_PG = distance.cdist( - adata_all[peaks_in_genes, ].X, - adata_all[genes, ].X, + adata_all[peaks_in_genes,].X, + adata_all[genes,].X, metric=metric, - ) + ) dist_PG = pd.DataFrame(dist_PG, index=peaks_in_genes, columns=[genes]) print("Saving variables into `.uns['tf_targets']` ...") - adata_all.uns['tf_targets'] = dict() - adata_all.uns['tf_targets']['overlap'] = overlap_PG - adata_all.uns['tf_targets']['dist_PG'] = dist_PG - adata_all.uns['tf_targets']['peaks_in_genes'] = peaks_in_genes - adata_all.uns['tf_targets']['genes'] = genes - adata_all.uns['tf_targets']['peaks'] = peaks - adata_all.uns['tf_targets']['peaks_in_genes'] = peaks_in_genes + adata_all.uns["tf_targets"] = dict() + adata_all.uns["tf_targets"]["overlap"] = overlap_PG + adata_all.uns["tf_targets"]["dist_PG"] = dist_PG + adata_all.uns["tf_targets"]["peaks_in_genes"] = peaks_in_genes + adata_all.uns["tf_targets"]["genes"] = genes + adata_all.uns["tf_targets"]["peaks"] = peaks + adata_all.uns["tf_targets"]["peaks_in_genes"] = peaks_in_genes dict_tf_targets = dict() for tf_motif, tf_gene in zip(list_tf_motif, list_tf_gene): - - print(f'searching for target genes of {tf_motif}') + print(f"searching for target genes of {tf_motif}") motif_peaks = adata_PM.obs_names[adata_PM[:, tf_motif].X.nonzero()[0]] motif_genes = list( - set(overlap_PG[isin(overlap_PG['peak'], motif_peaks)]['symbol_g']) - .intersection(genes)) + set( + overlap_PG[isin(overlap_PG["peak"], motif_peaks)]["symbol_g"] + ).intersection(genes) + ) # rank of the distances from genes to tf_motif - dist_GM_motif = distance.cdist(adata_all[genes, ].X, - adata_all[tf_motif, ].X, - metric=metric) - dist_GM_motif = pd.DataFrame(dist_GM_motif, - index=genes, - columns=[tf_motif]) + dist_GM_motif = distance.cdist( + adata_all[genes,].X, adata_all[tf_motif,].X, metric=metric + ) + dist_GM_motif = pd.DataFrame(dist_GM_motif, index=genes, columns=[tf_motif]) rank_GM_motif = dist_GM_motif.rank(axis=0) # rank of the distances from genes to tf_gene - dist_GG_motif = distance.cdist(adata_all[genes, ].X, - adata_all[tf_gene, ].X, - metric=metric) - dist_GG_motif = pd.DataFrame(dist_GG_motif, - index=genes, - columns=[tf_gene]) + dist_GG_motif = distance.cdist( + adata_all[genes,].X, adata_all[tf_gene,].X, metric=metric + ) + dist_GG_motif = pd.DataFrame(dist_GG_motif, index=genes, columns=[tf_gene]) rank_GG_motif = dist_GG_motif.rank(axis=0) # rank of the distances from peaks to tf_motif dist_PM_motif = distance.cdist( - adata_all[peaks_in_genes, ].X, - adata_all[tf_motif, ].X, - metric=metric) - dist_PM_motif = pd.DataFrame(dist_PM_motif, - index=peaks_in_genes, - columns=[tf_motif]) + adata_all[peaks_in_genes,].X, adata_all[tf_motif,].X, metric=metric + ) + dist_PM_motif = pd.DataFrame( + dist_PM_motif, index=peaks_in_genes, columns=[tf_motif] + ) rank_PM_motif = dist_PM_motif.rank(axis=0) # rank of the distances from peaks to candidate genes - cand_genes = \ - dist_GG_motif[tf_gene].nsmallest(n_genes).index.tolist()\ + cand_genes = ( + dist_GG_motif[tf_gene].nsmallest(n_genes).index.tolist() + dist_GM_motif[tf_motif].nsmallest(n_genes).index.tolist() - print(f'#candinate genes is {len(cand_genes)}') - print('removing duplicate genes ...') - print('removing genes that do not contain TF motif ...') + ) + print(f"#candinate genes is {len(cand_genes)}") + print("removing duplicate genes ...") + print("removing genes that do not contain TF motif ...") cand_genes = list(set(cand_genes).intersection(set(motif_genes))) - print(f'#candinate genes is {len(cand_genes)}') + print(f"#candinate genes is {len(cand_genes)}") dist_PG_motif = distance.cdist( - adata_all[peaks_in_genes, ].X, - adata_all[cand_genes, ].X, - metric=metric - ) - dist_PG_motif = pd.DataFrame(dist_PG_motif, - index=peaks_in_genes, - columns=cand_genes) + adata_all[peaks_in_genes,].X, adata_all[cand_genes,].X, metric=metric + ) + dist_PG_motif = pd.DataFrame( + dist_PG_motif, index=peaks_in_genes, columns=cand_genes + ) rank_PG_motif = dist_PG_motif.rank(axis=0) df_tf_targets = pd.DataFrame(index=cand_genes) - df_tf_targets['average_rank'] = -1 - df_tf_targets['has_motif'] = 'no' - df_tf_targets['rank_gene_to_TFmotif'] = -1 - df_tf_targets['rank_gene_to_TFgene'] = -1 - df_tf_targets['rank_peak_to_TFmotif'] = -1 - df_tf_targets['rank_peak2_to_TFmotif'] = -1 - df_tf_targets['rank_peak_to_gene'] = -1 - df_tf_targets['rank_peak2_to_gene'] = -1 + df_tf_targets["average_rank"] = -1 + df_tf_targets["has_motif"] = "no" + df_tf_targets["rank_gene_to_TFmotif"] = -1 + df_tf_targets["rank_gene_to_TFgene"] = -1 + df_tf_targets["rank_peak_to_TFmotif"] = -1 + df_tf_targets["rank_peak2_to_TFmotif"] = -1 + df_tf_targets["rank_peak_to_gene"] = -1 + df_tf_targets["rank_peak2_to_gene"] = -1 for i, g in enumerate(cand_genes): - g_peaks = list(set(overlap_PG.loc[[g]]['peak'])) + g_peaks = list(set(overlap_PG.loc[[g]]["peak"])) g_motif_peaks = list(set(g_peaks).intersection(motif_peaks)) if len(g_motif_peaks) > 0: - df_tf_targets.loc[g, 'has_motif'] = 'yes' - df_tf_targets.loc[g, 'rank_gene_to_TFmotif'] = \ - rank_GM_motif[tf_motif][g] - df_tf_targets.loc[g, 'rank_gene_to_TFgene'] = \ - rank_GG_motif[tf_gene][g] - df_tf_targets.loc[g, 'rank_peak_to_TFmotif'] = \ - rank_PM_motif.loc[g_peaks, tf_motif].min() - df_tf_targets.loc[g, 'rank_peak2_to_TFmotif'] = \ - rank_PM_motif.loc[g_motif_peaks, tf_motif].min() - df_tf_targets.loc[g, 'rank_peak_to_gene'] = \ - rank_PG_motif.loc[g_peaks, g].min() - df_tf_targets.loc[g, 'rank_peak2_to_gene'] = \ - rank_PG_motif.loc[g_peaks, g].min() - if i % int(len(cand_genes)/5) == 0: - print(f'completed: {i/len(cand_genes):.1%}') - df_tf_targets['average_rank'] = \ - df_tf_targets[['rank_gene_to_TFmotif', - 'rank_gene_to_TFgene']].mean(axis=1) + df_tf_targets.loc[g, "has_motif"] = "yes" + df_tf_targets.loc[g, "rank_gene_to_TFmotif"] = rank_GM_motif[tf_motif][ + g + ] + df_tf_targets.loc[g, "rank_gene_to_TFgene"] = rank_GG_motif[tf_gene][g] + df_tf_targets.loc[g, "rank_peak_to_TFmotif"] = rank_PM_motif.loc[ + g_peaks, tf_motif + ].min() + df_tf_targets.loc[g, "rank_peak2_to_TFmotif"] = rank_PM_motif.loc[ + g_motif_peaks, tf_motif + ].min() + df_tf_targets.loc[g, "rank_peak_to_gene"] = rank_PG_motif.loc[ + g_peaks, g + ].min() + df_tf_targets.loc[g, "rank_peak2_to_gene"] = rank_PG_motif.loc[ + g_peaks, g + ].min() + if i % int(len(cand_genes) / 5) == 0: + print(f"completed: {i/len(cand_genes):.1%}") + df_tf_targets["average_rank"] = df_tf_targets[ + ["rank_gene_to_TFmotif", "rank_gene_to_TFgene"] + ].mean(axis=1) if cutoff_peak is not None: - print('Pruning candidate genes based on nearby peaks ...') + print("Pruning candidate genes based on nearby peaks ...") df_tf_targets = df_tf_targets[ - (df_tf_targets[[ - 'rank_peak_to_TFmotif', - 'rank_peak_to_gene']] - < cutoff_peak).sum(axis=1) > 0] + ( + df_tf_targets[["rank_peak_to_TFmotif", "rank_peak_to_gene"]] + < cutoff_peak + ).sum(axis=1) + > 0 + ] if cutoff_gene is not None: - print('Pruning candidate genes based on average rank ...') - df_tf_targets = df_tf_targets[ - df_tf_targets['average_rank'] < cutoff_gene] - dict_tf_targets[tf_motif] = \ - df_tf_targets.sort_values(by='average_rank').copy() + print("Pruning candidate genes based on average rank ...") + df_tf_targets = df_tf_targets[df_tf_targets["average_rank"] < cutoff_gene] + dict_tf_targets[tf_motif] = df_tf_targets.sort_values(by="average_rank").copy() return dict_tf_targets