remove algorithm specific, cell type specific lines

broadinstitute · Jan 17, 2025 · 82d95cc · 82d95cc
1 parent c4f6b91
commit 82d95cc
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 55 deletions.
diff --git a/notebooks/Segmentation_QC.ipynb b/notebooks/Segmentation_QC.ipynb
diff --git a/src/celldega/pre/boundary_tile.py b/src/celldega/pre/boundary_tile.py
@@ -59,7 +59,7 @@ def filter_and_save_fine_boundary(coarse_tile, fine_i, fine_j, fine_tile_x_min,
         filename = f"{path_output}/cell_tile_{fine_i}_{fine_j}.parquet"
         fine_tile_cells.to_parquet(filename)
 
-def process_fine_boundaries(coarse_tile, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_output, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y):
+def process_fine_boundaries(coarse_tile, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_output, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y, max_workers):
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = []
         for fine_i in range(n_fine_tiles_x):
@@ -111,8 +111,6 @@ def get_cell_polygons(technology, path_cell_boundaries, transformation_matrix, p
         grouped["geometry"] = grouped.apply(lambda row: Polygon(zip(row["vertex_x"], row["vertex_y"])), axis=1)
         cells_orig = gpd.GeoDataFrame(grouped, geometry="geometry")[["geometry"]]
 
-        transformation_matrix = pd.read_csv(transformation_matrix).values[:3,:3]
-
     elif technology == "custom":
         cells_orig = gpd.read_parquet(path_cell_boundaries)
 
@@ -175,15 +173,15 @@ def make_cell_boundary_tiles(
 
     transformation_matrix = pd.read_csv(path_transformation_matrix, header=None, sep=" ").values
 
-    gdf_cells = get_cell_polygons(technology, path_output, path_cell_boundaries, transformation_matrix, image_scale, path_meta_cell_micron)
-
-    gdf_cells["center_x"] = gdf_cells.geometry.centroid.x
-    gdf_cells["center_y"] = gdf_cells.geometry.centroid.y
-
     # Ensure the output directory exists
     if not os.path.exists(path_output):
         os.makedirs(path_output)
 
+    gdf_cells = get_cell_polygons(technology, path_cell_boundaries, transformation_matrix, path_output, image_scale, path_meta_cell_micron)
+
+    gdf_cells["center_x"] = gdf_cells.geometry.centroid.x
+    gdf_cells["center_y"] = gdf_cells.geometry.centroid.y
+
     # Calculate tile bounds and fine/coarse tiles
     x_min, x_max = tile_bounds["x_min"], tile_bounds["x_max"]
     y_min, y_max = tile_bounds["y_min"], tile_bounds["y_max"]
@@ -206,4 +204,4 @@ def make_cell_boundary_tiles(
                 (gdf_cells["center_y"] >= coarse_tile_y_min) & (gdf_cells["center_y"] < coarse_tile_y_max)
             ]
             if not coarse_tile.empty:
-                process_fine_boundaries(coarse_tile, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_output, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y)
+                process_fine_boundaries(coarse_tile, i, j, coarse_tile_x_min, coarse_tile_x_max, coarse_tile_y_min, coarse_tile_y_max, tile_size, path_output, x_min, y_min, n_fine_tiles_x, n_fine_tiles_y, max_workers)
diff --git a/src/celldega/pre/trx_tile.py b/src/celldega/pre/trx_tile.py
@@ -71,8 +71,7 @@ def filter_and_save_fine_tile(coarse_tile, coarse_i, coarse_j, fine_i, fine_j, f
 
 def transform_transcript_coordinates(technology, path_trx, chunk_size, path_transformation_matrix, image_scale=1):
 
-    #transformation_matrix = np.loadtxt(path_transformation_matrix)
-    transformation_matrix = pd.read_csv(path_transformation_matrix).values[:3,:3]
+    transformation_matrix = np.loadtxt(path_transformation_matrix)
 
     # Load the transcript data based on the technology using Polars
     if technology == "MERSCOPE":

diff --git a/src/celldega/qc/__init__.py b/src/celldega/qc/__init__.py
@@ -85,46 +85,42 @@ def qc_segmentation(transcript_metadata_file, transcript_data_file, cell_polygon
 
     print("segmentation metrics calculation completed")
 
-def mixed_expression_calc(cell_feature_matrix_xenium_directory, cbg_stp_cellpose_default_file, cbg_stp_instanseg_file, cbg_stp_cellpose2_file, t_cell_specific_genes, b_cell_specific_genes):
+def mixed_expression_calc(default_segmentation_segmentation_name, default_segmentation_cell_feature_matrix_path, algorithm_names, algorithm_specific_cbg_files, cell_type_A_specific_genes, cell_type_B_specific_genes, cell_A_name, cell_B_name):
 
-    cbg_stp_cellpose2 = pd.read_parquet(cbg_stp_cellpose2_file)
-    cbg_stp_cellpose_default = pd.read_parquet(cbg_stp_cellpose_default_file)
-    cbg_stp_instanseg = pd.read_parquet(cbg_stp_instanseg_file)
+    cbg_dict = {}
 
-    cbg_xenium = read_cbg_mtx(cell_feature_matrix_xenium_directory)
+    for cbg_file, algorithm_name in zip(algorithm_specific_cbg_files, algorithm_names):
+        cbg_dict[algorithm_name] = pd.read_parquet(cbg_file)
 
-    cbg_dict = {}
-    cbg_dict['xenium_default'] = cbg_xenium
-    cbg_dict['cellpose_default'] = cbg_stp_cellpose_default
-    cbg_dict['instanseg'] = cbg_stp_instanseg
-    cbg_dict['cellpose2'] = cbg_stp_cellpose2
+    cbg_dict[default_segmentation_segmentation_name] = read_cbg_mtx(default_segmentation_cell_feature_matrix_path)
 
     for algorithm_name, cbg in cbg_dict.items():
-        t_cell_overlap = [gene for gene in t_cell_specific_genes if gene in cbg.columns]
-        b_cell_overlap = [gene for gene in b_cell_specific_genes if gene in cbg.columns]
+
+        A_cell_overlap = [gene for gene in cell_type_A_specific_genes if gene in cbg.columns]
+        B_cell_overlap = [gene for gene in cell_type_B_specific_genes if gene in cbg.columns]
 
-        cells_with_t_genes = cbg[t_cell_overlap].sum(axis=1) > 0
-        cells_with_b_genes = cbg[b_cell_overlap].sum(axis=1) > 0
+        cells_with_A_genes = cbg[A_cell_overlap].sum(axis=1) > 0
+        cells_with_B_genes = cbg[B_cell_overlap].sum(axis=1) > 0
 
-        cells_with_both = cbg[cells_with_t_genes & cells_with_b_genes]
+        cells_with_both = cbg[cells_with_A_genes & cells_with_B_genes]
 
-        t_cell_genes_expressed = cells_with_both[t_cell_overlap].apply(
+        A_cell_genes_expressed = cells_with_both[A_cell_overlap].apply(
             lambda row: {gene: int(row[gene]) for gene in row[row > 0].index}, axis=1
         )
 
-        b_cell_genes_expressed = cells_with_both[b_cell_overlap].apply(
+        B_cell_genes_expressed = cells_with_both[B_cell_overlap].apply(
             lambda row: {gene: int(row[gene]) for gene in row[row > 0].index}, axis=1
         )
 
         results = pd.DataFrame({
-            "T-cell genes and transcripts": t_cell_genes_expressed,
-            "B-cell genes and transcripts": b_cell_genes_expressed
+            f"{cell_A_name} genes and transcripts": A_cell_genes_expressed,
+            f"{cell_B_name} genes and transcripts": B_cell_genes_expressed
         }, index=cells_with_both.index)
 
-        results["Total T-cell transcripts"] = t_cell_genes_expressed.apply(lambda x: sum(x.values()))
-        results["Total B-cell transcripts"] = b_cell_genes_expressed.apply(lambda x: sum(x.values()))
+        results[f"Total {cell_A_name} transcripts"] = A_cell_genes_expressed.apply(lambda x: sum(x.values()))
+        results[f"Total {cell_B_name} transcripts"] = B_cell_genes_expressed.apply(lambda x: sum(x.values()))
 
-        results["Total"] = t_cell_genes_expressed.apply(lambda x: sum(x.values())) + b_cell_genes_expressed.apply(lambda x: sum(x.values()))
+        results["Total"] = A_cell_genes_expressed.apply(lambda x: sum(x.values())) + B_cell_genes_expressed.apply(lambda x: sum(x.values()))
         results['Technology'] = algorithm_name
 
         sns.set(style='white', rc={'figure.dpi': 250, 'axes.facecolor': (0, 0, 0, 0), 'figure.facecolor': (0, 0, 0, 0)})
@@ -140,18 +136,18 @@ def mixed_expression_calc(cell_feature_matrix_xenium_directory, cbg_stp_cellpose
         g.map_dataframe(
             lambda data, **kwargs: sns.histplot(
                 data=data,
-                x="Total T-cell transcripts",
-                y="Total B-cell transcripts",
+                x=f"Total {cell_A_name} transcripts",
+                y=f"Total {cell_B_name} transcripts",
                 bins=15,
                 cbar=True,
                 cmap='coolwarm',
                 vmin=1,
-                vmax=data["Total T-cell transcripts"].max(),
+                vmax=data[f"Total {cell_A_name} transcripts"].max(),
                 **kwargs
             )
         )
 
-        g.set_axis_labels("Total T-cell transcripts", "Total B-cell transcripts")
+        g.set_axis_labels(f"Total {cell_A_name} transcripts", f"Total {cell_B_name} transcripts")
         for ax in g.axes.flat:
             ax.xaxis.set_major_locator(plt.MaxNLocator(integer=True))
             ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))