-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path3.feature-select.py
140 lines (116 loc) · 5.26 KB
/
3.feature-select.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import sys
import pathlib
import argparse
import warnings
import logging
import traceback
import pandas as pd
from pycytominer import feature_select
sys.path.append("config")
from utils import parse_command_args, process_configuration, get_split_aware_site_info
recipe_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(os.path.join(recipe_path, "scripts"))
from io_utils import read_csvs_with_chunksize
# Configure logging
logfolder = os.path.join(os.path.dirname(recipe_path), "logs")
if not os.path.isdir(logfolder):
os.mkdir(logfolder)
logging.basicConfig(
filename=os.path.join(logfolder, "3.feature-select.log"), level=logging.INFO,
)
def handle_excepthook(exc_type, exc_value, exc_traceback):
logging.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
traceback_details = "\n".join(traceback.extract_tb(exc_traceback).format())
print(f"Uncaught Exception: {traceback_details}")
sys.excepthook = handle_excepthook
# Configure experiment
args = parse_command_args()
logging.info(f"Args used:{args}")
batch_id = args.batch_id
options_config_file = args.options_config_file
experiment_config_file = args.experiment_config_file
split_step = args.split_step
config, incomplete_sites, errored_sites = process_configuration(
batch_id,
step="profile--feature_select",
options_config=options_config_file,
experiment_config=experiment_config_file,
)
logging.info(f"Config used:{config}")
logging.info(f"Skipped incomplete sites during config processing: {incomplete_sites}")
logging.info(f"Skipped errored sites during config processing: {errored_sites}")
# Extract config arguments
split_info = config["experiment"]["split"][split_step]
perform = config["options"]["profile"]["feature_select"]["perform"]
# check if this step should be performed
if not perform:
sys.exit("Config file set to perform=False, not performing {}".format(__file__))
ignore_files = config["options"]["core"]["ignore_files"]
float_format = config["options"]["core"]["float_format"]
compression = config["options"]["core"]["compression"]
input_spotdir = config["directories"]["preprocess"]["spots"]
single_cell_input_dir = config["directories"]["profile"]["single_cell"]
single_cell_file = config["files"]["single_file_only_output_file"]
feature_select_input_dir = config["directories"]["profile"]["profiles"]
feature_select_input_files = config["files"]["normalize_files"]
feature_select_output_files = config["files"]["feature_select_files"]
sc_config = config["options"]["profile"]["single_cell"]
singlecell_from_single_file = sc_config["output_one_single_cell_file_only"]
feature_select_args = config["options"]["profile"]["feature_select"]
feature_select_operations = feature_select_args["operations"]
feature_select_levels = feature_select_args["levels"]
feature_select_drop_samples = feature_select_args["use_samples"]
feature_select_features = feature_select_args["features"]
feature_select_nacutoff = feature_select_args["na_cutoff"]
feature_select_corr_threshold = feature_select_args["corr_threshold"]
force = feature_select_args["force_overwrite"]
print("Starting 3.feature-select.")
logging.info("Starting 3.feature-select.")
sites = [x.name for x in input_spotdir.iterdir() if x.name not in ignore_files]
site_info_dict = get_split_aware_site_info(
config["experiment"], sites, split_info, separator="___"
)
for data_split_site in site_info_dict:
for data_level in feature_select_levels:
if data_level == "single_cell":
if not singlecell_from_single_file:
warnings.warn(
"Feature select operation is not enabled for site-specific single cell files. Skipping."
)
logging.warning(
"Feature select operation is not enabled for site-specific single cell files. Skipping."
)
continue
file_to_feature_select = feature_select_input_files[data_level]
file_to_feature_select = pathlib.Path(
file_to_feature_select.parents[0],
file_to_feature_select.name.replace(
".csv.gz", f"_{data_split_site}.csv.gz"
),
)
print(
f"Now performing feature selection for {data_level}...with operations: {feature_select_operations} for split {data_split_site}"
)
logging.info(
f"Performing feature selection for {data_level} with operations: {feature_select_operations} for split {data_split_site}"
)
output_file = feature_select_output_files[data_level]
output_file = pathlib.Path(
feature_select_output_files[data_level].parents[0],
output_file.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"),
)
df = read_csvs_with_chunksize(file_to_feature_select)
feature_select(
profiles=df,
features=feature_select_features,
samples=feature_select_drop_samples,
operation=feature_select_operations,
na_cutoff=feature_select_nacutoff,
corr_threshold=feature_select_corr_threshold,
output_file=output_file,
compression_options=compression,
float_format=float_format,
)
print("Finished 3.feature-select.")
logging.info("Finished 3.feature-select.")