-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_subset.py
executable file
·57 lines (38 loc) · 1.33 KB
/
create_subset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""Creates a subset of the data set by only including patients that have a particular value of a nominal feature"""
from sys import argv
from pickle import load
from utils.utils import SUBSET_PATH
def main():
"""MAIN"""
original_data_file: str = argv[1]
feat_map: str = argv[2]
cohort: str = argv[3]
feat_map: str = 'data/feat-maps/{}/{}.p'.format(cohort, feat_map)
with open(feat_map, 'rb') as f:
feat_map: dict = load(f)
unique_values: list = sorted(set(feat_map.values()))
print('The Feature\'s Unique Values:', *unique_values)
n_subsets: int = len(unique_values)
subsets: list = [[] for _ in range(n_subsets)]
with open(original_data_file, 'r') as f:
headers: str = next(f)
for line in f:
row: list = line.strip().split(',')
ptid: str = row[0]
val = feat_map[ptid]
idx: int = unique_values.index(val)
subsets[idx].append(line)
for i in range(n_subsets):
val = unique_values[i]
if type(val) is str:
val = val.lower()
subset_path: str = SUBSET_PATH.format(val)
write_file(subset_path=subset_path, subset=subsets[i], headers=headers)
def write_file(subset_path: str, subset: list, headers: str):
"""Saves the subset after the correct patients have been selected"""
with open(subset_path, 'w') as f:
f.write(headers)
for line in subset:
f.write(line)
if __name__ == '__main__':
main()