-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_me.py
231 lines (176 loc) · 9.43 KB
/
run_me.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
'''
Evaluation rules accomplished through a scoring system
genetic variant scoring system
Input: TSV file
Output: filtered entries with the highest and second highest total scores (i.e.: pathogenic and likely pathogenic disease causing variants)
Author: Hansi Thewarapperuma
Date: 25/12/2023
'''
import MAF
import Clin_sig
import Cons_score
import Func_pred
import Gen_context
import file_process
import Quality
# main method
if __name__ == '__main__':
# obtain the user input fasta file
user_tsv = input('Enter your TSV file path: ')
input_variants_df = file_process.tsv_to_df_filter_genes(user_tsv)
# in silico functional predictions - output: list of tuples containing the index and evaluation
functional_pred = Func_pred.in_silico_functional_predictions(input_variants_df)
if functional_pred is None:
functional_pred = []
# conservation score - output: list of tuples containing the index and evaluation
consv_score = Cons_score.conservation_scores(input_variants_df)
if consv_score is None:
consv_score = []
# clinical significance - output: list of tuples containing the index and evaluation
clinical_sig_scores = Clin_sig.evaluate_clinical_significance(input_variants_df)
if clinical_sig_scores is None:
clinical_sig_scores = []
# minor allele frequency - output: list of tuples containing the index and evaluation
af_output = MAF.evaluate_minor_allele_freq(input_variants_df)
if af_output is None:
af_output = []
# genomic context - output: list of tuples containing the index and evaluation
impact_output = Gen_context.evaluate_genomic_context(input_variants_df)
if impact_output is None:
impact_output = []
# Quality - Output: output: list of tuples containing the index and quality type
quality_status = Quality.evaluate_quality(input_variants_df)
if quality_status is None:
quality_status = []
# Assuming you have obtained lists of tuples from each function
# functional_pred, consv_score, clinical_sig_scores, af_output, impact_output
# Initialize a dictionary to store results
results_dict = {}
# Merge results using the common index
for result_list in [functional_pred, consv_score, clinical_sig_scores, af_output, impact_output, quality_status]:
for result_tuple in result_list:
index = result_tuple[0]
evaluation = result_tuple[1:]
# Add or update the dictionary with the evaluation
if index in results_dict:
results_dict[index].extend(evaluation)
else:
results_dict[index] = list(evaluation)
# *********************** EVALUATION-RULES *************************************
# highest score 1 given to the highest priority evaluation value which ensures pathogenicity and vice versa
# All the evaluaton values were given the same priority
# 1 was assigned to the above terms as they determine the pathogenicity of the variant
# -1 assigned to the opposite terms as they contradict the pathogenicity of the variant
# 0 was assigned to the neutral terms which have no evidence about pathogenicity
# Define a scoring system *********** VUS SCORE **********************
score_dict = {
'considerable_impact': 1,
'deleterious': 1,
'rare': 1,
'conserved': 1,
'pathogenic': 1,
'quality' : 1,
'vus': 0,
'conflicting_interpretations_of_pathogenicity': 0,
'no_considerable_impact': -1,
'not_deleterious': -1,
'common': -1,
'not_conserved': -1,
'benign': -2,
'less_quality' : -1,
'unresolved_clinical_significance': 0,
'N/A': 0,
'ambiguous_conservation': 0,
'ambiguous_deleteriousness': 0
}
# Calculate the total score for each entry
def calculate_score(evaluations):
total_score = sum(score_dict[eval_value] for eval_value in evaluations)
return total_score
# Filter out entries with 'N/A' as all the evaluation values
# filtered_results_dict = {index: evaluations for index, evaluations in results_dict.items() if
# 'N/A' not in evaluations}
# Sort entries based on total score
sorted_results = sorted(results_dict.items(), key=lambda x: calculate_score(x[1]), reverse=True)
# Print or use the sorted results as needed (********** COMMENT OUT TO PRINT ALL **************)
# for index, evaluations in sorted_results:
# print(f"Index: {index}, Evaluations: {evaluations}, Total Score: {calculate_score(evaluations)}")
# Find the highest total score
if sorted_results:
highest_total_score = calculate_score(sorted_results[0][1])
else:
highest_total_score = 0
# Filter out entries with the highest total score
filtered_highest_score_results = {index: evaluations for index, evaluations in sorted_results if
calculate_score(evaluations) == highest_total_score}
# Print or use the filtered results as needed : HIGHEST SCORE
# for index, evaluations in filtered_highest_score_results.items():
# print(f"Index: {index}, Evaluations: {evaluations}, Total Score: {calculate_score(evaluations)}")
# ****** IN CASE IF YOU WANT TO CONSIDER THE SECOND HIGHEST SCORE*******
# Find the second-highest total score
if len(sorted_results) > 1:
try:
second_highest_total_score = calculate_score(next(
evaluations for index, evaluations in sorted_results[1:] if
calculate_score(evaluations) != highest_total_score))
except StopIteration:
second_highest_total_score = 0
else:
second_highest_total_score = 0
# Filter out entries with the second-highest total score
filtered_second_highest_score_results = {index: evaluations for index, evaluations in sorted_results if
calculate_score(evaluations) == second_highest_total_score}
# Print or use the filtered results as needed: SECOND HIGHEST SCORE
# for index, evaluations in filtered_second_highest_score_results.items():
# print(f"Index: {index}, Evaluations: {evaluations}, Total Score: {calculate_score(evaluations)}")
# Count the number of results with the highest score and second highest score
pathogenic_count = len(filtered_highest_score_results)
pathogenic_likely_pathogenic_count = len(filtered_second_highest_score_results)
# Print the counts
print(f'Count of pathogenic variants detected: {pathogenic_count}')
print(f'Count of pathogenic or likely-pathogenic variants detected: {pathogenic_likely_pathogenic_count}')
# Get the indexes with +2 added
pathogenic_indexes = list(filtered_highest_score_results.keys())
pathogenic_likely_pathogenic_indexes = list(filtered_second_highest_score_results.keys())
# Print the indexes with statements
# if pathogenic_indexes:
# print('Consider these rows of your TSV for detected pathogenic variants:', pathogenic_indexes)
# if pathogenic_likely_pathogenic_indexes:
# print('Consider these rows of your TSV for pathogenic or likely-pathogenic variants:',
# pathogenic_likely_pathogenic_indexes)
# Write the detected pathogenic variants to a new TSV file
# def write_tsv_file(output_file, indexes):
# output_df = input_variants_df.iloc[indexes]
# output_df.to_csv(output_file, sep='\t', index=False, columns=input_variants_df.columns)
def write_tsv_file(output_file, indexes):
try:
# Ensure indexes are within the valid range
valid_indexes = [idx for idx in indexes if 0 <= idx < len(input_variants_df)]
if not valid_indexes:
print("No valid indexes to write.")
return
output_df = input_variants_df.iloc[valid_indexes]
# Reset the index before saving to ensure consistent indexing in the saved TSV file
output_df.reset_index(drop=True, inplace=True)
output_df.to_csv(output_file, sep='\t', index=False, columns=input_variants_df.columns)
print(f'TSV file saved to {output_file}')
except Exception as e:
print(f'Error saving TSV file: {e}')
# Specify the output file paths
pathogenic_output_file = 'pathogenic_variants.tsv'
pathogenic_likely_pathogenic_output_file = 'pathogenic_likely_pathogenic_variants.tsv'
# Write the output TSV files
if pathogenic_indexes:
write_tsv_file(pathogenic_output_file, pathogenic_indexes)
print(f'Pathogenic variants saved to {pathogenic_output_file}')
if pathogenic_likely_pathogenic_indexes:
write_tsv_file(pathogenic_likely_pathogenic_output_file, pathogenic_likely_pathogenic_indexes)
print(f'Pathogenic or likely-pathogenic variants saved to {pathogenic_likely_pathogenic_output_file}')
# print(len(input_variants_df))
# print("Pathogenic Indexes:", pathogenic_indexes)
# print("Likely Pathogenic Indexes:", pathogenic_likely_pathogenic_indexes)
valid_indexes_pathogenic = [idx for idx in pathogenic_indexes if 0 <= idx < len(input_variants_df)]
valid_indexes_likely_pathogenic = [idx for idx in pathogenic_likely_pathogenic_indexes if
0 <= idx < len(input_variants_df)]
# print("Valid Pathogenic Indexes:", valid_indexes_pathogenic)
# print("Valid Likely Pathogenic Indexes:", valid_indexes_likely_pathogenic)