-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHALO_TotalObjectResults_splitter.py
147 lines (118 loc) · 5.24 KB
/
HALO_TotalObjectResults_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
"""
Copyright 2024 The Regents of the University of Colorado
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Author: Christian Rickert <[email protected]>
Group: Human Immune Monitoring Shared Resource (HIMSR)
University of Colorado, Anschutz Medical Campus
Title: HALO_TotalObjectResults_splitter
Summary: Reverse the merging process and split merge data
by sample name and/or MSI coordinates
DOI: https://doi.org/10.5281/zenodo.4741394
URL: https://github.com/christianrickert/CU-HIMSR/
Description:
This script can be used to split merged Total_Object_Results.csv files
from Indica Labs' HALO and re-merge them into separate comma-separated
value files using a specific pattern (regular expression) to consolidate
the exported files.
For performance reasons, there is no validation of the number of columns
or order of column labels - the header and lines are written as-is, so
make sure to only split/re-merge compatible data.
Create an import folder (or have the script create one for you) at the
current location and place the TOTAL_Object_Results.csv into there.
The output will be written into only into an empyt (needs to be empty)
export folder.
"""
# imports
import fnmatch
import io
import os
import re
import sys
# functions
def get_files(path="", pat="*", anti="", recurse=False):
"""Iterate through all files in a directory structure and
return a list of matching files.
Keyword arguments:
path -- the path to a directory containing files (default "")
pat -- string pattern that needs to be part of the file name (default "None")
anti -- string pattern that may not be part of the file name (default "None")
recurse -- boolen that allows the function to work recursively (default "False")
"""
FILES = []
for root, dirs, files in os.walk(path):
for file in files:
file = os.path.join(root, file)
if fnmatch.fnmatch(file, pat) and not fnmatch.fnmatch(file, anti):
FILES.append(file)
if not recurse:
break # from `os.walk()`
return FILES
def get_image_name(line="", pattern=""):
"""Uses a regular expression search to match a pattern against a text.
Keyword arguments:
line -- the line of text to find the pattern in
pattern -- the regular expression used in the search
"""
return re.search(pattern, line).group(1)
def unmerge_data(in_path="", out_path=""):
"""Imports data from a text file and writes out all columns on a per-file basis
using the first column's data for labeling of individual export files."""
with open(in_path, "r", encoding="utf-8-sig") as in_file:
print('\tFOLDER: "' + out_path + '"', flush=True)
if not os.path.exists(out_path):
os.mkdir(out_path)
if os.listdir(out_path):
print("OUTPUT PATH NOT EMPTY. EXITING.")
sys.exit(0)
out_file = io.StringIO("")
header = in_file.readline() # read and call `next()` on iterator
for lines, in_line in enumerate(in_file, start=1):
image_name = get_image_name(in_line, pattern=NAME_PATTERN)
if image_name not in known_images:
out_file.close() # previous file
out_file_path = os.path.abspath(
os.path.join(out_path, image_name + ".csv")
)
out_file = open(out_file_path, "a", encoding="utf-8") # current file
out_file.write(header)
known_images.add(image_name)
out_file.write(in_line)
out_file.close() # last file
return (lines, len(known_images))
# constants & variables
EXPORT_FOLDER = r".\export"
FILE_TARGET = "*Total_Object_Results.csv"
IMPORT_FOLDER = r".\import"
# NAME_PATTERN = re.compile(r"(\d{6}\s[\w#&\s\-_\.+]+)(?=_Scan)") # Akoya Polaris
NAME_PATTERN = re.compile(r"\\([^\\]+?)\.[^\\.]+(?=" ")") # generic
VERSION = "HALO_summaryfile_splitter 1.0 (2024-09-25)"
# main program
print(VERSION)
print(os.linesep)
print("UNMERGING files in folder:")
print("-----------------------")
print('FILE: "' + FILE_TARGET + '"')
FILE_COUNT = 0
if not os.path.exists(EXPORT_FOLDER):
os.mkdir(EXPORT_FOLDER)
if not os.path.exists(IMPORT_FOLDER):
os.mkdir(IMPORT_FOLDER)
known_images = set() # keep across multiple merge files
for index, file in enumerate(get_files(IMPORT_FOLDER, FILE_TARGET)):
print("\tFILE: " + file)
unmerged = unmerge_data(in_path=file, out_path=EXPORT_FOLDER)
print("\tLINES: " + str(unmerged[0]), "MATCHES: " + str(unmerged[1]))
FILE_COUNT += 1
print("FILES: " + str(FILE_COUNT))
print(os.linesep)
# WAIT = input("Press ENTER to exit this program.")