diff --git a/MACS3/IO/Parser.py b/MACS3/IO/Parser.py index 09fd6f81..b8e3057a 100644 --- a/MACS3/IO/Parser.py +++ b/MACS3/IO/Parser.py @@ -1,7 +1,7 @@ # cython: language_level=3 # cython: profile=True # cython: linetrace=True -# Time-stamp: <2024-10-07 16:08:43 Tao Liu> +# Time-stamp: <2024-10-16 00:09:32 Tao Liu> """Module for all MACS Parser classes for input. Please note that the parsers are for reading the alignment files ONLY. @@ -28,7 +28,7 @@ from MACS3.Utilities.Constants import READ_BUFFER_SIZE from MACS3.Signal.FixWidthTrack import FWTrack -from MACS3.Signal.PairedEndTrack import PETrackI +from MACS3.Signal.PairedEndTrack import PETrackI, PETrackII from MACS3.Utilities.Logger import logging logger = logging.getLogger(__name__) @@ -1472,3 +1472,156 @@ def fw_parse_line(self, thisline: bytes) -> tuple: 1) else: raise StrandFormatError(thisline, thisfields[1]) + + +@cython.cclass +class FragParser(GenericParser): + """Parser for Fragment file containing scATAC-seq information. + + Format: + + chromosome frag_leftend frag_rightend barcode count + + Note: Only the first five columns are used! + + """ + n = cython.declare(cython.int, visibility='public') + d = cython.declare(cython.float, visibility='public') + + @cython.cfunc + def skip_first_commentlines(self): + """BEDPEParser needs to skip the first several comment lines. + """ + l_line: cython.int + thisline: bytes + + for thisline in self.fhd: + l_line = len(thisline) + if thisline and (thisline[:5] != b"track") \ + and (thisline[:7] != b"browser") \ + and (thisline[0] != 35): # 35 is b"#" + break + + # rewind from SEEK_CUR + self.fhd.seek(-l_line, 1) + return + + @cython.cfunc + def pe_parse_line(self, thisline: bytes): + """Parse each line, and return chromosome, left and right + positions, barcode and count. + + """ + thisfields: list + + thisline = thisline.rstrip() + + # still only support tabular as delimiter. + thisfields = thisline.split(b'\t') + try: + return (thisfields[0], + atoi(thisfields[1]), + atoi(thisfields[2]), + thisfields[3], + atoi(thisfields[4])) + except IndexError: + raise Exception("Less than 5 columns found at this line: %s\n" % thisline) + + @cython.ccall + def build_petrack2(self): + """Build PETrackII from all lines. + + """ + chromosome: bytes + left_pos: cython.int + right_pos: cython.int + barcode: bytes + count: cython.uchar + i: cython.long = 0 # number of fragments + m: cython.long = 0 # sum of fragment lengths + tmp: bytes = b"" + + petrack = PETrackII(buffer_size=self.buffer_size) + add_loc = petrack.add_loc + + while True: + # for each block of input + tmp += self.fhd.read(READ_BUFFER_SIZE) + if not tmp: + break + lines = tmp.split(b"\n") + tmp = lines[-1] + for thisline in lines[:-1]: + (chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline) + if left_pos < 0 or not chromosome: + continue + assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline + m += right_pos - left_pos + i += 1 + if i % 1000000 == 0: + info(" %d fragments parsed" % i) + add_loc(chromosome, left_pos, right_pos, barcode, count) + # last one + if tmp: + (chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline) + if left_pos >= 0 and chromosome: + assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline + i += 1 + m += right_pos - left_pos + add_loc(chromosome, left_pos, right_pos, barcode, count) + + self.d = cython.cast(cython.float, m) / i + self.n = i + assert self.d >= 0, "Something went wrong (mean fragment size was negative)" + + self.close() + petrack.set_rlengths({"DUMMYCHROM": 0}) + return petrack + + @cython.ccall + def append_petrack(self, petrack): + """Build PETrackI from all lines, return a PETrackI object. + """ + chromosome: bytes + left_pos: cython.int + right_pos: cython.int + barcode: bytes + count: cython.uchar + i: cython.long = 0 # number of fragments + m: cython.long = 0 # sum of fragment lengths + tmp: bytes = b"" + + add_loc = petrack.add_loc + while True: + # for each block of input + tmp += self.fhd.read(READ_BUFFER_SIZE) + if not tmp: + break + lines = tmp.split(b"\n") + tmp = lines[-1] + for thisline in lines[:-1]: + (chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline) + if left_pos < 0 or not chromosome: + continue + assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline + m += right_pos - left_pos + i += 1 + if i % 1000000 == 0: + info(" %d fragments parsed" % i) + add_loc(chromosome, left_pos, right_pos, barcode, count) + # last one + if tmp: + (chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline) + if left_pos >= 0 and chromosome: + assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline + i += 1 + m += right_pos - left_pos + add_loc(chromosome, left_pos, right_pos, barcode, count) + + self.d = (self.d * self.n + m) / (self.n + i) + self.n += i + + assert self.d >= 0, "Something went wrong (mean fragment size was negative)" + self.close() + petrack.set_rlengths({"DUMMYCHROM": 0}) + return petrack diff --git a/MACS3/IO/PeakIO.py b/MACS3/IO/PeakIO.py index 9ba1496f..0ad8f36c 100644 --- a/MACS3/IO/PeakIO.py +++ b/MACS3/IO/PeakIO.py @@ -1,6 +1,6 @@ # cython: language_level=3 # cython: profile=True -# Time-stamp: <2024-10-11 11:13:11 Tao Liu> +# Time-stamp: <2024-10-15 11:48:33 Tao Liu> """Module for PeakIO IO classes. @@ -141,6 +141,24 @@ def __str__(self): self.end, self.score) + def __getstate__(self): + return (self.chrom, + self.start, + self.end, + self.length, + self.summit, + self.score, + self.pileup, + self.pscore, + self.fc, + self.qscore, + self.name) + + def __setstate__(self, state): + (self.chrom, self.start, self.end, self.length, self.summit, + self.score, self.pileup, self.pscore, self.fc, + self.qscore, self.name) = state + @cython.cclass class PeakIO: diff --git a/MACS3/Signal/BedGraph.py b/MACS3/Signal/BedGraph.py index b16881f9..baa46af4 100644 --- a/MACS3/Signal/BedGraph.py +++ b/MACS3/Signal/BedGraph.py @@ -1,6 +1,6 @@ # cython: language_level=3 # cython: profile=True -# Time-stamp: <2024-10-14 23:47:21 Tao Liu> +# Time-stamp: <2024-10-15 16:18:23 Tao Liu> """Module for BedGraph data class. @@ -259,7 +259,7 @@ def add_chrom_data_PV(self, self.__data[chromosome] = [pyarray('i', pv['p']), pyarray('f', pv['v'])] minv = pv['v'].min() - maxv = pv['p'].max() + maxv = pv['v'].max() if maxv > self.maxvalue: self.maxvalue = maxv if minv < self.minvalue: @@ -1488,6 +1488,1337 @@ def cutoff_analysis(self, return ret +@cython.cclass +class bedGraphTrackII: + """Class for bedGraph type data. + + In bedGraph, data are represented as continuous non-overlapping + regions in the whole genome. I keep this assumption in all the + functions. If data has overlaps, some functions will definitely + give incorrect results. + + 1. Continuous: the next region should be after the previous one + unless they are on different chromosomes; + + 2. Non-overlapping: the next region should never have overlaps + with preceding region. + + The way to memorize bedGraph data is to remember the transition + points together with values of their preceding regions. The last + data point may exceed chromosome end, unless a chromosome + dictionary is given. Remember the coordinations in bedGraph and + this class is 0-indexed and right-open. + + Different with bedGraphTrackI, we use numpy array to store the + (end) positions and values. + + """ + __data: dict + maxvalue = cython.declare(cython.float, visibility="public") + minvalue = cython.declare(cython.float, visibility="public") + baseline_value = cython.declare(cython.float, visibility="public") + buffer_size: int + __size: dict + + def __init__(self, + baseline_value: cython.float = 0, + buffer_size: cython.int = 100000): + """ + baseline_value is the value to fill in the regions not defined + in bedGraph. For example, if the bedGraph is like: + + chr1 100 200 1 + chr1 250 350 2 + + Then the region chr1:200..250 should be filled with baseline_value. + + """ + self.__data = {} + self.__size = {} + self.maxvalue = -10000000 # initial maximum value is tiny since I want safe_add_loc to update it + self.minvalue = 10000000 # initial minimum value is large since I want safe_add_loc to update it + self.baseline_value = baseline_value + self.buffer_size = 100000 + + @cython.ccall + def add_loc(self, chromosome: bytes, + startpos: cython.int, + endpos: cython.int, + value: cython.float): + """Add a chr-start-end-value block into __data dictionary. + + Note, we don't check if the add_loc is called continuously on + sorted regions without any gap. So we only suggest calling + this function within MACS. + + """ + pre_v: cython.float + c: cnp.ndarray + i: cython.int + + # basic assumption, end pos should > start pos + + if endpos <= 0: + return + if startpos < 0: + startpos = 0 + + if chromosome not in self.__data: + i = 0 + # first element in the chromosome + self.__data[chromosome] = np.zeros(shape=self.buffer_size, + dtype=[('p', 'u4'), ('v', 'f4')]) + c = self.__data[chromosome] + if startpos > 0: + # start pos is not 0, then add two blocks, the first + # with "baseline_value"; the second with "value" + c[0] = (startpos, self.baseline_value) + i += 1 + + c[i] = (endpos, value) + else: + c = self.__data[chromosome] + i = self.__size[chromosome] + # get the preceding region + pre_v = c[i-1][1] # which is quicker? c[i-1][1] or c["v"][i-1]? + + # if this region is next to the previous one. + if pre_v == value: + # if value is the same, simply extend it. + c[i-1][0] = endpos + else: + if i % self.buffer_size == 0: + self.__data[chromosome].resize(i+self.buffer_size, + refcheck=False) + # otherwise, add a new region + c[i] = (endpos, value) + i += 1 + + self.__size[chromosome] = i + + @cython.ccall + def add_loc_wo_merge(self, chromosome: bytes, + startpos: cython.int, + endpos: cython.int, + value: cython.float): + """Add a chr-start-end-value block into __data dictionary. + + Note, we don't check if the add_loc is called continuously on + sorted regions without any gap. So we only suggest calling + this function within MACS. + + This one won't merge nearby ranges with the same value + """ + c: cnp.ndarray + i: cython.int + + # basic assumption, end pos should > start pos + + if endpos <= 0: + return + if startpos < 0: + startpos = 0 + + if chromosome not in self.__data: + i = 0 + # first element in the chromosome + self.__data[chromosome] = np.zeros(shape=self.buffer_size, + dtype=[('p', 'u4'), ('v', 'f4')]) + c = self.__data[chromosome] + if startpos > 0: + # start pos is not 0, then add two blocks, the first + # with "baseline_value"; the second with "value" + c[0] = (startpos, self.baseline_value) + i += 1 + + c[i] = (endpos, value) + else: + c = self.__data[chromosome] + i = self.__size[chromosome] + + if i % self.buffer_size == 0: + self.__data[chromosome].resize(i+self.buffer_size, + refcheck=False) + # otherwise, add a new region + c[i] = (endpos, value) + i += 1 + + self.__size[chromosome] = i + + @cython.ccall + def add_chrom_data(self, + chromosome: bytes, + pv: cnp.ndarray): + """Add a pv data to a chromosome. Replace the previous data. + + This is a kinda silly function to waste time and convert a PV + array (2-d named numpy array) into two python arrays for this + BedGraph class. May have better function later. + + Note: no checks for error, use with caution + """ + self.__data[chromosome] = pv + self.__size[chromosome] = len(pv) + + return + + @cython.ccall + def destroy(self) -> bool: + """ destroy content, free memory. + """ + chrs: set + chrom: bytes + + chrs = self.get_chr_names() + for chrom in sorted(chrs): + if chrom in self.__data: + self.__data[chrom].resize(self.buffer_size, + refcheck=False) + self.__data[chrom].resize(0, + refcheck=False) + self.__data[chrom] = None + self.__data.pop(chrom) + self.__size[chrom] = 0 + return True + + @cython.ccall + def finalize(self): + """Resize np arrays. + + Note: If this function is called, please do not add any more + data. remember to call it after all the files are read! + + """ + c: bytes + chrnames: set + maxv: cython.float + minv: cython.float + + chrnames = self.get_chr_names() + + for c in chrnames: + self.__data[c].resize((self.__size[c]), refcheck=False) + self.__data[c].sort(order=['p']) + + minv = self.__data[c]['v'].min() + maxv = self.__data[c]['v'].max() + if maxv > self.maxvalue: + self.maxvalue = maxv + if minv < self.minvalue: + self.minvalue = minv + return + + @cython.ccall + def get_data_by_chr(self, chromosome: bytes) -> cnp.ndarray: + """Return array of counts by chromosome. + + The return value is a tuple: + ([end pos],[value]) + """ + if chromosome in self.__data: + return self.__data[chromosome] + else: + return None + + @cython.ccall + def get_chr_names(self) -> set: + """Return all the chromosome names stored. + + """ + return set(sorted(self.__data.keys())) + + @cython.ccall + def filter_score(self, cutoff: cython.float = 0) -> bool: + """Filter using a score cutoff. Any region lower than score + cutoff will be set to self.baseline_value. + + Self will be modified. + """ + # new_pre_pos: cython.int + chrom: bytes + chrs: set + d: cnp.ndarray + maxv: cython.float + minv: cython.float + + chrs = self.get_chr_names() + for chrom in sorted(chrs): + d = self.__data[chrom] + d = d[d['v'] > cutoff] + self.__data[chrom] = d + self.__size[chrom] = len(d) + minv = d['v'].min() + maxv = d['v'].max() + if maxv > self.maxvalue: + self.maxvalue = maxv + if minv < self.minvalue: + self.minvalue = minv + return True + + @cython.ccall + def summary(self) -> tuple: + """Calculate the sum, total_length, max, min, mean, and std. + + Return a tuple for (sum, total_length, max, min, mean, std). + + """ + d: cnp.ndarray + n_v: cython.long + sum_v: cython.float + max_v: cython.float + min_v: cython.float + mean_v: cython.float + variance: cython.float + tmp: cython.float + std_v: cython.float + pre_p: cython.int + ln: cython.int + i: cython.int + + pre_p = 0 + n_v = 0 + sum_v = 0 + max_v = -100000 + min_v = 100000 + for d in self.__data.values(): + # for each chromosome + pre_p = 0 + for i in range(len(d)): + # for each region + ln = d[i][0]-pre_p + sum_v += d[i][1]*ln + n_v += ln + pre_p = d[i][0] + max_v = max(max(d["v"]), max_v) + min_v = min(min(d["v"]), min_v) + mean_v = sum_v/n_v + variance = 0.0 + for d in self.__data.values(): + for i in range(len(d)): + # for each region + tmp = d[i][1]-mean_v + ln = d[i][0]-pre_p + variance += tmp*tmp*ln + pre_p = d[i][0] + + variance /= float(n_v-1) + std_v = sqrt(variance) + return (sum_v, n_v, max_v, min_v, mean_v, std_v) + + @cython.ccall + def call_peaks(self, + cutoff: cython.float = 1.0, + min_length: cython.int = 200, + max_gap: cython.int = 50, + call_summits: bool = False): + """This function try to find regions within which, scores + are continuously higher than a given cutoff. + + """ + i: cython.int + chrom: bytes + pos: cnp.ndarray + value: cnp.ndarray + above_cutoff: cnp.ndarray(dtype="bool", ndim=1) + above_cutoff_v: cnp.ndarray + above_cutoff_endpos: cnp.ndarray + above_cutoff_startpos: cnp.ndarray + peak_content: list + + chrs = self.get_chr_names() + peaks = PeakIO() # dictionary to save peaks + + for chrom in sorted(chrs): + peak_content = [] # to store points above cutoff + pos = self.__data[chrom]['p'] + value = self.__data[chrom]['v'] + + above_cutoff = value >= cutoff + # scores where score is above cutoff + above_cutoff_v = value[above_cutoff] + # end positions of regions where score is above cutoff + above_cutoff_endpos = pos[above_cutoff] + # start positions of regions where score is above cutoff + above_cutoff_startpos = pos[np.roll(above_cutoff, -1)] + + if above_cutoff_v.size == 0: + # nothing above cutoff + continue + + if above_cutoff[0] is True: + # first element > cutoff, fix the first point as 0. otherwise it would be the last item in __data[chrom]['p'] + above_cutoff_startpos[0] = 0 + + # first bit of region above cutoff + peak_content.append((above_cutoff_startpos[0], above_cutoff_endpos[0], above_cutoff_v[0])) + for i in range(1, above_cutoff_startpos.size): + if above_cutoff_startpos[i] - peak_content[-1][1] <= max_gap: + # append + peak_content.append((above_cutoff_startpos[i], above_cutoff_endpos[i], above_cutoff_v[i])) + else: + # close + self.__close_peak(peak_content, + peaks, + min_length, + chrom) + peak_content = [(above_cutoff_startpos[i], above_cutoff_endpos[i], above_cutoff_v[i]),] + + # save the last peak + if not peak_content: + continue + else: + self.__close_peak(peak_content, + peaks, + min_length, + chrom) + + return peaks + + @cython.cfunc + def __close_peak(self, + peak_content: list, + peaks, + min_length: cython.int, + chrom: bytes) -> bool: + tsummit: list # list for temporary summits + peak_length: cython.int + summit: cython.int + tstart: cython.int + tend: cython.int + summit_value: cython.float + tvalue: cython.float + peak_length = peak_content[-1][1]-peak_content[0][0] + if peak_length >= min_length: # if the peak is too small, reject it + tsummit = [] + summit = 0 + summit_value = 0 + for (tstart, tend, tvalue) in peak_content: + if not summit_value or summit_value < tvalue: + tsummit = [cython.cast(cython.int, (tend+tstart)/2),] + summit_value = tvalue + elif summit_value == tvalue: + tsummit.append(cython.cast(cython.int, (tend+tstart)/2)) + summit = tsummit[cython.cast(cython.int, (len(tsummit)+1)/2)-1] + peaks.add(chrom, + peak_content[0][0], + peak_content[-1][1], + summit=summit, + peak_score=summit_value, + pileup=0, + pscore=0, + fold_change=0, + qscore=0 + ) + return True + + @cython.ccall + def call_broadpeaks(self, + lvl1_cutoff: cython.float = 500, + lvl2_cutoff: cython.float = 100, + min_length: cython.int = 200, + lvl1_max_gap: cython.int = 50, + lvl2_max_gap: cython.int = 400): + """This function try to find enriched regions within which, + scores are continuously higher than a given cutoff for level + 1, and link them using the gap above level 2 cutoff with a + maximum length of lvl2_max_gap. + + lvl1_cutoff: cutoff of value at enriched regions, default 500. + lvl2_cutoff: cutoff of value at linkage regions, default 100. + min_length : minimum peak length, default 200. + lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50. + lvl2_max_gap : maximum length of linkage regions, default 400. + colname: can be 'sample','control','-100logp','-100logq'. Cutoff will be applied to the specified column. + + Return both general PeakIO object for highly enriched regions + and gapped broad regions in BroadPeakIO. + """ + chrom: bytes + i: cython.int + j: cython.int + chrs: set + lvl1: PeakContent + lvl2: PeakContent # PeakContent class object + lvl1peakschrom: list + lvl2peakschrom: list + + assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2." + assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1." + lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff, + min_length=min_length, + max_gap=lvl1_max_gap, + call_summits=False) + lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff, + min_length=min_length, + max_gap=lvl2_max_gap, + call_summits=False) + chrs = lvl1_peaks.get_chr_names() + broadpeaks = BroadPeakIO() + # use lvl2_peaks as linking regions between lvl1_peaks + for chrom in sorted(chrs): + lvl1peakschrom = lvl1_peaks.get_data_from_chrom(chrom) + lvl2peakschrom = lvl2_peaks.get_data_from_chrom(chrom) + lvl1peakschrom_next = iter(lvl1peakschrom).__next__ + tmppeakset = [] # to temporarily store lvl1 region inside a lvl2 region + # our assumption is lvl1 regions should be included in lvl2 regions + try: + lvl1 = lvl1peakschrom_next() + for i in range(len(lvl2peakschrom)): + # for each lvl2 peak, find all lvl1 peaks inside + lvl2 = lvl2peakschrom[i] + while True: + if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]: + tmppeakset.append(lvl1) + lvl1 = lvl1peakschrom_next() + else: + self.__add_broadpeak(broadpeaks, + chrom, + lvl2, + tmppeakset) + tmppeakset = [] + break + except StopIteration: + self.__add_broadpeak(broadpeaks, chrom, lvl2, tmppeakset) + tmppeakset = [] + for j in range(i+1, len(lvl2peakschrom)): + self.__add_broadpeak(broadpeaks, + chrom, + lvl2peakschrom[j], + tmppeakset) + return broadpeaks + + @cython.cfunc + def __add_broadpeak(self, + bpeaks, + chrom: bytes, + lvl2peak: PeakContent, + lvl1peakset: list): + """Internal function to create broad peak. + """ + start: cython.int + end: cython.int + blockNum: cython.int + blockSizes: bytes + blockStarts: bytes + thickStart: bytes + thickEnd: bytes + + start = lvl2peak["start"] + end = lvl2peak["end"] + + # the following code will add those broad/lvl2 peaks with no + # strong/lvl1 peaks inside + if not lvl1peakset: + # try: + # will complement by adding 1bps start and end to this region + # may change in the future if gappedPeak format was improved. + bpeaks.add(chrom, start, end, + score=lvl2peak["score"], + thickStart=(b"%d" % start), + thickEnd=(b"%d" % end), + blockNum=2, + blockSizes=b"1,1", + blockStarts=(b"0,%d" % (end-start-1)), + pileup=lvl2peak["pileup"], + pscore=lvl2peak["pscore"], + fold_change=lvl2peak["fc"], + qscore=lvl2peak["qscore"]) + return bpeaks + + thickStart = b"%d" % lvl1peakset[0]["start"] + thickEnd = b"%d" % lvl1peakset[-1]["end"] + blockNum = len(lvl1peakset) + blockSizes = b",".join([b"%d" % x["length"] for x in lvl1peakset]) + blockStarts = b",".join([b"%d" % (x["start"]-start) for x in lvl1peakset]) + + if int(thickStart) != start: + # add 1bp left block + thickStart = b"%d" % start + blockNum += 1 + blockSizes = b"1,"+blockSizes + blockStarts = b"0,"+blockStarts + if int(thickEnd) != end: + # add 1bp right block + thickEnd = b"%d" % end + blockNum += 1 + blockSizes = blockSizes+b",1" + blockStarts = blockStarts + b"," + (b"%d" % (end-start-1)) + + bpeaks.add(chrom, start, end, + score=lvl2peak["score"], + thickStart=thickStart, + thickEnd=thickEnd, + blockNum=blockNum, + blockSizes=blockSizes, + blockStarts=blockStarts, + pileup=lvl2peak["pileup"], + pscore=lvl2peak["pscore"], + fold_change=lvl2peak["fc"], + qscore=lvl2peak["qscore"]) + return bpeaks + + @cython.ccall + def refine_peaks(self, peaks): + """This function try to based on given peaks, re-evaluate the + peak region, call the summit. + + peaks: PeakIO object + return: a new PeakIO object + + """ + pre_p: cython.int + p: cython.int + peak_s: cython.int + peak_e: cython.int + v: cython.float + chrom: bytes + chrs: set + + peaks.sort() + new_peaks = PeakIO() + chrs = self.get_chr_names() + assert isinstance(peaks, PeakIO) + chrs = chrs.intersection(set(peaks.get_chr_names())) + + for chrom in sorted(chrs): + peaks_chr = peaks.get_data_from_chrom(chrom) + peak_content = [] + # arrays for position and values + (ps, vs) = self.get_data_by_chr(chrom) + # assign the next function to a viable to speed up + psn = iter(ps).__next__ + vsn = iter(vs).__next__ + peakn = iter(peaks_chr).__next__ + + # remember previous position in bedgraph/self + pre_p = 0 + p = psn() + v = vsn() + peak = peakn() + peak_s = peak["start"] + peak_e = peak["end"] + while True: + # look for overlap + if p > peak_s and peak_e > pre_p: + # now put four coordinates together and pick the middle two + s, e = sorted([p, peak_s, peak_e, pre_p])[1:3] + # add this content + peak_content.append((s, e, v)) + # move self/bedGraph + try: + pre_p = p + p = psn() + v = vsn() + except Exception: + # no more value chunk in bedGraph + break + elif pre_p >= peak_e: + # close peak + self.__close_peak(peak_content, new_peaks, 0, chrom) + peak_content = [] + # move peak + try: + peak = peakn() + peak_s = peak["start"] + peak_e = peak["end"] + except Exception: + # no more peak + break + elif peak_s >= p: + # move self/bedgraph + try: + pre_p = p + p = psn() + v = vsn() + except Exception: + # no more value chunk in bedGraph + break + else: + raise Exception(f"no way here! prev position:{pre_p}; position:{p}; value:{v}; peak start:{peak_s}; peak end:{peak_e}") + + # save the last peak + if peak_content: + self.__close_peak(peak_content, new_peaks, 0, chrom) + return new_peaks + + @cython.ccall + def total(self) -> cython.int: + """Return the number of regions in this object. + + """ + t: cython.int + d: cnp.ndarray + + t = 0 + for d in self.__data.values(): + t += len(d) + return t + + # @cython.ccall + # def set_single_value(self, new_value: cython.float): + # """Change all the values in bedGraph to the same new_value, + # return a new bedGraphTrackI. + + # """ + # chrom: bytes + # max_p: cython.int + + # ret = bedGraphTrackI() + # chroms = set(self.get_chr_names()) + # for chrom in sorted(chroms): + # # arrays for position and values + # (p1, v1) = self.get_data_by_chr(chrom) + # # maximum p + # max_p = max(p1) + # # add a region from 0 to max_p + # ret.add_loc(chrom, 0, max_p, new_value) + # return ret + + # @cython.ccall + # def overlie(self, bdgTracks, func: str = "max"): + # """Calculate two or more bedGraphTrackI objects by letting self + # overlying bdgTrack2, with user-defined functions. + + # Transition positions from both bedGraphTrackI objects will be + # considered and combined. For example: + + # #1 bedGraph (self) | #2 bedGraph + # ----------------------------------------------- + # chr1 0 100 0 | chr1 0 150 1 + # chr1 100 200 3 | chr1 150 250 2 + # chr1 200 300 4 | chr1 250 300 4 + + # these two bedGraphs will be combined to have five transition + # points: 100, 150, 200, 250, and 300. So in order to calculate + # two bedGraphs, I pair values within the following regions + # like: + + # chr s e (#1,#2) applied_func_max + # ----------------------------------------------- + # chr1 0 100 (0,1) 1 + # chr1 100 150 (3,1) 3 + # chr1 150 200 (3,2) 3 + # chr1 200 250 (4,2) 4 + # chr1 250 300 (4,4) 4 + + # Then the given 'func' will be applied on each 2-tuple as func(#1,#2) + + # Supported 'func' are "sum", "subtract" (only for two bdg + # objects), "product", "divide" (only for two bdg objects), + # "max", "mean" and "fisher". + + # Return value is a new bedGraphTrackI object. + + # Option: bdgTracks can be a list of bedGraphTrackI objects + # """ + # pre_p: cython.int + # chrom: bytes + + # nr_tracks = len(bdgTracks) + 1 # +1 for self + # assert nr_tracks >= 2, "Specify at least one more bdg objects." + # for i, bdgTrack in enumerate(bdgTracks): + # assert isinstance(bdgTrack, bedGraphTrackI), "bdgTrack{} is not a bedGraphTrackI object".format(i + 1) + + # if func == "max": + # f = max + # elif func == "mean": + # f = mean_func + # elif func == "fisher": + # f = fisher_func + # elif func == "sum": + # f = sum + # elif func == "product": + # f = product_func + # elif func == "subtract": + # if nr_tracks == 2: + # f = subtract_func + # else: + # raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}") + # elif func == "divide": + # if nr_tracks == 2: + # f = divide_func + # else: + # raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}") + # else: + # raise Exception("Invalid function {func}! Choose from 'sum', 'subtract' (only for two bdg objects), 'product', 'divide' (only for two bdg objects), 'max', 'mean' and 'fisher'. ") + + # ret = bedGraphTrackI() + + # common_chr = set(self.get_chr_names()) + # for track in bdgTracks: + # common_chr = common_chr.intersection(set(track.get_chr_names())) + + # for chrom in sorted(common_chr): + # datas = [self.get_data_by_chr(chrom)] + # datas.extend([bdgTracks[i].get_data_by_chr(chrom) for i in range(len(bdgTracks))]) + + # ps, vs, pn, vn = [], [], [], [] + # for data in datas: + # ps.append(data[0]) + # pn.append(iter(ps[-1]).__next__) + # vs.append(data[1]) + # vn.append(iter(vs[-1]).__next__) + + # pre_p = 0 # remember the previous position in the new bedGraphTrackI object ret + # try: + # ps_cur = [pn[i]() for i in range(len(pn))] + # vs_cur = [vn[i]() for i in range(len(pn))] + + # while True: + # # get the lowest position + # lowest_p = min(ps_cur) + + # # at least one lowest position, could be multiple + # locations = [i for i in range(len(ps_cur)) if ps_cur[i] == lowest_p] + + # # add the data until the interval + # ret.add_loc(chrom, pre_p, ps_cur[locations[0]], f(vs_cur)) + + # pre_p = ps_cur[locations[0]] + # for index in locations: + # ps_cur[index] = pn[index]() + # vs_cur[index] = vn[index]() + # except StopIteration: + # # meet the end of either bedGraphTrackI, simply exit + # pass + # return ret + + # @cython.ccall + # def apply_func(self, func) -> bool: + # """Apply function 'func' to every value in this bedGraphTrackI object. + + # *Two adjacent regions with same value after applying func will + # not be merged. + # """ + # i: cython.int + + # for (p, s) in self.__data.values(): + # for i in range(len(s)): + # s[i] = func(s[i]) + # self.maxvalue = func(self.maxvalue) + # self.minvalue = func(self.minvalue) + # return True + + # @cython.ccall + # def p2q(self): + # """Convert pvalue scores to qvalue scores. + + # *Assume scores in this bedGraph are pvalue scores! Not work + # for other type of scores. + # """ + # chrom: bytes + # pos_array: pyarray + # pscore_array: pyarray + # pvalue_stat: dict = {} + # pqtable: dict = {} + # pre_p: cython.long + # this_p: cython.long + # # pre_l: cython.long + # # l: cython.long + # i: cython.long + # nhcal: cython.long = 0 + # N: cython.long + # k: cython.long + # this_l: cython.long + # this_v: cython.float + # # pre_v: cython.float + # v: cython.float + # q: cython.float + # pre_q: cython.float + # f: cython.float + # unique_values: list + + # # calculate frequencies of each p-score + # for chrom in sorted(self.get_chr_names()): + # pre_p = 0 + + # [pos_array, pscore_array] = self.__data[chrom] + + # pn = iter(pos_array).__next__ + # vn = iter(pscore_array).__next__ + + # for i in range(len(pos_array)): + # this_p = pn() + # this_v = vn() + # this_l = this_p - pre_p + # if this_v in pvalue_stat: + # pvalue_stat[this_v] += this_l + # else: + # pvalue_stat[this_v] = this_l + # pre_p = this_p + + # # nhcal += len(pos_array) + + # # nhval = 0 + + # N = sum(pvalue_stat.values()) # total length + # k = 1 # rank + # f = -log10(N) + # # pre_v = -2147483647 + # # pre_l = 0 + # pre_q = 2147483647 # save the previous q-value + + # # calculate qscore for each pscore + # pqtable = {} + # unique_values = sorted(pvalue_stat.keys(), reverse=True) + # for i in range(len(unique_values)): + # v = unique_values[i] + # # l = pvalue_stat[v] + # q = v + (log10(k) + f) + # q = max(0, min(pre_q, q)) # make q-score monotonic + # pqtable[v] = q + # # pre_v = v + # pre_q = q + # # k += l + # nhcal += 1 + + # # convert pscore to qscore + # for chrom in sorted(self.get_chr_names()): + # [pos_array, pscore_array] = self.__data[chrom] + + # for i in range(len(pos_array)): + # pscore_array[i] = pqtable[pscore_array[i]] + + # self.merge_regions() + # return + + # @cython.ccall + # def extract_value(self, bdgTrack2): + # """Extract values from regions defined in bedGraphTrackI class object + # `bdgTrack2`. + + # """ + # pre_p: cython.int + # p1: cython.int + # p2: cython.int + # i: cython.int + # v1: cython.float + # v2: cython.float + # chrom: bytes + + # assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object" + + # # 1: region in bdgTrack2; 2: value; 3: length with the value + # ret = [[], pyarray('f', []), pyarray('L', [])] + # radd = ret[0].append + # vadd = ret[1].append + # ladd = ret[2].append + + # chr1 = set(self.get_chr_names()) + # chr2 = set(bdgTrack2.get_chr_names()) + # common_chr = chr1.intersection(chr2) + # for i in range(len(common_chr)): + # chrom = common_chr.pop() + # (p1s, v1s) = self.get_data_by_chr(chrom) # arrays for position and values + # # assign the next function to a viable to speed up + # p1n = iter(p1s).__next__ + # v1n = iter(v1s).__next__ + + # # arrays for position and values + # (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom) + # # assign the next function to a viable to speed up + # p2n = iter(p2s).__next__ + # v2n = iter(v2s).__next__ + # # remember the previous position in the new bedGraphTrackI + # # object ret + # pre_p = 0 + # try: + # p1 = p1n() + # v1 = v1n() + + # p2 = p2n() + # v2 = v2n() + + # while True: + # if p1 < p2: + # # clip a region from pre_p to p1, then set pre_p as p1. + # if v2 > 0: + # radd(str(chrom)+"."+str(pre_p)+"."+str(p1)) + # vadd(v1) + # ladd(p1-pre_p) + # pre_p = p1 + # # call for the next p1 and v1 + # p1 = p1n() + # v1 = v1n() + # elif p2 < p1: + # # clip a region from pre_p to p2, then set + # # pre_p as p2. + # if v2 > 0: + # radd(str(chrom)+"."+str(pre_p)+"."+str(p2)) + # vadd(v1) + # ladd(p2-pre_p) + # pre_p = p2 + # # call for the next p2 and v2 + # p2 = p2n() + # v2 = v2n() + # elif p1 == p2: + # # from pre_p to p1 or p2, then set pre_p as p1 or p2. + # if v2 > 0: + # radd(str(chrom)+"."+str(pre_p)+"."+str(p1)) + # vadd(v1) + # ladd(p1-pre_p) + # pre_p = p1 + # # call for the next p1, v1, p2, v2. + # p1 = p1n() + # v1 = v1n() + # p2 = p2n() + # v2 = v2n() + # except StopIteration: + # # meet the end of either bedGraphTrackI, simply exit + # pass + + # return ret + + # @cython.ccall + # def extract_value_hmmr(self, bdgTrack2): + # """Extract values from regions defined in bedGraphTrackI class object + # `bdgTrack2`. + + # I will try to tweak this function to output only the values of + # bdgTrack1 (self) in the regions in bdgTrack2 + + # This is specifically for HMMRATAC. bdgTrack2 should be a + # bedgraph object containing the bins with value set to + # 'mark_bin' -- the bins in the same region will have the same + # value. + # """ + # # pre_p: cython.int + # p1: cython.int + # p2: cython.int + # i: cython.int + # v1: cython.float + # v2: cython.float + # chrom: bytes + # ret: list + + # assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object" + + # # 0: bin location (chrom, position); 1: value; 2: number of bins in this region + # ret = [[], pyarray('f', []), pyarray('i', [])] + # padd = ret[0].append + # vadd = ret[1].append + # ladd = ret[2].append + + # chr1 = set(self.get_chr_names()) + # chr2 = set(bdgTrack2.get_chr_names()) + # common_chr = sorted(list(chr1.intersection(chr2))) + # for i in range(len(common_chr)): + # chrom = common_chr.pop() + # # arrays for position and values + # (p1s, v1s) = self.get_data_by_chr(chrom) + # # assign the next function to a viable to speed up + # p1n = iter(p1s).__next__ + # v1n = iter(v1s).__next__ + + # # arrays for position and values + # (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom) + # # assign the next function to a viable to speed up + # p2n = iter(p2s).__next__ + # v2n = iter(v2s).__next__ + # # remember the previous position in the new bedGraphTrackI + # # object ret + # # pre_p = 0 + # try: + # p1 = p1n() + # v1 = v1n() + + # p2 = p2n() + # v2 = v2n() + + # while True: + # if p1 < p2: + # # clip a region from pre_p to p1, then set pre_p as p1. + # # in this case, we don't output any + # # if v2>0: + # # radd(str(chrom)+"."+str(pre_p)+"."+str(p1)) + # # vadd(v1) + # # ladd(p1-pre_p) + # # pre_p = p1 + # # call for the next p1 and v1 + # p1 = p1n() + # v1 = v1n() + # elif p2 < p1: + # # clip a region from pre_p to p2, then set pre_p as p2. + # if v2 != 0: # 0 means it's a gap region, we should have value > 1 + # padd((chrom, p2)) + # vadd(v1) + # ladd(int(v2)) + # # pre_p = p2 + # # call for the next p2 and v2 + # p2 = p2n() + # v2 = v2n() + # elif p1 == p2: + # # from pre_p to p1 or p2, then set pre_p as p1 or p2. + # if v2 != 0: # 0 means it's a gap region, we should have 1 or -1 + # padd((chrom, p2)) + # vadd(v1) + # ladd(int(v2)) + # # pre_p = p1 + # # call for the next p1, v1, p2, v2. + # p1 = p1n() + # v1 = v1n() + # p2 = p2n() + # v2 = v2n() + # except StopIteration: + # # meet the end of either bedGraphTrackI, simply exit + # pass + + # return ret + + # @cython.ccall + # def make_ScoreTrackII_for_macs(self, bdgTrack2, + # depth1: float = 1.0, + # depth2: float = 1.0): + # """A modified overlie function for MACS v2. + + # effective_depth_in_million: sequencing depth in million after + # duplicates being filtered. If + # treatment is scaled down to + # control sample size, then this + # should be control sample size in + # million. And vice versa. + + # Return value is a ScoreTrackII object. + # """ + # # pre_p: cython.int + # p1: cython.int + # p2: cython.int + # v1: cython.float + # v2: cython.float + # chrom: bytes + + # assert isinstance(bdgTrack2, bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object" + + # ret = ScoreTrackII(treat_depth=depth1, + # ctrl_depth=depth2) + # retadd = ret.add + + # chr1 = set(self.get_chr_names()) + # chr2 = set(bdgTrack2.get_chr_names()) + # common_chr = chr1.intersection(chr2) + # for chrom in sorted(common_chr): + # # arrays for position and values + # (p1s, v1s) = self.get_data_by_chr(chrom) + # # assign the next function to a viable to speed up + # p1n = iter(p1s).__next__ + # v1n = iter(v1s).__next__ + # # arrays for position and values + # (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom) + # # assign the next function to a viable to speed up + # p2n = iter(p2s).__next__ + # v2n = iter(v2s).__next__ + + # # this is the maximum number of locations needed to be + # # recorded in scoreTrackI for this chromosome. + # chrom_max_len = len(p1s)+len(p2s) + + # ret.add_chromosome(chrom, chrom_max_len) + + # # remember the previous position in the new bedGraphTrackI + # # object ret + # # pre_p = 0 + + # try: + # p1 = p1n() + # v1 = v1n() + + # p2 = p2n() + # v2 = v2n() + + # while True: + # if p1 < p2: + # # clip a region from pre_p to p1, then set pre_p as p1. + # retadd(chrom, p1, v1, v2) + # # pre_p = p1 + # # call for the next p1 and v1 + # p1 = p1n() + # v1 = v1n() + # elif p2 < p1: + # # clip a region from pre_p to p2, then set pre_p as p2. + # retadd(chrom, p2, v1, v2) + # # pre_p = p2 + # # call for the next p2 and v2 + # p2 = p2n() + # v2 = v2n() + # elif p1 == p2: + # # from pre_p to p1 or p2, then set pre_p as p1 or p2. + # retadd(chrom, p1, v1, v2) + # # pre_p = p1 + # # call for the next p1, v1, p2, v2. + # p1 = p1n() + # v1 = v1n() + # p2 = p2n() + # v2 = v2n() + # except StopIteration: + # # meet the end of either bedGraphTrackI, simply exit + # pass + + # ret.finalize() + # # ret.merge_regions() + # return ret + + # @cython.ccall + # def cutoff_analysis(self, + # max_gap: cython.int, + # min_length: cython.int, + # steps: cython.int = 100, + # min_score: cython.float = 0, + # max_score: cython.float = 1000) -> str: + # """ + # Cutoff analysis function for bedGraphTrackI object. + + # This function will try all possible cutoff values on the score + # column to call peaks. Then will give a report of a number of + # metrics (number of peaks, total length of peaks, average + # length of peak) at varying score cutoffs. For each score + # cutoff, the function finds the positions where the score + # exceeds the cutoff, then groups those positions into "peaks" + # based on the maximum allowed gap (max_gap) between consecutive + # positions. If a peak's length exceeds the minimum length + # (min_length), the peak is counted. + + # Parameters + # ---------- + + # max_gap : int32_t + # Maximum allowed gap between consecutive positions above cutoff + + # min_length : int32_t Minimum length of peak + # steps: int32_t + # It will be used to calculate 'step' to increase from min_v to + # max_v (see below). + + # min_score: float32_t + # Minimum score for cutoff analysis. Note1: we will take the + # larger value between the actual minimum value in the BedGraph + # and min_score as min_v. Note2: the min_v won't be included in + # the final result. We will try to output the smallest cutoff as + # min_v+step. + + # max_score: float32_t + # Maximum score for cutoff analysis. Note1: we will take the + # smaller value between the actual maximum value in the BedGraph + # and max_score as max_v. Note2: the max_v may not be included + # in the final result. We will only output the cutoff that can + # generate at least 1 peak. + + # Returns + # ------- + + # Cutoff analysis report in str object. + + # Todos + # ----- + + # May need to separate this function out as a class so that we + # can add more ways to analyze the result. Also, we can let this + # function return a list of dictionary or data.frame in that + # way, instead of str object. + # """ + # chrs: set + # peak_content: list + # ret_list: list + # cutoff_list: list + # cutoff_npeaks: list + # cutoff_lpeaks: list + # chrom: bytes + # ret: str + # cutoff: cython.float + # total_l: cython.long + # total_p: cython.long + # i: cython.long + # n: cython.long + # ts: cython.long + # te: cython.long + # lastp: cython.long + # tl: cython.long + # peak_length: cython.long + # # dict cutoff_npeaks, cutoff_lpeaks + # s: cython.float + + # chrs = self.get_chr_names() + + # # midvalue = self.minvalue/2 + self.maxvalue/2 + # # s = float(self.minvalue - midvalue)/steps + # minv = max(min_score, self.minvalue) + # maxv = min(self.maxvalue, max_score) + + # s = float(maxv - minv)/steps + + # # a list of possible cutoff values from minv to maxv with step of s + # cutoff_list = [round(value, 3) for value in np.arange(minv, maxv, s)] + + # cutoff_npeaks = [0] * len(cutoff_list) + # cutoff_lpeaks = [0] * len(cutoff_list) + + # for chrom in sorted(chrs): + # (pos_array, score_array) = self.__data[chrom] + # pos_array = np.array(self.__data[chrom][0]) + # score_array = np.array(self.__data[chrom][1]) + + # for n in range(len(cutoff_list)): + # cutoff = cutoff_list[n] + # total_l = 0 # total length of peaks + # total_p = 0 # total number of peaks + + # # get the regions with scores above cutoffs. This is + # # not an optimized method. It would be better to store + # # score array in a 2-D ndarray? + # above_cutoff = np.nonzero(score_array > cutoff)[0] + # # end positions of regions where score is above cutoff + # above_cutoff_endpos = pos_array[above_cutoff] + # # start positions of regions where score is above cutoff + # above_cutoff_startpos = pos_array[above_cutoff-1] + + # if above_cutoff_endpos.size == 0: + # continue + + # # first bit of region above cutoff + # acs_next = iter(above_cutoff_startpos).__next__ + # ace_next = iter(above_cutoff_endpos).__next__ + + # ts = acs_next() + # te = ace_next() + # peak_content = [(ts, te),] + # lastp = te + + # for i in range(1, above_cutoff_startpos.size): + # ts = acs_next() + # te = ace_next() + # tl = ts - lastp + # if tl <= max_gap: + # peak_content.append((ts, te)) + # else: + # peak_length = peak_content[-1][1] - peak_content[0][0] + # # if the peak is too small, reject it + # if peak_length >= min_length: + # total_l += peak_length + # total_p += 1 + # peak_content = [(ts, te),] + # lastp = te + + # if peak_content: + # peak_length = peak_content[-1][1] - peak_content[0][0] + # # if the peak is too small, reject it + # if peak_length >= min_length: + # total_l += peak_length + # total_p += 1 + # cutoff_lpeaks[n] += total_l + # cutoff_npeaks[n] += total_p + + # # prepare the returnning text + # ret_list = ["score\tnpeaks\tlpeaks\tavelpeak\n"] + # for n in range(len(cutoff_list)-1, -1, -1): + # cutoff = cutoff_list[n] + # if cutoff_npeaks[n] > 0: + # ret_list.append("%.2f\t%d\t%d\t%.2f\n" % (cutoff, + # cutoff_npeaks[n], + # cutoff_lpeaks[n], + # cutoff_lpeaks[n]/cutoff_npeaks[n])) + # ret = ''.join(ret_list) + # return ret + + @cython.cfunc def calculate_elbows(values: cnp.ndarray, threshold: cython.float = 0.01) -> cnp.ndarray: diff --git a/MACS3/Signal/PairedEndTrack.py b/MACS3/Signal/PairedEndTrack.py index d8a7a85f..72c39d91 100644 --- a/MACS3/Signal/PairedEndTrack.py +++ b/MACS3/Signal/PairedEndTrack.py @@ -1,6 +1,6 @@ # cython: language_level=3 # cython: profile=True -# Time-stamp: <2024-10-14 21:13:35 Tao Liu> +# Time-stamp: <2024-10-15 15:56:00 Tao Liu> """Module for filter duplicate tags from paired-end data @@ -23,7 +23,8 @@ from MACS3.Signal.Pileup import (quick_pileup, over_two_pv_array, se_all_in_one_pileup) -from MACS3.Signal.BedGraph import bedGraphTrackI +from MACS3.Signal.BedGraph import (bedGraphTrackI, + bedGraphTrackII) from MACS3.Signal.PileupV2 import (pileup_from_LR_hmmratac, pileup_from_LRC) # ------------------------------------ @@ -997,5 +998,19 @@ def pileup_bdg(self): for chrom in self.get_chr_names(): pv = pileup_from_LRC(self.locations[chrom]) bdg.add_chrom_data_PV(chrom, pv) + return bdg + + @cython.ccall + def pileup_bdg2(self): + """Pileup all chromosome and return a bdg object. + """ + bdg: bedGraphTrackII + pv: cnp.ndarray + bdg = bedGraphTrackII() + for chrom in self.get_chr_names(): + pv = pileup_from_LRC(self.locations[chrom]) + bdg.add_chrom_data(chrom, pv) + # bedGraphTrackII needs to be 'finalized'. + bdg.finalize() return bdg diff --git a/setup.py b/setup.py index ec4d3735..1ee36921 100644 --- a/setup.py +++ b/setup.py @@ -129,7 +129,7 @@ def main(): include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), Extension("MACS3.Signal.ScoreTrack", - ["MACS3/Signal/ScoreTrack.pyx"], + ["MACS3/Signal/ScoreTrack.py"], libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), diff --git a/test/test_PairedEndTrack.py b/test/test_PairedEndTrack.py index baa5b7b1..3723c944 100644 --- a/test/test_PairedEndTrack.py +++ b/test/test_PairedEndTrack.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Time-stamp: <2024-10-15 09:23:38 Tao Liu> +# Time-stamp: <2024-10-15 16:07:27 Tao Liu> import unittest from MACS3.Signal.PairedEndTrack import PETrackI, PETrackII @@ -93,9 +93,12 @@ def setUp(self): ] self.pileup_p = np.array([10, 50, 70, 80, 85, 100, 110, 160, 170, 180, 190], dtype="i4") self.pileup_v = np.array([3.0, 4.0, 6.0, 9.0, 11.0, 15.0, 19.0, 18.0, 16.0, 10.0, 6.0], dtype="f4") + self.peak_str = "chrom:chrY start:80 end:180 name:peak_1 score:19 summit:105\n" self.subset_barcodes = {b'0w#AAACGAACAAGTAACA', b"0w#AAACGAACAAGTAAGA"} self.subset_pileup_p = np.array([10, 50, 70, 80, 85, 100, 110, 160, 170, 180, 190], dtype="i4") self.subset_pileup_v = np.array([1.0, 2.0, 4.0, 6.0, 7.0, 8.0, 13.0, 12.0, 10.0, 5.0, 4.0], dtype="f4") + self.subset_peak_str = "chrom:chrY start:100 end:170 name:peak_1 score:13 summit:105\n" + self.t = sum([(x[2]-x[1]) * x[4] for x in self.input_regions]) def test_add_frag(self): @@ -128,3 +131,47 @@ def test_pileup(self): d = bdg.get_data_by_chr(b'chrY') # (p, v) of ndarray np.testing.assert_array_equal(d[0], self.subset_pileup_p) np.testing.assert_array_equal(d[1], self.subset_pileup_v) + + def test_pileup2(self): + pe = PETrackII() + for (c, l, r, b, C) in self.input_regions: + pe.add_loc(c, l, r, b, C) + pe.finalize() + bdg = pe.pileup_bdg2() + d = bdg.get_data_by_chr(b'chrY') # (p, v) of ndarray + np.testing.assert_array_equal(d['p'], self.pileup_p) + np.testing.assert_array_equal(d['v'], self.pileup_v) + + pe_subset = pe.subset(self.subset_barcodes) + bdg = pe_subset.pileup_bdg2() + d = bdg.get_data_by_chr(b'chrY') # (p, v) of ndarray + np.testing.assert_array_equal(d['p'], self.subset_pileup_p) + np.testing.assert_array_equal(d['v'], self.subset_pileup_v) + + def test_callpeak(self): + pe = PETrackII() + for (c, l, r, b, C) in self.input_regions: + pe.add_loc(c, l, r, b, C) + pe.finalize() + bdg = pe.pileup_bdg() # bedGraphTrackI object + peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10) + self.assertEqual(str(peaks), self.peak_str) + + pe_subset = pe.subset(self.subset_barcodes) + bdg = pe_subset.pileup_bdg() + peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10) + self.assertEqual(str(peaks), self.subset_peak_str) + + def test_callpeak2(self): + pe = PETrackII() + for (c, l, r, b, C) in self.input_regions: + pe.add_loc(c, l, r, b, C) + pe.finalize() + bdg = pe.pileup_bdg2() # bedGraphTrackII object + peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10) + self.assertEqual(str(peaks), self.peak_str) + + pe_subset = pe.subset(self.subset_barcodes) + bdg = pe_subset.pileup_bdg2() + peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10) + self.assertEqual(str(peaks), self.subset_peak_str) diff --git a/test/test_Parser.py b/test/test_Parser.py index 9c42b442..09c82c6d 100644 --- a/test/test_Parser.py +++ b/test/test_Parser.py @@ -1,32 +1,54 @@ #!/usr/bin/env python -# Time-stamp: <2019-12-12 14:42:28 taoliu> +# Time-stamp: <2024-10-16 00:13:01 Tao Liu> import unittest -from MACS3.IO.Parser import * +from MACS3.IO.Parser import (guess_parser, + BEDParser, + SAMParser, + BAMParser, + FragParser) -class Test_auto_guess ( unittest.TestCase ): - def setUp ( self ): +class Test_auto_guess(unittest.TestCase): + + def setUp(self): self.bedfile = "test/tiny.bed.gz" self.bedpefile = "test/tiny.bedpe.gz" self.samfile = "test/tiny.sam.gz" self.bamfile = "test/tiny.bam" - def test_guess_parser_bed ( self ): - p = guess_parser( self.bedfile ) - self.assertTrue( p.is_gzipped() ) - self.assertTrue( isinstance(p, BEDParser) ) - - def test_guess_parser_sam ( self ): - p = guess_parser( self.samfile ) - self.assertTrue( p.is_gzipped() ) - self.assertTrue( isinstance(p, SAMParser) ) + def test_guess_parser_bed(self): + p = guess_parser(self.bedfile) + self.assertTrue(p.is_gzipped()) + self.assertTrue(isinstance(p, BEDParser)) - def test_guess_parser_bam ( self ): - p = guess_parser( self.bamfile ) - self.assertTrue( p.is_gzipped() ) - self.assertTrue( isinstance(p, BAMParser) ) + def test_guess_parser_sam(self): + p = guess_parser(self.samfile) + self.assertTrue(p.is_gzipped()) + self.assertTrue(isinstance(p, SAMParser)) + def test_guess_parser_bam(self): + p = guess_parser(self.bamfile) + self.assertTrue(p.is_gzipped()) + self.assertTrue(isinstance(p, BAMParser)) +class Test_parsing(unittest.TestCase): + def setUp(self): + self.bedfile = "test/tiny.bed.gz" + self.bedpefile = "test/tiny.bedpe.gz" + self.samfile = "test/tiny.sam.gz" + self.bamfile = "test/tiny.bam" + self.fragfile = "test/test.fragments.tsv.gz" + + def test_fragment_file(self): + p = FragParser(self.fragfile) + petrack = p.build_petrack2() + petrack.finalize() + bdg = petrack.pileup_bdg() + bdg2 = petrack.pileup_bdg2() + peaks = bdg.call_peaks(cutoff=10, min_length=200, max_gap=100) + peaks2 = bdg2.call_peaks(cutoff=10, min_length=200, max_gap=100) + print(peaks) + print(peaks2) diff --git a/test/test_ScoreTrack.py b/test/test_ScoreTrack.py index 41eaeb98..7b7c340e 100644 --- a/test/test_ScoreTrack.py +++ b/test/test_ScoreTrack.py @@ -1,59 +1,62 @@ #!/usr/bin/env python -# Time-stamp: <2024-10-11 10:17:53 Tao Liu> +# Time-stamp: <2024-10-18 15:30:21 Tao Liu> import io import unittest -from numpy.testing import assert_equal, assert_almost_equal, assert_array_equal +from numpy.testing import assert_array_equal # assert_equal, assert_almost_equal -from MACS3.Signal.ScoreTrack import * +import numpy as np +from MACS3.Signal.ScoreTrack import ScoreTrackII, TwoConditionScores from MACS3.Signal.BedGraph import bedGraphTrackI + class Test_TwoConditionScores(unittest.TestCase): def setUp(self): self.t1bdg = bedGraphTrackI() self.t2bdg = bedGraphTrackI() self.c1bdg = bedGraphTrackI() self.c2bdg = bedGraphTrackI() - self.test_regions1 = [(b"chrY",0,70,0.00,0.01), - (b"chrY",70,80,7.00,0.5), - (b"chrY",80,150,0.00,0.02)] - self.test_regions2 = [(b"chrY",0,75,20.0,4.00), - (b"chrY",75,90,35.0,6.00), - (b"chrY",90,150,10.0,15.00)] + self.test_regions1 = [(b"chrY", 0, 70, 0.00, 0.01), + (b"chrY", 70, 80, 7.00, 0.5), + (b"chrY", 80, 150, 0.00, 0.02)] + self.test_regions2 = [(b"chrY", 0, 75, 20.0, 4.00), + (b"chrY", 75, 90, 35.0, 6.00), + (b"chrY", 90, 150, 10.0, 15.00)] for a in self.test_regions1: - self.t1bdg.safe_add_loc(a[0],a[1],a[2],a[3]) - self.c1bdg.safe_add_loc(a[0],a[1],a[2],a[4]) + self.t1bdg.safe_add_loc(a[0], a[1], a[2], a[3]) + self.c1bdg.safe_add_loc(a[0], a[1], a[2], a[4]) for a in self.test_regions2: - self.t2bdg.safe_add_loc(a[0],a[1],a[2],a[3]) - self.c2bdg.safe_add_loc(a[0],a[1],a[2],a[4]) - - self.twoconditionscore = TwoConditionScores( self.t1bdg, - self.c1bdg, - self.t2bdg, - self.c2bdg, - 1.0, - 1.0 ) + self.t2bdg.safe_add_loc(a[0], a[1], a[2], a[3]) + self.c2bdg.safe_add_loc(a[0], a[1], a[2], a[4]) + + self.twoconditionscore = TwoConditionScores(self.t1bdg, + self.c1bdg, + self.t2bdg, + self.c2bdg, + 1.0, + 1.0) self.twoconditionscore.build() self.twoconditionscore.finalize() - (self.cat1,self.cat2,self.cat3) = self.twoconditionscore.call_peaks(min_length=10, max_gap=10, cutoff=3) + (self.cat1, self.cat2, self.cat3) = self.twoconditionscore.call_peaks(min_length=10, max_gap=10, cutoff=3) + class Test_ScoreTrackII(unittest.TestCase): def setUp(self): # for initiate scoretrack - self.test_regions1 = [(b"chrY",10,100,10), - (b"chrY",60,10,10), - (b"chrY",110,15,20), - (b"chrY",160,5,20), - (b"chrY",210,20,5)] + self.test_regions1 = [(b"chrY", 10, 100, 10), + (b"chrY", 60, 10, 10), + (b"chrY", 110, 15, 20), + (b"chrY", 160, 5, 20), + (b"chrY", 210, 20, 5)] self.treat_edm = 10 self.ctrl_edm = 5 # for different scoring method - self.p_result = [60.49, 0.38, 0.08, 0.0, 6.41] # -log10(p-value), pseudo count 1 added - self.q_result = [58.17, 0.0, 0.0, 0.0, 5.13] # -log10(q-value) from BH, pseudo count 1 added - self.l_result = [58.17, 0.0, -0.28, -3.25, 4.91] # log10 likelihood ratio, pseudo count 1 added - self.f_result = [0.96, 0.00, -0.12, -0.54, 0.54] # note, pseudo count 1 would be introduced. + self.p_result = [60.49, 0.38, 0.08, 0.0, 6.41] # -log10(p-value), pseudo count 1 added + self.q_result = [58.17, 0.0, 0.0, 0.0, 5.13] # -log10(q-value) from BH, pseudo count 1 added + self.l_result = [58.17, 0.0, -0.28, -3.25, 4.91] # log10 likelihood ratio, pseudo count 1 added + self.f_result = [0.96, 0.00, -0.12, -0.54, 0.54] # note, pseudo count 1 would be introduced. self.d_result = [90.00, 0, -5.00, -15.00, 15.00] self.m_result = [10.00, 1.00, 1.50, 0.50, 2.00] # for norm @@ -107,98 +110,97 @@ def setUp(self): chrY 161 210 50 186 20 7.09102 3.5 -1 MACS_peak_2 """ - def assertListAlmostEqual ( self, a, b, places =2 ): - return all( [self.assertAlmostEqual(x, y, places=places) for (x, y) in zip( a, b)] ) + def assertListAlmostEqual(self, a, b, places=2): + return all([self.assertAlmostEqual(x, y, places=places) for (x, y) in zip(a, b)]) def test_compute_scores(self): - s1 = ScoreTrackII( self.treat_edm, self.ctrl_edm ) - s1.add_chromosome( b"chrY", 5 ) + s1 = ScoreTrackII(self.treat_edm, self.ctrl_edm) + s1.add_chromosome(b"chrY", 5) for a in self.test_regions1: - s1.add( a[0],a[1],a[2],a[3] ) + s1.add(a[0], a[1], a[2], a[3]) - s1.set_pseudocount ( 1.0 ) + s1.set_pseudocount(1.0) - s1.change_score_method( ord('p') ) + s1.change_score_method(ord('p')) r = s1.get_data_by_chr(b"chrY") - self.assertListAlmostEqual( [round(x,2) for x in r[3]], self.p_result ) + self.assertListAlmostEqual([round(x, 2) for x in r[3]], self.p_result) - s1.change_score_method( ord('q') ) + s1.change_score_method(ord('q')) r = s1.get_data_by_chr(b"chrY") - self.assertListAlmostEqual( [round(x,2) for x in list(r[3])], self.q_result ) + self.assertListAlmostEqual([round(x, 2) for x in list(r[3])], self.q_result) - s1.change_score_method( ord('l') ) + s1.change_score_method(ord('l')) r = s1.get_data_by_chr(b"chrY") - self.assertListAlmostEqual( [round(x,2) for x in list(r[3])], self.l_result ) + self.assertListAlmostEqual([round(x, 2) for x in list(r[3])], self.l_result) - s1.change_score_method( ord('f') ) + s1.change_score_method(ord('f')) r = s1.get_data_by_chr(b"chrY") - self.assertListAlmostEqual( [round(x,2) for x in list(r[3])], self.f_result ) + self.assertListAlmostEqual([round(x, 2) for x in list(r[3])], self.f_result) - s1.change_score_method( ord('d') ) + s1.change_score_method(ord('d')) r = s1.get_data_by_chr(b"chrY") - self.assertListAlmostEqual( [round(x,2) for x in list(r[3])], self.d_result ) + self.assertListAlmostEqual([round(x, 2) for x in list(r[3])], self.d_result) - s1.change_score_method( ord('m') ) + s1.change_score_method(ord('m')) r = s1.get_data_by_chr(b"chrY") - self.assertListAlmostEqual( [round(x,2) for x in list(r[3])], self.m_result ) + self.assertListAlmostEqual([round(x, 2) for x in list(r[3])], self.m_result) def test_normalize(self): - s1 = ScoreTrackII( self.treat_edm, self.ctrl_edm ) - s1.add_chromosome( b"chrY", 5 ) + s1 = ScoreTrackII(self.treat_edm, self.ctrl_edm) + s1.add_chromosome(b"chrY", 5) for a in self.test_regions1: - s1.add( a[0],a[1],a[2],a[3] ) + s1.add(a[0], a[1], a[2], a[3]) - s1.change_normalization_method( ord('T') ) + s1.change_normalization_method(ord('T')) r = s1.get_data_by_chr(b"chrY") - assert_array_equal( r, self.norm_T ) + assert_array_equal(r, self.norm_T) - s1.change_normalization_method( ord('C') ) + s1.change_normalization_method(ord('C')) r = s1.get_data_by_chr(b"chrY") - assert_array_equal( r, self.norm_C ) + assert_array_equal(r, self.norm_C) - s1.change_normalization_method( ord('M') ) + s1.change_normalization_method(ord('M')) r = s1.get_data_by_chr(b"chrY") - assert_array_equal( r, self.norm_M ) + assert_array_equal(r, self.norm_M) - s1.change_normalization_method( ord('N') ) + s1.change_normalization_method(ord('N')) r = s1.get_data_by_chr(b"chrY") - assert_array_equal( r, self.norm_N ) + assert_array_equal(r, self.norm_N) - def test_writebedgraph ( self ): - s1 = ScoreTrackII( self.treat_edm, self.ctrl_edm ) - s1.add_chromosome( b"chrY", 5 ) + def test_writebedgraph(self): + s1 = ScoreTrackII(self.treat_edm, self.ctrl_edm) + s1.add_chromosome(b"chrY", 5) for a in self.test_regions1: - s1.add( a[0],a[1],a[2],a[3] ) + s1.add(a[0], a[1], a[2], a[3]) - s1.change_score_method( ord('p') ) + s1.change_score_method(ord('p')) strio = io.StringIO() - s1.write_bedGraph( strio, "NAME", "DESC", 1 ) - self.assertEqual( strio.getvalue(), self.bdg1 ) + s1.write_bedGraph(strio, "NAME", "DESC", 1) + self.assertEqual(strio.getvalue(), self.bdg1) strio = io.StringIO() - s1.write_bedGraph( strio, "NAME", "DESC", 2 ) - self.assertEqual( strio.getvalue(), self.bdg2 ) + s1.write_bedGraph(strio, "NAME", "DESC", 2) + self.assertEqual(strio.getvalue(), self.bdg2) strio = io.StringIO() - s1.write_bedGraph( strio, "NAME", "DESC", 3 ) - self.assertEqual( strio.getvalue(), self.bdg3 ) + s1.write_bedGraph(strio, "NAME", "DESC", 3) + self.assertEqual(strio.getvalue(), self.bdg3) - def test_callpeak ( self ): - s1 = ScoreTrackII( self.treat_edm, self.ctrl_edm ) - s1.add_chromosome( b"chrY", 5 ) + def test_callpeak(self): + s1 = ScoreTrackII(self.treat_edm, self.ctrl_edm) + s1.add_chromosome(b"chrY", 5) for a in self.test_regions1: - s1.add( a[0],a[1],a[2],a[3] ) + s1.add(a[0], a[1], a[2], a[3]) - s1.change_score_method( ord('p') ) - p = s1.call_peaks( cutoff = 0.10, min_length=10, max_gap=10 ) + s1.change_score_method(ord('p')) + p = s1.call_peaks(cutoff=0.10, min_length=10, max_gap=10) strio = io.StringIO() - p.write_to_bed( strio, trackline = False ) - self.assertEqual( strio.getvalue(), self.peak1 ) + p.write_to_bed(strio, trackline=False) + self.assertEqual(strio.getvalue(), self.peak1) strio = io.StringIO() - p.write_to_summit_bed( strio, trackline = False ) - self.assertEqual( strio.getvalue(), self.summit1 ) + p.write_to_summit_bed(strio, trackline=False) + self.assertEqual(strio.getvalue(), self.summit1) strio = io.StringIO() - p.write_to_xls( strio ) - self.assertEqual( strio.getvalue(), self.xls1 ) - + p.write_to_xls(strio) + self.assertEqual(strio.getvalue(), self.xls1)