-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalignment_overlaps.jl
98 lines (79 loc) · 3.67 KB
/
alignment_overlaps.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env julia
"""
alignment_overlaps.jl - An alternative implemention of the BioAlignments.jl
BAM overlap.jl code to convert Record types into fragment-based and read-based Interval types
Original code at https://github.com/BioJulia/XAM.jl/blob/develop/src/bam/overlap.jl
By: Shaun Adkins ([email protected])
"""
import BGZFStreams
### OverlapIterator - Extended version of original BAM OverlapIterator
struct OverlapIterator{T}
reader::BAM.Reader{T}
refname::String
interval::UnitRange{Int}
strand_type::String
max_frag_size::UInt
end
function GenomicFeatures.eachoverlap(reader::BAM.Reader, interval::Interval, strand_type::String="no", max_frag_size::UInt=1000)
return GenomicFeatures.eachoverlap(reader, interval.seqname, interval.first:interval.last, strand_type, max_frag_size)
end
function GenomicFeatures.eachoverlap(reader::BAM.Reader, interval, strand_type::String="no", max_frag_size::UInt=1000)
return GenomicFeatures.eachoverlap(reader, convert(Interval, interval), strand_type, max_frag_size)
end
function GenomicFeatures.eachoverlap(reader::BAM.Reader, refname::AbstractString, interval::UnitRange, strand_type::String="no", max_frag_size::UInt=1000)
return OverlapIterator(reader, String(refname), interval, strand_type, max_frag_size)
end
### SuperBAMRecord - BAM Record with additional information
struct SuperBAMRecord
record::BAM.Record
interval::Interval{Bool}
strand::Char
end
record(s::SuperBAMRecord) = s.record
interval(s::SuperBAMRecord) = s.interval
strand(s::SuperBAMRecord) = s.strand
alignment_type(s::SuperBAMRecord) = getalignmenttype(metadata(interval(s)))
# Iterator
# --------
function Base.iterate(iter::OverlapIterator)
refindex = findfirst(isequal(iter.refname), iter.reader.refseqnames)
if refindex === nothing
throw(ArgumentError("GFF sequence name $(iter.refname) is not found in the BAM header"))
end
@assert iter.reader.index !== nothing
chunks = Indexes.overlapchunks(iter.reader.index.index, refindex, iter.interval)
if !isempty(chunks)
seek(iter.reader, first(chunks).start)
end
state = BAM.OverlapIteratorState(refindex, chunks, 1, BAM.Record())
return iterate(iter, state)
end
function Base.iterate(iter::OverlapIterator, state)
@inbounds while state.chunkid ≤ lastindex(state.chunks)
chunk = state.chunks[state.chunkid]
# Ran into issue where the BAM.Reader processed all records but kept going.
# This leads to the stream block_index to exceed the number of blocks. This solves that.
if eof(iter.reader.stream)
break
end
while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop
read!(iter.reader, state.record)
# Determine if fragment or read alignment. Fragment alignments may overlap the interval whereas the single read may be to the right
alignmentinterval = get_alignment_interval(state.record, iter.max_frag_size, iter.strand_type)
alignmentinterval === nothing && continue
c = GenomicFeatures.compare_overlap(alignmentinterval, Interval(iter.refname, iter.interval), isless)
if c == 0
alignmentstrand = getstrand(alignmentinterval, isstranded(iter.strand_type))
return SuperBAMRecord(state.record, alignmentinterval, alignmentstrand), state
elseif c > 0
# no more overlapping records in this chunk since records are sorted
break
end
end
state.chunkid += 1
if state.chunkid ≤ lastindex(state.chunks)
seek(iter.reader, state.chunks[state.chunkid].start)
end
end
return nothing
end