Skip to content

Commit

Permalink
Merge pull request #9 from cancerit/feature/mergePaired
Browse files Browse the repository at this point in the history
Modify report, add generation of interleaved fastq from paired
  • Loading branch information
keiranmraine authored Dec 7, 2017
2 parents b26e4f3 + 8a9b3e6 commit 1621fe0
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 34 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ install:
- pip install pytest pytest-cov
- pip install progressbar2
- pip install xlrd
- pip install xopen

before_script:
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
- chmod +x ./cc-test-reporter
- ./cc-test-reporter before-build

script:
- pytest --cov-branch --cov-report term --cov=cgp_seq_input_val
- pytest --cov-branch --cov-report term --cov=cgp_seq_input_val --cov-fail-under=50

after_script:
- ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,24 @@ of:
{
"interleaved": false,
"pairs": 722079,
"possible_encoding": [
"Sanger",
"Illumina 1.8"
],
"quality_ascii_range": [
37,
67
],
"valid_q": true
}
```

Optionally generates a new interleaved (gz) file when paired-fastq is the input.

Various exceptions can occur for malformed files.

The primary purpose is to confirm Sanger/Illumina 1.8+ quality scores.
The primary purpose is to confirm Sanger/Illumina 1.8+ quality scores. Further Information
on Phred encoding can be found [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding).

#### FASTQ not BAM/CRAM

Expand All @@ -78,6 +89,7 @@ pip install --find-links=~/wheels cgp_seq_input_val

* [progressbar2](http://progressbar-2.readthedocs.io/en/latest/)
* [xlrd](https://github.com/python-excel/xlrd)
* [xopen](https://github.com/marcelm/xopen)

## Development environment

Expand Down Expand Up @@ -110,9 +122,11 @@ cd $PROJECTROOT
hash virtualenv || pip3 install virtualenv
virtualenv -p python3 env
source env/bin/activate
pip install progressbar2
pip install xlrd
pip install -r requirements.txt
python setup.py develop # so bin scripts can find module
## If changed requirements please run:
pip freeze | grep -v `echo ${PWD##*/}` > requirements.txt
```

For testing/coverage (`./run_tests.sh`)
Expand Down
13 changes: 12 additions & 1 deletion cgp_seq_input_val/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,19 @@ def main():
dest='input',
metavar='FILE',
nargs='+',
help='Input manifest in tsv formats',
help='Input FASTQ (optionally gzip compressed)',
required=True)
parser_c.add_argument('-q', '--qc',
dest='qc',
type=int,
default=100000,
help='Assess phred quality scale using N pairs (0=all, slow)',
required=False)
parser_c.add_argument('-o', '--output',
dest='output',
type=str,
help='Output as interleaved FASTQ (ignored for interleaved input)',
required=False)
parser_c.set_defaults(func=validate_seq_files)

args = parser.parse_args()
Expand Down
80 changes: 63 additions & 17 deletions cgp_seq_input_val/seq_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

import os
import sys
import gzip
import gzip # only used for reading
from xopen import xopen # only used for writing
import json

# progressbar2
Expand All @@ -14,25 +15,39 @@
from cgp_seq_input_val.error_classes import SeqValidationError
from cgp_seq_input_val.fastq_read import FastqRead

prog_records = 100000
# From: https://en.wikipedia.org/wiki/FASTQ_format#Encoding
Q_RANGES = {'Sanger': [33, 73],
'Solexa': [59, 104],
'Illumina 1.3': [64, 104],
'Illumina 1.5': [67, 105],
'Illumina 1.8': [33, 74]}

PROG_RECORDS = 100000


def validate_seq_files(args):
"""
Top level entry point for validating sequence files.
"""
out_fh = None
try:
file_2 = None
if len(args.input) == 2:
file_2 = args.input[1]
validator = SeqValidator(args.input[0], file_2)
if args.output:
out_fh = xopen(args.output, mode='wt')

validator = SeqValidator(args.input[0], args.qc, out_fh=out_fh, file_b=file_2)
validator.validate()
validator.report(args.report)
except SeqValidationError as ve: # runtime so no functions for message and errno
sys.exit("ERROR: " + str(ve))
# have to catch 2 classes works 3.0-3.3, above 3.3 all IO issues are captured under OSError
except (OSError, IOError) as err:
sys.exit("ERROR (%d): %s - %s" % (err.errno, err.strerror, err.filename))
finally:
if out_fh:
out_fh.close()


class SeqValidator(object):
Expand All @@ -45,18 +60,20 @@ class SeqValidator(object):
progress_pairs - optional, how often to update progress bar [100,000]
- set to 0 to disable
"""
def __init__(self, file_a, file_b=None, progress_pairs=prog_records):
def __init__(self, file_a, qc_reads, file_b=None, out_fh=None, progress_pairs=PROG_RECORDS):
self.progress_pairs = progress_pairs
self.qc_reads = qc_reads
self.file_a = file_a
self.file_b = file_b
self.out_fh = out_fh
self.pairs = 0
# will use this to decide on path
self.is_gzip = False # change open method for fastq
# sam is not supported

# only the min value is actually needed to determine if scaling
# is Sanger or Illumina 1.8+
self.q_min = 1000
self.q_max = -1
self.encodings = []
self._prep()

def __str__(self):
Expand All @@ -65,6 +82,8 @@ def __str__(self):
ret.append('file_b: '+str(self.file_b))
ret.append('is_gzip: '+str(self.is_gzip))
ret.append('q_min: '+str(self.q_min))
ret.append('q_max: '+str(self.q_max))
ret.append('encodings: '+str(self.encodings))
return '\n'.join(ret)

def _prep(self):
Expand Down Expand Up @@ -97,16 +116,28 @@ def validate(self):
else:
self.validate_paired()

def possible_encoding(self):
"""
Converts the ascii quality score range to something useful for debugging
"""
for encoding in Q_RANGES:
if(Q_RANGES[encoding][0] <= self.q_min <= Q_RANGES[encoding][1] and
Q_RANGES[encoding][0] <= self.q_max <= Q_RANGES[encoding][1]):
self.encodings.append(encoding)

def report(self, fp):
"""
Prints json report to the provided file-pointer
Args:
fp - file pointer
"""
self.possible_encoding()
report = {'pairs': self.pairs,
'valid_q': self.q_min == 33,
'interleaved': self.file_a == self.file_b}
'valid_q': self.q_min >= 33 and self.q_max <= 74,
'interleaved': self.file_a == self.file_b,
'possible_encoding': self.encodings,
'quality_ascii_range': [self.q_min, self.q_max]}
json.dump(report, fp, sort_keys=True, indent=4)

def validate_paired(self):
Expand Down Expand Up @@ -146,7 +177,12 @@ def validate_paired(self):
curr_line_b = read_2.last_line
fqh_line_b = read_2.file_pos[1]

self.check_pair(read_1, read_2)
self.check_pair(read_1, read_2, self.qc_reads == 0 or pairs < self.qc_reads)

if self.out_fh:
print(read_1, file=self.out_fh)
print(read_2, file=self.out_fh)

pairs += 1

if bar and pairs % prog_indic == 0:
Expand Down Expand Up @@ -198,11 +234,12 @@ def validate_interleaved(self):
# ensure line increments based on the last line read
fqh_line = read_2.file_pos[1]

self.check_pair(read_1, read_2)
self.check_pair(read_1, read_2, self.qc_reads == 0 or pairs < self.qc_reads)
pairs += 1

if bar and pairs % prog_indic == 0:
bar.update(pairs/prog_indic)

if curr_line == '':
break
self.pairs = pairs
Expand All @@ -211,19 +248,28 @@ def validate_interleaved(self):
if fq_fh is not None and not fq_fh.closed:
fq_fh.close()

def check_pair(self, read_1, read_2):
def qual_range(self, read):
"""
Finds the min and max ascii values from each quality encoding
"""
sorted_qual = list(map(ord, read.qual))
sorted_qual.sort() # faster than sorted(list(...))
if self.q_min > sorted_qual[0]:
self.q_min = sorted_qual[0]

if self.q_max < sorted_qual[-1]:
self.q_max = sorted_qual[-1]

def check_pair(self, read_1, read_2, check_qual):
"""
Compares a pair of reads
Raises:
SeqValidationError
"""
if self.q_min > 33:
# once a min of 33 is achieved it must be sanger/Illumina 1.8+
# may need occasional review.
q_min = min(map(ord, read_1.qual))
if self.q_min > q_min:
self.q_min = q_min
if check_qual:
self.qual_range(read_1)
self.qual_range(read_2)

if read_1.name != read_2.name:
raise SeqValidationError("Fastq record name at line %d should be a \
Expand Down
25 changes: 25 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
astroid==1.5.3
bottle==0.12.13
colorama==0.3.9
coverage==4.4.1
flake8==3.4.1
flake8-polyfill==1.0.1
isort==4.2.15
lazy-object-proxy==1.3.1
mando==0.6.4
mccabe==0.6.1
nose2==0.6.5
pep8==1.7.0
progressbar2==3.34.3
py==1.4.34
py-cpuinfo==3.3.0
pycodestyle==2.3.1
pyflakes==1.5.0
pytest==3.2.3
pytest-cov==2.5.1
python-utils==2.2.0
radon==2.1.1
six==1.11.0
wrapt==1.10.11
xlrd==1.1.0
xopen==0.3.2
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
'version': '1.2.1',
'python_requires': '>= 3.4',
'setup_requires': ['pytest'],
'install_requires': ['progressbar2', 'xlrd'],
'install_requires': ['progressbar2', 'xlrd', 'xopen'],
'packages': ['cgp_seq_input_val'],
'package_data': {'cgp_seq_input_val': ['config/*.json']},
'entry_points': {
Expand Down
22 changes: 11 additions & 11 deletions tests/test_cgp_seq_input_val_seq_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,70 +14,70 @@ def teardown():

def test_seq_val_i_read_good():
fqi = os.path.join(test_dir, 'good_read_i.fq')
sv = SeqValidator(fqi, None, progress_pairs=1)
sv = SeqValidator(fqi, 1, out_fh=None, file_b=None, progress_pairs=1)
sv.validate()

def test_seq_val_p_read_good():
fq1 = os.path.join(test_dir, 'good_read_1.fq')
fq2 = os.path.join(test_dir, 'good_read_2.fq')
sv = SeqValidator(fq1, fq2, progress_pairs=1)
sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=1)
sv.validate()

def test_seq_val_i_gz_read_good():
fqi = os.path.join(test_dir, 'good_read_i.fq.gz')
sv = SeqValidator(fqi, None, progress_pairs=0)
sv = SeqValidator(fqi, 1, out_fh=None, file_b=None, progress_pairs=0)
sv.validate()
t = str(sv)
sv.report(sys.stdout)

def test_seq_val_p_gz_read_good():
fq1 = os.path.join(test_dir, 'good_read_1.fq.gz')
fq2 = os.path.join(test_dir, 'good_read_2.fq.gz')
sv = SeqValidator(fq1, fq2, progress_pairs=0)
sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
sv.validate()

def test_seq_val_bad_file():
with pytest.raises(SeqValidationError) as e_info:
fqi = os.path.join(test_dir, 'good_read_i.BAD')
sv = SeqValidator(fqi, None, progress_pairs=0)
sv = SeqValidator(fqi, 1, out_fh=None, file_b=None, progress_pairs=0)

def test_seq_val_mismatch_ext():
with pytest.raises(SeqValidationError) as e_info:
fq1 = os.path.join(test_dir, 'good_read_1.fq')
fq2 = os.path.join(test_dir, 'good_read_2.fq.gz')
sv = SeqValidator(fq1, fq2, progress_pairs=0)
sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)

def test_seq_val_more_read2():
with pytest.raises(SeqValidationError) as e_info:
fq1 = os.path.join(test_dir, 'good_read_1.fq')
fq2 = os.path.join(test_dir, '2_reads_2.fq')
sv = SeqValidator(fq1, fq2, progress_pairs=0)
sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
sv.validate()

def test_seq_val_more_read1():
with pytest.raises(SeqValidationError) as e_info:
fq1 = os.path.join(test_dir, '2_reads_1.fq')
fq2 = os.path.join(test_dir, 'good_read_2.fq')
sv = SeqValidator(fq1, fq2, progress_pairs=0)
sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
sv.validate()

def test_seq_val_r1_in_2():
with pytest.raises(SeqValidationError) as e_info:
fq1 = os.path.join(test_dir, 'good_read_1.fq')
fq2 = os.path.join(test_dir, 'r1_reads_in_2.fq')
sv = SeqValidator(fq1, fq2, progress_pairs=0)
sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
sv.validate()

def test_seq_val_r2_in_1():
with pytest.raises(SeqValidationError) as e_info:
fq1 = os.path.join(test_dir, 'good_read_2.fq')
fq2 = os.path.join(test_dir, 'r2_reads_in_1.fq')
sv = SeqValidator(fq1, fq2, progress_pairs=0)
sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
sv.validate()

def test_seq_val_fq_name():
with pytest.raises(SeqValidationError) as e_info:
fq1 = os.path.join(test_dir, 'good_read_1.fq')
fq2 = os.path.join(test_dir, 'diff_2.fq')
sv = SeqValidator(fq1, fq2, progress_pairs=0)
sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
sv.validate()

0 comments on commit 1621fe0

Please sign in to comment.