diff --git a/.travis.yml b/.travis.yml index 3734842..faada14 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,7 @@ install: - pip install pytest pytest-cov - pip install progressbar2 - pip install xlrd + - pip install xopen before_script: - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter @@ -23,7 +24,7 @@ before_script: - ./cc-test-reporter before-build script: - - pytest --cov-branch --cov-report term --cov=cgp_seq_input_val + - pytest --cov-branch --cov-report term --cov=cgp_seq_input_val --cov-fail-under=50 after_script: - ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT diff --git a/CHANGES.md b/CHANGES.md new file mode 100644 index 0000000..d93c422 --- /dev/null +++ b/CHANGES.md @@ -0,0 +1,32 @@ +# CHANGES + +## 1.3.0 + +* Fixes `valid_q` value of `seq-valid` json report. Always returned false previously. +* Correct command line help text for `seq-valid` input option. +* Extends the `seq-valid` json report to include the ascii range detected and which types of +phred score these align to. +* Additional command line options for `seq-valid` + * `-o | --output` - Generates an interleaved gzip compressed fastq file when input is paired fastq + * `-q | --qc` - Specify the number of pairs to be used when assessing the phred quality range + * Added for performance reasons + +## 1.2.1 + +* More informative exceptions for low-level issues + +## 1.2.0 + +* Changes to command line, using sub commands now +* Changed test framework +* Add travis and codeclimate checks + +## 1.1.0 + +* Fixed issue of `json.loads()` in py<3.5 being unable to decode resource string to utf-8 automatically. +* Adds a UUID to the header of a processed manifest if incoming not set +* Improve documentation + +## 1.0.0 + +Early functional release diff --git a/README.md b/README.md index 0781734..acde45a 100644 --- a/README.md +++ b/README.md @@ -48,13 +48,24 @@ of: { "interleaved": false, "pairs": 722079, + "possible_encoding": [ + "Sanger", + "Illumina 1.8" + ], + "quality_ascii_range": [ + 37, + 67 + ], "valid_q": true } ``` +Optionally generates a new interleaved (gz) file when paired-fastq is the input. + Various exceptions can occur for malformed files. -The primary purpose is to confirm Sanger/Illumina 1.8+ quality scores. +The primary purpose is to confirm Sanger/Illumina 1.8+ quality scores. Further Information +on Phred encoding can be found [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding). #### FASTQ not BAM/CRAM @@ -70,6 +81,8 @@ Installation is via `pip`. Simply execute with the path to the packaged distrib ```bash pip install --find-links=~/wheels cgp_seq_input_val +# or +pip install https://github.com/cancerit/cgp_seq_input_val/archive/master.tar.gz ``` ### Package Dependancies @@ -78,6 +91,7 @@ pip install --find-links=~/wheels cgp_seq_input_val * [progressbar2](http://progressbar-2.readthedocs.io/en/latest/) * [xlrd](https://github.com/python-excel/xlrd) +* [xopen](https://github.com/marcelm/xopen) ## Development environment @@ -110,19 +124,18 @@ cd $PROJECTROOT hash virtualenv || pip3 install virtualenv virtualenv -p python3 env source env/bin/activate -pip install progressbar2 -pip install xlrd +pip install -r requirements.txt python setup.py develop # so bin scripts can find module + +## If changed requirements please run: +pip freeze | grep -v `echo ${PWD##*/}` > requirements.txt ``` For testing/coverage (`./run_tests.sh`) ``` source env/bin/activate # if not already in env -pip install pytest -pip install pytest-cov -pip install pep8 -pip install radon +pip install -r requirements.txt gem install --user-install mdl ``` diff --git a/cgp_seq_input_val/command_line.py b/cgp_seq_input_val/command_line.py index a6dcbbb..5f9d7d4 100644 --- a/cgp_seq_input_val/command_line.py +++ b/cgp_seq_input_val/command_line.py @@ -88,8 +88,19 @@ def main(): dest='input', metavar='FILE', nargs='+', - help='Input manifest in tsv formats', + help='Input FASTQ (optionally gzip compressed)', required=True) + parser_c.add_argument('-q', '--qc', + dest='qc', + type=int, + default=100000, + help='Assess phred quality scale using N pairs (0=all, slow)', + required=False) + parser_c.add_argument('-o', '--output', + dest='output', + type=str, + help='Output as interleaved FASTQ (ignored for interleaved input)', + required=False) parser_c.set_defaults(func=validate_seq_files) args = parser.parse_args() diff --git a/cgp_seq_input_val/seq_validator.py b/cgp_seq_input_val/seq_validator.py index 04156f1..0bf48e1 100644 --- a/cgp_seq_input_val/seq_validator.py +++ b/cgp_seq_input_val/seq_validator.py @@ -4,7 +4,8 @@ import os import sys -import gzip +import gzip # only used for reading +from xopen import xopen # only used for writing import json # progressbar2 @@ -14,18 +15,29 @@ from cgp_seq_input_val.error_classes import SeqValidationError from cgp_seq_input_val.fastq_read import FastqRead -prog_records = 100000 +# From: https://en.wikipedia.org/wiki/FASTQ_format#Encoding +Q_RANGES = {'Sanger': [33, 73], + 'Solexa': [59, 104], + 'Illumina 1.3': [64, 104], + 'Illumina 1.5': [67, 105], + 'Illumina 1.8': [33, 74]} + +PROG_RECORDS = 100000 def validate_seq_files(args): """ Top level entry point for validating sequence files. """ + out_fh = None try: file_2 = None if len(args.input) == 2: file_2 = args.input[1] - validator = SeqValidator(args.input[0], file_2) + if args.output: + out_fh = xopen(args.output, mode='wt') + + validator = SeqValidator(args.input[0], args.qc, out_fh=out_fh, file_b=file_2) validator.validate() validator.report(args.report) except SeqValidationError as ve: # runtime so no functions for message and errno @@ -33,6 +45,9 @@ def validate_seq_files(args): # have to catch 2 classes works 3.0-3.3, above 3.3 all IO issues are captured under OSError except (OSError, IOError) as err: sys.exit("ERROR (%d): %s - %s" % (err.errno, err.strerror, err.filename)) + finally: + if out_fh: + out_fh.close() class SeqValidator(object): @@ -45,18 +60,20 @@ class SeqValidator(object): progress_pairs - optional, how often to update progress bar [100,000] - set to 0 to disable """ - def __init__(self, file_a, file_b=None, progress_pairs=prog_records): + def __init__(self, file_a, qc_reads, file_b=None, out_fh=None, progress_pairs=PROG_RECORDS): self.progress_pairs = progress_pairs + self.qc_reads = qc_reads self.file_a = file_a self.file_b = file_b + self.out_fh = out_fh self.pairs = 0 # will use this to decide on path self.is_gzip = False # change open method for fastq # sam is not supported - # only the min value is actually needed to determine if scaling - # is Sanger or Illumina 1.8+ self.q_min = 1000 + self.q_max = -1 + self.encodings = [] self._prep() def __str__(self): @@ -65,6 +82,8 @@ def __str__(self): ret.append('file_b: '+str(self.file_b)) ret.append('is_gzip: '+str(self.is_gzip)) ret.append('q_min: '+str(self.q_min)) + ret.append('q_max: '+str(self.q_max)) + ret.append('encodings: '+str(self.encodings)) return '\n'.join(ret) def _prep(self): @@ -97,6 +116,15 @@ def validate(self): else: self.validate_paired() + def possible_encoding(self): + """ + Converts the ascii quality score range to something useful for debugging + """ + for encoding in Q_RANGES: + if(Q_RANGES[encoding][0] <= self.q_min <= Q_RANGES[encoding][1] and + Q_RANGES[encoding][0] <= self.q_max <= Q_RANGES[encoding][1]): + self.encodings.append(encoding) + def report(self, fp): """ Prints json report to the provided file-pointer @@ -104,9 +132,12 @@ def report(self, fp): Args: fp - file pointer """ + self.possible_encoding() report = {'pairs': self.pairs, - 'valid_q': self.q_min == 33, - 'interleaved': self.file_a == self.file_b} + 'valid_q': self.q_min >= 33 and self.q_max <= 74, + 'interleaved': self.file_a == self.file_b, + 'possible_encoding': self.encodings, + 'quality_ascii_range': [self.q_min, self.q_max]} json.dump(report, fp, sort_keys=True, indent=4) def validate_paired(self): @@ -146,7 +177,12 @@ def validate_paired(self): curr_line_b = read_2.last_line fqh_line_b = read_2.file_pos[1] - self.check_pair(read_1, read_2) + self.check_pair(read_1, read_2, self.qc_reads == 0 or pairs < self.qc_reads) + + if self.out_fh: + print(read_1, file=self.out_fh) + print(read_2, file=self.out_fh) + pairs += 1 if bar and pairs % prog_indic == 0: @@ -198,11 +234,12 @@ def validate_interleaved(self): # ensure line increments based on the last line read fqh_line = read_2.file_pos[1] - self.check_pair(read_1, read_2) + self.check_pair(read_1, read_2, self.qc_reads == 0 or pairs < self.qc_reads) pairs += 1 if bar and pairs % prog_indic == 0: bar.update(pairs/prog_indic) + if curr_line == '': break self.pairs = pairs @@ -211,19 +248,28 @@ def validate_interleaved(self): if fq_fh is not None and not fq_fh.closed: fq_fh.close() - def check_pair(self, read_1, read_2): + def qual_range(self, read): + """ + Finds the min and max ascii values from each quality encoding + """ + sorted_qual = list(map(ord, read.qual)) + sorted_qual.sort() # faster than sorted(list(...)) + if self.q_min > sorted_qual[0]: + self.q_min = sorted_qual[0] + + if self.q_max < sorted_qual[-1]: + self.q_max = sorted_qual[-1] + + def check_pair(self, read_1, read_2, check_qual): """ Compares a pair of reads Raises: SeqValidationError """ - if self.q_min > 33: - # once a min of 33 is achieved it must be sanger/Illumina 1.8+ - # may need occasional review. - q_min = min(map(ord, read_1.qual)) - if self.q_min > q_min: - self.q_min = q_min + if check_qual: + self.qual_range(read_1) + self.qual_range(read_2) if read_1.name != read_2.name: raise SeqValidationError("Fastq record name at line %d should be a \ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8d503db --- /dev/null +++ b/requirements.txt @@ -0,0 +1,25 @@ +astroid==1.5.3 +bottle==0.12.13 +colorama==0.3.9 +coverage==4.4.1 +flake8==3.4.1 +flake8-polyfill==1.0.1 +isort==4.2.15 +lazy-object-proxy==1.3.1 +mando==0.6.4 +mccabe==0.6.1 +nose2==0.6.5 +pep8==1.7.0 +progressbar2==3.34.3 +py==1.4.34 +py-cpuinfo==3.3.0 +pycodestyle==2.3.1 +pyflakes==1.5.0 +pytest==3.2.3 +pytest-cov==2.5.1 +python-utils==2.2.0 +radon==2.1.1 +six==1.11.0 +wrapt==1.10.11 +xlrd==1.1.0 +xopen==0.3.2 diff --git a/setup.py b/setup.py index fc92761..96be014 100755 --- a/setup.py +++ b/setup.py @@ -9,10 +9,10 @@ 'url': 'https://github.com/cancerit/cgp_seq_input_val', 'download_url': '', 'author_email': 'cgphelp@sanger.ac.uk', - 'version': '1.2.1', + 'version': '1.3.0', 'python_requires': '>= 3.4', 'setup_requires': ['pytest'], - 'install_requires': ['progressbar2', 'xlrd'], + 'install_requires': ['progressbar2', 'xlrd', 'xopen'], 'packages': ['cgp_seq_input_val'], 'package_data': {'cgp_seq_input_val': ['config/*.json']}, 'entry_points': { diff --git a/tests/test_cgp_seq_input_val_seq_validator.py b/tests/test_cgp_seq_input_val_seq_validator.py index 9f18b56..87f3cc7 100644 --- a/tests/test_cgp_seq_input_val_seq_validator.py +++ b/tests/test_cgp_seq_input_val_seq_validator.py @@ -14,18 +14,18 @@ def teardown(): def test_seq_val_i_read_good(): fqi = os.path.join(test_dir, 'good_read_i.fq') - sv = SeqValidator(fqi, None, progress_pairs=1) + sv = SeqValidator(fqi, 1, out_fh=None, file_b=None, progress_pairs=1) sv.validate() def test_seq_val_p_read_good(): fq1 = os.path.join(test_dir, 'good_read_1.fq') fq2 = os.path.join(test_dir, 'good_read_2.fq') - sv = SeqValidator(fq1, fq2, progress_pairs=1) + sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=1) sv.validate() def test_seq_val_i_gz_read_good(): fqi = os.path.join(test_dir, 'good_read_i.fq.gz') - sv = SeqValidator(fqi, None, progress_pairs=0) + sv = SeqValidator(fqi, 1, out_fh=None, file_b=None, progress_pairs=0) sv.validate() t = str(sv) sv.report(sys.stdout) @@ -33,51 +33,51 @@ def test_seq_val_i_gz_read_good(): def test_seq_val_p_gz_read_good(): fq1 = os.path.join(test_dir, 'good_read_1.fq.gz') fq2 = os.path.join(test_dir, 'good_read_2.fq.gz') - sv = SeqValidator(fq1, fq2, progress_pairs=0) + sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0) sv.validate() def test_seq_val_bad_file(): with pytest.raises(SeqValidationError) as e_info: fqi = os.path.join(test_dir, 'good_read_i.BAD') - sv = SeqValidator(fqi, None, progress_pairs=0) + sv = SeqValidator(fqi, 1, out_fh=None, file_b=None, progress_pairs=0) def test_seq_val_mismatch_ext(): with pytest.raises(SeqValidationError) as e_info: fq1 = os.path.join(test_dir, 'good_read_1.fq') fq2 = os.path.join(test_dir, 'good_read_2.fq.gz') - sv = SeqValidator(fq1, fq2, progress_pairs=0) + sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0) def test_seq_val_more_read2(): with pytest.raises(SeqValidationError) as e_info: fq1 = os.path.join(test_dir, 'good_read_1.fq') fq2 = os.path.join(test_dir, '2_reads_2.fq') - sv = SeqValidator(fq1, fq2, progress_pairs=0) + sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0) sv.validate() def test_seq_val_more_read1(): with pytest.raises(SeqValidationError) as e_info: fq1 = os.path.join(test_dir, '2_reads_1.fq') fq2 = os.path.join(test_dir, 'good_read_2.fq') - sv = SeqValidator(fq1, fq2, progress_pairs=0) + sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0) sv.validate() def test_seq_val_r1_in_2(): with pytest.raises(SeqValidationError) as e_info: fq1 = os.path.join(test_dir, 'good_read_1.fq') fq2 = os.path.join(test_dir, 'r1_reads_in_2.fq') - sv = SeqValidator(fq1, fq2, progress_pairs=0) + sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0) sv.validate() def test_seq_val_r2_in_1(): with pytest.raises(SeqValidationError) as e_info: fq1 = os.path.join(test_dir, 'good_read_2.fq') fq2 = os.path.join(test_dir, 'r2_reads_in_1.fq') - sv = SeqValidator(fq1, fq2, progress_pairs=0) + sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0) sv.validate() def test_seq_val_fq_name(): with pytest.raises(SeqValidationError) as e_info: fq1 = os.path.join(test_dir, 'good_read_1.fq') fq2 = os.path.join(test_dir, 'diff_2.fq') - sv = SeqValidator(fq1, fq2, progress_pairs=0) + sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0) sv.validate()