Merge branch 'release/1.3.0'

cancerit · Dec 12, 2017 · 6c37df8 · 6c37df8
2 parents dc8470e + fd16cd7
commit 6c37df8
Show file tree

Hide file tree

Showing 8 changed files with 167 additions and 39 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -16,14 +16,15 @@ install:
   - pip install pytest pytest-cov
   - pip install progressbar2
   - pip install xlrd
+  - pip install xopen
 
 before_script:
   - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
   - chmod +x ./cc-test-reporter
   - ./cc-test-reporter before-build
 
 script:
-  - pytest --cov-branch --cov-report term --cov=cgp_seq_input_val
+  - pytest --cov-branch --cov-report term --cov=cgp_seq_input_val --cov-fail-under=50
 
 after_script:
   - ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
diff --git a/CHANGES.md b/CHANGES.md
@@ -0,0 +1,32 @@
+# CHANGES
+
+## 1.3.0
+
+* Fixes `valid_q` value of `seq-valid` json report.  Always returned false previously.
+* Correct command line help text for `seq-valid` input option.
+* Extends the `seq-valid` json report to include the ascii range detected and which types of
+phred score these align to.
+* Additional command line options for `seq-valid`
+  * `-o | --output` - Generates an interleaved gzip compressed fastq file when input is paired fastq
+  * `-q | --qc` - Specify the number of pairs to be used when assessing the phred quality range
+    * Added for performance reasons
+
+## 1.2.1
+
+* More informative exceptions for low-level issues
+
+## 1.2.0
+
+* Changes to command line, using sub commands now
+* Changed test framework
+* Add travis and codeclimate checks
+
+## 1.1.0
+
+* Fixed issue of `json.loads()` in py<3.5 being unable to decode resource string to utf-8 automatically.
+* Adds a UUID to the header of a processed manifest if incoming not set
+* Improve documentation
+
+## 1.0.0
+
+Early functional release
diff --git a/README.md b/README.md
@@ -48,13 +48,24 @@ of:
 {
     "interleaved": false,
     "pairs": 722079,
+    "possible_encoding": [
+        "Sanger",
+        "Illumina 1.8"
+    ],
+    "quality_ascii_range": [
+        37,
+        67
+    ],
     "valid_q": true
 }
 ```
 
+Optionally generates a new interleaved (gz) file when paired-fastq is the input.
+
 Various exceptions can occur for malformed files.
 
-The primary purpose is to confirm Sanger/Illumina 1.8+ quality scores.
+The primary purpose is to confirm Sanger/Illumina 1.8+ quality scores.  Further Information
+on Phred encoding can be found [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding).
 
 #### FASTQ not BAM/CRAM
 
@@ -70,6 +81,8 @@ Installation is via `pip`.  Simply execute with the path to the packaged distrib
 
 ```bash
 pip install --find-links=~/wheels cgp_seq_input_val
+# or
+pip install https://github.com/cancerit/cgp_seq_input_val/archive/master.tar.gz
 ```
 
 ### Package Dependancies
@@ -78,6 +91,7 @@ pip install --find-links=~/wheels cgp_seq_input_val
 
 * [progressbar2](http://progressbar-2.readthedocs.io/en/latest/)
 * [xlrd](https://github.com/python-excel/xlrd)
+* [xopen](https://github.com/marcelm/xopen)
 
 ## Development environment
 
@@ -110,19 +124,18 @@ cd $PROJECTROOT
 hash virtualenv || pip3 install virtualenv
 virtualenv -p python3 env
 source env/bin/activate
-pip install progressbar2
-pip install xlrd
+pip install -r requirements.txt
 python setup.py develop # so bin scripts can find module
+
+## If changed requirements please run:
+pip freeze | grep -v `echo ${PWD##*/}` > requirements.txt
 ```
 
 For testing/coverage (`./run_tests.sh`)
 
 ```
 source env/bin/activate # if not already in env
-pip install pytest
-pip install pytest-cov
-pip install pep8
-pip install radon
+pip install -r requirements.txt
 gem install --user-install mdl
 ```
 

diff --git a/cgp_seq_input_val/command_line.py b/cgp_seq_input_val/command_line.py
@@ -88,8 +88,19 @@ def main():
                           dest='input',
                           metavar='FILE',
                           nargs='+',
-                          help='Input manifest in tsv formats',
+                          help='Input FASTQ (optionally gzip compressed)',
                           required=True)
+    parser_c.add_argument('-q', '--qc',
+                          dest='qc',
+                          type=int,
+                          default=100000,
+                          help='Assess phred quality scale using N pairs (0=all, slow)',
+                          required=False)
+    parser_c.add_argument('-o', '--output',
+                          dest='output',
+                          type=str,
+                          help='Output as interleaved FASTQ (ignored for interleaved input)',
+                          required=False)
     parser_c.set_defaults(func=validate_seq_files)
 
     args = parser.parse_args()

diff --git a/cgp_seq_input_val/seq_validator.py b/cgp_seq_input_val/seq_validator.py
@@ -4,7 +4,8 @@
 
 import os
 import sys
-import gzip
+import gzip  # only used for reading
+from xopen import xopen  # only used for writing
 import json
 
 # progressbar2
@@ -14,25 +15,39 @@
 from cgp_seq_input_val.error_classes import SeqValidationError
 from cgp_seq_input_val.fastq_read import FastqRead
 
-prog_records = 100000
+# From: https://en.wikipedia.org/wiki/FASTQ_format#Encoding
+Q_RANGES = {'Sanger': [33, 73],
+            'Solexa': [59, 104],
+            'Illumina 1.3': [64, 104],
+            'Illumina 1.5': [67, 105],
+            'Illumina 1.8': [33, 74]}
+
+PROG_RECORDS = 100000
 
 
 def validate_seq_files(args):
     """
     Top level entry point for validating sequence files.
     """
+    out_fh = None
     try:
         file_2 = None
         if len(args.input) == 2:
             file_2 = args.input[1]
-        validator = SeqValidator(args.input[0], file_2)
+            if args.output:
+                out_fh = xopen(args.output, mode='wt')
+
+        validator = SeqValidator(args.input[0], args.qc, out_fh=out_fh, file_b=file_2)
         validator.validate()
         validator.report(args.report)
     except SeqValidationError as ve:  # runtime so no functions for message and errno
         sys.exit("ERROR: " + str(ve))
     # have to catch 2 classes works 3.0-3.3, above 3.3 all IO issues are captured under OSError
     except (OSError, IOError) as err:
         sys.exit("ERROR (%d): %s - %s" % (err.errno, err.strerror, err.filename))
+    finally:
+        if out_fh:
+            out_fh.close()
 
 
 class SeqValidator(object):
@@ -45,18 +60,20 @@ class SeqValidator(object):
         progress_pairs - optional, how often to update progress bar [100,000]
                        - set to 0 to disable
     """
-    def __init__(self, file_a, file_b=None, progress_pairs=prog_records):
+    def __init__(self, file_a, qc_reads, file_b=None, out_fh=None, progress_pairs=PROG_RECORDS):
         self.progress_pairs = progress_pairs
+        self.qc_reads = qc_reads
         self.file_a = file_a
         self.file_b = file_b
+        self.out_fh = out_fh
         self.pairs = 0
         # will use this to decide on path
         self.is_gzip = False  # change open method for fastq
         # sam is not supported
 
-        # only the min value is actually needed to determine if scaling
-        # is Sanger or Illumina 1.8+
         self.q_min = 1000
+        self.q_max = -1
+        self.encodings = []
         self._prep()
 
     def __str__(self):
@@ -65,6 +82,8 @@ def __str__(self):
         ret.append('file_b: '+str(self.file_b))
         ret.append('is_gzip: '+str(self.is_gzip))
         ret.append('q_min: '+str(self.q_min))
+        ret.append('q_max: '+str(self.q_max))
+        ret.append('encodings: '+str(self.encodings))
         return '\n'.join(ret)
 
     def _prep(self):
@@ -97,16 +116,28 @@ def validate(self):
         else:
             self.validate_paired()
 
+    def possible_encoding(self):
+        """
+        Converts the ascii quality score range to something useful for debugging
+        """
+        for encoding in Q_RANGES:
+            if(Q_RANGES[encoding][0] <= self.q_min <= Q_RANGES[encoding][1] and
+               Q_RANGES[encoding][0] <= self.q_max <= Q_RANGES[encoding][1]):
+                self.encodings.append(encoding)
+
     def report(self, fp):
         """
         Prints json report to the provided file-pointer
 
         Args:
             fp - file pointer
         """
+        self.possible_encoding()
         report = {'pairs': self.pairs,
-                  'valid_q': self.q_min == 33,
-                  'interleaved': self.file_a == self.file_b}
+                  'valid_q': self.q_min >= 33 and self.q_max <= 74,
+                  'interleaved': self.file_a == self.file_b,
+                  'possible_encoding': self.encodings,
+                  'quality_ascii_range': [self.q_min, self.q_max]}
         json.dump(report, fp, sort_keys=True, indent=4)
 
     def validate_paired(self):
@@ -146,7 +177,12 @@ def validate_paired(self):
                 curr_line_b = read_2.last_line
                 fqh_line_b = read_2.file_pos[1]
 
-                self.check_pair(read_1, read_2)
+                self.check_pair(read_1, read_2, self.qc_reads == 0 or pairs < self.qc_reads)
+
+                if self.out_fh:
+                    print(read_1, file=self.out_fh)
+                    print(read_2, file=self.out_fh)
+
                 pairs += 1
 
                 if bar and pairs % prog_indic == 0:
@@ -198,11 +234,12 @@ def validate_interleaved(self):
                 # ensure line increments based on the last line read
                 fqh_line = read_2.file_pos[1]
 
-                self.check_pair(read_1, read_2)
+                self.check_pair(read_1, read_2, self.qc_reads == 0 or pairs < self.qc_reads)
                 pairs += 1
 
                 if bar and pairs % prog_indic == 0:
                     bar.update(pairs/prog_indic)
+
                 if curr_line == '':
                     break
             self.pairs = pairs
@@ -211,19 +248,28 @@ def validate_interleaved(self):
             if fq_fh is not None and not fq_fh.closed:
                 fq_fh.close()
 
-    def check_pair(self, read_1, read_2):
+    def qual_range(self, read):
+        """
+        Finds the min and max ascii values from each quality encoding
+        """
+        sorted_qual = list(map(ord, read.qual))
+        sorted_qual.sort()  # faster than sorted(list(...))
+        if self.q_min > sorted_qual[0]:
+            self.q_min = sorted_qual[0]
+
+        if self.q_max < sorted_qual[-1]:
+            self.q_max = sorted_qual[-1]
+
+    def check_pair(self, read_1, read_2, check_qual):
         """
         Compares a pair of reads
 
         Raises:
             SeqValidationError
         """
-        if self.q_min > 33:
-            # once a min of 33 is achieved it must be sanger/Illumina 1.8+
-            # may need occasional review.
-            q_min = min(map(ord, read_1.qual))
-            if self.q_min > q_min:
-                self.q_min = q_min
+        if check_qual:
+            self.qual_range(read_1)
+            self.qual_range(read_2)
 
         if read_1.name != read_2.name:
             raise SeqValidationError("Fastq record name at line %d should be a \

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,25 @@
+astroid==1.5.3
+bottle==0.12.13
+colorama==0.3.9
+coverage==4.4.1
+flake8==3.4.1
+flake8-polyfill==1.0.1
+isort==4.2.15
+lazy-object-proxy==1.3.1
+mando==0.6.4
+mccabe==0.6.1
+nose2==0.6.5
+pep8==1.7.0
+progressbar2==3.34.3
+py==1.4.34
+py-cpuinfo==3.3.0
+pycodestyle==2.3.1
+pyflakes==1.5.0
+pytest==3.2.3
+pytest-cov==2.5.1
+python-utils==2.2.0
+radon==2.1.1
+six==1.11.0
+wrapt==1.10.11
+xlrd==1.1.0
+xopen==0.3.2
diff --git a/setup.py b/setup.py
@@ -9,10 +9,10 @@
     'url': 'https://github.com/cancerit/cgp_seq_input_val',
     'download_url': '',
     'author_email': '[email protected]',
-    'version': '1.2.1',
+    'version': '1.3.0',
     'python_requires': '>= 3.4',
     'setup_requires': ['pytest'],
-    'install_requires': ['progressbar2', 'xlrd'],
+    'install_requires': ['progressbar2', 'xlrd', 'xopen'],
     'packages': ['cgp_seq_input_val'],
     'package_data': {'cgp_seq_input_val': ['config/*.json']},
     'entry_points': {