Merge pull request #9 from cancerit/feature/mergePaired

Modify report, add generation of interleaved fastq from paired
cancerit · Dec 7, 2017 · 1621fe0 · 1621fe0
2 parents b26e4f3 + 8a9b3e6
commit 1621fe0
Show file tree

Hide file tree

Showing 7 changed files with 131 additions and 34 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -16,14 +16,15 @@ install:
   - pip install pytest pytest-cov
   - pip install progressbar2
   - pip install xlrd
+  - pip install xopen
 
 before_script:
   - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
   - chmod +x ./cc-test-reporter
   - ./cc-test-reporter before-build
 
 script:
-  - pytest --cov-branch --cov-report term --cov=cgp_seq_input_val
+  - pytest --cov-branch --cov-report term --cov=cgp_seq_input_val --cov-fail-under=50
 
 after_script:
   - ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
diff --git a/README.md b/README.md
@@ -48,13 +48,24 @@ of:
 {
     "interleaved": false,
     "pairs": 722079,
+    "possible_encoding": [
+        "Sanger",
+        "Illumina 1.8"
+    ],
+    "quality_ascii_range": [
+        37,
+        67
+    ],
     "valid_q": true
 }
 ```
 
+Optionally generates a new interleaved (gz) file when paired-fastq is the input.
+
 Various exceptions can occur for malformed files.
 
-The primary purpose is to confirm Sanger/Illumina 1.8+ quality scores.
+The primary purpose is to confirm Sanger/Illumina 1.8+ quality scores.  Further Information
+on Phred encoding can be found [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding).
 
 #### FASTQ not BAM/CRAM
 
@@ -78,6 +89,7 @@ pip install --find-links=~/wheels cgp_seq_input_val
 
 * [progressbar2](http://progressbar-2.readthedocs.io/en/latest/)
 * [xlrd](https://github.com/python-excel/xlrd)
+* [xopen](https://github.com/marcelm/xopen)
 
 ## Development environment
 
@@ -110,9 +122,11 @@ cd $PROJECTROOT
 hash virtualenv || pip3 install virtualenv
 virtualenv -p python3 env
 source env/bin/activate
-pip install progressbar2
-pip install xlrd
+pip install -r requirements.txt
 python setup.py develop # so bin scripts can find module
+
+## If changed requirements please run:
+pip freeze | grep -v `echo ${PWD##*/}` > requirements.txt
 ```
 
 For testing/coverage (`./run_tests.sh`)

diff --git a/cgp_seq_input_val/command_line.py b/cgp_seq_input_val/command_line.py
@@ -88,8 +88,19 @@ def main():
                           dest='input',
                           metavar='FILE',
                           nargs='+',
-                          help='Input manifest in tsv formats',
+                          help='Input FASTQ (optionally gzip compressed)',
                           required=True)
+    parser_c.add_argument('-q', '--qc',
+                          dest='qc',
+                          type=int,
+                          default=100000,
+                          help='Assess phred quality scale using N pairs (0=all, slow)',
+                          required=False)
+    parser_c.add_argument('-o', '--output',
+                          dest='output',
+                          type=str,
+                          help='Output as interleaved FASTQ (ignored for interleaved input)',
+                          required=False)
     parser_c.set_defaults(func=validate_seq_files)
 
     args = parser.parse_args()

diff --git a/cgp_seq_input_val/seq_validator.py b/cgp_seq_input_val/seq_validator.py
@@ -4,7 +4,8 @@
 
 import os
 import sys
-import gzip
+import gzip  # only used for reading
+from xopen import xopen  # only used for writing
 import json
 
 # progressbar2
@@ -14,25 +15,39 @@
 from cgp_seq_input_val.error_classes import SeqValidationError
 from cgp_seq_input_val.fastq_read import FastqRead
 
-prog_records = 100000
+# From: https://en.wikipedia.org/wiki/FASTQ_format#Encoding
+Q_RANGES = {'Sanger': [33, 73],
+            'Solexa': [59, 104],
+            'Illumina 1.3': [64, 104],
+            'Illumina 1.5': [67, 105],
+            'Illumina 1.8': [33, 74]}
+
+PROG_RECORDS = 100000
 
 
 def validate_seq_files(args):
     """
     Top level entry point for validating sequence files.
     """
+    out_fh = None
     try:
         file_2 = None
         if len(args.input) == 2:
             file_2 = args.input[1]
-        validator = SeqValidator(args.input[0], file_2)
+            if args.output:
+                out_fh = xopen(args.output, mode='wt')
+
+        validator = SeqValidator(args.input[0], args.qc, out_fh=out_fh, file_b=file_2)
         validator.validate()
         validator.report(args.report)
     except SeqValidationError as ve:  # runtime so no functions for message and errno
         sys.exit("ERROR: " + str(ve))
     # have to catch 2 classes works 3.0-3.3, above 3.3 all IO issues are captured under OSError
     except (OSError, IOError) as err:
         sys.exit("ERROR (%d): %s - %s" % (err.errno, err.strerror, err.filename))
+    finally:
+        if out_fh:
+            out_fh.close()
 
 
 class SeqValidator(object):
@@ -45,18 +60,20 @@ class SeqValidator(object):
         progress_pairs - optional, how often to update progress bar [100,000]
                        - set to 0 to disable
     """
-    def __init__(self, file_a, file_b=None, progress_pairs=prog_records):
+    def __init__(self, file_a, qc_reads, file_b=None, out_fh=None, progress_pairs=PROG_RECORDS):
         self.progress_pairs = progress_pairs
+        self.qc_reads = qc_reads
         self.file_a = file_a
         self.file_b = file_b
+        self.out_fh = out_fh
         self.pairs = 0
         # will use this to decide on path
         self.is_gzip = False  # change open method for fastq
         # sam is not supported
 
-        # only the min value is actually needed to determine if scaling
-        # is Sanger or Illumina 1.8+
         self.q_min = 1000
+        self.q_max = -1
+        self.encodings = []
         self._prep()
 
     def __str__(self):
@@ -65,6 +82,8 @@ def __str__(self):
         ret.append('file_b: '+str(self.file_b))
         ret.append('is_gzip: '+str(self.is_gzip))
         ret.append('q_min: '+str(self.q_min))
+        ret.append('q_max: '+str(self.q_max))
+        ret.append('encodings: '+str(self.encodings))
         return '\n'.join(ret)
 
     def _prep(self):
@@ -97,16 +116,28 @@ def validate(self):
         else:
             self.validate_paired()
 
+    def possible_encoding(self):
+        """
+        Converts the ascii quality score range to something useful for debugging
+        """
+        for encoding in Q_RANGES:
+            if(Q_RANGES[encoding][0] <= self.q_min <= Q_RANGES[encoding][1] and
+               Q_RANGES[encoding][0] <= self.q_max <= Q_RANGES[encoding][1]):
+                self.encodings.append(encoding)
+
     def report(self, fp):
         """
         Prints json report to the provided file-pointer
 
         Args:
             fp - file pointer
         """
+        self.possible_encoding()
         report = {'pairs': self.pairs,
-                  'valid_q': self.q_min == 33,
-                  'interleaved': self.file_a == self.file_b}
+                  'valid_q': self.q_min >= 33 and self.q_max <= 74,
+                  'interleaved': self.file_a == self.file_b,
+                  'possible_encoding': self.encodings,
+                  'quality_ascii_range': [self.q_min, self.q_max]}
         json.dump(report, fp, sort_keys=True, indent=4)
 
     def validate_paired(self):
@@ -146,7 +177,12 @@ def validate_paired(self):
                 curr_line_b = read_2.last_line
                 fqh_line_b = read_2.file_pos[1]
 
-                self.check_pair(read_1, read_2)
+                self.check_pair(read_1, read_2, self.qc_reads == 0 or pairs < self.qc_reads)
+
+                if self.out_fh:
+                    print(read_1, file=self.out_fh)
+                    print(read_2, file=self.out_fh)
+
                 pairs += 1
 
                 if bar and pairs % prog_indic == 0:
@@ -198,11 +234,12 @@ def validate_interleaved(self):
                 # ensure line increments based on the last line read
                 fqh_line = read_2.file_pos[1]
 
-                self.check_pair(read_1, read_2)
+                self.check_pair(read_1, read_2, self.qc_reads == 0 or pairs < self.qc_reads)
                 pairs += 1
 
                 if bar and pairs % prog_indic == 0:
                     bar.update(pairs/prog_indic)
+
                 if curr_line == '':
                     break
             self.pairs = pairs
@@ -211,19 +248,28 @@ def validate_interleaved(self):
             if fq_fh is not None and not fq_fh.closed:
                 fq_fh.close()
 
-    def check_pair(self, read_1, read_2):
+    def qual_range(self, read):
+        """
+        Finds the min and max ascii values from each quality encoding
+        """
+        sorted_qual = list(map(ord, read.qual))
+        sorted_qual.sort()  # faster than sorted(list(...))
+        if self.q_min > sorted_qual[0]:
+            self.q_min = sorted_qual[0]
+
+        if self.q_max < sorted_qual[-1]:
+            self.q_max = sorted_qual[-1]
+
+    def check_pair(self, read_1, read_2, check_qual):
         """
         Compares a pair of reads
 
         Raises:
             SeqValidationError
         """
-        if self.q_min > 33:
-            # once a min of 33 is achieved it must be sanger/Illumina 1.8+
-            # may need occasional review.
-            q_min = min(map(ord, read_1.qual))
-            if self.q_min > q_min:
-                self.q_min = q_min
+        if check_qual:
+            self.qual_range(read_1)
+            self.qual_range(read_2)
 
         if read_1.name != read_2.name:
             raise SeqValidationError("Fastq record name at line %d should be a \

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,25 @@
+astroid==1.5.3
+bottle==0.12.13
+colorama==0.3.9
+coverage==4.4.1
+flake8==3.4.1
+flake8-polyfill==1.0.1
+isort==4.2.15
+lazy-object-proxy==1.3.1
+mando==0.6.4
+mccabe==0.6.1
+nose2==0.6.5
+pep8==1.7.0
+progressbar2==3.34.3
+py==1.4.34
+py-cpuinfo==3.3.0
+pycodestyle==2.3.1
+pyflakes==1.5.0
+pytest==3.2.3
+pytest-cov==2.5.1
+python-utils==2.2.0
+radon==2.1.1
+six==1.11.0
+wrapt==1.10.11
+xlrd==1.1.0
+xopen==0.3.2
diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
     'version': '1.2.1',
     'python_requires': '>= 3.4',
     'setup_requires': ['pytest'],
-    'install_requires': ['progressbar2', 'xlrd'],
+    'install_requires': ['progressbar2', 'xlrd', 'xopen'],
     'packages': ['cgp_seq_input_val'],
     'package_data': {'cgp_seq_input_val': ['config/*.json']},
     'entry_points': {

diff --git a/tests/test_cgp_seq_input_val_seq_validator.py b/tests/test_cgp_seq_input_val_seq_validator.py
@@ -14,70 +14,70 @@ def teardown():
 
 def test_seq_val_i_read_good():
     fqi = os.path.join(test_dir, 'good_read_i.fq')
-    sv = SeqValidator(fqi, None, progress_pairs=1)
+    sv = SeqValidator(fqi, 1, out_fh=None, file_b=None, progress_pairs=1)
     sv.validate()
 
 def test_seq_val_p_read_good():
     fq1 = os.path.join(test_dir, 'good_read_1.fq')
     fq2 = os.path.join(test_dir, 'good_read_2.fq')
-    sv = SeqValidator(fq1, fq2, progress_pairs=1)
+    sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=1)
     sv.validate()
 
 def test_seq_val_i_gz_read_good():
     fqi = os.path.join(test_dir, 'good_read_i.fq.gz')
-    sv = SeqValidator(fqi, None, progress_pairs=0)
+    sv = SeqValidator(fqi, 1, out_fh=None, file_b=None, progress_pairs=0)
     sv.validate()
     t = str(sv)
     sv.report(sys.stdout)
 
 def test_seq_val_p_gz_read_good():
     fq1 = os.path.join(test_dir, 'good_read_1.fq.gz')
     fq2 = os.path.join(test_dir, 'good_read_2.fq.gz')
-    sv = SeqValidator(fq1, fq2, progress_pairs=0)
+    sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
     sv.validate()
 
 def test_seq_val_bad_file():
     with pytest.raises(SeqValidationError) as e_info:
         fqi = os.path.join(test_dir, 'good_read_i.BAD')
-        sv = SeqValidator(fqi, None, progress_pairs=0)
+        sv = SeqValidator(fqi, 1, out_fh=None, file_b=None, progress_pairs=0)
 
 def test_seq_val_mismatch_ext():
     with pytest.raises(SeqValidationError) as e_info:
         fq1 = os.path.join(test_dir, 'good_read_1.fq')
         fq2 = os.path.join(test_dir, 'good_read_2.fq.gz')
-        sv = SeqValidator(fq1, fq2, progress_pairs=0)
+        sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
 
 def test_seq_val_more_read2():
     with pytest.raises(SeqValidationError) as e_info:
         fq1 = os.path.join(test_dir, 'good_read_1.fq')
         fq2 = os.path.join(test_dir, '2_reads_2.fq')
-        sv = SeqValidator(fq1, fq2, progress_pairs=0)
+        sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
         sv.validate()
 
 def test_seq_val_more_read1():
     with pytest.raises(SeqValidationError) as e_info:
         fq1 = os.path.join(test_dir, '2_reads_1.fq')
         fq2 = os.path.join(test_dir, 'good_read_2.fq')
-        sv = SeqValidator(fq1, fq2, progress_pairs=0)
+        sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
         sv.validate()
 
 def test_seq_val_r1_in_2():
     with pytest.raises(SeqValidationError) as e_info:
         fq1 = os.path.join(test_dir, 'good_read_1.fq')
         fq2 = os.path.join(test_dir, 'r1_reads_in_2.fq')
-        sv = SeqValidator(fq1, fq2, progress_pairs=0)
+        sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
         sv.validate()
 
 def test_seq_val_r2_in_1():
     with pytest.raises(SeqValidationError) as e_info:
         fq1 = os.path.join(test_dir, 'good_read_2.fq')
         fq2 = os.path.join(test_dir, 'r2_reads_in_1.fq')
-        sv = SeqValidator(fq1, fq2, progress_pairs=0)
+        sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
         sv.validate()
 
 def test_seq_val_fq_name():
     with pytest.raises(SeqValidationError) as e_info:
         fq1 = os.path.join(test_dir, 'good_read_1.fq')
         fq2 = os.path.join(test_dir, 'diff_2.fq')
-        sv = SeqValidator(fq1, fq2, progress_pairs=0)
+        sv = SeqValidator(fq1, 1, out_fh=None, file_b=fq2, progress_pairs=0)
         sv.validate()