1.5.2

telatin · Jul 4, 2021 · 993eb71 · 993eb71
1 parent 77b210c
commit 993eb71
Show file tree

Hide file tree

Showing 8 changed files with 137 additions and 23 deletions.
diff --git a/docs/releases/v1.5.2.md b/docs/releases/v1.5.2.md
@@ -0,0 +1,33 @@
+# SeqFu v1.5.2
+
+### Changes
+
+* **seqfu cat** has new options to manipulate the sequence name (like `--append STRING`) and to add comments (like  `--add-len`, `--add-gc`)
+
+### Splash screen
+```text
+SeqFu - Sequence Fastx Utilities
+version: 1.5.2
+
+  · count [cnt]         : count FASTA/FASTQ reads, pair-end aware
+  · deinterleave [dei]  : deinterleave FASTQ
+  · derep [der]         : feature-rich dereplication of FASTA/FASTQ files
+  · interleave [ilv]    : interleave FASTQ pair ends
+  · lanes [mrl]         : merge Illumina lanes
+  · metadata [met]      : print a table of FASTQ reads (mapping files)
+  · sort [srt]          : sort sequences by size (uniques)
+  · stats [st]          : statistics on sequence lengths
+
+  · cat                 : concatenate FASTA/FASTQ files
+  · grep                : select sequences with patterns
+  · head                : print first sequences
+  · rc                  : reverse complement strings or files
+  · tab                 : tabulate reads to TSV (and viceversa)
+  · tail                : view last sequences
+  · view                : view sequences with colored quality and oligo matches
+
+Add --help after each command to print usage
+
+```
+
+
diff --git a/docs/tools/cat.md b/docs/tools/cat.md
@@ -13,23 +13,37 @@ Concatenate multiple FASTA or FASTQ files.
 Options:
   -k, --skip SKIP        Print one sequence every SKIP [default: 0]
 
+Sequence name:
   -p, --prefix STRING    Rename sequences with prefix + incremental number
-  -s, --strip-comments   Remove comments
-  -z, --strip-name       Remove name
+  -z, --strip-name       Remove the original sequence name
   -b, --basename         Prepend file basename to the sequence name
+  -a, --append STRING    Append this string to the sequence name [default: ]
   --split CHAR           Split basename at this char [default: .]
   --part INT             After splitting the basename, take this part [default: 1]
-
+  --sep STRING           Sequence name fields separator [default: _]
+  --basename-sep STRING  Separate basename from the rest with this [default: _]
+
+Sequence comments:
+  -s, --strip-comments   Remove original sequence comments
+  --comment-sep CHAR     Comment separator [default:  ]
+  --add-len              Add 'len=LENGTH' to the comments
+  --add-initial-len      Add 'original_len=LENGTH' to the comments
+  --add-gc               Add 'gc=%GC' to the comments
+  --add-initial-gc       Add 'original_gc=%GC' to the comments
+  --add-name             Add 'original_name=INITIAL_NAME' to the comments
+
+Filtering:
   -m, --min-len INT      Discard sequences shorter than INT [default: 1]
   -x, --max-len INT      Discard sequences longer than INT, 0 to ignore [default: 0]
   --trim-front INT       Trim INT base from the start of the sequence [default: 0]
   --trim-tail INT        Trim INT base from the end of the sequence [default: 0]
   --truncate INT         Keep only the first INT bases, 0 to ignore  [default: 0]
+                         Negative values to print the last INT bases
 
+Output:
   --fasta                Force FASTA output
   --fastq                Force FASTQ output
   --list                 Output a list of sequence names 
-  --sep STRING           Sequence name fields separator [default: _]
   -q, --fastq-qual INT   FASTQ default quality [default: 33]
   -v, --verbose          Verbose output
   -h, --help             Show this help
@@ -43,5 +57,6 @@ Additionally, you can add _standard input_ to the list of input files.
 by adding `-`.
 
 ## Output
-It is possible to mix FASTA and FASTQ files, and by default the program will produce a mixed output. Using `--fasta` or `--fastq` will force a specific output format. For FASTA sequences  a default quality values will be used.
+It is possible to mix FASTA and FASTQ files, and by default the program will produce a mixed output. 
+Using `--fasta` or `--fastq` will force a specific output format. For FASTA sequences a default quality values will be used.
 Using `--list` the simple list of records matching the criteria will be printed.
diff --git a/releases/changes.md b/releases/changes.md
@@ -1,2 +1,2 @@
-* **seqfu count** now multithreading and redesigned. The output format is identical but  the order of the records is not protected (use **seqfu count-legacy** if needed)
-* **seqfu cat** can print a list of sequences matching the criteria (`--list`)
+
+* **seqfu cat** has new options to manipulate the sequence name (like `--append STRING`) and to add comments (like  `--add-len`, `--add-gc`)
diff --git a/releases/v1.5.0.md b/releases/v1.5.0.md
@@ -0,0 +1,2 @@
+* **seqfu count** now multithreading and redesigned. The output format is identical but  the order of the records is not protected (use **seqfu count-legacy** if needed)
+* **seqfu cat** can print a list of sequences matching the criteria (`--list`)
diff --git a/releases/v1.5.2.splashes.json b/releases/v1.5.2.splashes.json
diff --git a/seqfu.nimble b/seqfu.nimble
@@ -1,5 +1,5 @@
 # Package
-version       = "1.5.0"
+version       = "1.5.2"
 author        = "Andrea Telatin"
 description   = "SeqFU command-line tools"
 license       = "MIT"

diff --git a/src/fastx_cat.nim b/src/fastx_cat.nim
@@ -14,23 +14,38 @@ Concatenate multiple FASTA or FASTQ files.
 Options:
   -k, --skip SKIP        Print one sequence every SKIP [default: 0]
 
+Sequence name:
   -p, --prefix STRING    Rename sequences with prefix + incremental number
-  -s, --strip-comments   Remove comments
-  -z, --strip-name       Remove name
+  -z, --strip-name       Remove the original sequence name
+  -a, --append STRING    Append this string to the sequence name [default: ]
+  --sep STRING           Sequence name fields separator [default: _]
+
   -b, --basename         Prepend file basename to the sequence name
   --split CHAR           Split basename at this char [default: .]
   --part INT             After splitting the basename, take this part [default: 1]
-
+  --basename-sep STRING  Separate basename from the rest with this [default: _]
+
+Sequence comments:
+  -s, --strip-comments   Remove original sequence comments
+  --comment-sep CHAR     Comment separator [default:  ]
+  --add-len              Add 'len=LENGTH' to the comments
+  --add-initial-len      Add 'original_len=LENGTH' to the comments
+  --add-gc               Add 'gc=%GC' to the comments
+  --add-initial-gc       Add 'original_gc=%GC' to the comments
+  --add-name             Add 'original_name=INITIAL_NAME' to the comments
+
+Filtering:
   -m, --min-len INT      Discard sequences shorter than INT [default: 1]
   -x, --max-len INT      Discard sequences longer than INT, 0 to ignore [default: 0]
   --trim-front INT       Trim INT base from the start of the sequence [default: 0]
   --trim-tail INT        Trim INT base from the end of the sequence [default: 0]
   --truncate INT         Keep only the first INT bases, 0 to ignore  [default: 0]
+                         Negative values to print the last INT bases
 
+Output:
   --fasta                Force FASTA output
   --fastq                Force FASTQ output
   --list                 Output a list of sequence names 
-  --sep STRING           Sequence name fields separator [default: _]
   -q, --fastq-qual INT   FASTQ default quality [default: 33]
   -v, --verbose          Verbose output
   -h, --help             Show this help
@@ -42,7 +57,13 @@ Options:
     forceFasta = args["--fasta"]
     forceFastq = args["--fastq"]
     defaultQual = parseInt($args["--fastq-qual"])
+
+    let
+      GC_DECIMAL_DIGITS = 2
+
     var
+      appendToName: string
+      appendSuffixToName: bool
       formatList: bool
       skip   : int
       prefix : string
@@ -54,8 +75,13 @@ Options:
       minSeqLen,maxSeqLen: int
       trimFront, trimEnd: int
       truncate: int
+      basenameSeparatorString: string
 
     try:
+      appendToName = $args["--append"]
+      appendSuffixToName = if len(appendToName) > 0: true
+                           else: false
+      basenameSeparatorString = $args["--basename-sep"]
       formatList = args["--list"]
       skip =  parseInt($args["--skip"])
       printBasename = args["--basename"] 
@@ -116,7 +142,18 @@ Options:
           # Print sequence
           currentPrintedSeqs += 1
 
-
+          let 
+            original_name = r.name
+
+          # Remove comments
+          if stripComments:
+            r.comment = ""
+
+          # Add comments if needed
+          if args["--add-initial-len"]:
+            r.comment &= $args["--comment-sep"] & "initial_len=" & $(len(r.seq))
+          if args["--add-initial-gc"]:
+            r.comment &= $args["--comment-sep"] & "initial_gc=" & get_gc(r.seq).formatFloat(ffDecimal, GC_DECIMAL_DIGITS)
 
           ## TRIM FRONT / TAIL
           if trimFront > 0 or trimEnd > 0:
@@ -129,6 +166,10 @@ Options:
             r.seq = r.seq[0 .. truncate-1]
             if len(r.qual) > 0:
               r.qual = r.qual[trimFront .. ^trimEnd]
+          elif truncate < 0:
+            r.seq = r.seq[^(truncate * -1) .. ^1]
+            if len(r.qual) > 0:
+              r.qual = r.qual[^(truncate * -1) .. ^1]
 
           ## DISCARD BY LEN           
           if len(r.seq) < minSeqLen or (maxSeqLen > 0 and len(r.seq) > maxSeqLen):
@@ -149,7 +190,7 @@ Options:
               bn = lastPathPart(filename).split(splitChar)[splitPart]
             else:
               bn = lastPathPart(filename)
-            newName =  bn & separator
+            newName =  bn & basenameSeparatorString
 
           # rename with prefix + counter
           if prefix != "" or printBasename:
@@ -163,20 +204,31 @@ Options:
             if not args["--strip-name"]:
               newName &= r.name
 
-          # Replace name if needed
 
-          if not args["--strip-comments"]:
+
+          # Append suffix to name
+          if appendSuffixToName:
+            newName &= appendToName
+
+          # Replace name if needed
+          r.name = newName
+
+          if not stripComments:
             newName &= "\t" & r.comment
 
-          r.name = newName
 
-          # Remove comments
-          if stripComments:
-            r.comment = ""
 
-          # Print output
 
-
+          ## COMMENTS AFTER TRIMMING
+          if args["--add-len"]:
+            r.comment &= $args["--comment-sep"] & "len=" & $len(r.seq)
+
+          if args["--add-gc"]:
+            r.comment &= $args["--comment-sep"] & "gc=" & get_gc(r.seq).formatFloat(ffDecimal, GC_DECIMAL_DIGITS)
+
+          if args["--add-name"]:
+            r.comment &= $args["--comment-sep"] & "original_name=" & original_name
+          # Print output
           if formatList:
             echo r.name
             continue

diff --git a/src/seqfu_utils.nim b/src/seqfu_utils.nim
@@ -85,7 +85,18 @@ proc printFastxRecord*(s: FastxRecord): string =
     ">" & seqName & "\n" & s.seq 
 
 
-
+proc get_gc*(s: string): float =
+  var 
+    gc_count = 0
+    at_count = 0
+    upper_seq = toUpperAscii(s)
+  for c in upper_seq:
+    if c in @['G', 'C']:
+      gc_count += 1
+    elif c in @['A', 'T', 'U']:
+      at_count += 1
+
+  return float(gc_count) / float(gc_count + at_count)
 
 proc guessR2*(file_R1: string, pattern_R1="auto", pattern_R2="auto", verbose=false): string =
   if not fileExists(file_R1):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		* seqfu count now multithreading and redesigned. The output format is identical but the order of the records is not protected (use seqfu count-legacy if needed)
		* seqfu cat can print a list of sequences matching the criteria (`--list`)