Skip to content

Commit

Permalink
Add depth parameter to fastgFish()
Browse files Browse the repository at this point in the history
 * User-specified number of iterations for Fastg fishing
 * Path to perl script can now be specified by user in function call
  • Loading branch information
kbseah committed Feb 25, 2016
1 parent b4f23a7 commit 58cf8f2
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 27 deletions.
Binary file added R_source_package/gbtools_2.5.4.tar.gz
Binary file not shown.
65 changes: 47 additions & 18 deletions accessory_scripts/fastg_paths_fishing.pl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# SPAdes-style Fastg and Scaffold.paths parsing tool
# Brandon Seah ([email protected])
# 2016-02-23
# 2016-02-25

# Given output from SPAdes:
# Fastg assembly graph,
Expand All @@ -21,6 +21,7 @@
my $version="2016-02-23";
my $fastg_file;
my $paths_file;
my $iter_depth = 0; # How many fishing iterations (0 = no limit)
my $iter_count = 0; # Counter for iterations of bait fishing
my $out="fastg_fishing"; # Output file prefix
my $bait_file;
Expand All @@ -35,10 +36,12 @@
my %fished_nodes_hash; # Hash of nodes corresponding to fished edges

## Usage options #################################

if (! @ARGV) { usage(); } # Print usage statement if no arguments
GetOptions (
"fastg|g=s" =>\$fastg_file, # Fastg file of assembly graph
"paths|p=s" =>\$paths_file, # File scaffold.paths or contigs.paths
"paths|p=s" =>\$paths_file, # File scaffold.paths or contigs.paths
"iter|i=i" =>\$iter_depth, # Number of iterations
"output|o=s" =>\$out, # Output prefix
"bait|b=s" =>\$bait_file, # List of scaffold names to fish
"rflag|r" =>\$rflag # Report to stdout if called from R
Expand All @@ -63,7 +66,7 @@
print $outlog_fh "Number of corresponding bait edges: ". scalar @bait_edges_array. "\n";

read_fastg();
perform_fishing_edges();
perform_fishing_edges($iter_depth);
translate_fished_edges_to_nodes();

if ($rflag == 0) { # Save list of fished contigs to file by default
Expand Down Expand Up @@ -99,16 +102,19 @@ sub usage {
print STDERR "These files are produced by SPAdes 3.6.2 onwards \n\n";
print STDERR "Usage: perl $0 \\ \n";
print STDERR "\t -g assembly_graph.fastg \\ \n";
print STDERR "\t -p scaffols.paths \\ \n";
print STDERR "\t -p scaffolds.paths \\ \n";
print STDERR "\t -o output_prefix \\ \n";
print STDERR "\t -b list_of_bait_scaffolds \\ \n";
print STDERR "\t -i [number of fishing iterations] \n";
print STDERR "\t -r [Flag if called from within R] \n";
print STDERR "\n";
print STDERR "-i 0 means fishing iterates until no more contigs retrieved\n\n";
print STDERR "Output: Logfile (prefix.log) and list of fished scaffolds (prefix.scafflist)\n\n";
exit;
}

sub translate_fished_edges_to_nodes { # Convert list of fished edges to list of nodes
sub translate_fished_edges_to_nodes {
# Convert list of fished edges to list of nodes
foreach my $theedge (keys %edge_fishing_hash) {
foreach my $thenode (@{$node_edge_hash{$theedge}}) {
$fished_nodes_hash{$thenode}++;
Expand All @@ -117,26 +123,42 @@ sub translate_fished_edges_to_nodes { # Convert list of fished edges to list of
}

sub perform_fishing_edges {
my $depth = shift @_; # Iteration depth
if ($depth == 0) { # I know this is inelegant
$iter_depth = 2;
}
my $init_count=0; # Counter for no. of fished contigs
my $curr_count=1; # Second counter
while ($init_count != $curr_count) { # Loop until iteratively complete
while ($iter_count <= $iter_depth-1 # Loop until iteratively complete
&& $init_count != $curr_count) { # or when fishing is exhausted
$iter_count++; # Iteration counter
$init_count=0; # Reset counters for each iterative step
$curr_count=0;
foreach my $fishedge (keys %edge_fishing_hash) { # Count edges marked as bait
if (defined $edge_fishing_hash{$fishedge} && $edge_fishing_hash{$fishedge} == 1) {
if ($depth == 0) { # If exhaustive looping,
$iter_depth++; # keep shifting goalposts
}
# Count edges marked as bait
foreach my $fishedge (keys %edge_fishing_hash) {
if (defined $edge_fishing_hash{$fishedge}
&& $edge_fishing_hash{$fishedge} == 1) {
$init_count++;
}
}
foreach my $fastg_entry (keys %fastg_hash) { # For all Fastg entries
if (defined $edge_fishing_hash{$fastg_entry} && $edge_fishing_hash{$fastg_entry} == 1) { # If edge is listed as a bait
# For all Fastg entries
foreach my $fastg_entry (keys %fastg_hash) {
# If edge is listed as a bait
if (defined $edge_fishing_hash{$fastg_entry}
&& $edge_fishing_hash{$fastg_entry} == 1) {
foreach my $connected_edges (@{$fastg_hash{$fastg_entry}}) {
$edge_fishing_hash{$connected_edges} = 1; # Mark those connected edges as bait
# Mark those connected edges as bait
$edge_fishing_hash{$connected_edges} = 1;
}
}
}
foreach my $fishedge (keys %edge_fishing_hash) { # Count edges marked as bait after fishing
if (defined $edge_fishing_hash {$fishedge} && $edge_fishing_hash {$fishedge} == 1) {
# Count edges marked as bait after fishing
foreach my $fishedge (keys %edge_fishing_hash) {
if (defined $edge_fishing_hash {$fishedge}
&& $edge_fishing_hash {$fishedge} == 1) {
$curr_count++;
}
}
Expand All @@ -148,10 +170,12 @@ sub read_fastg {
open(FASTGIN, "<", $fastg_file) or die ("$!\n"); # Read in Fastg file
while (<FASTGIN>) {
chomp;
if ($_ =~ m/^>EDGE_(\d+)_.*:(.+);$/) { # If a Fastg header line
# If a Fastg header line
if ($_ =~ m/^>EDGE_(\d+)_.*:(.+);$/) {
my $current_node = $1;
my $conn_line = $2;
$conn_line =~ s/'//g; # Remove inverted commas, which mark revcomp
# Remove inverted commas, which mark revcomp
$conn_line =~ s/'//g;
my @conn_line_array = split ",", $conn_line;
foreach my $the_header (@conn_line_array) {
if ($the_header =~ m/EDGE_(\d+)_/) {
Expand Down Expand Up @@ -202,10 +226,14 @@ sub hash_nodes_edges {
while (<PATHSIN>) {
chomp;
my $full_header = $_;
if ($full_header =~ m/NODE.*'$/) { # If header for reversed scaffold, ignore
# If header for reversed scaffold, ignore
if ($full_header =~ m/NODE.*'$/) {
$skip_flag = 1;
next;
} elsif ($full_header =~ m/NODE_(\d+)_.*\d+$/) { # If header for forward scaffold...

}
# If header for forward scaffold...
elsif ($full_header =~ m/NODE_(\d+)_.*\d+$/) {
$skip_flag = 0; # Turn off skip flag
$current_node = $1;
$scaffolds_fullnames_hash{$current_node} = $full_header; # Save full header name
Expand All @@ -215,7 +243,8 @@ sub hash_nodes_edges {
next;
} else {
my $edge_line = $_;
$edge_line =~ s/[-\+;]//g; # Remove unneeded chars, only interested in edge IDs
# Remove unneeded chars, only interested in edge IDs
$edge_line =~ s/[-\+;]//g;
my @edge_split = split ",", $edge_line;
foreach my $the_edge (@edge_split) {
push @{$node_edge_hash{$the_edge}}, $current_node; # Hash nodes with edges as key
Expand Down
2 changes: 1 addition & 1 deletion gbtools/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: gbtools
Type: Package
Version: 2.5.3
Version: 2.5.4
Title: Interactive visualization of metagenome assemblies in R
Date: 2016-02-25
Author: Brandon Seah
Expand Down
2 changes: 1 addition & 1 deletion gbtools/R/fastgFish.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
#' @return Object of class gbtbin
#' @export

fastgFish <- function (x, bin, fastg.file, paths.file, save, file) UseMethod ("fastgFish") # Define generic for fastgFish function
fastgFish <- function (x, bin, fastg.file, paths.file, depth, save, file, script.path) UseMethod ("fastgFish") # Define generic for fastgFish function
11 changes: 7 additions & 4 deletions gbtools/R/fastgFish.gbt.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,30 @@
#' @param bin Object of class gbtbin, derived from x above
#' @param fastg.file Fastg formatted assembly graph from SPAdes
#' @param paths Paths file mapping assembly graph edge names to scaffold/contig names
#' @param depth Number of fishing iterations (Default: 0 means iterate until no additional contigs recruited)
#' @param save Logical: Save list of fished contigs to external file? (Default: No)
#' @param file File name to save list of fished contigs, if save=TRUE
#' @param script.path Location of the fastg_paths_fishing.pl script
#' @return Object of class gbtbin
#' @export

fastgFish.gbt <- function (x, # Object of class gbt (parent object of the bin)
bin, # Object of class gbtbin
fastg.file, # Fastg assembly graph
paths.file, # scaffolds.paths or contigs.paths file
depth=0, # Number of iterations to fish (Default 0 - exhaustive)
save=FALSE, # Save result to external file? Default no
file="fished_bin.list" # File name to save result
file="fished_bin.list", # File name to save result
script.path="/home/kbseah/tools/my_scripts/genome-bin-tools/accessory_scripts/fastg_paths_fishing.pl"
) {
## REPLACE THIS PATH WITH SCRIPT PATH ON YOUR LOCAL SYSTEM #############################################
script.path <- "/home/kbseah/tools/my_scripts/genome-bin-tools/accessory_scripts/fastg_paths_fishing.pl"
########################################################################################################
script.path=script.path
command <- "perl"
command.params <- paste(script.path,
"-g", fastg.file,
"-p", paths.file,
"-o /tmp/tmp.fishing_output",
"-b -",
"-i", depth,
"-r")
fished.contigs.list <- system2 (command,
command.params,
Expand Down
2 changes: 1 addition & 1 deletion gbtools/man/fastgFish.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
\alias{fastgFish}
\title{Perform connectivity fishing with Fastg and paths files from SPAdes 3.6.2+}
\usage{
fastgFish(x, bin, fastg.file, paths.file, save, file)
fastgFish(x, bin, fastg.file, paths.file, depth, save, file, script.path)
}
\arguments{
\item{x}{Object of class gbt (parent object of the gbtbin object)}
Expand Down
9 changes: 7 additions & 2 deletions gbtools/man/fastgFish.gbt.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
\alias{fastgFish.gbt}
\title{Perform connectivity fishing with Fastg and paths files from SPAdes 3.6.2+}
\usage{
\method{fastgFish}{gbt}(x, bin, fastg.file, paths.file, save = FALSE,
file = "fished_bin.list")
\method{fastgFish}{gbt}(x, bin, fastg.file, paths.file, depth = 0,
save = FALSE, file = "fished_bin.list",
script.path = "/home/kbseah/tools/my_scripts/genome-bin-tools/accessory_scripts/fastg_paths_fishing.pl")
}
\arguments{
\item{x}{Object of class gbt (parent object of the gbtbin object)}
Expand All @@ -14,10 +15,14 @@

\item{fastg.file}{Fastg formatted assembly graph from SPAdes}

\item{depth}{Number of fishing iterations (Default: 0 means iterate until no additional contigs recruited)}

\item{save}{Logical: Save list of fished contigs to external file? (Default: No)}

\item{file}{File name to save list of fished contigs, if save=TRUE}

\item{script.path}{Location of the fastg_paths_fishing.pl script}

\item{paths}{Paths file mapping assembly graph edge names to scaffold/contig names}
}
\value{
Expand Down

0 comments on commit 58cf8f2

Please sign in to comment.