diff --git a/R_source_package/gbtools_2.5.4.tar.gz b/R_source_package/gbtools_2.5.4.tar.gz new file mode 100644 index 0000000..0f6a190 Binary files /dev/null and b/R_source_package/gbtools_2.5.4.tar.gz differ diff --git a/accessory_scripts/fastg_paths_fishing.pl b/accessory_scripts/fastg_paths_fishing.pl index e44b408..2ae466c 100755 --- a/accessory_scripts/fastg_paths_fishing.pl +++ b/accessory_scripts/fastg_paths_fishing.pl @@ -2,7 +2,7 @@ # SPAdes-style Fastg and Scaffold.paths parsing tool # Brandon Seah (kbseah@mpi-bremen.de) -# 2016-02-23 +# 2016-02-25 # Given output from SPAdes: # Fastg assembly graph, @@ -21,6 +21,7 @@ my $version="2016-02-23"; my $fastg_file; my $paths_file; +my $iter_depth = 0; # How many fishing iterations (0 = no limit) my $iter_count = 0; # Counter for iterations of bait fishing my $out="fastg_fishing"; # Output file prefix my $bait_file; @@ -35,10 +36,12 @@ my %fished_nodes_hash; # Hash of nodes corresponding to fished edges ## Usage options ################################# + if (! @ARGV) { usage(); } # Print usage statement if no arguments GetOptions ( "fastg|g=s" =>\$fastg_file, # Fastg file of assembly graph - "paths|p=s" =>\$paths_file, # File scaffold.paths or contigs.paths + "paths|p=s" =>\$paths_file, # File scaffold.paths or contigs.paths + "iter|i=i" =>\$iter_depth, # Number of iterations "output|o=s" =>\$out, # Output prefix "bait|b=s" =>\$bait_file, # List of scaffold names to fish "rflag|r" =>\$rflag # Report to stdout if called from R @@ -63,7 +66,7 @@ print $outlog_fh "Number of corresponding bait edges: ". scalar @bait_edges_array. "\n"; read_fastg(); -perform_fishing_edges(); +perform_fishing_edges($iter_depth); translate_fished_edges_to_nodes(); if ($rflag == 0) { # Save list of fished contigs to file by default @@ -99,16 +102,19 @@ sub usage { print STDERR "These files are produced by SPAdes 3.6.2 onwards \n\n"; print STDERR "Usage: perl $0 \\ \n"; print STDERR "\t -g assembly_graph.fastg \\ \n"; - print STDERR "\t -p scaffols.paths \\ \n"; + print STDERR "\t -p scaffolds.paths \\ \n"; print STDERR "\t -o output_prefix \\ \n"; print STDERR "\t -b list_of_bait_scaffolds \\ \n"; + print STDERR "\t -i [number of fishing iterations] \n"; print STDERR "\t -r [Flag if called from within R] \n"; print STDERR "\n"; + print STDERR "-i 0 means fishing iterates until no more contigs retrieved\n\n"; print STDERR "Output: Logfile (prefix.log) and list of fished scaffolds (prefix.scafflist)\n\n"; exit; } -sub translate_fished_edges_to_nodes { # Convert list of fished edges to list of nodes +sub translate_fished_edges_to_nodes { + # Convert list of fished edges to list of nodes foreach my $theedge (keys %edge_fishing_hash) { foreach my $thenode (@{$node_edge_hash{$theedge}}) { $fished_nodes_hash{$thenode}++; @@ -117,26 +123,42 @@ sub translate_fished_edges_to_nodes { # Convert list of fished edges to list of } sub perform_fishing_edges { + my $depth = shift @_; # Iteration depth + if ($depth == 0) { # I know this is inelegant + $iter_depth = 2; + } my $init_count=0; # Counter for no. of fished contigs my $curr_count=1; # Second counter - while ($init_count != $curr_count) { # Loop until iteratively complete + while ($iter_count <= $iter_depth-1 # Loop until iteratively complete + && $init_count != $curr_count) { # or when fishing is exhausted $iter_count++; # Iteration counter $init_count=0; # Reset counters for each iterative step $curr_count=0; - foreach my $fishedge (keys %edge_fishing_hash) { # Count edges marked as bait - if (defined $edge_fishing_hash{$fishedge} && $edge_fishing_hash{$fishedge} == 1) { + if ($depth == 0) { # If exhaustive looping, + $iter_depth++; # keep shifting goalposts + } + # Count edges marked as bait + foreach my $fishedge (keys %edge_fishing_hash) { + if (defined $edge_fishing_hash{$fishedge} + && $edge_fishing_hash{$fishedge} == 1) { $init_count++; } } - foreach my $fastg_entry (keys %fastg_hash) { # For all Fastg entries - if (defined $edge_fishing_hash{$fastg_entry} && $edge_fishing_hash{$fastg_entry} == 1) { # If edge is listed as a bait + # For all Fastg entries + foreach my $fastg_entry (keys %fastg_hash) { + # If edge is listed as a bait + if (defined $edge_fishing_hash{$fastg_entry} + && $edge_fishing_hash{$fastg_entry} == 1) { foreach my $connected_edges (@{$fastg_hash{$fastg_entry}}) { - $edge_fishing_hash{$connected_edges} = 1; # Mark those connected edges as bait + # Mark those connected edges as bait + $edge_fishing_hash{$connected_edges} = 1; } } } - foreach my $fishedge (keys %edge_fishing_hash) { # Count edges marked as bait after fishing - if (defined $edge_fishing_hash {$fishedge} && $edge_fishing_hash {$fishedge} == 1) { + # Count edges marked as bait after fishing + foreach my $fishedge (keys %edge_fishing_hash) { + if (defined $edge_fishing_hash {$fishedge} + && $edge_fishing_hash {$fishedge} == 1) { $curr_count++; } } @@ -148,10 +170,12 @@ sub read_fastg { open(FASTGIN, "<", $fastg_file) or die ("$!\n"); # Read in Fastg file while () { chomp; - if ($_ =~ m/^>EDGE_(\d+)_.*:(.+);$/) { # If a Fastg header line + # If a Fastg header line + if ($_ =~ m/^>EDGE_(\d+)_.*:(.+);$/) { my $current_node = $1; my $conn_line = $2; - $conn_line =~ s/'//g; # Remove inverted commas, which mark revcomp + # Remove inverted commas, which mark revcomp + $conn_line =~ s/'//g; my @conn_line_array = split ",", $conn_line; foreach my $the_header (@conn_line_array) { if ($the_header =~ m/EDGE_(\d+)_/) { @@ -202,10 +226,14 @@ sub hash_nodes_edges { while () { chomp; my $full_header = $_; - if ($full_header =~ m/NODE.*'$/) { # If header for reversed scaffold, ignore + # If header for reversed scaffold, ignore + if ($full_header =~ m/NODE.*'$/) { $skip_flag = 1; next; - } elsif ($full_header =~ m/NODE_(\d+)_.*\d+$/) { # If header for forward scaffold... + + } + # If header for forward scaffold... + elsif ($full_header =~ m/NODE_(\d+)_.*\d+$/) { $skip_flag = 0; # Turn off skip flag $current_node = $1; $scaffolds_fullnames_hash{$current_node} = $full_header; # Save full header name @@ -215,7 +243,8 @@ sub hash_nodes_edges { next; } else { my $edge_line = $_; - $edge_line =~ s/[-\+;]//g; # Remove unneeded chars, only interested in edge IDs + # Remove unneeded chars, only interested in edge IDs + $edge_line =~ s/[-\+;]//g; my @edge_split = split ",", $edge_line; foreach my $the_edge (@edge_split) { push @{$node_edge_hash{$the_edge}}, $current_node; # Hash nodes with edges as key diff --git a/gbtools/DESCRIPTION b/gbtools/DESCRIPTION index 615f92d..b5cb5df 100644 --- a/gbtools/DESCRIPTION +++ b/gbtools/DESCRIPTION @@ -1,6 +1,6 @@ Package: gbtools Type: Package -Version: 2.5.3 +Version: 2.5.4 Title: Interactive visualization of metagenome assemblies in R Date: 2016-02-25 Author: Brandon Seah diff --git a/gbtools/R/fastgFish.R b/gbtools/R/fastgFish.R index 7168e29..70904cf 100644 --- a/gbtools/R/fastgFish.R +++ b/gbtools/R/fastgFish.R @@ -13,4 +13,4 @@ #' @return Object of class gbtbin #' @export -fastgFish <- function (x, bin, fastg.file, paths.file, save, file) UseMethod ("fastgFish") # Define generic for fastgFish function \ No newline at end of file +fastgFish <- function (x, bin, fastg.file, paths.file, depth, save, file, script.path) UseMethod ("fastgFish") # Define generic for fastgFish function \ No newline at end of file diff --git a/gbtools/R/fastgFish.gbt.R b/gbtools/R/fastgFish.gbt.R index 7625af6..e41dfe1 100644 --- a/gbtools/R/fastgFish.gbt.R +++ b/gbtools/R/fastgFish.gbt.R @@ -8,8 +8,10 @@ #' @param bin Object of class gbtbin, derived from x above #' @param fastg.file Fastg formatted assembly graph from SPAdes #' @param paths Paths file mapping assembly graph edge names to scaffold/contig names +#' @param depth Number of fishing iterations (Default: 0 means iterate until no additional contigs recruited) #' @param save Logical: Save list of fished contigs to external file? (Default: No) #' @param file File name to save list of fished contigs, if save=TRUE +#' @param script.path Location of the fastg_paths_fishing.pl script #' @return Object of class gbtbin #' @export @@ -17,18 +19,19 @@ fastgFish.gbt <- function (x, # Object of class gbt (parent object of the bin) bin, # Object of class gbtbin fastg.file, # Fastg assembly graph paths.file, # scaffolds.paths or contigs.paths file + depth=0, # Number of iterations to fish (Default 0 - exhaustive) save=FALSE, # Save result to external file? Default no - file="fished_bin.list" # File name to save result + file="fished_bin.list", # File name to save result + script.path="/home/kbseah/tools/my_scripts/genome-bin-tools/accessory_scripts/fastg_paths_fishing.pl" ) { - ## REPLACE THIS PATH WITH SCRIPT PATH ON YOUR LOCAL SYSTEM ############################################# - script.path <- "/home/kbseah/tools/my_scripts/genome-bin-tools/accessory_scripts/fastg_paths_fishing.pl" - ######################################################################################################## + script.path=script.path command <- "perl" command.params <- paste(script.path, "-g", fastg.file, "-p", paths.file, "-o /tmp/tmp.fishing_output", "-b -", + "-i", depth, "-r") fished.contigs.list <- system2 (command, command.params, diff --git a/gbtools/man/fastgFish.Rd b/gbtools/man/fastgFish.Rd index c4ed0b6..bcd2ea9 100644 --- a/gbtools/man/fastgFish.Rd +++ b/gbtools/man/fastgFish.Rd @@ -4,7 +4,7 @@ \alias{fastgFish} \title{Perform connectivity fishing with Fastg and paths files from SPAdes 3.6.2+} \usage{ -fastgFish(x, bin, fastg.file, paths.file, save, file) +fastgFish(x, bin, fastg.file, paths.file, depth, save, file, script.path) } \arguments{ \item{x}{Object of class gbt (parent object of the gbtbin object)} diff --git a/gbtools/man/fastgFish.gbt.Rd b/gbtools/man/fastgFish.gbt.Rd index adf9ad1..d53b444 100644 --- a/gbtools/man/fastgFish.gbt.Rd +++ b/gbtools/man/fastgFish.gbt.Rd @@ -4,8 +4,9 @@ \alias{fastgFish.gbt} \title{Perform connectivity fishing with Fastg and paths files from SPAdes 3.6.2+} \usage{ -\method{fastgFish}{gbt}(x, bin, fastg.file, paths.file, save = FALSE, - file = "fished_bin.list") +\method{fastgFish}{gbt}(x, bin, fastg.file, paths.file, depth = 0, + save = FALSE, file = "fished_bin.list", + script.path = "/home/kbseah/tools/my_scripts/genome-bin-tools/accessory_scripts/fastg_paths_fishing.pl") } \arguments{ \item{x}{Object of class gbt (parent object of the gbtbin object)} @@ -14,10 +15,14 @@ \item{fastg.file}{Fastg formatted assembly graph from SPAdes} +\item{depth}{Number of fishing iterations (Default: 0 means iterate until no additional contigs recruited)} + \item{save}{Logical: Save list of fished contigs to external file? (Default: No)} \item{file}{File name to save list of fished contigs, if save=TRUE} +\item{script.path}{Location of the fastg_paths_fishing.pl script} + \item{paths}{Paths file mapping assembly graph edge names to scaffold/contig names} } \value{