Merge pull request #951 from Ensembl/release/113

Release/113
Ensembl · Sep 3, 2024 · 3045fbb · 3045fbb
2 parents 2e0063c + d26c79e
commit 3045fbb
Show file tree

Hide file tree

Showing 27 changed files with 259 additions and 120 deletions.
diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm b/modules/Bio/EnsEMBL/Production/Pipeline/AlphaFold/InsertProteinFeatures.pm
@@ -164,7 +164,7 @@ sub run {
             -db            => 'alphafold',
             -db_version    => $alpha_version,
             -db_file       => $self->param('db_dir') . '/accession_ids.csv',
-            -display_label => 'AlphaFold DB import',
+            -display_label => 'AFDB-ENSP mapping',
             -displayable   => '1',
             -description   => 'Protein features based on AlphaFold predictions, mapped with GIFTS or UniParc'
     );

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm b/modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm
@@ -383,7 +383,8 @@ feature for the position of this on the genome
 - cds_start_NF: the coding region start could not be confirmed
 - mRNA_end_NF: the mRNA end could not be confirmed
 - mRNA_start_NF: the mRNA start could not be confirmed.
-- basic: the transcript is part of the gencode basic geneset
+- gencode_basic: the transcript is part of the gencode basic geneset
+- gencode_primary: the transcript is part of the gencode primary geneset
 
 Comments
 

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Ga4ghChecksum/ChecksumGenerator.pm
@@ -218,7 +218,7 @@ sub all_hashes {
     } ## end foreach my $slice (@slices)
 
     for my $seq_type (keys %$batch) {
-        for my $attrib_table (keys $batch->{$seq_type}) {
+        for my $attrib_table (keys %{$batch->{$seq_type}}) {
             $attribute_adaptor->store_batch_on_Object($attrib_table, $batch->{$seq_type}->{$attrib_table}, 1000);
         }
     }

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm b/modules/Bio/EnsEMBL/Production/Pipeline/JSON/JsonRemodeller.pm
@@ -292,7 +292,10 @@ sub merge_xrefs {
         $obj->{$dbname} = [];
       }
       for my $ann ( @{ $subobj->{$dbname} } ) {
-        push $obj->{$dbname}, $self->copy_hash($ann);
+        if (ref($obj->{$dbname}) ne 'ARRAY') {
+          $obj->{$dbname} = [];
+        }
+        push @{ $obj->{$dbname} }, $self->copy_hash($ann);
       }
     }
   }

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Ortholog/SourceFactory.pm
@@ -59,7 +59,7 @@ sub write_output {
     my $compara_param = $self->param('compara');
     my $cleanup_dir = $self->param('cleanup_dir');
 
-    foreach my $pair (keys $sp_config) {
+    foreach my $pair (keys %{$sp_config}) {
         my $compara = $sp_config->{$pair}->{'compara'};
         if (defined $compara_param && $compara ne $compara_param) {
             print STDERR "Skipping $compara\n";

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
@@ -66,14 +66,14 @@ sub beekeeper_extra_cmdline_options {
 sub resource_classes {
     my $self = shift;
 
+    ## String it together
+    my %time = (
+        H => ' --time=1:00:00',
+        D => ' --time=1-00:00:00',
+        W => ' --time=7-00:00:00'
+    );
 
-    ## Sting it together
-    my %time = (H => ' --time=1:00:00',
-        D         => ' --time=1-00:00:00',
-        W         => ' --time=7-00:00:00',);
-
-    my %memory = ('100M' => '100',
-        '200M'           => '200',
+    my %memory = (
         '500M'           => '500',
         '1GB'            => '1000',
         '2GB'            => '2000',
@@ -89,40 +89,30 @@ sub resource_classes {
     );
 
     my $dq = ' --partition=datamover';
-
     my %output = (
         #Default is a duplicate of 100M
-        'default'   => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'H'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'default_D' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'D'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'default_W' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'W'} . ' --mem=' . $memory{'100M'} . 'm' },
+        'default'   => { 'SLURM' => $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'default_D' => { 'SLURM' => $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'default_W' => { 'SLURM' => $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' },
         #Data mover nodes
-        'dm'        => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'dm_D'      => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'dm_W'      => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'100M'} . 'm' },
-        'dm32_D'    => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 32000 -R "rusage[mem=32000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'32GB'} . 'm' },
-        'dmMAX_D'    => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 200000 -R "rusage[mem=200000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'200GB'} . 'm' },
+        'dm'       => { 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'dm_D'     => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'dm_W'     => { 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' },
+        'dm32_D'   => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'32GB'} . 'm' },
+        'dmMAX_D'  => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'200GB'} . 'm' },
     );
-    #Create a dictionary of all possible time and memory combinations. Format would be:
-    #2G={
-    #   'SLURM' => ' --time=1:00:00  --mem=2000m',
-    #   'LSF' => '-q $self->o(production_queue) -M 2000 -R "rusage[mem=2000]"'
-    # };
 
     while ((my $time_key, my $time_value) = each(%time)) {
         while ((my $memory_key, my $memory_value) = each(%memory)) {
             if ($time_key eq 'H') {
-                $output{$memory_key} = { 'LSF' => '-q ' . $self->o('production_queue') . ' -M ' . $memory_value . ' -R "rusage[mem=' . $memory_value . ']"',
-                    'SLURM'                    => $time_value . '  --mem=' . $memory_value . 'm' }
+                $output{$memory_key} = { 'SLURM' => $time_value . '  --mem=' . $memory_value . 'm' };
             }
             else {
-                $output{$memory_key . '_' . $time_key} = { 'LSF' => '-q ' . $self->o('production_queue') . ' -M ' . $memory_value . ' -R "rusage[mem=' . $memory_value . ']"',
-                    'SLURM'                                      => $time_value . '  --mem=' . $memory_value . 'm' }
+                $output{$memory_key . '_' . $time_key} = { 'SLURM' => $time_value . '  --mem=' . $memory_value . 'm' };
             }
         }
     }
-
     return \%output;
-
 }
 
 1;
diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/ProteinFeatures_conf.pm
@@ -75,7 +75,7 @@ sub default_options {
 
     interpro_file    => 'names.dat',
     interpro2go_file => 'interpro2go',
-    uniparc_file     => 'upidump.lis',
+    uniparc_file     => 'upidump.lis.gz',
     mapping_file     => 'idmapping_selected.tab.gz',
 
     # Files are retrieved and stored locally with the same name.
@@ -227,6 +227,30 @@ sub default_options {
         ipscan_xml      => 'TMHMM',
         ipscan_lookup   => 0,
       },
+      {
+        db               => 'Phobius',
+        ipscan_lookup    => 1,
+        ipscan_name      => 'Phobius',
+        ipscan_xml       => 'PHOBIUS',
+        logic_name       => 'phobius',
+        program          => 'InterProScan',
+      },
+      {
+        db              => 'SignalP_GRAM_POSITIVE',
+        ipscan_lookup   => 1,
+        ipscan_name     => 'SignalP_GRAM_POSITIVE',
+        ipscan_xml      => 'SIGNALP_GRAM_POSITIVE',
+        logic_name      => 'signalp_gram_positive',
+        program         => 'InterProScan',
+      },
+      {
+        db              => 'SignalP_GRAM_NEGATIVE',
+        ipscan_lookup   => 1,
+        ipscan_name     => 'SignalP_GRAM_NEGATIVE',
+        ipscan_xml      => 'SIGNALP_GRAM_NEGATIVE',
+        logic_name      => 'signalp_gram_negative',
+        program         => 'InterProScan',
+      },      
       #seg replaces low complexity regions in protein sequences with X characters(https://rothlab.ucdavis.edu/genhelp/seg.html)
       {
         logic_name      => 'seg',

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm b/modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/XrefProcess_conf.pm
@@ -173,6 +173,7 @@ sub pipeline_analyses {
       base_path => $self->o('base_path'),
       release   => $self->o('release')
     },
+    -max_retry_count => 0,
     -flow_into  => {
       '2->A' => 'dump_xref',
       'A->1' => 'schedule_mapping'
@@ -187,6 +188,7 @@ sub pipeline_analyses {
       release     => $self->o('release'),
       config_file => $self->o('config_file')
     },
+    -max_retry_count => 0,
     -flow_into  => { 2 => 'align_factory' },
     -rc_name    => '1GB',
   },

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm b/modules/Bio/EnsEMBL/Production/Pipeline/ProteinFeatures/LoadUniParc.pm
@@ -21,16 +21,27 @@ package Bio::EnsEMBL::Production::Pipeline::ProteinFeatures::LoadUniParc;
 
 use strict;
 use warnings;
-
+use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
 use File::Basename;
-
 use base ('Bio::EnsEMBL::Production::Pipeline::Common::Base');
 
 sub run {
   my ($self) = @_;
   my $uniparc_file = $self->param_required('uniparc_file_local');
 
+
   if (-e $uniparc_file) {
+
+    #check if uniparc file is compressed
+    if ($uniparc_file =~ /\.gz$/){
+        my $uniparc_file_decompress = $uniparc_file;
+        $uniparc_file_decompress =~ s/\.gz$//;
+        gunzip $uniparc_file => $uniparc_file_decompress  or $self->throw("gunzip failed: $GunzipError");
+        #delete compressed file .gz
+        unlink  $uniparc_file or $self->throw("unable to delete $uniparc_file: $!");
+        $uniparc_file = $uniparc_file_decompress;
+    }
+
     my $dbh = $self->hive_dbh;
     my $sql = "LOAD DATA LOCAL INFILE '$uniparc_file' INTO TABLE uniparc FIELDS TERMINATED BY ' '";
     $dbh->do($sql) or self->throw($dbh->errstr);
@@ -41,9 +52,14 @@ sub run {
     my $index_2 = 'ALTER TABLE uniparc ADD KEY md5sum_idx (md5sum) USING HASH';
     $dbh->do($index_2) or self->throw($dbh->errstr);
 
+    #delete upidump file from pipeline direcotry after loading into hive db
+    unlink  $uniparc_file or $self->throw("unable to delete $uniparc_file: $!");
+
   } else {
     $self->throw("Checksum file '$uniparc_file' does not exist");
   }
+
+
 }
 
 1;
diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm
@@ -84,10 +84,18 @@ sub run {
   $exe =~ s/\n//g;
   my $command_string = sprintf ("%s --showalignment FALSE --showvulgar FALSE --ryo '%s' --gappedextension FALSE --model 'affine:local' %s --subopt no --query %s --target %s --querychunktotal %s --querychunkid %s", $exe, $ryo, $method, $source, $target, $max_chunks, $chunk);
   my $output = `$command_string`;
-  my @hits = grep {$_ =~ /^xref/} split "\n", $output; # not all lines in output are alignments
 
-  while (my $hit = shift @hits) {
-    print $fh $hit . "\n";
+  if ($? == 0) {
+    my @hits = grep {$_ =~ /^xref/} split "\n", $output; # not all lines in output are alignments
+
+    while (my $hit = shift @hits) {
+      print $fh $hit . "\n";
+    }
+  } else {
+    my $job = $self->input_job();
+    $job->adaptor()->db()->get_LogMessageAdaptor()->store_job_message($job->dbID(), $output, 'WORKER_ERROR');  
+
+    throw("Exonerate failed with exit_code: $?\n");
   }
 
   $fh->close();

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
@@ -127,21 +127,28 @@ sub run {
     } else {
       # Create list of files
       opendir(my $dir_handle, $file_name);
-      my @list_files = readdir($dir_handle);
+      my @temp_list_files = readdir($dir_handle);
       closedir($dir_handle);
+
+      my @list_files;
+      foreach my $file (@temp_list_files) {
+        next if ($file =~ /^\./);
+        push(@list_files, $file_name . "/" . $file);
+      }
       if ($preparse) { @list_files = $preparse; }
 
       # For Uniprot and Refseq, files might have been split by species
       if (!$preparse && ($name =~ /^Uniprot/ || $name =~ /^RefSeq_peptide/ || $name =~ /^RefSeq_dna/)) {
         my $file_prefix = ($name =~ /SPTREMBL/ ? 'uniprot_trembl' : ($name =~ /SWISSPROT/ ? 'uniprot_sprot' : ($name =~ /_dna/ ? 'refseq_rna' : 'refseq_protein')));
-        @list_files = glob($file_name . "/**/" . $file_prefix . "-" . $species_id);
-        $_ = basename(dirname($_)) . "/" . basename($_) foreach (@list_files);
+        my @species_list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id);
+        if (scalar(@species_list_files) > 0) {
+          @list_files = @species_list_files;
+        }
       }
 
       foreach my $file (@list_files) {
-        next if ($file =~ /^\./);
         $file =~ s/\n//;
-        $file = $file_name . "/" . $file;
+        if (!-f $file) { next; }
         if (defined $release_file and $file eq $release_file) { next; }
 
         $dataflow_params = {

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/gencode_sources.json
@@ -203,7 +203,7 @@
     {
       "name" : "HGNC",
       "parser" : "HGNCParser",
-      "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
+      "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
       "db" : "ccds",
       "priority" : 3
     }

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_all_sources.json
@@ -226,7 +226,7 @@
     {
       "name" : "Xenbase",
       "parser" : "XenopusJamboreeParser",
-      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt",
+      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt",
       "priority" : 1
     },
     {
@@ -241,7 +241,7 @@
     {
       "name" : "HGNC",
       "parser" : "HGNCParser",
-      "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
+      "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
       "db" : "ccds",
       "priority" : 3
     }

diff --git a/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json b/modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/xref_sources.json
@@ -254,7 +254,7 @@
     {
       "name" : "Xenbase",
       "parser" : "XenopusJamboreeParser",
-      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt",
+      "file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt",
       "priority" : 1
     },
     {
@@ -269,7 +269,7 @@
     {
       "name" : "HGNC",
       "parser" : "HGNCParser",
-      "file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
+      "file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
       "db" : "ccds",
       "priority" : 3
     }