Skip to content

Commit

Permalink
Merge pull request #951 from Ensembl/release/113
Browse files Browse the repository at this point in the history
Release/113
  • Loading branch information
vinay-ebi authored Sep 3, 2024
2 parents 2e0063c + d26c79e commit 3045fbb
Show file tree
Hide file tree
Showing 27 changed files with 259 additions and 120 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ sub run {
-db => 'alphafold',
-db_version => $alpha_version,
-db_file => $self->param('db_dir') . '/accession_ids.csv',
-display_label => 'AlphaFold DB import',
-display_label => 'AFDB-ENSP mapping',
-displayable => '1',
-description => 'Protein features based on AlphaFold predictions, mapped with GIFTS or UniParc'
);
Expand Down
3 changes: 2 additions & 1 deletion modules/Bio/EnsEMBL/Production/Pipeline/GTF/DumpFile.pm
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,8 @@ feature for the position of this on the genome
- cds_start_NF: the coding region start could not be confirmed
- mRNA_end_NF: the mRNA end could not be confirmed
- mRNA_start_NF: the mRNA start could not be confirmed.
- basic: the transcript is part of the gencode basic geneset
- gencode_basic: the transcript is part of the gencode basic geneset
- gencode_primary: the transcript is part of the gencode primary geneset
Comments
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ sub all_hashes {
} ## end foreach my $slice (@slices)

for my $seq_type (keys %$batch) {
for my $attrib_table (keys $batch->{$seq_type}) {
for my $attrib_table (keys %{$batch->{$seq_type}}) {
$attribute_adaptor->store_batch_on_Object($attrib_table, $batch->{$seq_type}->{$attrib_table}, 1000);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,10 @@ sub merge_xrefs {
$obj->{$dbname} = [];
}
for my $ann ( @{ $subobj->{$dbname} } ) {
push $obj->{$dbname}, $self->copy_hash($ann);
if (ref($obj->{$dbname}) ne 'ARRAY') {
$obj->{$dbname} = [];
}
push @{ $obj->{$dbname} }, $self->copy_hash($ann);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ sub write_output {
my $compara_param = $self->param('compara');
my $cleanup_dir = $self->param('cleanup_dir');

foreach my $pair (keys $sp_config) {
foreach my $pair (keys %{$sp_config}) {
my $compara = $sp_config->{$pair}->{'compara'};
if (defined $compara_param && $compara ne $compara_param) {
print STDERR "Skipping $compara\n";
Expand Down
44 changes: 17 additions & 27 deletions modules/Bio/EnsEMBL/Production/Pipeline/PipeConfig/Base_conf.pm
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,14 @@ sub beekeeper_extra_cmdline_options {
sub resource_classes {
my $self = shift;

## String it together
my %time = (
H => ' --time=1:00:00',
D => ' --time=1-00:00:00',
W => ' --time=7-00:00:00'
);

## Sting it together
my %time = (H => ' --time=1:00:00',
D => ' --time=1-00:00:00',
W => ' --time=7-00:00:00',);

my %memory = ('100M' => '100',
'200M' => '200',
my %memory = (
'500M' => '500',
'1GB' => '1000',
'2GB' => '2000',
Expand All @@ -89,40 +89,30 @@ sub resource_classes {
);

my $dq = ' --partition=datamover';

my %output = (
#Default is a duplicate of 100M
'default' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'H'} . ' --mem=' . $memory{'100M'} . 'm' },
'default_D' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'D'} . ' --mem=' . $memory{'100M'} . 'm' },
'default_W' => { 'LSF' => '-q ' . $self->o('production_queue'), 'SLURM' => $time{'W'} . ' --mem=' . $memory{'100M'} . 'm' },
'default' => { 'SLURM' => $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' },
'default_D' => { 'SLURM' => $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' },
'default_W' => { 'SLURM' => $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' },
#Data mover nodes
'dm' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'100M'} . 'm' },
'dm_D' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'100M'} . 'm' },
'dm_W' => { 'LSF' => '-q ' . $self->o('datamover_queue'), 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'100M'} . 'm' },
'dm32_D' => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 32000 -R "rusage[mem=32000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'32GB'} . 'm' },
'dmMAX_D' => { 'LSF' => '-q ' . $self->o('datamover_queue') . ' -M 200000 -R "rusage[mem=200000]"', 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'200GB'} . 'm' },
'dm' => { 'SLURM' => $dq . $time{'H'} . ' --mem=' . $memory{'1GB'} . 'm' },
'dm_D' => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'1GB'} . 'm' },
'dm_W' => { 'SLURM' => $dq . $time{'W'} . ' --mem=' . $memory{'1GB'} . 'm' },
'dm32_D' => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'32GB'} . 'm' },
'dmMAX_D' => { 'SLURM' => $dq . $time{'D'} . ' --mem=' . $memory{'200GB'} . 'm' },
);
#Create a dictionary of all possible time and memory combinations. Format would be:
#2G={
# 'SLURM' => ' --time=1:00:00 --mem=2000m',
# 'LSF' => '-q $self->o(production_queue) -M 2000 -R "rusage[mem=2000]"'
# };

while ((my $time_key, my $time_value) = each(%time)) {
while ((my $memory_key, my $memory_value) = each(%memory)) {
if ($time_key eq 'H') {
$output{$memory_key} = { 'LSF' => '-q ' . $self->o('production_queue') . ' -M ' . $memory_value . ' -R "rusage[mem=' . $memory_value . ']"',
'SLURM' => $time_value . ' --mem=' . $memory_value . 'm' }
$output{$memory_key} = { 'SLURM' => $time_value . ' --mem=' . $memory_value . 'm' };
}
else {
$output{$memory_key . '_' . $time_key} = { 'LSF' => '-q ' . $self->o('production_queue') . ' -M ' . $memory_value . ' -R "rusage[mem=' . $memory_value . ']"',
'SLURM' => $time_value . ' --mem=' . $memory_value . 'm' }
$output{$memory_key . '_' . $time_key} = { 'SLURM' => $time_value . ' --mem=' . $memory_value . 'm' };
}
}
}

return \%output;

}

1;
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ sub default_options {

interpro_file => 'names.dat',
interpro2go_file => 'interpro2go',
uniparc_file => 'upidump.lis',
uniparc_file => 'upidump.lis.gz',
mapping_file => 'idmapping_selected.tab.gz',

# Files are retrieved and stored locally with the same name.
Expand Down Expand Up @@ -227,6 +227,30 @@ sub default_options {
ipscan_xml => 'TMHMM',
ipscan_lookup => 0,
},
{
db => 'Phobius',
ipscan_lookup => 1,
ipscan_name => 'Phobius',
ipscan_xml => 'PHOBIUS',
logic_name => 'phobius',
program => 'InterProScan',
},
{
db => 'SignalP_GRAM_POSITIVE',
ipscan_lookup => 1,
ipscan_name => 'SignalP_GRAM_POSITIVE',
ipscan_xml => 'SIGNALP_GRAM_POSITIVE',
logic_name => 'signalp_gram_positive',
program => 'InterProScan',
},
{
db => 'SignalP_GRAM_NEGATIVE',
ipscan_lookup => 1,
ipscan_name => 'SignalP_GRAM_NEGATIVE',
ipscan_xml => 'SIGNALP_GRAM_NEGATIVE',
logic_name => 'signalp_gram_negative',
program => 'InterProScan',
},
#seg replaces low complexity regions in protein sequences with X characters(https://rothlab.ucdavis.edu/genhelp/seg.html)
{
logic_name => 'seg',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ sub pipeline_analyses {
base_path => $self->o('base_path'),
release => $self->o('release')
},
-max_retry_count => 0,
-flow_into => {
'2->A' => 'dump_xref',
'A->1' => 'schedule_mapping'
Expand All @@ -187,6 +188,7 @@ sub pipeline_analyses {
release => $self->o('release'),
config_file => $self->o('config_file')
},
-max_retry_count => 0,
-flow_into => { 2 => 'align_factory' },
-rc_name => '1GB',
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,27 @@ package Bio::EnsEMBL::Production::Pipeline::ProteinFeatures::LoadUniParc;

use strict;
use warnings;

use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use File::Basename;

use base ('Bio::EnsEMBL::Production::Pipeline::Common::Base');

sub run {
my ($self) = @_;
my $uniparc_file = $self->param_required('uniparc_file_local');


if (-e $uniparc_file) {

#check if uniparc file is compressed
if ($uniparc_file =~ /\.gz$/){
my $uniparc_file_decompress = $uniparc_file;
$uniparc_file_decompress =~ s/\.gz$//;
gunzip $uniparc_file => $uniparc_file_decompress or $self->throw("gunzip failed: $GunzipError");
#delete compressed file .gz
unlink $uniparc_file or $self->throw("unable to delete $uniparc_file: $!");
$uniparc_file = $uniparc_file_decompress;
}

my $dbh = $self->hive_dbh;
my $sql = "LOAD DATA LOCAL INFILE '$uniparc_file' INTO TABLE uniparc FIELDS TERMINATED BY ' '";
$dbh->do($sql) or self->throw($dbh->errstr);
Expand All @@ -41,9 +52,14 @@ sub run {
my $index_2 = 'ALTER TABLE uniparc ADD KEY md5sum_idx (md5sum) USING HASH';
$dbh->do($index_2) or self->throw($dbh->errstr);

#delete upidump file from pipeline direcotry after loading into hive db
unlink $uniparc_file or $self->throw("unable to delete $uniparc_file: $!");

} else {
$self->throw("Checksum file '$uniparc_file' does not exist");
}


}

1;
14 changes: 11 additions & 3 deletions modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/Alignment.pm
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,18 @@ sub run {
$exe =~ s/\n//g;
my $command_string = sprintf ("%s --showalignment FALSE --showvulgar FALSE --ryo '%s' --gappedextension FALSE --model 'affine:local' %s --subopt no --query %s --target %s --querychunktotal %s --querychunkid %s", $exe, $ryo, $method, $source, $target, $max_chunks, $chunk);
my $output = `$command_string`;
my @hits = grep {$_ =~ /^xref/} split "\n", $output; # not all lines in output are alignments

while (my $hit = shift @hits) {
print $fh $hit . "\n";
if ($? == 0) {
my @hits = grep {$_ =~ /^xref/} split "\n", $output; # not all lines in output are alignments

while (my $hit = shift @hits) {
print $fh $hit . "\n";
}
} else {
my $job = $self->input_job();
$job->adaptor()->db()->get_LogMessageAdaptor()->store_job_message($job->dbID(), $output, 'WORKER_ERROR');

throw("Exonerate failed with exit_code: $?\n");
}

$fh->close();
Expand Down
17 changes: 12 additions & 5 deletions modules/Bio/EnsEMBL/Production/Pipeline/Xrefs/ScheduleSource.pm
Original file line number Diff line number Diff line change
Expand Up @@ -127,21 +127,28 @@ sub run {
} else {
# Create list of files
opendir(my $dir_handle, $file_name);
my @list_files = readdir($dir_handle);
my @temp_list_files = readdir($dir_handle);
closedir($dir_handle);

my @list_files;
foreach my $file (@temp_list_files) {
next if ($file =~ /^\./);
push(@list_files, $file_name . "/" . $file);
}
if ($preparse) { @list_files = $preparse; }

# For Uniprot and Refseq, files might have been split by species
if (!$preparse && ($name =~ /^Uniprot/ || $name =~ /^RefSeq_peptide/ || $name =~ /^RefSeq_dna/)) {
my $file_prefix = ($name =~ /SPTREMBL/ ? 'uniprot_trembl' : ($name =~ /SWISSPROT/ ? 'uniprot_sprot' : ($name =~ /_dna/ ? 'refseq_rna' : 'refseq_protein')));
@list_files = glob($file_name . "/**/" . $file_prefix . "-" . $species_id);
$_ = basename(dirname($_)) . "/" . basename($_) foreach (@list_files);
my @species_list_files = glob($file_name . "/**/**/**/**/" . $file_prefix . "-" . $species_id);
if (scalar(@species_list_files) > 0) {
@list_files = @species_list_files;
}
}

foreach my $file (@list_files) {
next if ($file =~ /^\./);
$file =~ s/\n//;
$file = $file_name . "/" . $file;
if (!-f $file) { next; }
if (defined $release_file and $file eq $release_file) { next; }

$dataflow_params = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@
{
"name" : "HGNC",
"parser" : "HGNCParser",
"file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
"file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
"db" : "ccds",
"priority" : 3
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@
{
"name" : "Xenbase",
"parser" : "XenopusJamboreeParser",
"file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt",
"file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt",
"priority" : 1
},
{
Expand All @@ -241,7 +241,7 @@
{
"name" : "HGNC",
"parser" : "HGNCParser",
"file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
"file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
"db" : "ccds",
"priority" : 3
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@
{
"name" : "Xenbase",
"parser" : "XenopusJamboreeParser",
"file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping.txt",
"file" : "http://ftp.xenbase.org/pub/GenePageReports/GenePageEnsemblModelMapping_4.1.txt",
"priority" : 1
},
{
Expand All @@ -269,7 +269,7 @@
{
"name" : "HGNC",
"parser" : "HGNCParser",
"file" : "https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
"file" : "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit",
"db" : "ccds",
"priority" : 3
}
Expand Down
Loading

0 comments on commit 3045fbb

Please sign in to comment.