docs/presentations/HiveWorkshop_Sept2013/solutions2.tar
CompressFiles2_conf.pm 000644 000765 000024 00000006366 12270503112 015116 0 ustar 00lg4 staff 000000 000000
=pod
=head1 NAME
Bio::EnsEMBL::Hive::PipeConfig::CompressFiles2_conf
=head1 SYNOPSIS
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::CompressFiles2_conf -password <your_password>
seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => 'dumps', 'only_files' => '*.sql' }"
seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => '$HOME/ncbi_taxonomy' }"
=head1 DESCRIPTION
This is an example pipeline put together from two basic building blocks:
Analysis_1: JobFactory.pm is used to turn the list of files in a given directory into jobs
these jobs are sent down the branch #2 into the second analysis
Analysis_2: SystemCmd.pm is used to run these compression/decompression jobs in parallel.
=head1 CONTACT
Please subscribe to the Hive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss Hive-related questions or to be notified of our updates
=cut
package Bio::EnsEMBL::Hive::PipeConfig::CompressFiles2_conf;
use strict;
use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
=head2 pipeline_analyses
Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
Here it defines two analyses:
* 'find_files' generates a list of files whose names match the pattern #only_files#
Each job of this analysis will dataflow (create jobs) via branch #2 into 'compress_a_file' analysis.
* 'compress_a_file' actually performs the (un)zipping of the files in parallel
=cut
sub pipeline_analyses {
my ($self) = @_;
return [
{ -logic_name => 'find_files',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
-parameters => {
'inputcmd' => 'find #directory# -type f -name "#only_files#"',
'column_names' => [ 'filename' ],
'directory' => 'pdfs', # directory where both source and target files are located
'only_files' => '*', # any wildcard understood by shell
},
-flow_into => {
'2->A' => [ 'compress_a_file' ],
'A->1' => [ 'report_by_email' ],
},
},
{ -logic_name => 'compress_a_file',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
'cmd' => 'gzip #filename#',
},
-analysis_capacity => 4, # limit the number of workers that will be performing jobs in parallel
},
{ -logic_name => 'report_by_email',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::NotifyByEmail',
-meadow_type => 'LOCAL',
-parameters => {
'email' => $ENV{'USER'} . '@sanger.ac.uk',
'subject' => 'pipeline has finished',
'text' => 'done compressing files in #directory#',
},
},
];
}
1;
CompressFiles3_conf.pm 000644 000765 000024 00000007207 12270503125 015116 0 ustar 00lg4 staff 000000 000000
=pod
=head1 NAME
Bio::EnsEMBL::Hive::PipeConfig::CompressFiles3_conf
=head1 SYNOPSIS
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::CompressFiles3_conf -password <your_password>
seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => 'dumps', 'only_files' => '*.sql' }"
seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => '$HOME/ncbi_taxonomy' }"
=head1 DESCRIPTION
This is an example pipeline put together from two basic building blocks:
Analysis_1: JobFactory.pm is used to turn the list of files in a given directory into jobs
these jobs are sent down the branch #2 into the second analysis
Analysis_2: SystemCmd.pm is used to run these compression/decompression jobs in parallel.
=head1 CONTACT
Please subscribe to the Hive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss Hive-related questions or to be notified of our updates
=cut
package Bio::EnsEMBL::Hive::PipeConfig::CompressFiles3_conf;
use strict;
use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
sub hive_meta_table {
my ($self) = @_;
return {
%{$self->SUPER::hive_meta_table}, # here we inherit anything from the base class
'hive_use_param_stack' => 1, # switch on the implicit parameter propagation mechanism
};
}
=head2 pipeline_analyses
Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
Here it defines two analyses:
* 'find_files' generates a list of files whose names match the pattern #only_files#
Each job of this analysis will dataflow (create jobs) via branch #2 into 'compress_a_file' analysis.
* 'compress_a_file' actually performs the (un)zipping of the files in parallel
=cut
sub pipeline_analyses {
my ($self) = @_;
return [
{ -logic_name => 'find_files',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
-parameters => {
'inputcmd' => 'find #directory# -type f -name "#only_files#"',
'column_names' => [ 'filename' ],
'directory' => 'pdfs', # directory where both source and target files are located
'only_files' => '*', # any wildcard understood by shell
},
-flow_into => {
'2->A' => [ 'compress_a_file' ],
'A->1' => [ 'report_by_email' ],
},
},
{ -logic_name => 'compress_a_file',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
'cmd' => 'gzip #gzip_flags# #filename#',
# defaults for the parameters used in this analysis:
'gzip_flags' => '', # can be set to '-d' for decompression
},
-analysis_capacity => 4, # limit the number of workers that will be performing jobs in parallel
},
{ -logic_name => 'report_by_email',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::NotifyByEmail',
-parameters => {
'email' => $ENV{'USER'} . '@sanger.ac.uk',
'subject' => 'pipeline has finished',
'text' => 'done compressing files in #directory#',
},
},
];
}
1;
CompressFiles4_conf.pm 000644 000765 000024 00000007725 12414771345 015137 0 ustar 00lg4 staff 000000 000000
package Bio::EnsEMBL::Hive::PipeConfig::CompressFiles4_conf;;
use strict;
use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
sub hive_meta_table {
my ($self) = @_;
return {
%{$self->SUPER::hive_meta_table}, # here we inherit anything from the base class
'hive_use_param_stack' => 1, # switch on the new param_stack mechanism
};
}
=head2 pipeline_analyses
Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
Here it defines two analyses:
* 'find_files' generates a list of files whose names match the pattern #only_files#
Each job of this analysis will dataflow (create jobs) via branch #2 into 'compress_a_file' analysis.
* 'compress_a_file' actually performs the (un)zipping of the files in parallel
=cut
sub pipeline_analyses {
my ($self) = @_;
return [
{ -logic_name => 'find_files',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
-parameters => {
'inputcmd' => 'find #directory# -type f -name "#only_files#"',
'column_names' => [ 'filename' ],
'directory' => 'pdfs', # directory where both source and target files are located
'only_files' => '*', # any wildcard understood by shell
},
-flow_into => {
'2->A' => [ 'pre_compress_size' ], # will create a fan of jobs
'A->1' => [ 'report_by_email' ],
},
},
{ -logic_name => 'pre_compress_size',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory', # to capture output use JobFactory
-parameters => {
'inputcmd' => "wc -c #filename# | sed -e 's/^ *//' ",
'delimiter' => ' ',
'column_names' => [ 'size', 'filename' ],
},
-flow_into => {
2 => [ ':////accu?size={filename}' ],
1 => [ 'compress_a_file' ],
},
},
{ -logic_name => 'compress_a_file',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
'cmd' => 'gzip #gzip_flags# #filename#',
'gzip_flags' => '', # can be set to '-d' for decompression
},
-analysis_capacity => 4, # limit the number of workers that will be performing jobs in parallel
-flow_into => {
1 => [ 'post_compress_size' ],
},
},
{ -logic_name => 'post_compress_size',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory', # to capture output use JobFactory
-parameters => {
'inputcmd' => "wc -c #filename#.gz | sed -e 's/^ *//' ",
'delimiter' => ' ',
'column_names' => [ 'comp_size', 'comp_filename' ],
},
-flow_into => {
2 => [ ':////accu?comp_size={comp_filename}' ],
},
},
{ -logic_name => 'report_by_email',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::NotifyByEmail',
-parameters => {
'email' => $ENV{'USER'} . '@ebi.ac.uk',
'subject' => 'pipeline has finished',
'min_comp_size' => '#expr(min values %{#comp_size#})expr#',
'max_comp_size' => '#expr(max values %{#comp_size#})expr#',
'text' => 'done compressing files in #directory#, compressed sizes between #min_comp_size# and #max_comp_size#',
},
},
];
}
1;