Ensembl/ensembl-hive

View on GitHub
docs/presentations/HiveWorkshop_Sept2013/solutions2.tar

Summary

Maintainability
Test Coverage
CompressFiles2_conf.pm000644 000765 000024 00000006366 12270503112 015116 0ustar00lg4staff000000 000000 
=pod

=head1 NAME

  Bio::EnsEMBL::Hive::PipeConfig::CompressFiles2_conf

=head1 SYNOPSIS

    init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::CompressFiles2_conf -password <your_password>

    seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => 'dumps', 'only_files' => '*.sql' }"

    seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => '$HOME/ncbi_taxonomy' }"

=head1 DESCRIPTION

    This is an example pipeline put together from two basic building blocks:

    Analysis_1: JobFactory.pm is used to turn the list of files in a given directory into jobs

        these jobs are sent down the branch #2 into the second analysis

    Analysis_2: SystemCmd.pm is used to run these compression/decompression jobs in parallel.

=head1 CONTACT

    Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates

=cut


package Bio::EnsEMBL::Hive::PipeConfig::CompressFiles2_conf;

use strict;
use warnings;

use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');  # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly


=head2 pipeline_analyses

    Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
                  Here it defines two analyses:

                    * 'find_files'          generates a list of files whose names match the pattern #only_files#
                                            Each job of this analysis will dataflow (create jobs) via branch #2 into 'compress_a_file' analysis.

                    * 'compress_a_file'     actually performs the (un)zipping of the files in parallel

=cut

sub pipeline_analyses {
    my ($self) = @_;
    return [
        {   -logic_name => 'find_files',
            -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
            -parameters => {
                'inputcmd'     => 'find #directory# -type f -name "#only_files#"',
                'column_names' => [ 'filename' ],

                'directory'     => 'pdfs',  # directory where both source and target files are located
                'only_files'    => '*',     # any wildcard understood by shell
            },
            -flow_into => {
                '2->A' => [ 'compress_a_file' ],
                'A->1' => [ 'report_by_email' ],
            },
        },

        {   -logic_name    => 'compress_a_file',
            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
            -parameters    => {
                'cmd'       => 'gzip #filename#',
            },
            -analysis_capacity => 4,            # limit the number of workers that will be performing jobs in parallel
        },

        {   -logic_name    => 'report_by_email',
            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::NotifyByEmail',
            -meadow_type => 'LOCAL',
            -parameters    => {
                'email'   => $ENV{'USER'} . '@sanger.ac.uk',
                'subject' => 'pipeline has finished',
                'text'      => 'done compressing files in #directory#',
            },
        },
    ];
}

1;

CompressFiles3_conf.pm000644 000765 000024 00000007207 12270503125 015116 0ustar00lg4staff000000 000000 
=pod

=head1 NAME

  Bio::EnsEMBL::Hive::PipeConfig::CompressFiles3_conf

=head1 SYNOPSIS

    init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::CompressFiles3_conf -password <your_password>

    seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => 'dumps', 'only_files' => '*.sql' }"

    seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => '$HOME/ncbi_taxonomy' }"

=head1 DESCRIPTION

    This is an example pipeline put together from two basic building blocks:

    Analysis_1: JobFactory.pm is used to turn the list of files in a given directory into jobs

        these jobs are sent down the branch #2 into the second analysis

    Analysis_2: SystemCmd.pm is used to run these compression/decompression jobs in parallel.

=head1 CONTACT

    Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates

=cut


package Bio::EnsEMBL::Hive::PipeConfig::CompressFiles3_conf;

use strict;
use warnings;

use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');  # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly


sub hive_meta_table {
    my ($self) = @_;
    return {
        %{$self->SUPER::hive_meta_table},       # here we inherit anything from the base class

        'hive_use_param_stack'  => 1,           # switch on the implicit parameter propagation mechanism
    };
}


=head2 pipeline_analyses

    Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
                  Here it defines two analyses:

                    * 'find_files'          generates a list of files whose names match the pattern #only_files#
                                            Each job of this analysis will dataflow (create jobs) via branch #2 into 'compress_a_file' analysis.

                    * 'compress_a_file'     actually performs the (un)zipping of the files in parallel

=cut

sub pipeline_analyses {
    my ($self) = @_;
    return [
        {   -logic_name => 'find_files',
            -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
            -parameters => {
                'inputcmd'     => 'find #directory# -type f -name "#only_files#"',
                'column_names' => [ 'filename' ],

                'directory'     => 'pdfs',  # directory where both source and target files are located
                'only_files'    => '*',     # any wildcard understood by shell
            },
            -flow_into => {
                '2->A' => [ 'compress_a_file' ],
                'A->1' => [ 'report_by_email' ],
            },
        },

        {   -logic_name    => 'compress_a_file',
            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
            -parameters    => {
                'cmd'       => 'gzip #gzip_flags# #filename#',

                    # defaults for the parameters used in this analysis:
                'gzip_flags'    => '',      # can be set to '-d' for decompression
            },
            -analysis_capacity => 4,            # limit the number of workers that will be performing jobs in parallel
        },

        {   -logic_name    => 'report_by_email',
            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::NotifyByEmail',
            -parameters    => {
                'email'   => $ENV{'USER'} . '@sanger.ac.uk',
                'subject' => 'pipeline has finished',
                'text'      => 'done compressing files in #directory#',
            },
        },
    ];
}

1;

CompressFiles4_conf.pm000644 000765 000024 00000007725 12414771345 015137 0ustar00lg4staff000000 000000 

package Bio::EnsEMBL::Hive::PipeConfig::CompressFiles4_conf;;

use strict;
use warnings;

use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');  # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly


sub hive_meta_table {
    my ($self) = @_;
    return {
        %{$self->SUPER::hive_meta_table},       # here we inherit anything from the base class

        'hive_use_param_stack'  => 1,           # switch on the new param_stack mechanism
    };
}


=head2 pipeline_analyses

    Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
                  Here it defines two analyses:

                    * 'find_files'          generates a list of files whose names match the pattern #only_files#
                                            Each job of this analysis will dataflow (create jobs) via branch #2 into 'compress_a_file' analysis.

                    * 'compress_a_file'     actually performs the (un)zipping of the files in parallel

=cut

sub pipeline_analyses {
    my ($self) = @_;
    return [
        {   -logic_name => 'find_files',
            -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
            -parameters => {
                'inputcmd'     => 'find #directory# -type f -name "#only_files#"',
                'column_names' => [ 'filename' ],

                'directory'     => 'pdfs',  # directory where both source and target files are located
                'only_files'    => '*',     # any wildcard understood by shell
            },
            -flow_into => {
                '2->A' => [ 'pre_compress_size' ],     # will create a fan of jobs
                'A->1' => [ 'report_by_email' ],
            },
        },

        {   -logic_name    => 'pre_compress_size',
            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory', # to capture output use JobFactory
            -parameters    => {
                'inputcmd'      => "wc -c #filename# | sed -e 's/^ *//' ",
                'delimiter'     => ' ',
                'column_names'  => [ 'size', 'filename' ],
            },
            -flow_into => {
                2 => [ ':////accu?size={filename}' ],
                1 => [ 'compress_a_file' ],
            },
        },

        {   -logic_name    => 'compress_a_file',
            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
            -parameters    => {
                'cmd'           => 'gzip #gzip_flags# #filename#',

                'gzip_flags'    => '',      # can be set to '-d' for decompression
            },
            -analysis_capacity => 4,            # limit the number of workers that will be performing jobs in parallel
            -flow_into => {
                1 => [ 'post_compress_size' ],
            },
        },

        {   -logic_name    => 'post_compress_size',
            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory', # to capture output use JobFactory
            -parameters    => {
                'inputcmd'      => "wc -c #filename#.gz | sed -e 's/^ *//' ",
                'delimiter'     => ' ',
                'column_names'  => [ 'comp_size', 'comp_filename' ],
            },
            -flow_into => {
                2 => [ ':////accu?comp_size={comp_filename}' ],
            },
        },

        {   -logic_name    => 'report_by_email',
            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::NotifyByEmail',
            -parameters    => {
                'email'   => $ENV{'USER'} . '@ebi.ac.uk',
                'subject' => 'pipeline has finished',
                'min_comp_size' => '#expr(min values %{#comp_size#})expr#',
                'max_comp_size' => '#expr(max values %{#comp_size#})expr#',
                'text'    => 'done compressing files in #directory#, compressed sizes between #min_comp_size# and #max_comp_size#',
            },
        },

    ];
}

1;