my ($self) = @_;
return [
## First analysis: PECAN
{ -logic_name => 'pecan',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
# The cmd parameter is required by the SystemCmd module. It defines the command line to be run.
# Note that some values are written between #hashes#. Those will be subtituted by the corresponding input values
'cmd' => 'java -cp /soft/pecan_v0.8/pecan_v0.8.jar bp.pecan.Pecan -E "#tree_string#" -F #input_files# -G #msa_file#',
},
-hive_capacity => 200, # max. number of parallel jobs
-input_ids => [
# Each input_id is a new job for this analysis. Here we are defining the input_files and the msa_file for
# three different jobs.
{
'tree_string' => '(((HUMAN,(MOUSE,RAT)),COW),OPOSSUM);',
'input_files' => 'human.fa mouse.fa rat.fa cow.fa opossum.fa',
'msa_file' => "pecan_no_chicken.mfa",
},
{
'tree_string' => '((((HUMAN,MOUSE),COW),OPOSSUM),CHICKEN);',
'input_files' => 'human.fa mouse.fa cow.fa opossum.fa chicken.fa',
'msa_file' => "pecan_no_rat.mfa",
},
{
'tree_string' => '(((HUMAN,COW),OPOSSUM),CHICKEN);',
'input_files' => 'human.fa cow.fa opossum.fa chicken.fa',
'msa_file' => "pecan_no_rodents.mfa",
},
],
-flow_into => {
# dataflow rule. Once a 'pecan' job is done, it will create a new 'gerp_col' job.
# The input_id for the new job will be the same as for the previous job (this is
# only true for branch 1. In this case, 'tree_string', 'input_files' and 'msa_file'
# values are used to create a new 'gerp_col' job (only msa_file is actually required).
1 => [ 'gerp_col' ],
},
},
## Second analysis: GERP_COL
{ -logic_name => 'gerp_col',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
# In this case, #msa_file# comes from the parent 'pecan' job.
'cmd' => 'gerpcol -t tree.nw -f #msa_file# -a -e HUMAN',
},
-hive_capacity => 20, # max. number of parallel jobs
-flow_into => {
# dataflow rule, branch 1. The input_id for the new job will be the same as for the
# previous job, i.e. 'tree_string', 'input_files' and 'msa_file' values are used to
# create a new 'gerp_elem' job (only msa_file is actually required).
1 => [ 'gerp_elem' ],
},
},
## Third analysis: GERP_ELEM
{ -logic_name => 'gerp_elem',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
# In this case, #msa_file# comes from the parent 'gerp_col' job, which in turn comes from its parent 'pecan' job.
'cmd' => 'gerpelem -f #msa_file#.rates -c chr13 -s 32878016 -x .bed',
},
-hive_capacity => 200, # max. number of parallel jobs
},
];
}