Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

run mcl and inflate results #4

Merged
merged 2 commits into from
Apr 23, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dist.ini
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ repository.type = git
requires = blastp
requires = makeblastdb
requires = cd-hit
requires = mcl
requires = mcxdeblast

[@Basic]
[PruneCruft]
Expand Down
73 changes: 71 additions & 2 deletions lib/Bio/PanGenome.pm
Original file line number Diff line number Diff line change
@@ -1,8 +1,77 @@
package Bio::PanGenome;

# ABSTRACT: Create a pan genome

=head1 SYNOPSIS

Create a pan genome

=cut

use Moose;
# ABSTRACT: Pan Genomes
use Bio::PanGenome::ParallelAllAgainstAllBlast;
use Bio::PanGenome::CombinedProteome;
use Bio::PanGenome::External::Cdhit;
use Bio::PanGenome::External::Mcl;
use Bio::PanGenome::InflateClusters;

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );
has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );
has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' );
has 'mcl_exec' => ( is => 'ro', isa => 'Str', default => 'mcl' );

sub run {
my ($self) = @_;

my $output_combined_filename = 'combined_files.faa';
my $output_cd_hit_filename = 'clustered.faa';
my $output_blast_results_filename = 'blast_results';
my $output_mcl_filename = 'uninflated_mcl_groups';

my $combine_fasta_files = Bio::PanGenome::CombinedProteome->new(
proteome_files => $self->fasta_files,
output_filename => $output_combined_filename,
apply_unknowns_filter => 1
);
$combine_fasta_files->create_combined_proteome_file;

my $cdhit_obj = Bio::PanGenome::External::Cdhit->new(
input_file => $output_combined_filename,
output_base => $output_cd_hit_filename
);
$cdhit_obj->run();

my $blast_obj = Bio::PanGenome::ParallelAllAgainstAllBlast->new(
fasta_file => $output_cd_hit_filename,
blast_results_file_name => $output_blast_results_filename,
job_runner => $self->job_runner,
makeblastdb_exec => $self->makeblastdb_exec,
blastp_exec => $self->blastp_exec
);
$blast_obj->run();

my $mcl = Bio::PanGenome::External::Mcl->new(
blast_results => $output_blast_results_filename,
mcxdeblast_exec => $self->mcxdeblast_exec,
mcl_exec => $self->mcl_exec,
output_file => $output_mcl_filename
);
$mcl->run();

my $inflate_clusters = Bio::PanGenome::InflateClusters->new(
clusters_filename => $cdhit_obj->clusters_filename,
mcl_filename => $output_mcl_filename,
output_file => $self->output_filename
);
$inflate_clusters->inflate();

# Cleanup files
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;
1;
95 changes: 45 additions & 50 deletions lib/Bio/PanGenome/CommandLine/CreatePanGenome.pm
Original file line number Diff line number Diff line change
@@ -1,47 +1,59 @@
package Bio::PanGenome::CommandLine::CreatePanGenome;

# ABSTRACT: Create a pan genome from a set of proteins in a FASTA file
# ABSTRACT: Take in FASTA files of proteins and cluster them

=head1 SYNOPSIS

Create a pan genome from a set of proteins in a FASTA file
Take in FASTA files of proteins and cluster them

=cut

use Moose;
use Getopt::Long qw(GetOptionsFromArray);
use Bio::PanGenome::CombinedProteome;
use Bio::PanGenome::External::Cdhit;
use Bio::PanGenome::External::Makeblastdb;
use Bio::PanGenome::External::Blastp;
use Bio::PanGenome::GGFile;
use Bio::PanGenome;

has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );

has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str' );
has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );

has '_error_message' => ( is => 'rw', isa => 'Str' );
has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'LSF' );
has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );
has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' );
has 'mcl_exec' => ( is => 'ro', isa => 'Str', default => 'mcl' );

has '_error_message' => ( is => 'rw', isa => 'Str' );

sub BUILD {
my ($self) = @_;

my ( $fasta_files, $output_filename, $help );
my ( $fasta_files, $output_filename, $job_runner, $makeblastdb_exec,$mcxdeblast_exec,$mcl_exec, $blastp_exec, $help );

GetOptionsFromArray(
$self->args,
'o|output=s' => \$output_filename,
'h|help' => \$help,
'o|output=s' => \$output_filename,
'j|job_runner=s' => \$job_runner,
'm|makeblastdb_exec=s' => \$makeblastdb_exec,
'b|blastp_exec=s' => \$blastp_exec,
'd|mcxdeblast_exec' => \$mcxdeblast_exec,
'c|mcl_exec' => \$mcl_exec,
'h|help' => \$help,
);

$self->output_filename($output_filename) if ( defined($output_filename) );


if ( @{ $self->args } == 0 ) {
$self->_error_message("Error: You need to provide at least 1 FASTA file");
$self->_error_message("Error: You need to provide a FASTA file");
}

$self->output_filename($output_filename) if ( defined($output_filename) );
$self->job_runner($job_runner) if ( defined($job_runner) );
$self->makeblastdb_exec($makeblastdb_exec) if ( defined($makeblastdb_exec) );
$self->blastp_exec($blastp_exec) if ( defined($blastp_exec) );
$self->mcxdeblast_exec($mcxdeblast_exec) if ( defined($mcxdeblast_exec) );
$self->mcl_exec($mcl_exec) if ( defined($mcl_exec) );

for my $filename ( @{ $self->args } ) {
if ( !-e $filename ) {
$self->_error_message("Error: Cant access file $filename");
Expand All @@ -60,46 +72,29 @@ sub run {
print $self->_error_message . "\n";
die $self->usage_text;
}

my $combined_proteome_obj = Bio::PanGenome::CombinedProteome->new(
proteome_files => $self->fasta_files,
output_filename => 'combined_proteome.faa'

my $pan_genome_obj = Bio::PanGenome->new(
fasta_files => $self->fasta_files,
output_filename => $self->output_filename,
job_runner => $self->job_runner,
makeblastdb_exec => $self->makeblastdb_exec,
blastp_exec => $self->blastp_exec
);
$combined_proteome_obj->create_combined_proteome_file;
print "Created combined file:\n";
my $percentage_sequences_ignored = (($combined_proteome_obj->number_of_sequences_ignored/$combined_proteome_obj->number_of_sequences_seen)*100);
print $percentage_sequences_ignored." percent of sequences ignored\n";

print "Clustering the data:\n";
my $cdhit_obj = Bio::PanGenome::External::Cdhit->new( input_file => 'combined_proteome.faa', output_base => 'clustered.faa');
$cdhit_obj->run();

print "Creating a blast database:\n";
my $blast_database= Bio::PanGenome::External::Makeblastdb->new(fasta_file => 'clustered.faa');
$blast_database->run();

print "Blasting all against all:\n";
my $blastp_obj = Bio::PanGenome::External::Blastp->new(
fasta_file => 'clustered.faa',
blast_database => $blast_database->output_database,
output_file => 'results.out'
);
$blastp_obj->run();

$pan_genome_obj->run();
}

sub usage_text {
my ($self) = @_;

return <<USAGE;
Usage: create_pan_geneome [options]
Create a pan genome from a set of proteins in a FASTA file

# Create a pan genome from some FASTA files
create_pan_geneome *.faa
Take in FASTA files of proteins and cluster them
# Take in FASTA files of proteins and cluster them
create_pan_geneome example.faa

# Provide an output filename
create_pan_geneome -o outputfile.faa *.faa
create_pan_geneome -o results *.faa

# This help message
create_pan_geneome -h
Expand Down
6 changes: 6 additions & 0 deletions lib/Bio/PanGenome/External/Cdhit.pm
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ has '_length_difference_cutoff' => ( is => 'ro', isa => 'Num', default => 0
has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num', default => 0.99 );
has '_logging' => ( is => 'ro', isa => 'Str', default => '2> /dev/null' );

sub clusters_filename
{
my ($self) = @_;
return join('.',($self->output_base,'clstr'));
}

sub _command_to_run {
my ($self) = @_;
return join(
Expand Down
54 changes: 54 additions & 0 deletions lib/Bio/PanGenome/External/Mcl.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package Bio::PanGenome::External::Mcl;

# ABSTRACT: Wrapper around MCL which takes in blast results and outputs clustered results

=head1 SYNOPSIS

Wrapper around MCL which takes in blast results and outputs clustered results

use Bio::PanGenome::External::Mcl;

my $mcl= Bio::PanGenome::External::Mcl->new(
blast_results => 'db',
mcxdeblast_exec => 'mcxdeblast',
mcl_exec => 'mcl',
output_file => 'output.groups'
);

$mcl->run();

=cut

use Moose;

has 'blast_results' => ( is => 'ro', isa => 'Str', required => 1 );
has 'mcxdeblast_exec' => ( is => 'ro', isa => 'Str', default => 'mcxdeblast' );
has 'mcl_exec' => ( is => 'ro', isa => 'Str', default => 'mcl' );
has 'output_file' => ( is => 'ro', isa => 'Str', default => 'output_groups' );

has '_inflation_value' => ( is => 'ro', isa => 'Num', default => 1.5 );
has '_logging' => ( is => 'ro', isa => 'Str', default => '2> /dev/null' );

sub _command_to_run {
my ($self) = @_;
return join(
" ",
(
$self->mcxdeblast_exec, '-m9',
'--line-mode=abc', $self->blast_results,
'|', $self->mcl_exec, '-', '--abc',
'-I', $self->_inflation_value, '-o', $self->output_file,
$self->_logging
)
);
}

sub run {
my ($self) = @_;
system( $self->_command_to_run );
1;
}

no Moose;
__PACKAGE__->meta->make_immutable;
1;
69 changes: 0 additions & 69 deletions lib/Bio/PanGenome/GGFile.pm

This file was deleted.

Loading