#!/bin/env perl =pod =head1 NAME clustalparser - SOME KIND OF DESCRIPTION =head1 SYNOPSIS # parse alignments cat FILE | ./clustalparser ./clustalparser -in FILE # get help or see man page ./clustalparser [-help] [-man] =head1 DESCRIPTION You should describe what this does. =head1 OPTIONS =head2 -in FILE Specify input file. You can also pipe the file via STDIN cat FILE | ./clustalparser =head2 -help Get brief help =head2 -man Show man page. =cut use strict; use diagnostics; use warnings FATAL=>"all"; use Carp; use Config::General; use Cwd qw(getcwd abs_path); use Data::Dumper; use File::Basename; use FindBin; use Getopt::Long; use List::MoreUtils qw(uniq minmax); use Math::Round qw(round nearest); use Math::VecStat qw(sum min max average); use Pod::Usage; use Statistics::Basic qw(median stddev); use Storable; use lib "$FindBin::RealBin"; use lib "$FindBin::RealBin/../lib"; use lib "$FindBin::RealBin/lib"; our (%OPT,%CONF,$conf); our @COMMAND_LINE = ( "in=s", "configfile=s", "help", "cdump", "man", "debug"); # read and parse configuration file parse_config(); sub validateconfiguration { } # _START # This is a boilerplate Perl script for you to edit to build up a clustal parser # # Run it first as # # ./clustalpaser -h # # and then using the alignment file # # cat ../../data/alignment.aln | ./clustalparser my $data = read_file($CONF{in}); # dump the data structure we created from the # alignments to STDOUT printdumper($data); # Continue with the script here # ... ################################################################ # Convert a list of numbers into a # list of contiguous ranges defined by [min,max] # # [ [min1,max1], [min2,max2], ...] sub list2ranges { my @x = sort {$a <=> $b} @_; my $ranges = []; # I wrote two approaches to solve this problem. The first # is more like C code and the second is more Perlish. # Change the 1 to a 0 in the outer if loop to use the second approach. # Which one reads easier? if(1) { # This one is more like C for ( my $i = 0; $i < @x; $i++) { my $min = $i; for my $j ($i+1..@x-1) { if($x[$j] - $x[$j-1] > 1) { last; } $i = $j; } push @$ranges, [$min,$i]; } } else { # This one is more Perlish my $i = 0; my $max; while(@x) { my $x = shift @x; if(! defined $ranges->[-1] || $x - $max > 1) { push @$ranges, [$i,$i]; } else { $ranges->[-1][1] = $i; } $max = $x; $i++; } } return $ranges; } # Read the alignment file. For now, all it does # is read the alignment lines and store them # as a hash, keyed by the sequence id. sub read_file { my $file = shift; my $inputhandle = get_handle($file); my $data; while(<$inputhandle>) { # skip the first line, which is a header # this is one of those rare times that a special variable # other than $_ or @_ is usedful. The $. gives the # line number of the file handle that is currently being read. next if $. == 1; # skip comments and blank lines next if /^\s*\#/ ||/^\s*$/; # trim newline chomp; # you now have the fields of this line my ($id,$seq,$cumul) = split; # put these values into a hash and push it onto the list push @$data, {id=>$id, seq=>$seq, cumul=>$cumul}; } return $data; } # _END sub get_handle { my $file = shift; my $h; if($file) { die "No such file [$file]" unless -e $file; open(FILE,$file); $h = \*FILE; } else { $h = \*STDIN; } return $h; } # HOUSEKEEPING ############################################################### sub dump_config { printdumper(\%OPT,\%CONF); } sub parse_config { my $dump_debug_level = 3; GetOptions(\%OPT,@COMMAND_LINE); pod2usage() if $OPT{help}; pod2usage(-verbose=>2) if $OPT{man}; loadconfiguration($OPT{configfile}); populateconfiguration(); # copy command line options to config hash validateconfiguration(); if ($CONF{cdump}) { $Data::Dumper::Indent = 2; $Data::Dumper::Quotekeys = 0; $Data::Dumper::Terse = 0; $Data::Dumper::Sortkeys = 1; $Data::Dumper::Varname = "OPT"; printdumper(\%OPT); $Data::Dumper::Varname = "CONF"; printdumper(\%CONF); exit; } } sub populateconfiguration { for my $var (keys %OPT) { $CONF{$var} = $OPT{$var}; } repopulateconfiguration(\%CONF); } sub repopulateconfiguration { my ($node,$parent_node_name) = shift; return unless ref($node) eq "HASH"; for my $key (keys %$node) { my $value = $node->{$key}; if (ref($value) eq "HASH") { repopulateconfiguration($value,$key); } elsif (ref($value) eq "ARRAY") { for my $item (@$value) { repopulateconfiguration($item,$key); } } elsif (defined $value) { my $new_value = parse_field($value,$key,$parent_node_name,$node); $node->{$key} = $new_value; } } } sub parse_field { my ($str,$key,$parent_node_name,$node) = @_; # replace configuration field # conf(LEAF,LEAF,...) while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) { my ($template,$leaf) = ($1,$2); if (defined $template && defined $leaf) { my @leaf = split(/\s*,\s*/,$leaf); my $new_template; if (@leaf == 2 && $leaf[0] eq ".") { $new_template = $node->{$leaf[1]}; } else { $new_template = fetch_conf(@leaf); } $str =~ s/\Q$template\E/$new_template/g; } } if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) { my $fn = $1; $str = eval $fn; if ($@) { die "could not parse configuration parameter [$@]"; } } return $str; } sub fetch_configuration { my @config_path = @_; my $node = \%CONF; if(! @config_path) { return \%CONF; } for my $path_element (@config_path) { if (! exists $node->{$path_element}) { return undef; } else { $node = $node->{$path_element}; } } return $node; } sub fetch_conf { return fetch_configuration(@_); } ################################################################ # # sub loadconfiguration { my $file = shift; if (defined $file) { if (-e $file && -r _) { # provided configuration file exists and can be read $file = abs_path($file); } else { confess "The configuration file [$file] passed with -configfile does not exist or cannot be read."; } } else { # otherwise, try to automatically find a configuration file my ($scriptname,$path,$suffix) = fileparse($0); my $cwd = getcwd(); my $bindir = $FindBin::RealBin; my $userdir = $ENV{HOME}; my @candidate_files = ( "$cwd/$scriptname.conf", "$cwd/etc/$scriptname.conf", "$cwd/../etc/$scriptname.conf", "$bindir/$scriptname.conf", "$bindir/etc/$scriptname.conf", "$bindir/../etc/$scriptname.conf", "$userdir/.$scriptname.conf", ); my @additional_files = (); for my $candidate_file (@additional_files,@candidate_files) { #printinfo("configsearch",$candidate_file); if (-e $candidate_file && -r _) { $file = $candidate_file; #printinfo("configfound",$candidate_file); last; } } } if (defined $file) { $OPT{configfile} = $file; $conf = new Config::General( -ConfigFile=>$file, -IncludeRelative=>1, -IncludeAgain=>1, -ExtendedAccess=>1, -AllowMultiOptions=>"yes", -LowerCaseNames=>1, -AutoTrue=>1 ); %CONF = $conf->getall; } } sub printdebug { printerr(@_) if defined $CONF{debug}; } sub printinfo { print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; } sub printfinfo { my ($fmt,@args) = @_; @args = map { defined $_ ? $_ : "_undef_" } @args; printf("$fmt\n",@args); } sub printerr { print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; } sub printdumper { print Dumper(@_); } sub printdumperq { printdumper(@_); exit; } =pod =head1 HISTORY =over =item * 17 Oct 2018 Boilerplate created. =back =head1 AUTHOR Martin Krzywinski =head1 CONTACT Martin Krzywinski Genome Sciences Center BC Cancer Research Center 100-570 W 7th Ave Vancouver BC V5Z 4S6 mkweb.bcgsc.ca martink@bcgsc.ca =cut