#!/usr/bin/env perl =pod =head1 NAME parsemaf - create Circos link file from .maf =head1 SYNOPSIS # get a list of strains cat ../../lecture.2/1/ebola.multiz.txt | ./parsemaf # dump the internal alignment data structure to # understand how the alignments are stored during parsing cat ../../lecture.2//1/ebola.multiz.txt | ./parsemaf -dump # report alignments between a reference strain and all other strains cat ../../lecture.2/1/ebola.multiz.txt | ./parsemaf -g1 Zaire_1995 # If the strain defined by -g1 is not defined, then a list of strains # will be shown that match the string as a regular expression cat ../../lecture.2/1/ebola.multiz.txt | ./parsemaf -g1 Gab # report alignments between two strains cat ../../lecture.2/1/ebola.multiz.txt | ./parsemaf -g1 Zaire_1995 -g2 Reston_1996 =head1 DESCRIPTION Generates a Circos link file from .maf alignments =head1 OUTPUT Individual alignments between two strains are reported on each line in Circos link format CHR1 START1 END1 CHR2 START2 END2 {options} =cut # These two lines are critical and you should always include them in # all your scripts. The first requires that we declare variables. The # second generates all warnings. use strict; use warnings FATAL=>"all"; # We'll use the uniq() function to get a list of unique elements # in a list from this module. Perl does not have such a function built-in. use List::MoreUtils qw(uniq); # Some modules that the script needs for its housekeeping, such as # command line parameter parsing, displaying the man page and so on. # Don't worry about this for now. use Carp; use Cwd qw(getcwd abs_path); use Config::General; use Data::Dumper; use File::Basename; use FindBin; use Getopt::Long; use Pod::Usage; our (%OPT,%CONF,$conf); # Here we define the list of command line parameters that we can # define. our @COMMAND_LINE = ("g1=s", "g2=s", "karyotype", "dump", "configfile=s", "help", "cdump", "man", "debug"); # Read and parse configuration file. Don't worry about this parse_config(); # _START ################################################################ # Make sure you understand everything below all the way to the # housekeeping section. # Read about the MAF format at http://www.bx.psu.edu/~dcking/man/maf.xhtml # This will be our alignment list reference. It's declared as # a scalar because it will be a reference to a list. While it # doesn't have to be a reference, get used to using references. # # Alternatively, we could have done this # # my @alignments; # ... # push @alignments, ... # ... # $alignments[-1]{strain}{$strain} = ... # # By using a list variable saves you from having to dereference # it with @$alignments or using -> to pull out an element. # # Last element using @alignments list: $alignments[-1] # Last element using $alignments as list reference: $alignments->[-1] # # We can always make a list reference by # # my $alignments = \@alignments; my $alignments; # Read from STDIN until end of stream. This way the script expects # a pipe from another process, like cat # # cat .../1/ebola.multiz.txt | ./parsemaf # while(my $line = <>) { # remove the trailing new line chomp $line; # if this file matches this regular expression, then it is # the start of an alignment block if($line =~ /^a score=(.+)/) { # push a new alignment hash onto the list # for now, only the score is defined push @$alignments, {score=>$1}; } elsif($line =~ /^s /) { # this is an individual alignment line and belongs # to the current alignment block, which is the last # element in the @$alignment list. # chop up the line into fields using any whitespace as delimiter my @tok = split(" ",$line); # assign the first four elements of the tokens to # individual variables. By putting 'undef' as the # first element we discard the first element in # the assignment. Another way to do this would be # # my ($strain,$start,$size,$strand,$genome_size) = @tok[1..5]; my (undef,$strain,$start,$size,$strand,$genome_size) = @tok; # if the strain is in the format STR.STR (repeated string) # joined by a period, throw out everything after # the period if( $strain =~ /(.+)[.]\1/) { $strain =~ s/[.].*//; } # assign this alignment line to the last (current) alignment # block. We could have made a list out of these but it's # even more convenient to key them by the strain $alignments->[-1]{strain}{$strain} = { start=>$start,size=>$size,strain=>$strain,genome_size=>$genome_size }; } } printdebug("parsed",int(@$alignments),"alignments"); # The parsing above could have been done by changing the # input record separator in Perl, which is accessible using # the special variable $/ # # The default value of this variable depends on the operating # system. For more details see https://perldoc.perl.org/perlvar.html # # I urge you against using these special variables (with the # exception of $_ and @_) and get in the habit of doing things yourself. # Your code will be much more legible. Some of these variables # actually slow down your code. # Just dump the data structure to STDOUT and quit. This # helps understand what's happening under the hood. # # The alignments are a list. Each list item is a hash of this form # # { score => SCORE, # strain => { STRAIN1 => { start=>START1,size=>SIZE1,strain=>STRAIN1 }, # STRAIN2 => { start=>START2,size=>SIZE2,strain=>STRAIN2 }, # STRAIN3 => { start=>START3,size=>SIZE3,strain=>STRAIN3 }, # ... # } # } printdumperq($alignments) if $CONF{dump}; # Make a unique list of strains. This is a compound line made up of # several nested functions. Each is a simple step and this is # a powerful Perl idiom when the steps are combined together. # # the list of alignments # @$alignments # # a list of strains (which are keys in the alignment hash) in a given alignment # keys %{$a->{strain}} # # a list of strains from all alignments # here we're basically mapping the above step to each alignment # map { keys %{$->{strain}} } @$alignments # # a list of unique strains # uniq ( map { keys %{$->{strain}} } @$alignments ) my @strains = uniq ( map { keys %{$_->{strain}} } @$alignments ); # If -g1 is not set, we just list the strains and quit. # I've defined printinfo() below as a wrapper for space-delimited # print that prints _undef_ for arguments that are not defined. if(! $CONF{g1}) { printinfo("*** These strains are available:"); for my $strain (sort @strains) { printinfo($strain); } exit; } # If we defined -g1 and/or -g2 make sure that we've seen these in the file my $quit; for my $param (qw(g1 g2)) { # die if we have defined the genome we want but it's not in the list of strains. if($CONF{$param} && ! grep($_ eq $CONF{$param}, @strains)) { printinfo("ERROR: There are no alignments for -$param [$CONF{$param}]"); # Even if there are strains that match $CONF{g1} or $CONF{g2} we can # produce a list of strains that match it as a regular expression to help the user # find a strain. my @matching_strains = grep($_ =~ /$CONF{$param}/, @strains); if(@matching_strains) { printinfo("Strains that match your input $CONF{$param} as a regular expression are"); for (@matching_strains) { printinfo(" $_"); } } else { printinfo("No strains match your input $CONF{$param} as a regular expression."); } $quit = 1; } } die if $quit; # Now we print out the alignments in Circos' link format my $seen_strain; for my $alignment (@$alignments) { # alignment to our reference my $a1 = $alignment->{strain}{$CONF{g1}}; # the reference may be absent from this alignment, in which case go to next alignment next unless $a1; # for this alignment block, make a list of alignments to match up with # the reference. if we have defined -g2 STRAIN2 then it's the alignment for # STRAIN2 (which may be absent). Othewise it's a list of all strains in this alignment. my @a2 = $CONF{g2} ? $alignment->{strain}{$CONF{g2}} : values %{$alignment->{strain}}; # Loop over all the alignments in this block for my $a2 (@a2) { # Move to the next alignment line if this one is not defined. This shouldn't # happen but it's safe to check. next if ! $a2; # Report the karyotype if we used -karyotype. These will be printed # to STDERR. The reason for this is that we can write to two # files at the same time by redirecting STDOUT and STDERR to # different files # # ./parsemaf > stdout.txt 2> stderr.txt # # Or we can combine the two # # ./parsemaf > everything 2>&1 if($CONF{karyotype} && ! $seen_strain->{ $a1->{strain} }++) { printerr(sprintf("chr - %s %s 0 %d black", $a1->{strain},$a1->{strain},$a1->{genome_size})); } if($CONF{karyotype} && ! $seen_strain->{ $a2->{strain} }++) { printerr(sprintf("chr - %s %s 0 %d black", $a2->{strain},$a2->{strain},$a2->{genome_size})); } # We're reporting alignment pairs, so skip if we're asking for self-alignment next if $a1->{strain} eq $a2->{strain}; printinfo( $a1->{strain}, $a1->{start}, $a1->{start} + $a1->{size} - 1, $a2->{strain}, $a2->{start}, $a2->{start} + $a2->{size} - 1, sprintf("offset=%d",$a2->{start}-$a1->{start}), ); } } # _END exit; # HOUSEKEEPING ############################################################### sub validateconfiguration { } sub dump_config { printdumper(\%OPT,\%CONF); } sub parse_config { my $dump_debug_level = 3; GetOptions(\%OPT,@COMMAND_LINE); pod2usage() if $OPT{help}; pod2usage(-verbose=>2) if $OPT{man}; loadconfiguration($OPT{configfile}); populateconfiguration(); # copy command line options to config hash validateconfiguration(); if ($CONF{cdump}) { $Data::Dumper::Indent = 2; $Data::Dumper::Quotekeys = 0; $Data::Dumper::Terse = 0; $Data::Dumper::Sortkeys = 1; $Data::Dumper::Varname = "OPT"; printdumper(\%OPT); $Data::Dumper::Varname = "CONF"; printdumper(\%CONF); exit; } } sub populateconfiguration { for my $var (keys %OPT) { $CONF{$var} = $OPT{$var}; } repopulateconfiguration(\%CONF); } sub repopulateconfiguration { my ($node,$parent_node_name) = shift; return unless ref($node) eq "HASH"; for my $key (keys %$node) { my $value = $node->{$key}; if (ref($value) eq "HASH") { repopulateconfiguration($value,$key); } elsif (ref($value) eq "ARRAY") { for my $item (@$value) { repopulateconfiguration($item,$key); } } elsif (defined $value) { my $new_value = parse_field($value,$key,$parent_node_name,$node); $node->{$key} = $new_value; } } } sub parse_field { my ($str,$key,$parent_node_name,$node) = @_; # replace configuration field # conf(LEAF,LEAF,...) while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) { my ($template,$leaf) = ($1,$2); if (defined $template && defined $leaf) { my @leaf = split(/\s*,\s*/,$leaf); my $new_template; if (@leaf == 2 && $leaf[0] eq ".") { $new_template = $node->{$leaf[1]}; } else { $new_template = fetch_conf(@leaf); } $str =~ s/\Q$template\E/$new_template/g; } } if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) { my $fn = $1; $str = eval $fn; if ($@) { die "could not parse configuration parameter [$@]"; } } return $str; } sub fetch_configuration { my @config_path = @_; my $node = \%CONF; if(! @config_path) { return \%CONF; } for my $path_element (@config_path) { if (! exists $node->{$path_element}) { return undef; } else { $node = $node->{$path_element}; } } return $node; } sub fetch_conf { return fetch_configuration(@_); } ################################################################ # # sub loadconfiguration { my $file = shift; if (defined $file) { if (-e $file && -r _) { # provided configuration file exists and can be read $file = abs_path($file); } else { confess "The configuration file [$file] passed with -configfile does not exist or cannot be read."; } } else { # otherwise, try to automatically find a configuration file my ($scriptname,$path,$suffix) = fileparse($0); my $cwd = getcwd(); my $bindir = $FindBin::RealBin; my $userdir = $ENV{HOME}; my @candidate_files = ( "$cwd/$scriptname.conf", "$cwd/etc/$scriptname.conf", "$cwd/../etc/$scriptname.conf", "$bindir/$scriptname.conf", "$bindir/etc/$scriptname.conf", "$bindir/../etc/$scriptname.conf", "$userdir/.$scriptname.conf", ); my @additional_files = (); for my $candidate_file (@additional_files,@candidate_files) { #printinfo("configsearch",$candidate_file); if (-e $candidate_file && -r _) { $file = $candidate_file; #printinfo("configfound",$candidate_file); last; } } } if (defined $file) { $OPT{configfile} = $file; $conf = new Config::General( -ConfigFile=>$file, -IncludeRelative=>1, -IncludeAgain=>1, -ExtendedAccess=>1, -AllowMultiOptions=>"yes", -LowerCaseNames=>1, -AutoTrue=>1 ); %CONF = $conf->getall; } } sub printdebug { printerr(@_) if defined $CONF{debug}; } sub printinfo { print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; } sub printfinfo { my ($fmt,@args) = @_; @args = map { defined $_ ? $_ : "_undef_" } @args; printf("$fmt\n",@args); } sub printerr { print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; } sub printdumper { print Dumper(@_); } sub printdumperq { printdumper(@_); exit; } =pod =head1 HISTORY =over =item * 27 Jun 2018 Added some modules and printdumperq. =item * 24 Jun 2014 Fixed config dump. =item * 12 May 2014 Added printfinfo(). =back =head1 AUTHOR Martin Krzywinski =head1 CONTACT Martin Krzywinski Genome Sciences Center BC Cancer Research Center 100-570 W 7th Ave Vancouver BC V5Z 4S6 mkweb.bcgsc.ca martink@bcgsc.ca =cut