#!/usr/bin/env perl

=pod

=head1 NAME

parsemaf - create Circos link file from .maf

=head1 SYNOPSIS

  # get a list of strains
  cat ../../lecture.2/1/ebola.multiz.txt | ./parsemaf

  # dump the internal alignment data structure to 
  # understand how the alignments are stored during parsing
  cat ../../lecture.2//1/ebola.multiz.txt | ./parsemaf -dump

  # report alignments between a reference strain and all other strains
  cat ../../lecture.2/1/ebola.multiz.txt | ./parsemaf -g1 Zaire_1995

  # If the strain defined by -g1 is not defined, then a list of strains
  # will be shown that match the string as a regular expression
  cat ../../lecture.2/1/ebola.multiz.txt | ./parsemaf -g1 Gab

  # report alignments between two strains
  cat ../../lecture.2/1/ebola.multiz.txt | ./parsemaf -g1 Zaire_1995 -g2 Reston_1996

=head1 DESCRIPTION

Generates a Circos link file from .maf alignments

=head1 OUTPUT

Individual alignments between two strains are reported on each line in Circos link format

  CHR1 START1 END1 CHR2 START2 END2 {options}

=cut

# These two lines are critical and you should always include them in
# all your scripts. The first requires that we declare variables. The
# second generates all warnings.
use strict;
use warnings FATAL=>"all";

# We'll use the uniq() function to get a list of unique elements
# in a list from this module. Perl does not have such a function built-in.
use List::MoreUtils qw(uniq);

# Some modules that the script needs for its housekeeping, such as
# command line parameter parsing, displaying the man page and so on.
# Don't worry about this for now.
use Carp;
use Cwd qw(getcwd abs_path);
use Config::General;
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use Pod::Usage;

our (%OPT,%CONF,$conf);

# Here we define the list of command line parameters that we can 
# define.
our @COMMAND_LINE = ("g1=s",
                     "g2=s",
                     "karyotype",
                     "dump",
										 "configfile=s",
										 "help",
										 "cdump",
										 "man",
										 "debug");

# Read and parse configuration file. Don't worry about this
parse_config();

# _START
################################################################
# Make sure you understand everything below all the way to the
# housekeeping section.

# Read about the MAF format at http://www.bx.psu.edu/~dcking/man/maf.xhtml

# This will be our alignment list reference. It's declared as
# a scalar because it will be a reference to a list. While it
# doesn't have to be a reference, get used to using references.
#
# Alternatively, we could have done this
#
# my @alignments;
# ...
# push @alignments, ...
# ...
# $alignments[-1]{strain}{$strain} = ...
#
# By using a list variable saves you from having to dereference
# it with @$alignments or using -> to pull out an element.
#
# Last element using @alignments list:              $alignments[-1]
# Last element using $alignments as list reference: $alignments->[-1]
#
# We can always make a list reference by 
#
# my $alignments = \@alignments;

my $alignments;

# Read from STDIN until end of stream. This way the script expects
# a pipe from another process, like cat
#
# cat .../1/ebola.multiz.txt | ./parsemaf
#
while(my $line = <>) {
  # remove the trailing new line
  chomp $line;
  # if this file matches this regular expression, then it is
  # the start of an alignment block
  if($line =~ /^a score=(.+)/) {
    # push a new alignment hash onto the list
    # for now, only the score is defined
    push @$alignments, {score=>$1};
  } elsif($line =~ /^s /) {
    # this is an individual alignment line and belongs
    # to the current alignment block, which is the last
    # element in the @$alignment list. 

    # chop up the line into fields using any whitespace as delimiter
    my @tok = split(" ",$line);
    # assign the first four elements of the tokens to 
    # individual variables. By putting 'undef' as the
    # first element we discard the first element in
    # the assignment. Another way to do this would be
    #
    # my ($strain,$start,$size,$strand,$genome_size) = @tok[1..5];
    my (undef,$strain,$start,$size,$strand,$genome_size) = @tok;
    # if the strain is in the format STR.STR (repeated string)
    # joined by a period, throw out everything after
    # the period
    if( $strain =~ /(.+)[.]\1/) {
      $strain =~ s/[.].*//;
    }
    # assign this alignment line to the last (current) alignment
    # block. We could have made a list out of these but it's
    # even more convenient to key them by the strain 
    $alignments->[-1]{strain}{$strain} = 
      { start=>$start,size=>$size,strain=>$strain,genome_size=>$genome_size };
  }
}
printdebug("parsed",int(@$alignments),"alignments");

# The parsing above could have been done by changing the
# input record separator in Perl, which is accessible using
# the special variable $/
#
# The default value of this variable depends on the operating
# system. For more details see https://perldoc.perl.org/perlvar.html
#
# I urge you against using these special variables (with the
# exception of $_ and @_) and get in the habit of doing things yourself. 
# Your code will be much more legible. Some of these variables
# actually slow down your code.

# Just dump the data structure to STDOUT and quit. This
# helps understand what's happening under the hood.
#
# The alignments are a list. Each list item is a hash of this form
#
#  { score  => SCORE,
#    strain => { STRAIN1 => { start=>START1,size=>SIZE1,strain=>STRAIN1 },
#                STRAIN2 => { start=>START2,size=>SIZE2,strain=>STRAIN2 },
#                STRAIN3 => { start=>START3,size=>SIZE3,strain=>STRAIN3 },
#                ...
#              }
#  }

printdumperq($alignments) if $CONF{dump};

# Make a unique list of strains. This is a compound line made up of
# several nested functions. Each is a simple step and this is
# a powerful Perl idiom when the steps are combined together.
#
# the list of alignments
# @$alignments  
#
# a list of strains (which are keys in the alignment hash) in a given alignment
# keys %{$a->{strain}}
#
# a list of strains from all alignments
# here we're basically mapping the above step to each alignment
# map { keys %{$->{strain}} } @$alignments
#
# a list of unique strains
# uniq ( map { keys %{$->{strain}} } @$alignments )
my @strains = uniq ( map { keys %{$_->{strain}} } @$alignments );

# If -g1 is not set, we just list the strains and quit. 
# I've defined printinfo() below as a wrapper for space-delimited
# print that prints _undef_ for arguments that are not defined.
if(! $CONF{g1}) {
  printinfo("*** These strains are available:");
  for my $strain (sort @strains) {
    printinfo($strain);
  }
  exit;
}

# If we defined -g1 and/or -g2 make sure that we've seen these in the file
my $quit;
for my $param (qw(g1 g2)) {
  # die if we have defined the genome we want but it's not in the list of strains.
  if($CONF{$param} && ! grep($_ eq $CONF{$param}, @strains)) {
    printinfo("ERROR: There are no alignments for -$param [$CONF{$param}]");
    # Even if there are strains that match $CONF{g1} or $CONF{g2} we can 
    # produce a list of strains that match it as a regular expression to help the user 
    # find a strain.
    my @matching_strains = grep($_ =~ /$CONF{$param}/, @strains);
    if(@matching_strains) {
      printinfo("Strains that match your input $CONF{$param} as a regular expression are");
      for (@matching_strains) {
        printinfo("  $_");
      } 
    } else {
      printinfo("No strains match your input $CONF{$param} as a regular expression.");
    }
    $quit = 1;
  }
}
die if $quit;

# Now we print out the alignments in Circos' link format
my $seen_strain;
for my $alignment (@$alignments) {
  # alignment to our reference
  my $a1 = $alignment->{strain}{$CONF{g1}};
  # the reference may be absent from this alignment, in which case go to next alignment
  next unless $a1;
  # for this alignment block, make a list of alignments to match up with
  # the reference. if we have defined -g2 STRAIN2 then it's the alignment for 
  # STRAIN2 (which may be absent). Othewise it's a list of all strains in this alignment.
  my @a2 = $CONF{g2} ? $alignment->{strain}{$CONF{g2}} : values %{$alignment->{strain}};

  # Loop over all the alignments in this block
  for my $a2 (@a2) {
    # Move to the next alignment line if this one is not defined. This shouldn't 
    # happen but it's safe to check.
    next if ! $a2;

    # Report the karyotype if we used -karyotype. These will be printed
    # to STDERR. The reason for this is that we can write to two 
    # files at the same time by redirecting STDOUT and STDERR to 
    # different files
    #
    # ./parsemaf > stdout.txt 2> stderr.txt
    #
    # Or we can combine the two
    #
    # ./parsemaf > everything 2>&1

    if($CONF{karyotype} && ! $seen_strain->{ $a1->{strain} }++) {
      printerr(sprintf("chr - %s %s 0 %d black",
                       $a1->{strain},$a1->{strain},$a1->{genome_size}));
    }
    if($CONF{karyotype} && ! $seen_strain->{ $a2->{strain} }++) {
      printerr(sprintf("chr - %s %s 0 %d black",
                       $a2->{strain},$a2->{strain},$a2->{genome_size}));
    }

    # We're reporting alignment pairs, so skip if we're asking for self-alignment
    next if $a1->{strain} eq $a2->{strain};
    printinfo(
              $a1->{strain},
              $a1->{start},
              $a1->{start} + $a1->{size} - 1,
              $a2->{strain},
              $a2->{start},
              $a2->{start} + $a2->{size} - 1,
              sprintf("offset=%d",$a2->{start}-$a1->{start}),
             );
  }
}

# _END

exit;

# HOUSEKEEPING ###############################################################

sub validateconfiguration {

}

sub dump_config {
	printdumper(\%OPT,\%CONF);
}

sub parse_config {
  my $dump_debug_level = 3;
  GetOptions(\%OPT,@COMMAND_LINE);
  pod2usage() if $OPT{help};
  pod2usage(-verbose=>2) if $OPT{man};
  loadconfiguration($OPT{configfile});
  populateconfiguration();	# copy command line options to config hash
  validateconfiguration(); 
  if ($CONF{cdump}) {
    $Data::Dumper::Indent    = 2;
    $Data::Dumper::Quotekeys = 0;
    $Data::Dumper::Terse     = 0;
    $Data::Dumper::Sortkeys  = 1;
    $Data::Dumper::Varname = "OPT";
    printdumper(\%OPT);
    $Data::Dumper::Varname = "CONF";
    printdumper(\%CONF);
    exit;
  }
}

sub populateconfiguration {
  for my $var (keys %OPT) {
    $CONF{$var} = $OPT{$var};
  }
  repopulateconfiguration(\%CONF);
}

sub repopulateconfiguration {
  my ($node,$parent_node_name) = shift;
  return unless ref($node) eq "HASH";
  for my $key (keys %$node) {
		my $value = $node->{$key};
		if (ref($value) eq "HASH") {
			repopulateconfiguration($value,$key);
		} elsif (ref($value) eq "ARRAY") {
			for my $item (@$value) {
				repopulateconfiguration($item,$key);
			}
		} elsif (defined $value) {
			my $new_value = parse_field($value,$key,$parent_node_name,$node);
			$node->{$key} = $new_value;
		}
	}
}

sub parse_field {
	my ($str,$key,$parent_node_name,$node) = @_;
	# replace configuration field
	# conf(LEAF,LEAF,...)
	while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) {
		my ($template,$leaf) = ($1,$2);
		if (defined $template && defined $leaf) {
			my @leaf         = split(/\s*,\s*/,$leaf);
			my $new_template;
			if (@leaf == 2 && $leaf[0] eq ".") {
				$new_template = $node->{$leaf[1]};
			} else {
				$new_template = fetch_conf(@leaf);
			}
			$str =~ s/\Q$template\E/$new_template/g;
		}
	}
	if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) {
		my $fn = $1;
		$str = eval $fn;
		if ($@) {
			die "could not parse configuration parameter [$@]";
		}
	}
	return $str;
}

sub fetch_configuration {
	my @config_path = @_;
	my $node        = \%CONF;
	if(! @config_path) {
		return \%CONF;
	}
	for my $path_element (@config_path) {
		if (! exists $node->{$path_element}) {
	    return undef;
		} else {
	    $node = $node->{$path_element};
		}
	}
	return $node;
}

sub fetch_conf {
	return fetch_configuration(@_);
}

################################################################
#
#

sub loadconfiguration {
  my $file = shift;
  if (defined $file) {
    if (-e $file && -r _) {
      # provided configuration file exists and can be read
      $file = abs_path($file);
    } else {
      confess "The configuration file [$file] passed with -configfile does not exist or cannot be read.";
    }
  } else {
    # otherwise, try to automatically find a configuration file
    my ($scriptname,$path,$suffix) = fileparse($0);
    my $cwd     = getcwd();
    my $bindir  = $FindBin::RealBin;
    my $userdir = $ENV{HOME};
    my @candidate_files = (
													 "$cwd/$scriptname.conf",
													 "$cwd/etc/$scriptname.conf",
													 "$cwd/../etc/$scriptname.conf",
													 "$bindir/$scriptname.conf",
													 "$bindir/etc/$scriptname.conf",
													 "$bindir/../etc/$scriptname.conf",
													 "$userdir/.$scriptname.conf",
													);
    my @additional_files = ();
    for my $candidate_file (@additional_files,@candidate_files) {
			#printinfo("configsearch",$candidate_file);
			if (-e $candidate_file && -r _) {
				$file = $candidate_file;
				#printinfo("configfound",$candidate_file);
				last;
			}
    }
  }
  if (defined $file) {
    $OPT{configfile} = $file;
    $conf = new Config::General(
																-ConfigFile=>$file,
																-IncludeRelative=>1,
																-IncludeAgain=>1,
																-ExtendedAccess=>1,
																-AllowMultiOptions=>"yes",
																-LowerCaseNames=>1,
																-AutoTrue=>1
															 );
    %CONF = $conf->getall;
  }
}

sub printdebug {
	printerr(@_) if defined $CONF{debug};
}

sub printinfo {
	print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
}

sub printfinfo {
	my ($fmt,@args) = @_;
	@args = map { defined $_ ? $_ : "_undef_" } @args;
	printf("$fmt\n",@args);
}

sub printerr {
	print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
}

sub printdumper {
	print Dumper(@_);
}

sub printdumperq {
  printdumper(@_);
  exit;
}

=pod 

=head1 HISTORY

=over

=item * 27 Jun 2018

Added some modules and printdumperq.

=item * 24 Jun 2014

Fixed config dump.

=item * 12 May 2014

Added printfinfo().

=back 

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

Martin Krzywinski
Genome Sciences Center
BC Cancer Research Center
100-570 W 7th Ave
Vancouver BC V5Z 4S6

mkweb.bcgsc.ca
martink@bcgsc.ca

=cut