#!/bin/env perl =pod =head1 NAME coverage.analytical - report analytical read coverage statistics for aneuploid genomes =head1 SYNOPSIS coverage.analytical -r REDUNDANCY -numchr NUMCHROMOSOMES -k L,k [-tol FLOAT] [-help] [-man] =head1 DESCRIPTION Reports analytical coverage statistics for sequencing reads for a given redundancy and aneuploidy. =head1 OPTIONS =head2 -r REDUNDANCY This controls the n-fold redundancy of sequencing. For example, if you sequence at 10x, use -r 10. =head2 -k L,k Compute k-mer frequency given read length L and k (e.g. for L=50 and k=47, each read will contribute 4 k-mers). This is specific to the ABySS pipeline in which coverage is represented by reads but the actual distribution is one of k-mers, where each k-mer contributes to coverage at its start. =head2 -numchr NUMCHROMOSOMES This controls the number of alleles for a given position. If you are interested in statistics for a diploid genome (as you probably should be), use -numchr 2. =head1 HISTORY =over =item * 5 Jan 2010 Started. =back =head1 REFERENCE Wendl, M.C. and R.K. Wilson. 2008. Aspects of coverage in medical DNA sequencing. BMC Bioinformatics 9: 239. =head1 BUGS =head1 AUTHOR Martin Krzywinski =head1 CONTACT Martin Krzywinski Genome Sciences Centre Vancouver BC Canada www.bcgsc.ca martink@bcgsc.ca =cut ################################################################ # # Copyright 2002-2010 Martin Krzywinski # # This file is part of the Genome Sciences Centre Perl code base. # # This script is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this script; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # ################################################################ ################################################################ # Martin Krzywinski (martink@bcgsc.ca) # 2010 ################################################################ use strict; use Config::General; use Data::Dumper; use File::Basename; use FindBin; use Getopt::Long; use IO::File; use List::Util; use List::MoreUtils; use Math::VecStat qw(sum min max average); use Pod::Usage; use Set::IntSpan; use Statistics::Descriptive; use Storable; use Time::HiRes qw(gettimeofday tv_interval); use lib "$FindBin::RealBin"; use lib "$FindBin::RealBin/../lib"; use lib "$FindBin::RealBin/lib"; use vars qw(%OPT %CONF); ################################################################ # # *** YOUR MODULE IMPORTS HERE # ################################################################ GetOptions(\%OPT, "redundancy=f", "numchr=i", "k=s", "configfile=s","help","man","debug+"); pod2usage() if $OPT{help}; pod2usage(-verbose=>2) if $OPT{man}; loadconfiguration($OPT{configfile}); populateconfiguration(); # copy command line options to config hash validateconfiguration(); if($CONF{debug} > 1) { $Data::Dumper::Pad = "debug parameters"; $Data::Dumper::Indent = 1; $Data::Dumper::Quotekeys = 0; $Data::Dumper::Terse = 1; print Dumper(\%CONF); } my $rho = $CONF{redundancy}; my $h = $CONF{numchr}; my $tol = $CONF{tol}; my ($rlen,$kmer) = split(",",$CONF{k}); if($rlen) { $rho *= ($rlen-$kmer+1)/$rlen; } printinfo($rho); for my $phi (0 .. 5*$rho) { my $phi_or_more = coverage($rho,$phi,$h); my $phi_or_less = 1 - coverage($rho,$phi,$h); my $phi_exactly = $phi_or_more - coverage($rho,$phi+1,$h); next if $phi_or_less < $tol && $phi_exactly < $tol; last if $phi_or_less > 1-$tol && $phi_exactly < $tol; if(!$rlen) { printinfo($rho,$phi,$phi_exactly,$phi_or_less,$phi_or_more); } else { printinfo($rho,$phi * $rlen/($rlen-$kmer+1) ,$phi_exactly,$phi_or_less,$phi_or_more); } } sub coverage { my ($rho,$phi,$h) = @_; my $sum=0; for my $k (0..$phi-1) { $sum += 1/factorial($k) * ($rho/$h)**$k* exp(-$rho/$h) } return (1-$sum)**$h; } sub factorial { my $v = shift; my $res = 1; while ($v > 1) { $res *= $v; $v--; } return $res; } sub validateconfiguration { $CONF{redundancy} ||= 50; $CONF{numchr} ||= 1; $CONF{tol} ||= 0.00001; } ################################################################ # # *** DO NOT EDIT BELOW THIS LINE *** # ################################################################ sub populateconfiguration { foreach my $key (keys %OPT) { $CONF{$key} = $OPT{$key}; } repopulateconfiguration(\%CONF); } sub repopulateconfiguration { my $root = shift; for my $key (keys %$root) { my $value = $root->{$key}; if(ref($value) eq "HASH") { repopulateconfiguration($value); } elsif (ref($value) eq "ARRAY") { for my $item (@$value) { repopulateconfiguration($item); } } else { while($value =~ /__([^_].+?)__/g) { my $source = "__" . $1 . "__"; my $target = eval $1; $value =~ s/\Q$source\E/$target/g; } $root->{$key} = $value; } } } sub loadconfiguration { my $file = shift; my ($scriptname) = fileparse($0); if(-e $file && -r _) { # great the file exists } elsif (-e "/home/$ENV{LOGNAME}/.$scriptname.conf" && -r _) { $file = "/home/$ENV{LOGNAME}/.$scriptname.conf"; } elsif (-e "$FindBin::RealBin/$scriptname.conf" && -r _) { $file = "$FindBin::RealBin/$scriptname.conf"; } elsif (-e "$FindBin::RealBin/etc/$scriptname.conf" && -r _) { $file = "$FindBin::RealBin/etc/$scriptname.conf"; } elsif (-e "$FindBin::RealBin/../etc/$scriptname.conf" && -r _) { $file = "$FindBin::RealBin/../etc/$scriptname.conf"; } else { return undef; } $OPT{configfile} = $file; my $conf = new Config::General(-ConfigFile=>$file, -AllowMultiOptions=>"yes", -LowerCaseNames=>1, -AutoTrue=>1); %CONF = $conf->getall; } sub printdebug { printinfo("debug",@_) if $CONF{debug}; } sub printinfo { printf("%s\n",join(" ",@_)); } sub printdumper { printinfo(Dumper(@_)); }