#!/home/martink/bin/perl =pod =head1 NAME coverage - simulate read coverage of a haploid genome =head1 SYNOPSIS coverage -g GENOMESIZE -l READLENGTH -r REDUNDANCY -occupancy -freq [-fragment FRAGMENTLENGTH] [-nozero] [-help] [-man] [-brief] =head1 DESCRIPTION Runs a Monte Carlo simulation to calculate distribution of coverage of randomly and uniformly sampled reads across a genome. =head1 OPTIONS =head2 -g GENOMESIZE Specifies the genome size in the simulation. You will not want to use a full mammalian genome size (3e9) here. Depending on your precision requirements, a genome size 1,000 to 10,000 times the size of a read may be sufficient. =head2 -l READLENGTH The length of the read that will be covering the genome. This can be any positive, non-zero number. =head2 -r REDUNDANCY Controls the number of reads that will be sampled. Reads are sampled randomly and uniformly. The start or end of a read may extend past the genome, as to ensure a non-zero intersection between the read and the genome. =head2 -fragment FRAGMENTLENGTH If you want to simulate coverage by paired-end reads, use -f to specify the fragment length. If 2*READLENGTH > FRAGMENTLENGTH, know that your reads will overlap. --------------------- FRAGMENTLENGTH XXXXXXXXXXXX READLENGTH XXXXXXXXXXXX READLENGTH ^^^ overlap =head2 -occupancy Report occupancy statistics. For each genome position, the coverage (i.e. number of overlapping reads) is reported. Occupancy statistics are prefixed with "occupancy". > coverage -occupancy -g 1000 -l 1 -r 50 ... occupancy 3 46 occupancy 4 48 occupancy 5 46 occupancy 6 50 occupancy 7 55 ... In this example, position indexed by 3 is covered by 46 reads. =head2 -freq Report frequency statistics. The a distribution of the occupancy of each position is shown. Frequency statistics are prefixed with "frequency". > coverage -frequency -g 1000 -l 1 -r 50 ... frequency 46 43 0.0430 frequency 47 65 0.0650 frequency 48 48 0.0480 frequency 49 58 0.0580 ... In this example, the number of positions covered by 46 reads is 43, which is 4.3% of the total genome. =head2 -nonzero Any reported lines with zero occupancy or frequency are not displayed. =head2 -brief Does not include "occupancy" or "frequency" stirng in the output. =head1 HISTORY =over =item * 6 Jan 2010 Started. =back =head1 BUGS =head1 AUTHOR Martin Krzywinski =head1 CONTACT Martin Krzywinski Genome Sciences Centre Vancouver BC Canada www.bcgsc.ca martink@bcgsc.ca =cut ################################################################ # # Copyright 2002-2010 Martin Krzywinski # # This file is part of the Genome Sciences Centre Perl code base. # # This script is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this script; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # ################################################################ ################################################################ # Martin Krzywinski (martink@bcgsc.ca) # 2010 ################################################################ use strict; use Config::General; use Data::Dumper; use File::Basename; use FindBin; use Getopt::Long; use IO::File; use List::Util; use List::MoreUtils; use Math::VecStat qw(sum min max average); use Pod::Usage; use Set::IntSpan; use Statistics::Descriptive; use Storable; use Time::HiRes qw(gettimeofday tv_interval); use lib "$FindBin::RealBin"; use lib "$FindBin::RealBin/../lib"; use lib "$FindBin::RealBin/lib"; use vars qw(%OPT %CONF); ################################################################ # # *** YOUR MODULE IMPORTS HERE # ################################################################ GetOptions(\%OPT, "genome=i", "length=i", "redundancy=i", "fragment=i", "nozero", "occupancy", "frequency", "brief", "configfile=s","help","man","debug+"); pod2usage() if $OPT{help}; pod2usage(-verbose=>2) if $OPT{man}; loadconfiguration($OPT{configfile}); populateconfiguration(); # copy command line options to config hash validateconfiguration(); if($CONF{debug} > 1) { $Data::Dumper::Pad = "debug parameters"; $Data::Dumper::Indent = 1; $Data::Dumper::Quotekeys = 0; $Data::Dumper::Terse = 1; print Dumper(\%CONF); } my $nreads = int ( $CONF{genome} * $CONF{redundancy} / $CONF{length} ); $nreads /= 2 if $CONF{fragment}; my @genome = map { 0 } ( 0.. $CONF{genome}-1 ); printdebug("genome size",$CONF{genome},"read length",$CONF{length},"num reads",$nreads); for (1..$nreads) { # start of the read can be anywhere within [-length+1,genome] # this samples read position in such a way as to always create a non-zero # intersection between the read and the genome span my $start = int(rand($CONF{genome}+$CONF{length}-1))-$CONF{length}+1; my $end = $start + $CONF{length} - 1; printdebug("+read1",$start,$end); addcoverage(\@genome,$start,$end); if($CONF{fragment}) { # if this is a paired-end library, then add coverage of another read # and the end of the fragment # # ----------------------------------- fragment # ^start ^end # rrrrrrrrrrr read 1 # rrrrrrrrrrr read 2 # ^start ^end my $end2 = $start + $CONF{fragment} - 1; my $start2 = $end2 - $CONF{length} + 1; addcoverage(\@genome,$start2,$end2); printdebug("+read2",$start2,$end2,$end2-$start+1); } } # if occupancy information is desired, then every genome # position is reported, along with read coverage at that position if($CONF{occupancy}) { for my $i (0..@genome-1) { next if $CONF{nozero} && ! $genome[$i]; my @report = ($i,$genome[$i]); unshift @report,"occupancy" unless $CONF{brief}; printinfo(@report); } } # if coverage frequency distribution is desired (essentially # a histogram of occupanyc), then each coverage level is reported along # with the number of times it is seen if($CONF{frequency}) { my $hist; map { $hist->{$_}++ } @genome; my @freq = sort {$a <=> $b} keys %$hist; for my $f (0..max(@freq)) { next if $CONF{nozero} && ! $hist->{$f}; my @report = sprintf("%d %d %.4f",$f,$hist->{$f},$hist->{$f}/$CONF{genome}); unshift @report,"frequency" unless $CONF{brief}; printinfo(@report); } } sub addcoverage { my ($list,$start,$end) = @_; if($start == $end) { $list->[$start]++ if $start >= 0; } else { map { $list->[$_]++ } (max(0,$start)..min(@$list-1,$end)); } } sub validateconfiguration { $CONF{genome} ||= 1000; $CONF{length} ||= 1; $CONF{redundancy} ||= 50; $CONF{frequency} = 1 if ! $CONF{frequency} && ! $CONF{occupancy}; } ################################################################ # # *** DO NOT EDIT BELOW THIS LINE *** # ################################################################ sub populateconfiguration { foreach my $key (keys %OPT) { $CONF{$key} = $OPT{$key}; } repopulateconfiguration(\%CONF); } sub repopulateconfiguration { my $root = shift; for my $key (keys %$root) { my $value = $root->{$key}; if(ref($value) eq "HASH") { repopulateconfiguration($value); } elsif (ref($value) eq "ARRAY") { for my $item (@$value) { repopulateconfiguration($item); } } else { while($value =~ /__([^_].+?)__/g) { my $source = "__" . $1 . "__"; my $target = eval $1; $value =~ s/\Q$source\E/$target/g; } $root->{$key} = $value; } } } sub loadconfiguration { my $file = shift; my ($scriptname) = fileparse($0); if(-e $file && -r _) { # great the file exists } elsif (-e "/home/$ENV{LOGNAME}/.$scriptname.conf" && -r _) { $file = "/home/$ENV{LOGNAME}/.$scriptname.conf"; } elsif (-e "$FindBin::RealBin/$scriptname.conf" && -r _) { $file = "$FindBin::RealBin/$scriptname.conf"; } elsif (-e "$FindBin::RealBin/etc/$scriptname.conf" && -r _) { $file = "$FindBin::RealBin/etc/$scriptname.conf"; } elsif (-e "$FindBin::RealBin/../etc/$scriptname.conf" && -r _) { $file = "$FindBin::RealBin/../etc/$scriptname.conf"; } else { return undef; } $OPT{configfile} = $file; my $conf = new Config::General(-ConfigFile=>$file, -AllowMultiOptions=>"yes", -LowerCaseNames=>1, -AutoTrue=>1); %CONF = $conf->getall; } sub printdebug { printinfo("debug",@_) if $CONF{debug}; } sub printinfo { printf("%s\n",join(" ",@_)); }