#!/bin/env perl =pod =head1 NAME histogram - generate a simple ASCII histogram of a distribution of numbers =head1 SYNOPSIS cat numbers.txt | ./histogram [-help] [-man] cat numbers.txt | ./histogram -min 0 -max 100 cat numbers.txt | ./histogram -min 0 -max 100 -binsize 10 cat numbers.txt | ./histogram -min 0 -max 100 -nbins 20 # sqrtscale the ascii distribution cat numbers.txt | ./histogram .. -sqrt cat numbers.txt | ./histogram .. -sqrt 0.25 =head1 DESCRIPTION =head1 OPTIONS =cut use strict; use diagnostics; use warnings FATAL=>"all"; use Carp; use Config::General; use Cwd qw(getcwd abs_path); use Data::Dumper; use File::Basename; use FindBin; use Getopt::Long; use Math::Round qw(round nearest); use Math::VecStat qw(sum min max average); use Pod::Usage; use Statistics::Basic qw(median stddev); use lib "$FindBin::RealBin"; use lib "$FindBin::RealBin/../lib"; use lib "$FindBin::RealBin/lib"; our (%OPT,%CONF,$conf); our @COMMAND_LINE = ("col=i", "nbins=i", "binsize=i", "min=f", "max=f", "sqrt:f", "pmin=f", "pmax=f", "height=i", "file=s", "configfile=s", "help", "cdump", "man", "debug"); our $VERSION = 0.01; # common and custom module imports below use List::MoreUtils qw(uniq minmax); # read and parse configuration file parse_config(); sub validateconfiguration { $CONF{col} ||= 0; $CONF{height} ||= 25; $CONF{nbins} = 20 if ! $CONF{nbins} && ! $CONF{binsize}; } # _START # read the values from STDIN and use the -col COLUMN, by default 0 my @values; while(my $line = <>) { chomp $line; next if $line =~ /^\#/; my @tok = split(" ",$line); if(defined $CONF{col} && ! defined $tok[ $CONF{col} ]) { die "The line [$line] has no number in column [$CONF{col}]"; } push @values, $tok[ $CONF{col}]; } # get min and max of values my ($min,$max) = minmax(@values); # change min/max range if redefined via -min and/or -max $min = $CONF{min} if defined $CONF{min};# && $CONF{min} > $min; $max = $CONF{max} if defined $CONF{max};# && $CONF{max} < $max; # *very* quick and dirty percentile if -pmin or -pmax used if($CONF{pmin} || $CONF{pmax}) { my @sorted = sort {$a <=> $b} @values; $min = $sorted[ $CONF{pmin}/100 * (@sorted-1) ] if $CONF{pmin}; $max = $sorted[ $CONF{pmax}/100 * (@sorted-1) ] if $CONF{pmax}; } # determine bin size from either the -binsize command-line parameter # or from -nbins. At least one of these is defined -- we made sure of # that in validateconfiguration() my $bin_size; if($CONF{binsize}) { $bin_size = $CONF{binsize}; $CONF{nbins} = round(($max-$min)/$CONF{binsize}); } else { $bin_size = ($max-$min)/$CONF{nbins}; } # populate bins my $bins; for my $i (0..$CONF{nbins}-1) { my $start = $min + $i * $bin_size; my $end = $min + ($i+1) * $bin_size; my $n = grep($_ >= $start && ($_ < $end || ( $i == $CONF{nbins}-1 && $_ <= $end)), @values); push @$bins, { start=>$start, i=>$i, end=>$end, size=>$bin_size, n=>$n }; } my $histogram_width = $CONF{height}; my $maxn = max(map { $_->{n} } @$bins); my $sumn = sum(map { $_->{n} } @$bins); my $cumuln = 0; my $bin_count_width = max(map { length($_->{n}) } @$bins); my $bin_count_fmt = sprintf("%%%dd",2+$bin_count_width); my $bin_width_f_max = max(map { $_->{n}/$maxn } @$bins); # report number and fraction of values smaller than requested minimum printinfo(sprintf("%10s %10.4f>$bin_count_fmt %6.3f","", $min, int(grep($_ < $min, @values)), int(grep($_ < $min, @values))/@values)); # report each bin: min, max, count, fractional count, cumulative count for my $bin (@$bins) { $cumuln += $bin->{n}; my $binwidth = $bin->{n}*$histogram_width/$maxn; if(defined $CONF{sqrt}) { my $power = $CONF{sqrt} || 0.5; my $f = $bin->{n}/$maxn; $binwidth = $f**$power/$bin_width_f_max**$power * $histogram_width; } printinfo(sprintf("%10.4f %10.4f $bin_count_fmt %6.3f %6.3f %-${histogram_width}s", $bin->{start}, $bin->{end}, $bin->{n}, $bin->{n}/$sumn, $cumuln/$sumn, "*"x$binwidth)); } # report number and fraction of values larger than requested maximum printinfo(sprintf("%10s %10.4f<%6d %6.3f","", $max, int(grep($_ > $max, @values)), int(grep($_ > $max, @values))/@values)); # aggregate stats for full data set printinfo(sprintf("%8s %12d","n",int(@values))); printinfo(sprintf("%8s %12.5f","average",average(@values))); printinfo(sprintf("%8s %12.5f","sd",stddev(@values))); printinfo(sprintf("%8s %12.5f","min",min(@values))); printinfo(sprintf("%8s %12.5f","max",max(@values))); printinfo(sprintf("%8s %12.5f","sum",sum(@values))); # _END exit; ################################################################ sub read_file { my $file = shift; my $inputhandle = get_handle($file); my $data; while(<$inputhandle>) { next if /^\s*\#/; next if /^\s*$/g; chomp; my @tok = split; push @$data, \@tok; } return $data; } sub get_handle { my $file = shift; my $h; if($file) { die "No such file [$file]" unless -e $file; open(FILE,$file); $h = \*FILE; } else { $h = \*STDIN; } return $h; } # HOUSEKEEPING ############################################################### sub dump_config { printdumper(\%OPT,\%CONF); } sub parse_config { my $dump_debug_level = 3; GetOptions(\%OPT,@COMMAND_LINE); pod2usage() if $OPT{help}; pod2usage(-verbose=>2) if $OPT{man}; loadconfiguration($OPT{configfile}); populateconfiguration(); # copy command line options to config hash validateconfiguration(); if ($CONF{cdump}) { $Data::Dumper::Indent = 2; $Data::Dumper::Quotekeys = 0; $Data::Dumper::Terse = 0; $Data::Dumper::Sortkeys = 1; $Data::Dumper::Varname = "OPT"; printdumper(\%OPT); $Data::Dumper::Varname = "CONF"; printdumper(\%CONF); exit; } } sub populateconfiguration { for my $var (keys %OPT) { $CONF{$var} = $OPT{$var}; } repopulateconfiguration(\%CONF); } sub repopulateconfiguration { my ($node,$parent_node_name) = shift; return unless ref($node) eq "HASH"; for my $key (keys %$node) { my $value = $node->{$key}; if (ref($value) eq "HASH") { repopulateconfiguration($value,$key); } elsif (ref($value) eq "ARRAY") { for my $item (@$value) { repopulateconfiguration($item,$key); } } elsif (defined $value) { my $new_value = parse_field($value,$key,$parent_node_name,$node); $node->{$key} = $new_value; } } } sub parse_field { my ($str,$key,$parent_node_name,$node) = @_; # replace configuration field # conf(LEAF,LEAF,...) while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) { my ($template,$leaf) = ($1,$2); if (defined $template && defined $leaf) { my @leaf = split(/\s*,\s*/,$leaf); my $new_template; if (@leaf == 2 && $leaf[0] eq ".") { $new_template = $node->{$leaf[1]}; } else { $new_template = fetch_conf(@leaf); } $str =~ s/\Q$template\E/$new_template/g; } } if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) { my $fn = $1; $str = eval $fn; if ($@) { die "could not parse configuration parameter [$@]"; } } return $str; } sub fetch_configuration { my @config_path = @_; my $node = \%CONF; if(! @config_path) { return \%CONF; } for my $path_element (@config_path) { if (! exists $node->{$path_element}) { return undef; } else { $node = $node->{$path_element}; } } return $node; } sub fetch_conf { return fetch_configuration(@_); } ################################################################ # # sub loadconfiguration { my $file = shift; if (defined $file) { if (-e $file && -r _) { # provided configuration file exists and can be read $file = abs_path($file); } else { confess "The configuration file [$file] passed with -configfile does not exist or cannot be read."; } } else { # otherwise, try to automatically find a configuration file my ($scriptname,$path,$suffix) = fileparse($0); my $cwd = getcwd(); my $bindir = $FindBin::RealBin; my $userdir = $ENV{HOME}; my @candidate_files = ( "$cwd/$scriptname.conf", "$cwd/etc/$scriptname.conf", "$cwd/../etc/$scriptname.conf", "$bindir/$scriptname.conf", "$bindir/etc/$scriptname.conf", "$bindir/../etc/$scriptname.conf", "$userdir/.$scriptname.conf", ); my @additional_files = (); for my $candidate_file (@additional_files,@candidate_files) { #printinfo("configsearch",$candidate_file); if (-e $candidate_file && -r _) { $file = $candidate_file; #printinfo("configfound",$candidate_file); last; } } } if (defined $file) { $OPT{configfile} = $file; $conf = new Config::General( -ConfigFile=>$file, -IncludeRelative=>1, -IncludeAgain=>1, -ExtendedAccess=>1, -AllowMultiOptions=>"yes", -LowerCaseNames=>1, -AutoTrue=>1 ); %CONF = $conf->getall; } } sub printdebug { printerr(@_) if defined $CONF{debug}; } sub printinfo { print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; } sub printfinfo { my ($fmt,@args) = @_; @args = map { defined $_ ? $_ : "_undef_" } @args; printf("$fmt\n",@args); } sub printerr { print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; } sub printdumper { print Dumper(@_); } sub printdumperq { printdumper(@_); exit; } =pod =head1 HISTORY =over =item * 27 Jun 2018 Added some modules and printdumperq. =item * 24 Jun 2014 Fixed config dump. =item * 12 May 2014 Added printfinfo(). =back =head1 AUTHOR Martin Krzywinski =head1 CONTACT Martin Krzywinski Genome Sciences Center BC Cancer Research Center 100-570 W 7th Ave Vancouver BC V5Z 4S6 mkweb.bcgsc.ca martink@bcgsc.ca =cut