#!/usr/local/bin/perl ################################################################ # # POD documentation # ################################################################ =pod =head1 NAME fetchratdata - scrape rat map data from MDC and dump to STDOUT =head1 SYNOPSIS grabdata -chromosome CHR =head1 DESCRIPTION =cut use strict; use LWP::Simple; use HTML::TreeBuilder; use Pod::Usage; use Data::Dumper; use Graph::Undirected; use GraphViz; use Getopt::Long; use vars qw(%CONF); GetOptions(\%CONF,"chromosome=s"); pod2usage(-verbose=>2) if ! $CONF{chromosome}; my $url = "http://www.mdc-berlin.de/ratgenome/data/MDC-Map-" . sprintf("%02d",$CONF{chromosome}) . ".html"; my %bac_to_yacs; # load HTML page as text my $html = get($url); # construct HTML tree my $tree = HTML::TreeBuilder->new_from_content($html); # extract the table of interest from the HTML tree my ($table) = grep($_->attr("rules") eq "none", $tree->find_by_tag_name("table")); # create a list of all rows my @rows = $table->find_by_tag_name("tr"); # for each row, extract values from the pertinent columns my $nbsp = chr(160); ROW: foreach my $row (@rows) { my @cols = $row->find_by_tag_name("td"); next unless @cols == 7; # get text from clumns 2=contig, 3=bac , 6=yac,yac,yac,yac (my $contig = $cols[2]->as_text) =~ s/^\s*$nbsp//; (my $bacname = $cols[3]->as_text) =~ s/^\s*$nbsp//; my $yacnames = $cols[6]->as_text; my @yacnames = split(/,/,$yacnames); map { $_ =~ s/^\s*$nbsp// } @yacnames; # make sure the bac name has the right format if ($bacname !~ /(RPCI3[12]\.\d{1,3}[a-z]\d{1,2})/) { print "debug malformed bacname $bacname on chr $CONF{chromosome} ctg $contig\n"; next ROW; } else { $bacname = $1; } # remember a list of yacs for each bac push ( @{$bac_to_yacs{$bacname}}, @yacnames ); print "debug chr $CONF{chromosome} ctg $contig bac $bacname linked to ",scalar(@yacnames)," yacs\n"; } # create a graph of bac-yac relationships my $graph = Graph::Undirected->new(); my $graphviz = GraphViz->new(directed=>0, width=>50, height=>50, epsilon=>0.05, node=>{ width=>0.2, height=>0.2, shape=>"circle", fontsize=>2, style=>"filled", fontname=>"/usr/local/fonts/ttf/trebuc.ttf" }, ); foreach my $bac (keys %bac_to_yacs) { my @yacs = @{$bac_to_yacs{$bac}}; (my $baclabel = lc $bac) =~ s/^rpci3//; foreach my $yac (@yacs) { print "debug adding edge $bac $yac\n"; # for computation $graph->add_edge($bac,$yac); # for vizualization # we'll transform the YAC name to something shorter # WIBRy933.123a12->W123A12 (my $yaclabel = lc $yac) =~ s/^([a-z]).+?\./$1/; $graphviz->add_edge($baclabel,$yaclabel,minlength=>0.25); } } open(GRAPH,">/home/martink/www/htdocs/tmp/bacyac.png"); print GRAPH $graphviz->as_png; close(GRAPH); my @groups = $graph->strongly_connected_components; foreach my $group_idx (0..@groups-1) { my @vertices = @{$groups[$group_idx]}; foreach my $vertex_idx (0..@vertices-1) { printf("%d %d %s\n",$group_idx,$vertex_idx,$vertices[$vertex_idx]); } }