<title>Perl Workshops - Learning Perl

#!/usr/local/bin/perl -w # file tagger.pl -- extract potential SAGE tags from the # virtual transcriptome, then # 1) count the total number of potential tags # 2) identify and list all tags that could result from internal polyA tracts # # The raw code -- see below for detailed comments # use strict; my $file = shift or die "Usage: ./tagger.pl input_file\n"; die "File $file not found\n" unless -e $file; my $lines = `cat $file`; my @lines = split '>', $lines; shift @lines; my $polyA = 0; my $total = 0; for ( @lines ) { my @rows = split "\n", $_; my ( $gene ) = $rows[0] =~ /^(\S+)/; shift @rows; my $seq = uc join '', @rows; my @frags = reverse split 'CATG', $seq; pop @frags; $total += @frags; my $pos = 0; for my $frag ( @frags ) { my $tag = substr $frag, 0, 10; next unless ++$pos > 1; while ( $frag =~ /(\w{10})/g ) { pos $frag -= 9; if ( polyA($1) ) { $polyA++; print "$gene\t$tag\t$pos\n"; last; } } } } warn "\n\nThere were a total of $total potential tags\n", "$polyA may be primed by internal poly_A tracts\n\n"; sub polyA { $_ = shift; return tr/A/A/ > 7 ? 1 : 0 } __END__ # the same code with comments... # load the strict pragma -- this will make sure you follow # a lot of common sense programming rules that will make your # code much easier to read and debug. Use this pragma for every script you # write and you won't learn nearly as many bad habits that will come # back to haunt you later. use strict; # get the filename (entered as a command-line argument) my $file = shift # or die with a complaint about not getting one or die "Usage: ./tagger.pl input_file\n"; # die if the input file does not exist die "File $file not found\n" unless -e $file; # get all of the predicted transcripts my $lines = `cat $file`; # split the file into individual fasta records my @lines = split '>', $lines; # remove the first (empty) element from the list shift @lines; # set the counters to zero my $total = 0; my $polyA = 0; # get the tags for ( @lines ) { # split the record into individual lines my @rows = split "\n", $_; # get the gene name (first word in the first line) my ( $gene ) = $rows[0] =~ /^(\S+)/; # remove the id line shift @rows; # merge the remainder into a sequence string my $seq = uc join '', @rows; # digest the sequence with NlaIII and reverse the fragment order my @frags = reverse split 'CATG', $seq; # remove the 5'-most fragment (does not have an NlaII site at 5' end) pop @frags; # add the number of fragments (tags) to the tally $total += @frags; # extract the tags by position -- numbered upstream from polyA tail # set the position counter to zero my $pos = 0; # examine each of the fragments for my $frag ( @frags ) { # extract the 10 bp tag (excluding the NlaII site) my $tag = substr $frag, 0, 10; # increment the position and skip position 1 next unless ++$pos > 1; # find all possible contiguous blocks of 10 bp while ( $frag =~ /(\w{10})/g ) { # reset the register to nucleotide n + 1 pos $frag -= 9; # does the subroutine 'polyA' return a true value? if ( polyA($1) ) { # if yes, increment $polyA $polyA++; # and print the tag info to STDOUT print "$gene\t$tag\t$pos\n"; # and abort the while loop last; } # otherwise, go on to the next 10 bp block } # end of while ( $frag =~ /(\w{10})/g ) loop } # end of for my $frag ( @frags ) loop } # end of for ( @lines ) loop # print the summary to STDERR (will show on screen even if STDOUT # is redirected to a file) warn "\n\nThere were a total of $total potential tags\n", "$polyA may be primed by internal poly_A tracts\n\n"; # this subroutine checks a block of 10 nt to see if it is >= 80% 'A' sub polyA { # get the 10 bp string passed as an argument $_ = shift; # this determines the output of the subroutine return # first, count the 'A's -- a side-effect of the 'tr' operator # is that it returns the count of transliterations performed tr/A/A/ # are there eight or more? > 7 ? # return true if the answer is yes 1 # otherwise : # return false 0 }

course 0.1.0.1

legend

course code

categories

levels

lecture code viewer

downloads

1 | First Look at Perl | 0.1.0.1.1