#!/bin/bash DATA=. SCRIPTS=../../../scripts # Change this to the directory where Circos is installed CIRCOS=/home/martink/work/circos/svn/ # grep treats special characters as normal unless they're escaped # #https://stackoverflow.com/questions/6775904/grepping-using-the-alternative-operator # About 43,000 genes that are on canonical chromosomes echo "Creating list of 43,000 genes" cat $DATA/genes.human.refseq.txt | grep -v "#" | grep -v -i "random\|_alt\|none\|chrun" > genes.43000.txt # Gene entries with unique names echo "Creating list of 19,300 genes (unique names)" cat genes.43000.txt | sort -u -k 13,13 > genes.19300.txt # Prefixes echo "Counting genes for each 3-letter prefix" cat track.genes.txt | cut -d " " -f 5 | sed 's/name=//' | cut -c 1-3 | sort | uniq -c | sort -rn > prefixes.count.txt # Gene entries with unique first 3 letter substrings in their name echo "Creating list of 3,500 genes (unique 3-letter prefix)" cat genes.19300.txt | awk '{print $13,$0}' | uniq -w 3 > genes.3500.txt # Histogram of gene size echo "Creating histogram of gene sizes (in kb)" cat genes.19300.txt | awk '{print ($6-$5+1)/1000}' | $SCRIPTS/histogram -min 0 -max 200 -binsize 10 > histogram.gene.size.txt # Number of genes on each chromosome echo "Counting genes on each chromosome" cat genes.19300.txt | cut -d $'\t' -f 3 | sort | uniq -c | sort -nr > count.bychr.txt # Create a gene track with fields # CHR START END NUM_EXONS name=GENENAME echo "Creating Circos data file of gene positions" cat genes.19300.txt | awk '{print $3,$5,$6,$9,"name="$13}' | sed 's/chr/hs/' > track.genes.txt # Count BINS="1 5 10 20" for ws in $BINS; do echo "Creating Circos data file of gene counts in $ws Mb windows" cat track.genes.txt | $CIRCOS/tools/resample/bin/resample -bin ${ws}e6 -count > track.gene.count.${ws}mb.txt done