#!/bin/bash

DATA=.
SCRIPTS=../../../scripts

# Change this to the directory where Circos is installed 
CIRCOS=/home/martink/work/circos/svn/

# Parse the segmental duplications into a Circos link format. Ignore all duplications on
# random or unanchored chromosomes.
echo "Creating Circos segmental duplication links"
cat $DATA/segdup.human.txt | grep -v \# | cut -d $'\t' -f 2,3,4,8,9,10 | grep -v -i 'random\|chrun\|_alt\|none' | sed 's/chr/hs/g' > track.segdup.all.txt

# Histogram of sizes
echo "Creating histogram of segmental duplication size (in kb)"
cat track.segdup.all.txt | awk '{print ($3-$2-1)/1000}' | histogram.v2 -min 1 -max 20 -binsize 1 > histogram.segdup.size.txt

# Get a list of unique chromosomes from the link track ...
# ... search for the chromosome at start of line, add link size, sort by size, add i=NR where NR
#     is the awk record number and append to file tmp.txt

echo "Creating Circos segmental duplication links with size rank for each chromosome"

for chr in `cat track.segdup.all.txt | cut -d $'\t' -f 1 | sort -u` ; do
grep -w ^$chr track.segdup.all.txt | awk 'BEGIN { OFS="\t" } {print $3-$2,$0}' | sort -nr | awk 'BEGIN { OFS="\t" } {print $0,"sizerank="NR}' >> tmp.txt
done

# Sort the indexed link by size (first field), remove the field and output to a track file

cat tmp.txt | sort -nr | cut -d $'\t' -f 2- > track.segdup.indexed.txt

\rm tmp.txt