#!/usr/bin/python
import re
import sys
def swap_ab(string):
newstr = ''
for i in range(0, 8, 2):
temp = string[i:i + 2]
if temp == 'ba':
temp = 'ab'
newstr += temp
return newstr
binsize = 250000
bin = 0
# 1 #
bins = [] # array of hashes/dictionaries
bins.append({})
bins[0]['id'] = 0
bins[0]['hp'] = 0
bins[0]['hm'] = 0
bins[0]['ni'] = 0
size = 0
fourbases = ['A', 'T', 'C', 'G']
genotypes = {
'ababaaaa' : 'ID', # identical
'aaabaaaa' : 'IDHM', # identical or haplomaternal
'aaababab' : 'IDHM', # identical or haplomaternal
'abaaaaaa' : 'IDHP', # identical or haplopaternal
'abaaabab' : 'IDHP', # identical or haplopaternal
'ababaaab' : 'HMHP', # haplomaternal or haplopaternal
'abababab' : 'IDNI', # identical or non-identical
'abaaaaab' : 'HMNI', # haplomaternal or non-identical
'ababaabb' : 'NI', # non-identical
'aaabaaab' : 'HPNI' # haplopaternal or non-identical
}
for line in open('quartet.txt'):
line = line.rstrip()
# input format is
# chr4,1646,T,T,C,T,T,T,T,T,C
columns = re.split(',', line)
pos = int(columns[1])
# 2 #
bin = int(pos / binsize)
while bin >= len(bins):
bins.append({})
bins[len(bins) - 1]['id'] = 0
bins[len(bins) - 1]['hp'] = 0
bins[len(bins) - 1]['hm'] = 0
bins[len(bins) - 1]['ni'] = 0
quartet = ''
for i in range(3, 11):
quartet += columns[i]
count = 0
bases = []
# 3 #
if re.search('A', quartet):
count += 1
bases.append('A')
if re.search('T', quartet):
count += 1
bases.append('T')
if re.search('C', quartet):
count += 1
bases.append('C')
if re.search('G', quartet):
count += 1
bases.append('G')
if count == 2: # if the genotype string has
# exactly two different bases
# which is the most common base?
parents = quartet[0:4]
# select one of the two bases and count how
# many times is occurs in the parents
pcount = float(parents.count(bases[0]))
if pcount >= 2:
quartet = re.sub(bases[0], 'a', quartet)
quartet = re.sub(bases[1], 'b', quartet)
if pcount < 2:
quartet = re.sub(bases[1], 'a', quartet)
quartet = re.sub(bases[0], 'b', quartet)
# replace all ba with ab
quartet = swap_ab(quartet)
# check if the $quartet genotype is any
# of the keys in %genotypes
if quartet in genotypes:
if re.search('ID', genotypes[quartet]):
bins[bin]['id'] += 1
if re.search('HP', genotypes[quartet]):
bins[bin]['hp'] += 1
if re.search('HM', genotypes[quartet]):
bins[bin]['hm'] += 1
if re.search('NI', genotypes[quartet]):
bins[bin]['ni'] += 1
# Finally print all the data in bins
print 'bin\tidentical\thaplopaternal\thaplomaternal\tnonidentical'
for i in range(0, len(bins) - 1):
sys.stdout.write(str(i))
sys.stdout.write('\t')
sys.stdout.write(str(bins[i]['id']))
sys.stdout.write('\t')
sys.stdout.write(str(bins[i]['hp']))
sys.stdout.write('\t')
sys.stdout.write(str(bins[i]['hm']))
sys.stdout.write('\t')
sys.stdout.write(str(bins[i]['ni']))
print ''