#!/usr/bin/python

import re
import sys

def swap_ab(string):
    newstr = ''
    for i in range(0, 8, 2):
        temp = string[i:i + 2]
        if temp == 'ba':
            temp = 'ab'
        newstr += temp

    return newstr


binsize = 250000

bin = 0

# 1 #

bins = []  # array of hashes/dictionaries
bins.append({})
bins[0]['id'] = 0
bins[0]['hp'] = 0
bins[0]['hm'] = 0
bins[0]['ni'] = 0

size = 0
fourbases = ['A', 'T', 'C', 'G']

genotypes = {
    'ababaaaa' : 'ID',      # identical
    'aaabaaaa' : 'IDHM',    # identical or haplomaternal
    'aaababab' : 'IDHM',    # identical or haplomaternal
    'abaaaaaa' : 'IDHP',    # identical or haplopaternal
    'abaaabab' : 'IDHP',    # identical or haplopaternal
    'ababaaab' : 'HMHP',    # haplomaternal or haplopaternal
    'abababab' : 'IDNI',    # identical or non-identical
    'abaaaaab' : 'HMNI',    # haplomaternal or non-identical
    'ababaabb' : 'NI',      # non-identical
    'aaabaaab' : 'HPNI'     # haplopaternal or non-identical
    }

for line in open('quartet.txt'):
    line = line.rstrip()

    # input format is
    # chr4,1646,T,T,C,T,T,T,T,T,C

    columns = re.split(',', line)

    pos = int(columns[1])

    # 2 #

    bin = int(pos / binsize)
    while bin >= len(bins):
        bins.append({})
        bins[len(bins) - 1]['id'] = 0
        bins[len(bins) - 1]['hp'] = 0
        bins[len(bins) - 1]['hm'] = 0
        bins[len(bins) - 1]['ni'] = 0

    quartet = ''

    for i in range(3, 11):
        quartet += columns[i]

    count = 0
    bases = []

    # 3 #

    if re.search('A', quartet):
        count += 1
        bases.append('A')

    if re.search('T', quartet):
        count += 1
        bases.append('T')

    if re.search('C', quartet):
        count += 1
        bases.append('C')

    if re.search('G', quartet):
        count += 1
        bases.append('G')

    if count == 2:  # if the genotype string has
                    # exactly two different bases
                    # which is the most common base?

        parents = quartet[0:4]

        # select one of the two bases and count how
        # many times is occurs in the parents

        pcount = float(parents.count(bases[0]))

        if pcount >= 2:

            quartet = re.sub(bases[0], 'a', quartet)
            quartet = re.sub(bases[1], 'b', quartet)

        if pcount < 2:

            quartet = re.sub(bases[1], 'a', quartet)
            quartet = re.sub(bases[0], 'b', quartet)

        # replace all ba with ab

        quartet = swap_ab(quartet)

        # check if the $quartet genotype is any
        # of the keys in %genotypes

        if quartet in genotypes:

            if re.search('ID', genotypes[quartet]):
                bins[bin]['id'] += 1
            if re.search('HP', genotypes[quartet]):
                bins[bin]['hp'] += 1
            if re.search('HM', genotypes[quartet]):
                bins[bin]['hm'] += 1
            if re.search('NI', genotypes[quartet]):
                bins[bin]['ni'] += 1

# Finally print all the data in bins

print 'bin\tidentical\thaplopaternal\thaplomaternal\tnonidentical'

for i in range(0, len(bins) - 1):
    sys.stdout.write(str(i))
    sys.stdout.write('\t')
    sys.stdout.write(str(bins[i]['id']))
    sys.stdout.write('\t')
    sys.stdout.write(str(bins[i]['hp']))
    sys.stdout.write('\t')
    sys.stdout.write(str(bins[i]['hm']))
    sys.stdout.write('\t')
    sys.stdout.write(str(bins[i]['ni']))
    print ''