#!/usr/bin/python

# obtain pairwise distances from snp data,
# counting sites where at least one allele is different

import re

humans = [

    # SNPs appear in the SNP data file in columns in this order

    'YH',         # Han chinese
    'SJK',        # Seong-Jin Kim
    'JW',         # James Watson
    'CV',         # Craig Venter
    'NA18507',    # Yoruban of 1000 Genomes project
    'NA12891',    # Of Central European origin
    'ABT',        # Archbishop Desmond Tutu
    'KB1',        # Bushmen individual
    'chimp'       # chimpanzee
]


# 1 #
# initialize the distance matrix with zero values
# for the diagonal cells

diff = [[] * 10]
for j in range(0, 10):
    diff[0].append(0)

for i in range(1, 10):
    diff.append([])
    for j in range(0, 10):
        diff[i].append(0)

# read the snp data from file

for line in open('snp.txt'):
    line = line.rstrip()
    columns = re.split(' ', line)

    # 2 #

    for i in range(1, 9):
        for j in range(i + 1, 10):

            # 3 #

            if columns[i] != columns[j]:
                diff[i][j] += 1

                # 4 #
                # to produce a symmetric matrix

                diff[j][i] += 1

# 5 #
# print a header for PHYLIP format
# with the number of species

print '   ', '9'

# print the matrix data

for i in range(1, 10):

    # 6 #

    txt = humans[i - 1]
    txt = txt[0:7]
    print txt,
    length = 10 - len(txt)
    short = ' ' * (length - 2)
    print short,
    for j in range(1, 10):
        print diff[i][j],

    print ''