#!/usr/bin/python
# obtain pairwise distances from snp data,
# counting sites where at least one allele is different
import re
humans = [
# SNPs appear in the SNP data file in columns in this order
'YH', # Han chinese
'SJK', # Seong-Jin Kim
'JW', # James Watson
'CV', # Craig Venter
'NA18507', # Yoruban of 1000 Genomes project
'NA12891', # Of Central European origin
'ABT', # Archbishop Desmond Tutu
'KB1', # Bushmen individual
'chimp' # chimpanzee
]
# 1 #
# initialize the distance matrix with zero values
# for the diagonal cells
diff = [[] * 10]
for j in range(0, 10):
diff[0].append(0)
for i in range(1, 10):
diff.append([])
for j in range(0, 10):
diff[i].append(0)
# read the snp data from file
for line in open('snp.txt'):
line = line.rstrip()
columns = re.split(' ', line)
# 2 #
for i in range(1, 9):
for j in range(i + 1, 10):
# 3 #
if columns[i] != columns[j]:
diff[i][j] += 1
# 4 #
# to produce a symmetric matrix
diff[j][i] += 1
# 5 #
# print a header for PHYLIP format
# with the number of species
print ' ', '9'
# print the matrix data
for i in range(1, 10):
# 6 #
txt = humans[i - 1]
txt = txt[0:7]
print txt,
length = 10 - len(txt)
short = ' ' * (length - 2)
print short,
for j in range(1, 10):
print diff[i][j],
print ''