18/snp.pl


#!/usr/bin/perl -w

# obtain pairwise distances from snp data,
# counting sites where at least one allele is different

use strict;

my @humans = (

    # SNPs appear in the SNP data file in columns in this order

    'YH',         # Han chinese
    'SJK',        # Seong-Jin Kim
    'JW',         # James Watson
    'CV',         # Craig Venter
    'NA18507',    # Yoruban of 1000 Genomes project
    'NA12891',    # Of Central European origin
    'ABT',        # Archbishop Desmond Tutu
    'KB1',        # Bushmen individual
    'chimp'       # chimpanzee
);

my @diff;

# 1 #
# initialize the distance matrix with zero values
# for the diagonal cells
for ( my $i = 1 ; $i < 10 ; $i++ ) {
    $diff[$i][$i] = 0;
}

# read the snp data from file

my $infile = 'snp.txt';
open( IN, $infile ) or die "Oops, could not open $infile";
while (<IN>) {
    chomp;
    my @columns = split;

    # 2 #
    for ( my $i = 1 ; $i < 9 ; $i++ ) {
        for ( my $j = $i + 1 ; $j < 10 ; $j++ ) {

            # 3 #
            if ( $columns[$i] ne $columns[$j] ) {
                $diff[$i][$j]++;

                # 4 #
                # to produce a symmetric matrix
                $diff[$j][$i]++;
            }
        }
    }
}
close IN;

# 5 #
# print a header for PHYLIP format
# with the number of species

print "    9\n";

# print the matrix data

for ( my $i = 1 ; $i < 10 ; $i++ ) {

    # 6 #
    my $txt = $humans[ $i - 1 ];
    $txt = substr( $txt, 0, 7 );
    print "$txt";
    my $len   = 10 - length($txt);
    my $short = ' ' x $len;
    print "$short";
    for ( my $j = 1 ; $j < 10 ; $j++ ) {
        print "$diff[$i][$j] ";
    }
    print "\n";
}