quartet.pl


#!/usr/bin/perl -w

use strict;

my $binsize = 250000;
my $bin;

# 1 #
my @bins;    # array of hashes

my @fourbases = ( 'A', 'T', 'C', 'G' );

my %genotypes = (
   'ababaaaa' => 'ID',      # identical
   'aaabaaaa' => 'IDHM',    # identical or haplomaternal
   'aaababab' => 'IDHM',    # identical or haplomaternal
   'abaaaaaa' => 'IDHP',    # identical or haplopaternal
   'abaaabab' => 'IDHP',    # identical or haplopaternal
   'ababaaab' => 'HMHP',    # haplomaternal or haplopaternal
   'abababab' => 'IDNI',    # identical or non-identical
   'abaaaaab' => 'HMNI',    # haplomaternal or non-identical
   'ababaabb' => 'NI',      # non-identical
   'aaabaaab' => 'HPNI'     # haplopaternal or non-identical
);

my $infile = 'quartet.txt';
open(IN, $infile) or die "Could not open $infile\n";
while (<IN>) {
   chomp;

   # input format is
   # chr4,1646,T,T,C,T,T,T,T,T,C

   my @columns = split(',');
   my $pos = $columns[1];

   # 2 #
   $bin = int( $pos / $binsize );

   my $quartet = '';
   for ( my $i = 3 ; $i < 11 ; $i++ ) {
        $quartet .= $columns[$i];
   }

   my $count = 0;
   my @bases = ();

   # 3 #
   foreach my $base (@fourbases) {
      if ( $quartet =~ /$base/ ) {
         $count++;
         push @bases, $base;
      }
    }

   if ( $count == 2 ) {    # if the genotype string has
                           # exactly two different bases

      # which is the most common base?
      my $parents = substr( $quartet, 0, 4 );

      # select one of the two bases and count how
      # many times it occurs in the parents
      my $pcount = ( $parents =~ s/$bases[0]//g );
      if ( $pcount >= 2 ) {
         $quartet =~ s/$bases[0]/a/g;
         $quartet =~ s/$bases[1]/b/g;
      }
      if ( $pcount < 2 ) {
         $quartet =~ s/$bases[1]/a/g;
         $quartet =~ s/$bases[0]/b/g;
      }

      # replace all ba with ab
      $quartet = swap_ab($quartet);

      # check %genotypes
      if ($genotypes{$quartet}) {
         if ( $genotypes{$quartet} =~ /ID/ ) { $bins[$bin]{id}++; }
         if ( $genotypes{$quartet} =~ /HP/ ) { $bins[$bin]{hp}++; }
         if ( $genotypes{$quartet} =~ /HM/ ) { $bins[$bin]{hm}++; }
         if ( $genotypes{$quartet} =~ /NI/ ) { $bins[$bin]{ni}++; }
      }
   }
}

close IN;

# Some of the elements in @bins are undefined.
# This is to set these to zero instead.
for ( my $i = 0 ; $i <= $#bins ; $i++ ) {
   unless ( $bins[$i]{id} ) { $bins[$i]{id} = 0; }
   unless ( $bins[$i]{ni} ) { $bins[$i]{ni} = 0; }
   unless ( $bins[$i]{hp} ) { $bins[$i]{hp} = 0; }
   unless ( $bins[$i]{hm} ) { $bins[$i]{hm} = 0; }
}

# Finally print all the data in @bins
print "bin\tidentical\thaplopaternal\thaplomaternal\tnonidentical\n";
for ( my $i = 0 ; $i <= $#bins ; $i++ ) {
   print "$i\t$bins[$i]{id}\t$bins[$i]{hp}\t";
   print "$bins[$i]{hm}\t$bins[$i]{ni}\n";
}

sub swap_ab {
   my $str    = $_[0];
   my $newstr = '';
   for ( my $i = 0 ; $i < 8 ; $i = $i + 2 ) {
      my $temp = substr( $str, $i, 2 );
      if ( $temp eq 'ba' ) { $temp = 'ab'; }
      $newstr .= $temp;
    }
    return $newstr;
}