17/pattern_search.pl
#!/usr/bin/perl -w
use strict;
my $id = '';
my $seq = '';
my $rev = '';
my $strand = '';
my $x = 0;
my $infile = 'chlorophyta_genomes.fa';
open IN, $infile or die "Oops, could not open $infile\n";
while (<IN>) {
chomp;
if (/^>/) {
if ($id ne '') {
# 1 #
pattern( $id, $seq, '+' );
# 2 #
$rev = $seq;
$rev = reverse($seq);
$rev =~ tr/AGCT/TCGA/;
pattern( $id, $rev, '-' );
}
$id = $_;
$seq = '';
}
else {
$seq .= $_;
}
}
close IN;
pattern( $id, $seq, '+' );
$rev = $seq;
$rev = reverse($seq);
$rev =~ tr/AGCT/TCGA/;
pattern( $id, $rev, '-' );
sub pattern {
my ( $id, $seq, $strand ) = @_;
my $gi = '';
# 3 #
if ( $id =~ /gi\|(\d+)\|.*\|.*\| (.*)(chloroplast|plastid)/ ) {
$gi = $1;
my $org = $2;
# 4 #
$org =~ s/ /_/g;
$gi = $gi . '_' . $org;
}
my $len = length($seq);
# 5 #
for ( my $i = 0 ; $i < $len - 100 ; $i++ ) {
my $subseq = substr( $seq, $i, 100 );
# 6 #
if ( $subseq =~ /.{42}AG[AG].(.)(.).{4}(.)(.).AGCA[AG].{40}/ ) {
if ( ( pair( $1, $4 ) ) && ( pair( $2, $3 ) ) ) {
$x++;
my $newid = $x . '_' . $gi . "$strand";
print ">$newid\n$subseq\n";
}
}
}
}
sub pair {
my ( $base1, $base2 ) = @_;
if ( ( ( $base1 eq 'G' ) && ( $base2 eq 'C' ) )
|| ( ( $base1 eq 'G' ) && ( $base2 eq 'T' ) )
|| ( ( $base1 eq 'A' ) && ( $base2 eq 'T' ) )
|| ( ( $base1 eq 'C' ) && ( $base2 eq 'G' ) )
|| ( ( $base1 eq 'T' ) && ( $base2 eq 'A' ) )
|| ( ( $base1 eq 'T' ) && ( $base2 eq 'G' ) ) )
{
return 1;
}
}