17/pattern_search.pl


#!/usr/bin/perl -w

use strict;

my $id     = '';
my $seq    = '';
my $rev    = '';
my $strand = '';
my $x      = 0;

my $infile = 'chlorophyta_genomes.fa';
open IN, $infile or die "Oops, could not open $infile\n";
while (<IN>) {
    chomp;
    if (/^>/) {
        if ($id  ne '') {  

            # 1 #
            pattern( $id, $seq, '+' );

            # 2 #
            $rev = $seq;
            $rev = reverse($seq);
            $rev =~ tr/AGCT/TCGA/;
            pattern( $id, $rev, '-' );
        }
        $id  = $_;
        $seq = '';
    }
    else {
        $seq .= $_;
    }
}
close IN;

pattern( $id, $seq, '+' );
$rev = $seq;
$rev = reverse($seq);
$rev =~ tr/AGCT/TCGA/;
pattern( $id, $rev, '-' );

sub pattern {
    my ( $id, $seq, $strand ) = @_;
    my $gi = '';
    
    # 3 #
    if ( $id =~ /gi\|(\d+)\|.*\|.*\| (.*)(chloroplast|plastid)/ ) {
        $gi = $1;
        my $org = $2;
 
       # 4 #
        $org =~ s/ /_/g;
        $gi = $gi . '_' . $org;
    }
    my $len = length($seq);

    # 5 #
    for ( my $i = 0 ; $i < $len - 100 ; $i++ ) {
        my $subseq = substr( $seq,  $i, 100 );

         # 6 #
        if ( $subseq =~ /.{42}AG[AG].(.)(.).{4}(.)(.).AGCA[AG].{40}/ ) {
            if ( ( pair( $1, $4 ) ) && ( pair( $2, $3 ) ) ) {
                $x++;
                my $newid = $x . '_' . $gi . "$strand";
                print ">$newid\n$subseq\n";
            }    
        }    
    }    
}    

sub pair {
    my ( $base1, $base2 ) = @_;
    if (   ( ( $base1 eq 'G' ) && ( $base2 eq 'C' ) )
        || ( ( $base1 eq 'G' ) && ( $base2 eq 'T' ) )
        || ( ( $base1 eq 'A' ) && ( $base2 eq 'T' ) )
        || ( ( $base1 eq 'C' ) && ( $base2 eq 'G' ) )
        || ( ( $base1 eq 'T' ) && ( $base2 eq 'A' ) )
        || ( ( $base1 eq 'T' ) && ( $base2 eq 'G' ) ) )
    {
        return 1;
    }
}