#!/usr/bin/python

import string
import re
import sys

complement = string.maketrans('ATCG', 'TAGC')
myid = ''
seq = ''
rev = ''
strand = ''
global x
x = 0


def pattern(myid, seq, strand):
    global x
    gi = ''

    # 3 #

    match = re.search('gi\|(\d+)\|.*\|.*\| (.*)(chloroplast|plastid)',
                      myid)
    if match:
        gi = match.group(1)
        org = match.group(2)

    # 4 #

        org = org.replace(' ', '_')
        gi = gi + '_' + org

    length = len(seq)

    # 5 #

    for i in range(0, length - 100):
        subseq = seq[i:i + 100]

         # 6 #

        match = re.search('.{42}AG[AG].(.)(.).{4}(.)(.).AGCA[AG].{40}',
                          subseq)
        if match:
            if pair(match.group(1), match.group(4)) \
                and pair(match.group(2), match.group(3)):
                x += 1
                newmyid = '>' + str(x) + '_' + gi + strand
                print newmyid
                print subseq


def pair(base1, base2):

    if     base1 == 'G' and base2 == 'C' \
        or base1 == 'G' and base2 == 'T' \
        or base1 == 'A' and base2 == 'T' \
        or base1 == 'C' and base2 == 'G' \
        or base1 == 'T' and base2 == 'A' \
        or base1 == 'T' and base2 == 'G':

        return 1


for line in open('chlorophyta_genomes.fa'):

    line = line.rstrip()

    if re.search('^>', line):
        if myid != '':

            # 1 #

            strand = '+'
            pattern(myid, seq, strand)

            # 2 #

            rev = seq.translate(complement)[::-1]
            strand = '-'
            pattern(myid, rev, strand)

        myid = line
        seq = ''
    else:

        seq += line

strand = '+'
pattern(myid, seq, strand)

rev = seq.translate(complement)[::-1]
strand = '-'
pattern(myid, rev, strand)