#!/usr/bin/python
import re
seq = ''
for line in open('mrna.fa'):
if not re.search('>', line):
line = line.rstrip()
seq = seq + line
for i in range(0, len(seq) - 23):
testseq = seq[i:i + 23]
# check if first two positions are AA and
# last are TT
if re.search('^AA.*TT$', testseq):
# test GC content
# count the number of G's and C's
gc_content = 0.0
gc_content += testseq.count('G')
gc_content += testseq.count('C')
gc_content /= 23
# is the GC content within the range 30-50?
if gc_content >= 0.3 and gc_content <= 0.5:
# does the sequence contain stretches of As, Ts, Cs or Gs?
if not (re.search('A{4}', testseq) or re.search('T{4}',
testseq) or re.search('C{4}', testseq)
or re.search('G{4}', testseq)
or re.search('(G|C){6}', testseq)): # avoid also regions of six positions with G or C
print i, '\t', testseq