#!/usr/bin/python
import string
import re
import sys
complement = string.maketrans('ATCG', 'TAGC')
myid = ''
seq = ''
rev = ''
strand = ''
global x
x = 0
def pattern(myid, seq, strand):
global x
gi = ''
# 3 #
match = re.search('gi\|(\d+)\|.*\|.*\| (.*)(chloroplast|plastid)',
myid)
if match:
gi = match.group(1)
org = match.group(2)
# 4 #
org = org.replace(' ', '_')
gi = gi + '_' + org
length = len(seq)
# 5 #
for i in range(0, length - 100):
subseq = seq[i:i + 100]
# 6 #
match = re.search('.{42}AG[AG].(.)(.).{4}(.)(.).AGCA[AG].{40}',
subseq)
if match:
if pair(match.group(1), match.group(4)) \
and pair(match.group(2), match.group(3)):
x += 1
newmyid = '>' + str(x) + '_' + gi + strand
print newmyid
print subseq
def pair(base1, base2):
if base1 == 'G' and base2 == 'C' \
or base1 == 'G' and base2 == 'T' \
or base1 == 'A' and base2 == 'T' \
or base1 == 'C' and base2 == 'G' \
or base1 == 'T' and base2 == 'A' \
or base1 == 'T' and base2 == 'G':
return 1
for line in open('chlorophyta_genomes.fa'):
line = line.rstrip()
if re.search('^>', line):
if myid != '':
# 1 #
strand = '+'
pattern(myid, seq, strand)
# 2 #
rev = seq.translate(complement)[::-1]
strand = '-'
pattern(myid, rev, strand)
myid = line
seq = ''
else:
seq += line
strand = '+'
pattern(myid, seq, strand)
rev = seq.translate(complement)[::-1]
strand = '-'
pattern(myid, rev, strand)