#!/usr/bin/python
import re
# first read the sequences that were used as input
# to cmsearch from the file pattern_search.out
origseq = {}
myid = ''
strand = ''
for line in open('pattern_search.out'):
line = line.rstrip()
match = re.search('^>(.*)', line)
if match:
myid = match.group(1)
else:
origseq[myid] = line
# read the output from cmsearch
for line in open('chlorophyta.tab'):
match = \
re.search('SRP_bact +(\S+) +(\S+) +(\S+) +(\S+) +(\S+) +(\S+) +(\S+)'
, line)
if match:
myid = match.group(1)
beg = int(match.group(2))
end = int(match.group(3))
score = float(match.group(6))
length = end - beg + 1
ret = (origseq[myid])[beg - 1:beg - 1 + length]
if score > 20:
myid = '>' + myid
print myid, 'BEG:', beg, 'END:', end
print ret