#!/usr/bin/python
import re
genome = {}
for line in open('chr7.txt'):
if not re.search('^\#', line):
line = line.rstrip('\n')
columns = re.split('\t', line)
if columns[8]: # if there is a protein identifier
exonStarts = columns[6]
exonStarts = re.sub('\,$', '', exonStarts) # remove the trailing ','
exonStarts = re.split(',', exonStarts)
exonEnds = columns[7]
exonEnds = re.sub('\,$', '', exonEnds)
exonEnds = re.split(',', exonEnds)
for i in range(0, len(exonStarts)):
start = int(exonStarts[i])
stop = int(exonEnds[i])
for j in range(start, stop + 1):
# save name, strand and proteinID
genome[j] = columns[0] + ' '
genome[j] += columns[2] + ' ' + columns[8]
for line in open('bushmen.out'):
columns = re.split('\t', line)
pos = int(columns[1])
if pos in genome:
print pos, genome[pos]