# get_hbd_tracts.py
# work with results of max_hbd.py, which has rs column then one hbd prob per pair
# output discovered hbd tracts
import sys
if len(sys.argv)!=3:
for i in range(50): print "*",
print
print "This program takes results from max_hbd.py and creates a summary of discovered HBD tracts."
print "By default, a position is HBD if the probability is > 0.5."
print "Output is one line per tract, with id, start index, end index, genetic length."
print "Start index and end index are integers from 1 to number of markers."
print
print "Usage: python get_hbd_tracts.py my.max_hbd my_genetic.markers (pipe output to desired file)"
for i in range(50): print "*",
print
raise SystemExit
thresh1 = 0.5 # a tract must exceed this probability somewhere in the region
thresh2 = 0.5 # the tract ends when the IBD probability falls below this value
infile = open(sys.argv[1])
markerfile = open(sys.argv[2])
# read in genetic marker distances
rs_dist = {}
for line in markerfile.xreadlines():
bits = line.split()
rs_dist[bits[0]] = float(bits[1])
bits = infile.readline().split()
snps = bits[1:]
nsnps = len(snps)
# read in hbd info
for line in infile.xreadlines():
bits = line.split()
# calculate lengths
lengths = [0]*nsnps
hbds = [float(x) for x in bits[1:]]
for j in range(nsnps):
if hbds[j]>thresh1 and lengths[j]==0:
length1 = 0
this_pos = rs_dist[snps[j]]
pos1 = this_pos
start1 = j
for k in range(j-1,-1,-1):
if hbds[k]> thresh2:
length1 = length1 + 1
pos1 = rs_dist[snps[k]]
start1 = k
else:
break
length2 = 0
pos2 = this_pos
for k in range(j+1,nsnps):
if hbds[k] > thresh2:
length2 = length2 + 1
pos2 = rs_dist[snps[k]]
else:
break
current_length = length1+length2+1
current_dist = pos2 - pos1
for k in range(j-length1,j+length2+1):
lengths[k] = current_length
print bits[0], start1+1,start1+current_length, current_dist