# get_ibd_tracts.py
# work with results of max_ibd.py, which has rs column then one ibd prob per pair
# output discovered tracts of IBD
import sys
if len(sys.argv)!=3:
for i in range(50): print "*",
print
print "This program takes results from max_ibd.py and creates a summary of discovered IBD tracts."
print "By default, a position is IBD if the probability is > 0.5."
print "Output is one line per tract, with id1, id2, start index, end index, genetic length."
print "Start index and end index are integers from 1 to number of markers."
print
print "Usage: python get_ibd_tracts.py my.max_ibd my_genetic.markers (pipe output to desired file)"
for i in range(50): print "*",
print
raise SystemExit
thresh1 = 0.5 # a tract must exceed this probability somewhere in the region
thresh2 = 0.5 # the tract ends when the IBD probability falls below this value
nlines = -1 # nlines to process while debugging if positive number
infile = open(sys.argv[1])
markerfile = open(sys.argv[2])
# read in genetic marker distances
rs_dist = {}
for line in markerfile.xreadlines():
bits = line.split()
rs_dist[bits[0]] = float(bits[1])
# get id info
ids = []
for i in range(2):
bits = infile.readline().split()
ids.append(bits[1:])
# read in ibd info
ibds = []; snps = []
i = 0
for line in infile.xreadlines():
if nlines > 0 and i > nlines:
break
bits = line.split()
snps.append(bits[0])
this_ibds = []
for j in range(1,len(bits)):
this_ibds.append(float(bits[j]))
ibds.append(this_ibds)
i = i + 1
npairs = len(ibds[0])
nsnps = len(snps)
# calculate lengths
positive_lengths = []
positive_starts = []
lengths = [[0 for i in range(npairs)] for j in range(nsnps)]
positive_dists = []
positive_ids1 = []; positive_ids2 = []
for i in range(npairs):
for j in range(nsnps):
if ibds[j][i]>thresh1 and lengths[j][i]==0:
length1 = 0
start1 = j
this_pos = rs_dist[snps[j]]
pos1 = this_pos
for k in range(j-1,-1,-1):
if ibds[k][i]> thresh2:
length1 = length1 + 1
pos1 = rs_dist[snps[k]]
start1 = k
else:
break
length2 = 0
pos2 = this_pos
for k in range(j+1,nsnps):
if ibds[k][i] > thresh2:
length2 = length2 + 1
pos2 = rs_dist[snps[k]]
else:
break
current_length = length1+length2+1
current_dist = pos2 - pos1
for k in range(j-length1,j+length2+1):
lengths[k][i] = current_length
positive_lengths.append(current_length)
positive_dists.append(current_dist)
positive_starts.append(start1)
positive_ids1.append(ids[0][i])
positive_ids2.append(ids[1][i])
for id1,id2,start,length,dist in zip(positive_ids1,positive_ids2,positive_starts,positive_lengths,positive_dists):
print id1,id2,start+1,start+length,dist