# get_ibd_tracts.py # work with results of max_ibd.py, which has rs column then one ibd prob per pair # output discovered tracts of IBD import sys if len(sys.argv)!=3: for i in range(50): print "*", print print "This program takes results from max_ibd.py and creates a summary of discovered IBD tracts." print "By default, a position is IBD if the probability is > 0.5." print "Output is one line per tract, with id1, id2, start index, end index, genetic length." print "Start index and end index are integers from 1 to number of markers." print print "Usage: python get_ibd_tracts.py my.max_ibd my_genetic.markers (pipe output to desired file)" for i in range(50): print "*", print raise SystemExit thresh1 = 0.5 # a tract must exceed this probability somewhere in the region thresh2 = 0.5 # the tract ends when the IBD probability falls below this value nlines = -1 # nlines to process while debugging if positive number infile = open(sys.argv[1]) markerfile = open(sys.argv[2]) # read in genetic marker distances rs_dist = {} for line in markerfile.xreadlines(): bits = line.split() rs_dist[bits[0]] = float(bits[1]) # get id info ids = [] for i in range(2): bits = infile.readline().split() ids.append(bits[1:]) # read in ibd info ibds = []; snps = [] i = 0 for line in infile.xreadlines(): if nlines > 0 and i > nlines: break bits = line.split() snps.append(bits[0]) this_ibds = [] for j in range(1,len(bits)): this_ibds.append(float(bits[j])) ibds.append(this_ibds) i = i + 1 npairs = len(ibds[0]) nsnps = len(snps) # calculate lengths positive_lengths = [] positive_starts = [] lengths = [[0 for i in range(npairs)] for j in range(nsnps)] positive_dists = [] positive_ids1 = []; positive_ids2 = [] for i in range(npairs): for j in range(nsnps): if ibds[j][i]>thresh1 and lengths[j][i]==0: length1 = 0 start1 = j this_pos = rs_dist[snps[j]] pos1 = this_pos for k in range(j-1,-1,-1): if ibds[k][i]> thresh2: length1 = length1 + 1 pos1 = rs_dist[snps[k]] start1 = k else: break length2 = 0 pos2 = this_pos for k in range(j+1,nsnps): if ibds[k][i] > thresh2: length2 = length2 + 1 pos2 = rs_dist[snps[k]] else: break current_length = length1+length2+1 current_dist = pos2 - pos1 for k in range(j-length1,j+length2+1): lengths[k][i] = current_length positive_lengths.append(current_length) positive_dists.append(current_dist) positive_starts.append(start1) positive_ids1.append(ids[0][i]) positive_ids2.append(ids[1][i]) for id1,id2,start,length,dist in zip(positive_ids1,positive_ids2,positive_starts,positive_lengths,positive_dists): print id1,id2,start+1,start+length,dist