# get_hbd_tracts.py # work with results of max_hbd.py, which has rs column then one hbd prob per pair # output discovered hbd tracts import sys if len(sys.argv)!=3: for i in range(50): print "*", print print "This program takes results from max_hbd.py and creates a summary of discovered HBD tracts." print "By default, a position is HBD if the probability is > 0.5." print "Output is one line per tract, with id, start index, end index, genetic length." print "Start index and end index are integers from 1 to number of markers." print print "Usage: python get_hbd_tracts.py my.max_hbd my_genetic.markers (pipe output to desired file)" for i in range(50): print "*", print raise SystemExit thresh1 = 0.5 # a tract must exceed this probability somewhere in the region thresh2 = 0.5 # the tract ends when the IBD probability falls below this value infile = open(sys.argv[1]) markerfile = open(sys.argv[2]) # read in genetic marker distances rs_dist = {} for line in markerfile.xreadlines(): bits = line.split() rs_dist[bits[0]] = float(bits[1]) bits = infile.readline().split() snps = bits[1:] nsnps = len(snps) # read in hbd info for line in infile.xreadlines(): bits = line.split() # calculate lengths lengths = [0]*nsnps hbds = [float(x) for x in bits[1:]] for j in range(nsnps): if hbds[j]>thresh1 and lengths[j]==0: length1 = 0 this_pos = rs_dist[snps[j]] pos1 = this_pos start1 = j for k in range(j-1,-1,-1): if hbds[k]> thresh2: length1 = length1 + 1 pos1 = rs_dist[snps[k]] start1 = k else: break length2 = 0 pos2 = this_pos for k in range(j+1,nsnps): if hbds[k] > thresh2: length2 = length2 + 1 pos2 = rs_dist[snps[k]] else: break current_length = length1+length2+1 current_dist = pos2 - pos1 for k in range(j-length1,j+length2+1): lengths[k] = current_length print bits[0], start1+1,start1+current_length, current_dist