# get_hbd_tracts.py

# work with results of max_hbd.py, which has rs column then one hbd prob per pair
# output discovered hbd tracts


import sys

if len(sys.argv)!=3:
    for i in range(50): print "*",
    print
    print "This program takes results from max_hbd.py and creates a summary of discovered HBD tracts."
    print "By default, a position is HBD if the probability is > 0.5."
    print "Output is one line per tract, with id, start index, end index, genetic length."
    print "Start index and end index are integers from 1 to number of markers."
    print
    print "Usage: python get_hbd_tracts.py my.max_hbd my_genetic.markers (pipe output to desired file)"
    for i in range(50): print "*",
    print
    raise SystemExit

thresh1 = 0.5 # a tract must exceed this probability somewhere in the region
thresh2 = 0.5 # the tract ends when the IBD probability falls below this value

infile = open(sys.argv[1])
markerfile = open(sys.argv[2])


# read in genetic marker distances
rs_dist = {}
for line in markerfile.xreadlines():
    bits = line.split()
    rs_dist[bits[0]] = float(bits[1])

bits = infile.readline().split()
snps = bits[1:]
nsnps = len(snps)

# read in hbd info
for line in infile.xreadlines():
    bits = line.split()

    # calculate lengths
    lengths = [0]*nsnps 
    hbds = [float(x) for x in bits[1:]]
    for j in range(nsnps):
        if hbds[j]>thresh1 and lengths[j]==0:
            length1 = 0
            this_pos = rs_dist[snps[j]]
            pos1 = this_pos
            start1 = j
            for k in range(j-1,-1,-1):
                if hbds[k]> thresh2:
                    length1 = length1 + 1
                    pos1 = rs_dist[snps[k]]
                    start1 = k
                else:
                    break
            length2 = 0
            pos2 = this_pos
            for k in range(j+1,nsnps):
                if hbds[k] > thresh2:
                    length2 = length2 + 1
                    pos2 = rs_dist[snps[k]]
                else:
                    break
            current_length = length1+length2+1
            current_dist = pos2 - pos1
            for k in range(j-length1,j+length2+1):
                lengths[k] = current_length
            print bits[0], start1+1,start1+current_length, current_dist