## I changed the densities to go from dmin to dmax instead of from 0 to dmax
## I changed the code from post <- pmax(p*fw/(p*fw + (1-p)*fb),0)
## to post <- ifelse(dd$partners==0,0, ifelse(dd$partners==2,1,pmax(p*fw/(p*fw + (1-p)*fb),0)
## I changed iter from T,F to 0,1,2. See documentation below.
## I added 'unknown' option to include unknown transmission data with training
## data when determining f0 and f1
##
## Rachel Debes
## 11/7/11
seqlevel <- function(dd,p0=0.5,eps=.0001,iter=2,unknown=T){
#######################################################
# This is an empirical Bayes-like algorithm to evaluate the probability
# of linkage of individual sequence pairs. It takes training data from known
# linked and unlinked sequences to develop distributions of linked distances
# and unlinked distances. It then uses this information in Bayes formula
# to estimate the posterior probability of linkage for samples of unknown
# linkage status. The process is iterative in that the prior probability
# of linkage is updated by summing over the posterior probabilities (from the
# unknown status cases) and then recomputing the posteriors. If you want a
# straight Bayes with fixed prior, set iter=F
#
# INPUT:
# dd is a dataframe with the following components
# distance - distance between the pair of sequences
# partners -
# = 0 means this is a pair of sequences from different individuals who are not partners;
# category 0 is treated as unlinked training data (prob. of linkage fixed at 0)
# = 1 means this is a pair of sequences from PIP partners; these are the ones we want to
# make a decision about; posterior probability estimates are developed for the sequences
# in category 1
# = 2 means this is a pair of sequences from the same index case
# = 3 means this is a pair of sequences from different individuals who are known to be linked
# categories 2 and 3 are treated as linked training data (prob. of linkage fixed at 1)
# Note: categories 2 and 3 are treated identically - as known linked sequences
#
#
# p0 is the prior probability of linkage (if iterating, just set to 0.5 to start)
#
# eps is the convergence criteria
#
# iter = 0, algorithm is Bayes and does not update the prior
# = 1, algorithm is an iterative Bayes. The posterior probabilities are
# iteratively updated using a fixed prior and then used in the next
# iteration to calculate the updated linked and unlinked densities.
# Thus the Bayes calculation involves a fixed prior and iteratively
# updated linked and unlinked densities.
# but the prior is fixed and is used in the traditional Bayes calculation
# = 2, algorithm is empirical Bayes and iteratively updates the prior
# probability of linkage based on proportion of category 1 partners that are
# classified as linked
# unknown -
# = F, unknown data is not included with the training data to determine f0
# and f1; in this case, the function will iterate only once and will be
# a true Bayes model
# = T, unknown data is included with the training data with weights p
#
#
# Note: if iter = 1 and unknown=F, then this is has the same result as iter = 0
#
# OUTPUT:
# A list with components
# post - posterior probabilities of linkage for partners == 1; only useful/meaningful for
# partners == 1
# p - sum of the posterior probability of linkage in the partners == 1 group after
# convergence; this is effectively the prior that was used in the last iteration
# fwithin - distribution object (from R function density) representing the linked distances
# fbetween - distribution object (from R function density) representing the unlinked distances
#
# Jim Hughes (jphughes@u.washington.edu)
# 6/5/09
########################################################
# Get initial values for p, fwithin and fbetween
innerfunction <- function(dd,p0,eps,iter=2,unknown=T){
dmax <- max(dd$distance)
dmin <- min(dd$distance)
p <- p0
# Estimate density for within person sequences
w <- ifelse(dd$partners>=2,1,0)
w <- w/sum(w)
fwithin <- density(dd$distance,from=dmin,to=dmax,weight=w)
fw <- approx(fwithin$x,fwithin$y,dd$distance,method="linear")$y
# Estimate density for between person sequences
w <- ifelse(dd$partners==0,1,0)
w <- w/sum(w)
fbetween <- density(dd$distance,from=dmin,to=dmax,weight=w)
fb <- approx(fbetween$x,fbetween$y,dd$distance,method="linear")$y
post <- ifelse(dd$partners==0,0, ifelse(dd$partners>=2,1,pmax(p*fw/(p*fw + (1-p)*fb),0)))
if (iter > 0){
repeat{
# E step
if(iter==2) p <- sum(post[dd$partners==1])/sum(dd$partners==1)
if(unknown){
wt <- ifelse(dd$partners==0,0,ifelse(dd$partners>=2,1,post))
# M step
w <- wt/sum(wt)
fwithin <- density(dd$distance,from=dmin,to=dmax,weight=w)
fw <- approx(fwithin$x,fwithin$y,dd$distance,method="linear")$y
w <- (1-wt)/sum(1-wt)
fbetween <- density(dd$distance,from=dmin,to=dmax,weight=w)
fb <- approx(fbetween$x,fbetween$y,dd$distance,method="linear")$y
}
# check for convergence
newpost <- ifelse(dd$partners==0,0,ifelse(dd$partners>=2,1,pmax(p*fw/(p*fw + (1-p)*fb),0)))
if (sum((post-newpost)[dd$partners==1]^2)/sum(dd$partners==1) < eps) break
post <- newpost
}
}
# post gives posterior probabilities of linkage
return(list(post=post[dd$partners==1],p=p,fwithin=fwithin,fbetween=fbetween))
}
results <- innerfunction(dd,p0,eps,iter,unknown)
return(results)
}