Main Page/Research/MSB/Scripts/preprocess myexp.R

From phurvitz
< Main Page‎ | Research‎ | MSB‎ | Scripts
Revision as of 00:00, 27 January 2009 by Phil Hurvitz (talk | contribs)
Jump to: navigation, search
# preprocess the MyExperience files
# if necessary handle on a per-subject basis

# output format should be:
# sid,myexp_phone_localtime,whatactivity,howlong,locationtype,currentactivity,
#    travelingpurpose,travelingstart,travelingend,chaining,local.gpstime,utc.gpstime,tsquality

source("http://gis.washington.edu/phurvitz/R/functions.R")

# go into the combined MyExperience dir and read each myexper.csv file 
myexp.dir <- "~/public_html/msb/processed_data/downloaded_data/myexperience_merge"
setwd(myexp.dir)
# list files
myexp.files <- list.files(".", pattern="myexper.csv")

# output dir
outdir <- "~/public_html/msb/processed_data"

# get the SMS records
source("~/public_html/msb/tools/preprocess_sms.R")
sms.master <- process.sms()
sms.split <- split(sms.master, sms.master$sid)

# for each SID
process.myexp <- function() {
    cat("Processing MyExperience data....")
    # all MyExperience records
    myexp.master <- NULL

    for (i in 1:length(myexp.files)) {
        # read in the myexp file
        myexp.file <- myexp.files[i]
        if(file.exists(myexp.file)) {
            myexp <- read.csv(myexp.file, as.is=T)
        } else {
            next()
        }
        sid <- unlist(strsplit(myexp.file, "\\."))[1]
        cat(paste(sid, ",", sep=""))
        colnames(myexp) <- fix.colnames(myexp)
        # fix timestamp field
        myexp$myexp.phone.localtime <- as.POSIXct(myexp$myexp.phone.localtime)
    
        # get the SMS for this subject
        cmd <- paste("sms <- sms.split$\"", sid, "\"", sep="")
        eval(parse(text=cmd))
        if (is.null(sms)) {
            next
        }

        # a null vector to hold match times
        sms.phone.localtime <- rep(NA, nrow(myexp))
        sms.net.localtime <- rep(NA, nrow(myexp))
        for (j in 1:nrow(myexp)) {
            # get the record's timestamp
            myexp.phone.localtime <- myexp[j, "myexp.phone.localtime"]
            # take the time difference between this record and the vector of phone times 
            dt <- difftime(myexp.phone.localtime, sms$sms.phone.localtime)
            # the SMS time will always be later than the MyExp time (i.e., a negative difftime).
            # the closest record will have the abs() smallest difftime
            # if there are records with SMS times that are earlier than MyExp, make them -Inf
            dt <- ifelse(dt>0,-Inf,dt)
            # get the max of (dt), this record has the minimum deviation.
            sms.match.rec <- which.max(dt)
            # get the timestamp quality and conflate only those that have good quality timestamp
            tsquality <- sms[sms.match.rec, "tsquality"]
            #if (tsquality=="A") {
                sms.phone.localtime[j] <- as.character(sms[sms.match.rec, "sms.phone.localtime"])
                sms.net.localtime[j] <- as.character(sms[sms.match.rec, "sms.net.localtime"])
            #} else {
                #sms.phone.localtime[j] <- NA
                #sms.net.localtime[j] <- NA
            #}
        }
        myexp$sms.phone.localtime <- as.POSIXct(sms.phone.localtime)
        myexp$sms.net.localtime <- as.POSIXct(sms.net.localtime)

        # drop some fields that we don't need from the subsequent join
        myexp$sid <- myexp$sms.phone.localtime <- NULL

        # join
        myexp.sms <- merge(myexp, sms, by="sms.net.localtime")
    
        # combine with a master table for all subjects
        myexp.master <- rbind(myexp.master, myexp.sms)
    }
    colnames(myexp.master)[which(names(myexp.master)=="sms.phone.localtime.x")] <- "sms.phone.localtime"
    colnames(myexp.master)[which(names(myexp.master)=="sid.x")] <- "sid"
    cat("\n")
    # write out a file
    outfile <- paste(outdir, "myexp.csv", sep="/")
    colnames(myexp.master) <- unfix.colnames(myexp.master)
    write.csv(myexp.master, file=outfile, row.names=F)
    colnames(myexp.master) <- fix.colnames(myexp.master)

    # return the master table
    return(myexp.master)
}