Main Page/Research/MSB/Scripts/preprocess myexp.R
From phurvitz
# preprocess the MyExperience files # if necessary handle on a per-subject basis # output format should be: # sid,myexp_phone_localtime,whatactivity,howlong,locationtype,currentactivity, # travelingpurpose,travelingstart,travelingend,chaining,local.gpstime,utc.gpstime,tsquality source("http://gis.washington.edu/phurvitz/R/functions.R") # go into the combined MyExperience dir and read each myexper.csv file myexp.dir <- "~/public_html/msb/processed_data/downloaded_data/myexperience_merge" setwd(myexp.dir) # list files myexp.files <- list.files(".", pattern="myexper.csv") # output dir outdir <- "~/public_html/msb/processed_data" # get the SMS records source("~/public_html/msb/tools/preprocess_sms.R") sms.master <- process.sms() sms.split <- split(sms.master, sms.master$sid) # for each SID process.myexp <- function() { cat("Processing MyExperience data....") # all MyExperience records myexp.master <- NULL for (i in 1:length(myexp.files)) { # read in the myexp file myexp.file <- myexp.files[i] if(file.exists(myexp.file)) { myexp <- read.csv(myexp.file, as.is=T) } else { next() } sid <- unlist(strsplit(myexp.file, "\\."))[1] cat(paste(sid, ",", sep="")) colnames(myexp) <- fix.colnames(myexp) # fix timestamp field myexp$myexp.phone.localtime <- as.POSIXct(myexp$myexp.phone.localtime) # get the SMS for this subject cmd <- paste("sms <- sms.split$\"", sid, "\"", sep="") eval(parse(text=cmd)) if (is.null(sms)) { next } # a null vector to hold match times sms.phone.localtime <- rep(NA, nrow(myexp)) sms.net.localtime <- rep(NA, nrow(myexp)) for (j in 1:nrow(myexp)) { # get the record's timestamp myexp.phone.localtime <- myexp[j, "myexp.phone.localtime"] # take the time difference between this record and the vector of phone times dt <- difftime(myexp.phone.localtime, sms$sms.phone.localtime) # the SMS time will always be later than the MyExp time (i.e., a negative difftime). # the closest record will have the abs() smallest difftime # if there are records with SMS times that are earlier than MyExp, make them -Inf dt <- ifelse(dt>0,-Inf,dt) # get the max of (dt), this record has the minimum deviation. sms.match.rec <- which.max(dt) # get the timestamp quality and conflate only those that have good quality timestamp tsquality <- sms[sms.match.rec, "tsquality"] #if (tsquality=="A") { sms.phone.localtime[j] <- as.character(sms[sms.match.rec, "sms.phone.localtime"]) sms.net.localtime[j] <- as.character(sms[sms.match.rec, "sms.net.localtime"]) #} else { #sms.phone.localtime[j] <- NA #sms.net.localtime[j] <- NA #} } myexp$sms.phone.localtime <- as.POSIXct(sms.phone.localtime) myexp$sms.net.localtime <- as.POSIXct(sms.net.localtime) # drop some fields that we don't need from the subsequent join myexp$sid <- myexp$sms.phone.localtime <- NULL # join myexp.sms <- merge(myexp, sms, by="sms.net.localtime") # combine with a master table for all subjects myexp.master <- rbind(myexp.master, myexp.sms) } colnames(myexp.master)[which(names(myexp.master)=="sms.phone.localtime.x")] <- "sms.phone.localtime" colnames(myexp.master)[which(names(myexp.master)=="sid.x")] <- "sid" cat("\n") # write out a file outfile <- paste(outdir, "myexp.csv", sep="/") colnames(myexp.master) <- unfix.colnames(myexp.master) write.csv(myexp.master, file=outfile, row.names=F) colnames(myexp.master) <- fix.colnames(myexp.master) # return the master table return(myexp.master) }