Main Page/Research/MSB/Scripts/preprocess sms.R
From phurvitz
< Main Page | Research | MSB | Scripts
Revision as of 01:58, 17 January 2009 by Phil Hurvitz (talk | contribs)
# preprocess SMS messages to a standard format # output data format will be # sid kb_logged msb_secs gps_lock_last sms_phone_localtime sms_net_localtime sms_phone_localdate local.gpstime utc.gpstime tsquality # s09 63936 3974 -9 2007-12-07 13:40:01 2007-12-07 13:42:07 2007-12-07 2007-12-07 13:40:01 2007-12-07 21:40:01 A # GPS quality: A=absolute, E1=estimate (good), E2=estimate (not too good), G=guess process.sms <- function() { # an informative message cat("Processing SMS data.... ") source("http://gis.washington.edu/phurvitz/R/functions.R") # where are files? current.dir <- getwd() masterdir <- "/home/phurvitz/public_html/msb/processed_data" #===================================SMS 6========================== # read in the sms master file (6 fields) sms.master.file.6 <- paste(masterdir, "sms6.csv", sep="/") sms <- read.csv(sms.master.file.6, as.is=T) # fix the column names colnames(sms) <- fix.colnames(sms) # initialize some new columns gps.week <- gps.ms <- local.gpstime <- utc.gpstime <- tsquality <- NA # standardize column order sms <- with(sms, data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, local.gpstime, utc.gpstime, tsquality, stringsAsFactors=F)) # remove those with bad subject names sms <- sms[grep("^s", sms$sid),] # standardize dates year2char <- nchar(sub(".*/", "", sms$sms.phone.localtime))==2 sms$x <- as.POSIXct("2000-01-01") sms[year2char,]$x <- as.POSIXct(strptime(sms[year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%y")) sms[!year2char,]$x <- as.POSIXct(strptime(sms[!year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%Y")) sms$sms.phone.localtime <- sms$x sms$x <- NULL sms$sms.net.localtime <- as.POSIXct(strptime(sms$sms.net.localtime, "%Y-%b-%d %H:%M:%S")) # split into SIDs sms.sidlist <- split(sms, sms$sid) sms.master <- NULL # handle each SID for (i in 1:length(sms.sidlist)) { # get the list from this SID sid <- names(sms.sidlist)[i] # an informative message cat(paste(sid, ",", sep="")) sms.sid <- sms.sidlist[[i]] # time differences td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,sms.net.localtime, units="s"))) # what time difference is the mode? mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) mode.td.h <- (round(mode.td.s/3600, 2)) # put in classes based on the match. E1 is ~ 1 hr, E2 is 1-2 hr, G is anyone's guess sms.sid$tsquality <- ifelse((abs(mode.td.h) > 1 & abs(mode.td.h) < 2), "E2", ifelse((abs(mode.td.s) > 0 & abs(mode.td.s) <= 3660), "E1", "G")) # set the timestamps gpsunixtime <- with(sms.sid, ifelse(tsquality=="G", sms.sid$sms.net.localtime, sms.sid$sms.phone.localtime - mode.td.s)) local.gpstime <- as.POSIXct(strptime("1jan1970", "%d%b%Y", tz="PST") + gpsunixtime) attr(local.gpstime, "tzone") <- "" sms.sid$local.gpstime <- local.gpstime sms.master <- rbind(sms.master, sms.sid) } # no GPS time should be later than a phone network time, so if this is the case, replace with phone net time sms.master$local.gpstime <- with(sms.master, ifelse(local.gpstime > sms.net.localtime, sms.net.localtime, local.gpstime)) sms.master$local.gpstime <- as.POSIXct(sms.master$local.gpstime, origin="1970-01-01", tz="PST") attr(sms.master$local.gpstime, "tzone") <- "" #===================================SMS 8========================== # now handle the sms8 records sms.master.file.8 <- paste(masterdir, "sms8.csv", sep="/") sms <- read.csv(sms.master.file.8, as.is=T) colnames(sms) <- fix.colnames(sms) local.gpstime <- utc.gpstime <- tsquality <- NA sms <- with(sms, data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, local.gpstime, utc.gpstime, tsquality, stringsAsFactors=F)) # remove those with bad subject names sms <- sms[grep("^s", sms$sid),] # standardize dates year2char <- nchar(sub(".*/", "", sms$sms.phone.localtime))==2 sms$x <- as.POSIXct("2000-01-01") sms[year2char,]$x <- as.POSIXct(strptime(sms[year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%y")) sms[!year2char,]$x <- as.POSIXct(strptime(sms[!year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%Y")) sms$sms.phone.localtime <- sms$x sms$x <- NULL sms$sms.net.localtime <- as.POSIXct(strptime(sms$sms.net.localtime, "%Y-%b-%d %H:%M:%S")) sms <- sms[order(sms$sid),] # leap-seconds offset for GPS gps.offset <- -14 # start of the GPS epoch start.epoch <- strptime("1980-01-06", "%Y-%m-%d", "GMT") sms.gps.gmt <- start.epoch + (sms$gps.week * 7 * 24 * 60 * 60 + sms$gps.ms / 1000 + gps.offset) sms$local.gpstime <- sms.gps.gmt attr(sms$local.gpstime, "tzone") <- NULL # split into SIDs sms.sidlist <- split(sms, sms$sid) # process each subject for (i in 1:length(sms.sidlist)) { #for (i in 6){ # get the list from this SID sid <- names(sms.sidlist)[i] # an informative message cat(paste(sid, ",", sep="")) sms.sid <- sms.sidlist[[i]] # time differences td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,local.gpstime, units="s"))) # what time difference is the mode? mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) mode.td.h <- (round(mode.td.s/3600, 2)) (sms.sid) # handle ugly cases if (sid=="s14") { # no records, really. next() } if (sid=="s29") { # one record seems to have a wrong timestamp sms.sid[28,"local.gpstime"] <- sms.sid[28,"local.gpstime"] + 3600 } if (sid=="s61") { sms.sid[75,"local.gpstime"] <- sms.sid[75,"sms.net.localtime"] } if (sid=="s46" | sid=="s61") { # the majority have no GPS lock, so make a subset of time differences for locked measurements td.s <- td.s[abs(td.s)<100] mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) } if (sid=="s50") { # none of the SMS have a fix sms.sid$local.gpstime <- sms.sid$sms.phone.localtime sms.sid$tsquality <- "G" } # handle good cases if (sid=="s12") { sms.sid$tsquality <- "A" } # handle bad cases badlist <- paste("s", c(11,13,15,16,17,18,19,20,22,23,24,27,28,29,30,31,32,33,34,35,37, 38,39,45,46,51,52,54,55,56,58,61,62,63,64,66,67,68,69,70), sep="") if (!is.na(match(sid, badlist ))) { # if the GPS week is bad, make the GPS time from the mode difference from phone time sms.sid$local.gpstime <- ifelse (sms.sid$gps.week==1340, sms.sid$sms.phone.localtime - mode.td.s, sms.sid$local.gpstime) sms.sid$local.gpstime <- as.POSIXct(sms.sid$local.gpstime, origin="1970-01-01", tz="GMT") attr(sms.sid$local.gpstime, "tzone") <- "" sms.sid$tsquality <- ifelse(sms.sid$gps.week==1340, "E1", "A") } sms.master <- rbind(sms.master, sms.sid) } # an informative message cat("\n") # make a field of UTC time sms.master$utc.gpstime <- sms.master$local.gpstime attr(sms.master$utc.gpstime, "tzone") <- "UTC" # write a CSV file outfile <- paste(masterdir, "sms.csv", sep="/") colnames(sms.master) <- unfix.colnames(sms.master) write.csv(sms.master, file=outfile, row.names=F, quote=F) colnames(sms.master) <- fix.colnames(sms.master) return(sms.master) } sms.master <- process.sms() sms.master[sms.master$sid==sid,]