Main Page/Research/MSB/Scripts/preprocess sms.R
From phurvitz
< Main Page | Research | MSB | Scripts
Revision as of 05:20, 26 January 2009 by Phil Hurvitz (talk | contribs)
# preprocess SMS messages to a standard format # output data format will be # sid kb_logged msb_secs gps_lock_last sms_phone_localtime sms_net_localtime sms_phone_localdate gps.localtime gps.utctime tsquality # s09 63936 3974 -9 2007-12-07 13:40:01 2007-12-07 13:42:07 2007-12-07 2007-12-07 13:40:01 2007-12-07 21:40:01 A # GPS quality: A=absolute, E1=estimate (good), E2=estimate (not too good), G=guess, B=badG process.sms <- function() { # an informative message cat("Processing SMS data.... ") source("http://gis.washington.edu/phurvitz/R/functions.R") # where are files? current.dir <- getwd() masterdir <- "/home/phurvitz/public_html/msb/processed_data" #===================================SMS 6========================== # read in the sms master file (6 fields) sms.master.file.6 <- paste(masterdir, "sms6.csv", sep="/") sms <- read.csv(sms.master.file.6, as.is=T) # fix the column names colnames(sms) <- fix.colnames(sms) # initialize some new columns gps.week <- gps.ms <- gps.localtime <- gps.utctime <- tsquality <- NA # standardize column order sms <- with(sms, data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, gps.localtime, gps.utctime, tsquality, stringsAsFactors=F)) # remove those with bad subject names sms <- sms[grep("^s", sms$sid),] # standardize dates year2char <- nchar(sub(".*/", "", sms$sms.phone.localtime))==2 sms$x <- as.POSIXct("2000-01-01") sms[year2char,]$x <- as.POSIXct(strptime(sms[year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%y")) sms[!year2char,]$x <- as.POSIXct(strptime(sms[!year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%Y")) sms$sms.phone.localtime <- sms$x sms$x <- NULL sms$sms.net.localtime <- as.POSIXct(strptime(sms$sms.net.localtime, "%Y-%b-%d %H:%M:%S")) # split into SIDs sms.sidlist <- split(sms, sms$sid) sms.master <- NULL # handle each SID for (i in 1:length(sms.sidlist)) { # get the list from this SID sid <- names(sms.sidlist)[i] # an informative message cat(paste(sid, ",", sep="")) sms.sid <- sms.sidlist[[i]] # time differences between phone and net time td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,sms.net.localtime, units="s"))) # what time difference is the mode? mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) mode.td.h <- (round(mode.td.s/3600, 2)) # put in classes based on the match. E1 is ~ 1 hr, E2 is 1-2 hr, G is anyone's guess sms.sid$tsquality <- ifelse((abs(mode.td.h) > 1 & abs(mode.td.h) < 2), "E2", ifelse((abs(mode.td.s) > 0 & abs(mode.td.s) <= 3660), "E1", "G")) # set the timestamps gpsunixtime <- with(sms.sid, ifelse(tsquality=="G", sms.sid$sms.net.localtime, sms.sid$sms.phone.localtime - mode.td.s)) gps.localtime <- as.POSIXct(strptime("1jan1970", "%d%b%Y", tz="PST") + gpsunixtime) attr(gps.localtime, "tzone") <- "" sms.sid$gps.localtime <- gps.localtime sms.master <- rbind(sms.master, sms.sid) } # no GPS time should be later than a phone network time, so if this is the case, replace with phone net time sms.master$gps.localtime <- with(sms.master, ifelse(gps.localtime > sms.net.localtime, sms.net.localtime, gps.localtime)) sms.master$gps.localtime <- as.POSIXct(sms.master$gps.localtime, origin="1970-01-01", tz="PST") attr(sms.master$gps.localtime, "tzone") <- "" #===================================SMS 8========================== # now handle the sms8 records sms.master.file.8 <- paste(masterdir, "sms8.csv", sep="/") sms <- read.csv(sms.master.file.8, as.is=T) colnames(sms) <- fix.colnames(sms) gps.localtime <- gps.utctime <- tsquality <- NA sms <- with(sms, data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, gps.localtime, gps.utctime, tsquality, stringsAsFactors=F)) # remove those with bad subject names sms <- sms[grep("^s", sms$sid),] # standardize dates year2char <- nchar(sub(".*/", "", sms$sms.phone.localtime))==2 sms$x <- as.POSIXct("2000-01-01") sms[year2char,]$x <- as.POSIXct(strptime(sms[year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%y")) sms[!year2char,]$x <- as.POSIXct(strptime(sms[!year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%Y")) sms$sms.phone.localtime <- sms$x sms$x <- NULL sms$sms.net.localtime <- as.POSIXct(strptime(sms$sms.net.localtime, "%Y-%b-%d %H:%M:%S")) sms <- sms[order(sms$sid),] # leap-seconds offset for GPS gps.offset <- -14 # start of the GPS epoch start.epoch <- strptime("1980-01-06", "%Y-%m-%d", "GMT") sms.gps.gmt <- start.epoch + (sms$gps.week * 7 * 24 * 60 * 60 + sms$gps.ms / 1000 + gps.offset) sms$gps.localtime <- sms.gps.gmt attr(sms$gps.localtime, "tzone") <- NULL # split into SIDs sms.sidlist <- split(sms, sms$sid) # process each subject for (i in 1:length(sms.sidlist)) { #for (i in 6){ # get the list from this SID sid <- names(sms.sidlist)[i] # an informative message cat(paste(sid, ",", sep="")) sms.sid <- sms.sidlist[[i]] # time differences td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,gps.localtime, units="s"))) # what time difference is the mode? mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) mode.td.h <- (round(mode.td.s/3600, 2)) (sms.sid) # handle ugly cases if (sid=="s14") { # no records, really. next() } if (sid=="s26") { # the first half are mostly good sms.sid[1:15,"tsquality"] <- "A" # record 8 is strange sms.sid[8,"gps.localtime"] <- sms.sid[7,"gps.localtime"] + sms.sid[8,"msb.secs"] - sms.sid[7,"msb.secs"] sms.sid[8,"tsquality"] <- "E2" # record 16 is bad sms.sid[16,"tsquality"] <- "B" # but 17:22 are good sms.sid[17:22,"tsquality"] <- "A" # and the phone time goes awry at record 22+ # so set the GPS time to the network localtime sms.sid[22:nrow(sms.sid),"gps.localtime"] <- sms.sid[22:nrow(sms.sid),"sms.net.localtime"] sms.sid[22:nrow(sms.sid),"tsquality"] <- "B" } if (sid=="s46" | sid=="s61") { # the majority have no GPS lock, so make a subset of time differences for locked measurements td.s <- td.s[abs(td.s)<100] mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) } if (sid=="s50") { # none of the SMS have a fix sms.sid$gps.localtime <- sms.sid$sms.phone.localtime sms.sid$tsquality <- "G" } # handle good cases if (sid=="s12") { sms.sid$tsquality <- "A" } # handle bad cases badlist <- paste("s", c(11,13,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33,34,35,37, 38,39,45,46,51,52,54,55,56,58,61,62,63,64,66,67,68,69,70), sep="") if (!is.na(match(sid, badlist ))) { # if the GPS week is bad, make the GPS time from the mode difference from phone time sms.sid$gps.localtime <- ifelse (sms.sid$gps.week==1340, sms.sid$sms.phone.localtime - mode.td.s, sms.sid$gps.localtime) sms.sid$gps.localtime <- as.POSIXct(sms.sid$gps.localtime, origin="1970-01-01", tz="GMT") attr(sms.sid$gps.localtime, "tzone") <- "" sms.sid$tsquality <- ifelse(sms.sid$gps.week==1340, "E1", "A") } if (sid=="s61") { sms.sid[75,"gps.localtime"] <- sms.sid[75,"sms.net.localtime"] } if (sid=="s29") { # one record seems to have a wrong timestamp sms.sid[28,"gps.localtime"] <- sms.sid[28,"gps.localtime"] + 3600 } sms.master <- rbind(sms.master, sms.sid) } # an informative message cat("\n") # make a field of UTC time sms.master$gps.utctime <- sms.master$gps.localtime attr(sms.master$gps.utctime, "tzone") <- "UTC" # calculate the difference in seconds between the phone and GPS time sms.master$timediff.sec <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="s")) sms.master$timediff.hr <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="h")) sms.master$timediff.day <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="d")) # write a CSV file outfile <- paste(masterdir, "sms.csv", sep="/") colnames(sms.master) <- unfix.colnames(sms.master) write.csv(sms.master, file=outfile, row.names=F, quote=F) colnames(sms.master) <- fix.colnames(sms.master) return(sms.master) } #sms.master <- process.sms() #sms.m1 <- sms.master #sms.m1$kb.logged <- NULL #sids <- unique(sms.m1$sid) #for (i in 1:length(sids)) { #sid <- sids[i] #cmd <- paste(sid, " <- sms.m1[sms.m1$sid==", "\"", sid, "\"", ",]", sep="") #eval(parse(text=cmd)) #cmd <- paste("rownames(", sid, ") <- NULL", sep="") #eval(parse(text=cmd)) #} # #ss <- split(sms.m1, sms.m1$sid) #for (i in 1:length(ss)) { #dat <- ss[[i]] #rownames(dat) <- NULL #print(dat) #}