Difference between revisions of "Main Page/Research/MSB/Scripts/preprocess sms.R"
From phurvitz
Phil Hurvitz (talk | contribs) |
Phil Hurvitz (talk | contribs) |
||
Line 3: | Line 3: | ||
# output data format will be | # output data format will be | ||
− | # sid kb_logged msb_secs gps_lock_last sms_phone_localtime sms_net_localtime sms_phone_localdate | + | # sid kb_logged msb_secs gps_lock_last sms_phone_localtime sms_net_localtime sms_phone_localdate gps.localtime gps.utctime tsquality |
# s09 63936 3974 -9 2007-12-07 13:40:01 2007-12-07 13:42:07 2007-12-07 2007-12-07 13:40:01 2007-12-07 21:40:01 A | # s09 63936 3974 -9 2007-12-07 13:40:01 2007-12-07 13:42:07 2007-12-07 2007-12-07 13:40:01 2007-12-07 21:40:01 A | ||
− | # GPS quality: A=absolute, E1=estimate (good), E2=estimate (not too good), G=guess | + | # GPS quality: A=absolute, E1=estimate (good), E2=estimate (not too good), G=guess, B=badG |
process.sms <- function() { | process.sms <- function() { | ||
Line 22: | Line 22: | ||
colnames(sms) <- fix.colnames(sms) | colnames(sms) <- fix.colnames(sms) | ||
# initialize some new columns | # initialize some new columns | ||
− | gps.week <- gps.ms <- | + | gps.week <- gps.ms <- gps.localtime <- gps.utctime <- tsquality <- NA |
# standardize column order | # standardize column order | ||
sms <- with(sms, | sms <- with(sms, | ||
data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, | data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, | ||
− | + | gps.localtime, gps.utctime, tsquality, stringsAsFactors=F)) | |
# remove those with bad subject names | # remove those with bad subject names | ||
Line 48: | Line 48: | ||
# get the list from this SID | # get the list from this SID | ||
sid <- names(sms.sidlist)[i] | sid <- names(sms.sidlist)[i] | ||
− | # an informative message | + | # an informative message |
cat(paste(sid, ",", sep="")) | cat(paste(sid, ",", sep="")) | ||
sms.sid <- sms.sidlist[[i]] | sms.sid <- sms.sidlist[[i]] | ||
− | # time differences | + | # time differences between phone and net time |
td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,sms.net.localtime, units="s"))) | td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,sms.net.localtime, units="s"))) | ||
# what time difference is the mode? | # what time difference is the mode? | ||
Line 62: | Line 62: | ||
# set the timestamps | # set the timestamps | ||
gpsunixtime <- with(sms.sid, ifelse(tsquality=="G", sms.sid$sms.net.localtime, sms.sid$sms.phone.localtime - mode.td.s)) | gpsunixtime <- with(sms.sid, ifelse(tsquality=="G", sms.sid$sms.net.localtime, sms.sid$sms.phone.localtime - mode.td.s)) | ||
− | + | gps.localtime <- as.POSIXct(strptime("1jan1970", "%d%b%Y", tz="PST") + gpsunixtime) | |
− | attr( | + | attr(gps.localtime, "tzone") <- "" |
− | sms.sid$ | + | sms.sid$gps.localtime <- gps.localtime |
sms.master <- rbind(sms.master, sms.sid) | sms.master <- rbind(sms.master, sms.sid) | ||
} | } | ||
# no GPS time should be later than a phone network time, so if this is the case, replace with phone net time | # no GPS time should be later than a phone network time, so if this is the case, replace with phone net time | ||
− | sms.master$ | + | sms.master$gps.localtime <- with(sms.master, ifelse(gps.localtime > sms.net.localtime, sms.net.localtime, gps.localtime)) |
− | sms.master$ | + | sms.master$gps.localtime <- as.POSIXct(sms.master$gps.localtime, origin="1970-01-01", tz="PST") |
− | attr(sms.master$ | + | attr(sms.master$gps.localtime, "tzone") <- "" |
#===================================SMS 8========================== | #===================================SMS 8========================== | ||
Line 77: | Line 77: | ||
sms <- read.csv(sms.master.file.8, as.is=T) | sms <- read.csv(sms.master.file.8, as.is=T) | ||
colnames(sms) <- fix.colnames(sms) | colnames(sms) <- fix.colnames(sms) | ||
− | + | gps.localtime <- gps.utctime <- tsquality <- NA | |
sms <- with(sms, | sms <- with(sms, | ||
data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, | data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, | ||
− | + | gps.localtime, gps.utctime, tsquality, stringsAsFactors=F)) | |
# remove those with bad subject names | # remove those with bad subject names | ||
Line 101: | Line 101: | ||
start.epoch <- strptime("1980-01-06", "%Y-%m-%d", "GMT") | start.epoch <- strptime("1980-01-06", "%Y-%m-%d", "GMT") | ||
sms.gps.gmt <- start.epoch + (sms$gps.week * 7 * 24 * 60 * 60 + sms$gps.ms / 1000 + gps.offset) | sms.gps.gmt <- start.epoch + (sms$gps.week * 7 * 24 * 60 * 60 + sms$gps.ms / 1000 + gps.offset) | ||
− | sms$ | + | sms$gps.localtime <- sms.gps.gmt |
− | attr(sms$ | + | attr(sms$gps.localtime, "tzone") <- NULL |
# split into SIDs | # split into SIDs | ||
Line 116: | Line 116: | ||
sms.sid <- sms.sidlist[[i]] | sms.sid <- sms.sidlist[[i]] | ||
# time differences | # time differences | ||
− | td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime, | + | td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,gps.localtime, units="s"))) |
# what time difference is the mode? | # what time difference is the mode? | ||
mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) | mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) | ||
Line 126: | Line 126: | ||
next() | next() | ||
} | } | ||
− | if (sid==" | + | if (sid=="s26") { |
− | # | + | # the first half are mostly good |
− | sms.sid[ | + | sms.sid[1:15,"tsquality"] <- "A" |
− | + | # record 8 is strange | |
− | + | sms.sid[8,"gps.localtime"] <- sms.sid[7,"gps.localtime"] + sms.sid[8,"msb.secs"] - sms.sid[7,"msb.secs"] | |
− | sms.sid[ | + | sms.sid[8,"tsquality"] <- "E2" |
+ | # record 16 is bad | ||
+ | sms.sid[16,"tsquality"] <- "B" | ||
+ | # but 17:22 are good | ||
+ | sms.sid[17:22,"tsquality"] <- "A" | ||
+ | # and the phone time goes awry at record 22+ | ||
+ | # so set the GPS time to the network localtime | ||
+ | sms.sid[22:nrow(sms.sid),"gps.localtime"] <- sms.sid[22:nrow(sms.sid),"sms.net.localtime"] | ||
+ | sms.sid[22:nrow(sms.sid),"tsquality"] <- "B" | ||
} | } | ||
if (sid=="s46" | sid=="s61") { | if (sid=="s46" | sid=="s61") { | ||
Line 140: | Line 148: | ||
if (sid=="s50") { | if (sid=="s50") { | ||
# none of the SMS have a fix | # none of the SMS have a fix | ||
− | sms.sid$ | + | sms.sid$gps.localtime <- sms.sid$sms.phone.localtime |
sms.sid$tsquality <- "G" | sms.sid$tsquality <- "G" | ||
} | } | ||
Line 148: | Line 156: | ||
} | } | ||
# handle bad cases | # handle bad cases | ||
− | badlist <- paste("s", c(11,13,15,16,17,18,19,20,22,23,24,27,28,29,30,31,32,33,34,35,37, | + | badlist <- paste("s", c(11,13,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33,34,35,37, |
38,39,45,46,51,52,54,55,56,58,61,62,63,64,66,67,68,69,70), sep="") | 38,39,45,46,51,52,54,55,56,58,61,62,63,64,66,67,68,69,70), sep="") | ||
if (!is.na(match(sid, badlist ))) { | if (!is.na(match(sid, badlist ))) { | ||
# if the GPS week is bad, make the GPS time from the mode difference from phone time | # if the GPS week is bad, make the GPS time from the mode difference from phone time | ||
− | sms.sid$ | + | sms.sid$gps.localtime <- ifelse (sms.sid$gps.week==1340, sms.sid$sms.phone.localtime - mode.td.s, sms.sid$gps.localtime) |
− | sms.sid$ | + | sms.sid$gps.localtime <- as.POSIXct(sms.sid$gps.localtime, origin="1970-01-01", tz="GMT") |
− | attr(sms.sid$ | + | attr(sms.sid$gps.localtime, "tzone") <- "" |
sms.sid$tsquality <- ifelse(sms.sid$gps.week==1340, "E1", "A") | sms.sid$tsquality <- ifelse(sms.sid$gps.week==1340, "E1", "A") | ||
+ | } | ||
+ | if (sid=="s61") { | ||
+ | sms.sid[75,"gps.localtime"] <- sms.sid[75,"sms.net.localtime"] | ||
+ | } | ||
+ | if (sid=="s29") { | ||
+ | # one record seems to have a wrong timestamp | ||
+ | sms.sid[28,"gps.localtime"] <- sms.sid[28,"gps.localtime"] + 3600 | ||
} | } | ||
sms.master <- rbind(sms.master, sms.sid) | sms.master <- rbind(sms.master, sms.sid) | ||
Line 162: | Line 177: | ||
cat("\n") | cat("\n") | ||
# make a field of UTC time | # make a field of UTC time | ||
− | sms.master$ | + | sms.master$gps.utctime <- sms.master$gps.localtime |
− | attr(sms.master$ | + | attr(sms.master$gps.utctime, "tzone") <- "UTC" |
+ | # calculate the difference in seconds between the phone and GPS time | ||
+ | sms.master$timediff.sec <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="s")) | ||
+ | sms.master$timediff.hr <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="h")) | ||
+ | sms.master$timediff.day <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="d")) | ||
# write a CSV file | # write a CSV file | ||
outfile <- paste(masterdir, "sms.csv", sep="/") | outfile <- paste(masterdir, "sms.csv", sep="/") | ||
Line 172: | Line 191: | ||
} | } | ||
− | + | #sms.master <- process.sms() | |
− | sms.master <- process.sms() | + | #sms.m1 <- sms.master |
− | sms.master[sms. | + | #sms.m1$kb.logged <- NULL |
+ | #sids <- unique(sms.m1$sid) | ||
+ | #for (i in 1:length(sids)) { | ||
+ | #sid <- sids[i] | ||
+ | #cmd <- paste(sid, " <- sms.m1[sms.m1$sid==", "\"", sid, "\"", ",]", sep="") | ||
+ | #eval(parse(text=cmd)) | ||
+ | #cmd <- paste("rownames(", sid, ") <- NULL", sep="") | ||
+ | #eval(parse(text=cmd)) | ||
+ | #} | ||
+ | # | ||
+ | #ss <- split(sms.m1, sms.m1$sid) | ||
+ | #for (i in 1:length(ss)) { | ||
+ | #dat <- ss[[i]] | ||
+ | #rownames(dat) <- NULL | ||
+ | #print(dat) | ||
+ | #} | ||
</pre> | </pre> |
Revision as of 05:20, 26 January 2009
# preprocess SMS messages to a standard format # output data format will be # sid kb_logged msb_secs gps_lock_last sms_phone_localtime sms_net_localtime sms_phone_localdate gps.localtime gps.utctime tsquality # s09 63936 3974 -9 2007-12-07 13:40:01 2007-12-07 13:42:07 2007-12-07 2007-12-07 13:40:01 2007-12-07 21:40:01 A # GPS quality: A=absolute, E1=estimate (good), E2=estimate (not too good), G=guess, B=badG process.sms <- function() { # an informative message cat("Processing SMS data.... ") source("http://gis.washington.edu/phurvitz/R/functions.R") # where are files? current.dir <- getwd() masterdir <- "/home/phurvitz/public_html/msb/processed_data" #===================================SMS 6========================== # read in the sms master file (6 fields) sms.master.file.6 <- paste(masterdir, "sms6.csv", sep="/") sms <- read.csv(sms.master.file.6, as.is=T) # fix the column names colnames(sms) <- fix.colnames(sms) # initialize some new columns gps.week <- gps.ms <- gps.localtime <- gps.utctime <- tsquality <- NA # standardize column order sms <- with(sms, data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, gps.localtime, gps.utctime, tsquality, stringsAsFactors=F)) # remove those with bad subject names sms <- sms[grep("^s", sms$sid),] # standardize dates year2char <- nchar(sub(".*/", "", sms$sms.phone.localtime))==2 sms$x <- as.POSIXct("2000-01-01") sms[year2char,]$x <- as.POSIXct(strptime(sms[year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%y")) sms[!year2char,]$x <- as.POSIXct(strptime(sms[!year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%Y")) sms$sms.phone.localtime <- sms$x sms$x <- NULL sms$sms.net.localtime <- as.POSIXct(strptime(sms$sms.net.localtime, "%Y-%b-%d %H:%M:%S")) # split into SIDs sms.sidlist <- split(sms, sms$sid) sms.master <- NULL # handle each SID for (i in 1:length(sms.sidlist)) { # get the list from this SID sid <- names(sms.sidlist)[i] # an informative message cat(paste(sid, ",", sep="")) sms.sid <- sms.sidlist[[i]] # time differences between phone and net time td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,sms.net.localtime, units="s"))) # what time difference is the mode? mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) mode.td.h <- (round(mode.td.s/3600, 2)) # put in classes based on the match. E1 is ~ 1 hr, E2 is 1-2 hr, G is anyone's guess sms.sid$tsquality <- ifelse((abs(mode.td.h) > 1 & abs(mode.td.h) < 2), "E2", ifelse((abs(mode.td.s) > 0 & abs(mode.td.s) <= 3660), "E1", "G")) # set the timestamps gpsunixtime <- with(sms.sid, ifelse(tsquality=="G", sms.sid$sms.net.localtime, sms.sid$sms.phone.localtime - mode.td.s)) gps.localtime <- as.POSIXct(strptime("1jan1970", "%d%b%Y", tz="PST") + gpsunixtime) attr(gps.localtime, "tzone") <- "" sms.sid$gps.localtime <- gps.localtime sms.master <- rbind(sms.master, sms.sid) } # no GPS time should be later than a phone network time, so if this is the case, replace with phone net time sms.master$gps.localtime <- with(sms.master, ifelse(gps.localtime > sms.net.localtime, sms.net.localtime, gps.localtime)) sms.master$gps.localtime <- as.POSIXct(sms.master$gps.localtime, origin="1970-01-01", tz="PST") attr(sms.master$gps.localtime, "tzone") <- "" #===================================SMS 8========================== # now handle the sms8 records sms.master.file.8 <- paste(masterdir, "sms8.csv", sep="/") sms <- read.csv(sms.master.file.8, as.is=T) colnames(sms) <- fix.colnames(sms) gps.localtime <- gps.utctime <- tsquality <- NA sms <- with(sms, data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, gps.localtime, gps.utctime, tsquality, stringsAsFactors=F)) # remove those with bad subject names sms <- sms[grep("^s", sms$sid),] # standardize dates year2char <- nchar(sub(".*/", "", sms$sms.phone.localtime))==2 sms$x <- as.POSIXct("2000-01-01") sms[year2char,]$x <- as.POSIXct(strptime(sms[year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%y")) sms[!year2char,]$x <- as.POSIXct(strptime(sms[!year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%Y")) sms$sms.phone.localtime <- sms$x sms$x <- NULL sms$sms.net.localtime <- as.POSIXct(strptime(sms$sms.net.localtime, "%Y-%b-%d %H:%M:%S")) sms <- sms[order(sms$sid),] # leap-seconds offset for GPS gps.offset <- -14 # start of the GPS epoch start.epoch <- strptime("1980-01-06", "%Y-%m-%d", "GMT") sms.gps.gmt <- start.epoch + (sms$gps.week * 7 * 24 * 60 * 60 + sms$gps.ms / 1000 + gps.offset) sms$gps.localtime <- sms.gps.gmt attr(sms$gps.localtime, "tzone") <- NULL # split into SIDs sms.sidlist <- split(sms, sms$sid) # process each subject for (i in 1:length(sms.sidlist)) { #for (i in 6){ # get the list from this SID sid <- names(sms.sidlist)[i] # an informative message cat(paste(sid, ",", sep="")) sms.sid <- sms.sidlist[[i]] # time differences td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,gps.localtime, units="s"))) # what time difference is the mode? mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) mode.td.h <- (round(mode.td.s/3600, 2)) (sms.sid) # handle ugly cases if (sid=="s14") { # no records, really. next() } if (sid=="s26") { # the first half are mostly good sms.sid[1:15,"tsquality"] <- "A" # record 8 is strange sms.sid[8,"gps.localtime"] <- sms.sid[7,"gps.localtime"] + sms.sid[8,"msb.secs"] - sms.sid[7,"msb.secs"] sms.sid[8,"tsquality"] <- "E2" # record 16 is bad sms.sid[16,"tsquality"] <- "B" # but 17:22 are good sms.sid[17:22,"tsquality"] <- "A" # and the phone time goes awry at record 22+ # so set the GPS time to the network localtime sms.sid[22:nrow(sms.sid),"gps.localtime"] <- sms.sid[22:nrow(sms.sid),"sms.net.localtime"] sms.sid[22:nrow(sms.sid),"tsquality"] <- "B" } if (sid=="s46" | sid=="s61") { # the majority have no GPS lock, so make a subset of time differences for locked measurements td.s <- td.s[abs(td.s)<100] mode.td.s <- as.numeric(names(sort(table(td.s), T)[1])) } if (sid=="s50") { # none of the SMS have a fix sms.sid$gps.localtime <- sms.sid$sms.phone.localtime sms.sid$tsquality <- "G" } # handle good cases if (sid=="s12") { sms.sid$tsquality <- "A" } # handle bad cases badlist <- paste("s", c(11,13,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33,34,35,37, 38,39,45,46,51,52,54,55,56,58,61,62,63,64,66,67,68,69,70), sep="") if (!is.na(match(sid, badlist ))) { # if the GPS week is bad, make the GPS time from the mode difference from phone time sms.sid$gps.localtime <- ifelse (sms.sid$gps.week==1340, sms.sid$sms.phone.localtime - mode.td.s, sms.sid$gps.localtime) sms.sid$gps.localtime <- as.POSIXct(sms.sid$gps.localtime, origin="1970-01-01", tz="GMT") attr(sms.sid$gps.localtime, "tzone") <- "" sms.sid$tsquality <- ifelse(sms.sid$gps.week==1340, "E1", "A") } if (sid=="s61") { sms.sid[75,"gps.localtime"] <- sms.sid[75,"sms.net.localtime"] } if (sid=="s29") { # one record seems to have a wrong timestamp sms.sid[28,"gps.localtime"] <- sms.sid[28,"gps.localtime"] + 3600 } sms.master <- rbind(sms.master, sms.sid) } # an informative message cat("\n") # make a field of UTC time sms.master$gps.utctime <- sms.master$gps.localtime attr(sms.master$gps.utctime, "tzone") <- "UTC" # calculate the difference in seconds between the phone and GPS time sms.master$timediff.sec <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="s")) sms.master$timediff.hr <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="h")) sms.master$timediff.day <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="d")) # write a CSV file outfile <- paste(masterdir, "sms.csv", sep="/") colnames(sms.master) <- unfix.colnames(sms.master) write.csv(sms.master, file=outfile, row.names=F, quote=F) colnames(sms.master) <- fix.colnames(sms.master) return(sms.master) } #sms.master <- process.sms() #sms.m1 <- sms.master #sms.m1$kb.logged <- NULL #sids <- unique(sms.m1$sid) #for (i in 1:length(sids)) { #sid <- sids[i] #cmd <- paste(sid, " <- sms.m1[sms.m1$sid==", "\"", sid, "\"", ",]", sep="") #eval(parse(text=cmd)) #cmd <- paste("rownames(", sid, ") <- NULL", sep="") #eval(parse(text=cmd)) #} # #ss <- split(sms.m1, sms.m1$sid) #for (i in 1:length(ss)) { #dat <- ss[[i]] #rownames(dat) <- NULL #print(dat) #}