Main Page/Research/MSB/Scripts/preprocess sms.R

From phurvitz
< Main Page‎ | Research‎ | MSB‎ | Scripts
Jump to: navigation, search
# preprocess SMS messages to a standard format

# output data format will be
# sid kb_logged msb_secs gps_lock_last sms_phone_localtime   sms_net_localtime sms_phone_localdate      gps.localtime          gps.utctime tsquality
# s09     63936     3974            -9 2007-12-07 13:40:01 2007-12-07 13:42:07          2007-12-07 2007-12-07 13:40:01 2007-12-07 21:40:01         A
# GPS quality: A=absolute, E1=estimate (good), E2=estimate (not too good), G=guess, B=badG

process.sms <- function() {
    # an informative message            
    cat("Processing SMS data.... ")
    source("http://gis.washington.edu/phurvitz/R/functions.R")
    # where are files?
    current.dir <- getwd()
    masterdir <- "/home/phurvitz/public_html/msb/processed_data"

    #===================================SMS 6==========================
    # read in the sms master file (6 fields)
    sms.master.file.6 <- paste(masterdir, "sms6.csv", sep="/")
    sms <- read.csv(sms.master.file.6, as.is=T)
    # fix the column names
    colnames(sms) <- fix.colnames(sms)
    # initialize some new columns
    gps.week <- gps.ms <- gps.localtime <- gps.utctime <- tsquality <- NA
    # standardize column order
    sms <-  with(sms, 
        data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, 
        gps.localtime, gps.utctime, tsquality, stringsAsFactors=F))

    # remove those with bad subject names
    sms <- sms[grep("^s", sms$sid),]

    # standardize dates
    year2char <- nchar(sub(".*/", "", sms$sms.phone.localtime))==2
    sms$x <- as.POSIXct("2000-01-01")
    sms[year2char,]$x <- as.POSIXct(strptime(sms[year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%y"))
    sms[!year2char,]$x <- as.POSIXct(strptime(sms[!year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%Y"))
    sms$sms.phone.localtime <- sms$x
    sms$x <- NULL
    sms$sms.net.localtime <- as.POSIXct(strptime(sms$sms.net.localtime, "%Y-%b-%d %H:%M:%S"))

    # split into SIDs
    sms.sidlist <- split(sms, sms$sid)
    sms.master <- NULL

    # handle each SID
    for (i in 1:length(sms.sidlist)) {
        # get the list from this SID
        sid <- names(sms.sidlist)[i]
        # an informative message           
        cat(paste(sid, ",", sep=""))
        sms.sid <- sms.sidlist[[i]]
        # time differences between phone and net time
        td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,sms.net.localtime, units="s")))
        # what time difference is the mode?
        mode.td.s <- as.numeric(names(sort(table(td.s), T)[1]))
        mode.td.h <- (round(mode.td.s/3600, 2))
        # put in classes based on the match. E1 is ~ 1 hr, E2 is 1-2 hr, G is anyone's guess
        sms.sid$tsquality <- ifelse((abs(mode.td.h) > 1 & abs(mode.td.h) < 2), "E2",
                                  ifelse((abs(mode.td.s) > 0 & abs(mode.td.s) <= 3660), "E1",
                                  "G"))
        # set the timestamps 
        gpsunixtime <- with(sms.sid, ifelse(tsquality=="G", sms.sid$sms.net.localtime, sms.sid$sms.phone.localtime - mode.td.s))
        gps.localtime <- as.POSIXct(strptime("1jan1970", "%d%b%Y", tz="PST") + gpsunixtime)
        attr(gps.localtime, "tzone") <- ""
        sms.sid$gps.localtime <- gps.localtime
        sms.master <- rbind(sms.master, sms.sid)
    }
    # no GPS time should be later than a phone network time, so if this is the case, replace with phone net time
    sms.master$gps.localtime <- with(sms.master, ifelse(gps.localtime > sms.net.localtime, sms.net.localtime, gps.localtime))
    sms.master$gps.localtime <-  as.POSIXct(sms.master$gps.localtime, origin="1970-01-01", tz="PST")
    attr(sms.master$gps.localtime, "tzone") <- ""

    #===================================SMS 8==========================
    # now handle the sms8 records
    sms.master.file.8 <- paste(masterdir, "sms8.csv", sep="/")
    sms <- read.csv(sms.master.file.8, as.is=T)
    colnames(sms) <- fix.colnames(sms)
    gps.localtime <- gps.utctime <- tsquality <- NA
    sms <-  with(sms, 
        data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, 
        gps.localtime, gps.utctime, tsquality, stringsAsFactors=F))

    # remove those with bad subject names
    sms <- sms[grep("^s", sms$sid),]

    # standardize dates
    year2char <- nchar(sub(".*/", "", sms$sms.phone.localtime))==2
    sms$x <- as.POSIXct("2000-01-01")
    sms[year2char,]$x <- as.POSIXct(strptime(sms[year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%y"))
    sms[!year2char,]$x <- as.POSIXct(strptime(sms[!year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%Y"))
    sms$sms.phone.localtime <- sms$x
    sms$x <- NULL
    sms$sms.net.localtime <- as.POSIXct(strptime(sms$sms.net.localtime, "%Y-%b-%d %H:%M:%S"))
    sms <- sms[order(sms$sid),]

    # leap-seconds offset for GPS
    gps.offset <- -14
    
    # start of the GPS epoch
    start.epoch <- strptime("1980-01-06", "%Y-%m-%d", "GMT")
    sms.gps.gmt <- start.epoch + (sms$gps.week * 7 * 24 * 60 * 60 + sms$gps.ms / 1000 + gps.offset)
    sms$gps.localtime <- sms.gps.gmt
    attr(sms$gps.localtime, "tzone") <- NULL

    # split into SIDs
    sms.sidlist <- split(sms, sms$sid)

    # process each subject
    for (i in 1:length(sms.sidlist)) {
    #for (i in 6){
        # get the list from this SID
        sid <- names(sms.sidlist)[i]
        # an informative message            
        cat(paste(sid, ",", sep=""))
        sms.sid <- sms.sidlist[[i]]
        # time differences
        td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,gps.localtime, units="s")))
        # what time difference is the mode?
        mode.td.s <- as.numeric(names(sort(table(td.s), T)[1]))
        mode.td.h <- (round(mode.td.s/3600, 2))
        (sms.sid)
        # handle ugly cases
        if (sid=="s14") {
            # no records, really.
            next()
        }
        if (sid=="s26") {
            # the first half are mostly good
            sms.sid[1:15,"tsquality"] <- "A"
            # record 8 is strange
            sms.sid[8,"gps.localtime"] <- sms.sid[7,"gps.localtime"] + sms.sid[8,"msb.secs"] - sms.sid[7,"msb.secs"]
            sms.sid[8,"tsquality"] <- "E2"
            # record 16 is bad
            sms.sid[16,"tsquality"] <- "B"
            # but 17:22 are good
            sms.sid[17:22,"tsquality"] <- "A"
            # and the phone time goes awry at record 22+
            # so set the GPS time to the network localtime
            sms.sid[22:nrow(sms.sid),"gps.localtime"] <- sms.sid[22:nrow(sms.sid),"sms.net.localtime"]
            sms.sid[22:nrow(sms.sid),"tsquality"] <- "B"
        }
        if (sid=="s46" | sid=="s61") {
            # the majority have no GPS lock, so make a subset of time differences for locked measurements
            td.s <- td.s[abs(td.s)<100]
            mode.td.s <- as.numeric(names(sort(table(td.s), T)[1]))
        }
        if (sid=="s50") {
            # none of the SMS have a fix
            sms.sid$gps.localtime <- sms.sid$sms.phone.localtime
            sms.sid$tsquality <- "G"
        }
        # handle good cases
        if (sid=="s12") {
            sms.sid$tsquality <- "A"
        }
        # handle bad cases
        badlist <- paste("s", c(11,13,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33,34,35,37,
            38,39,45,46,51,52,54,55,56,58,61,62,63,64,66,67,68,69,70), sep="")
        if (!is.na(match(sid, badlist ))) {
            # if the GPS week is bad, make the GPS time from the mode difference from phone time
            sms.sid$gps.localtime <- ifelse (sms.sid$gps.week==1340, sms.sid$sms.phone.localtime - mode.td.s, sms.sid$gps.localtime)
            sms.sid$gps.localtime <- as.POSIXct(sms.sid$gps.localtime, origin="1970-01-01", tz="GMT")
            attr(sms.sid$gps.localtime, "tzone") <- ""
            sms.sid$tsquality <- ifelse(sms.sid$gps.week==1340, "E1", "A")
        }
        if (sid=="s61") {
            sms.sid[75,"gps.localtime"] <- sms.sid[75,"sms.net.localtime"]
        }
        if (sid=="s29") {
            # one record seems to have a wrong timestamp
            sms.sid[28,"gps.localtime"] <- sms.sid[28,"gps.localtime"] + 3600
        }
        sms.master <- rbind(sms.master, sms.sid)
    }
    # an informative message            
    cat("\n")
    # make a field of UTC time
    sms.master$gps.utctime <- sms.master$gps.localtime 
    attr(sms.master$gps.utctime, "tzone") <- "UTC"
    # calculate the difference in seconds between the phone and GPS time
    sms.master$timediff.sec <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="s"))
    sms.master$timediff.hr <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="h"))
    sms.master$timediff.day <- with(sms.master, difftime(gps.localtime, sms.phone.localtime, units="d"))
    # write a CSV file
    outfile <- paste(masterdir, "sms.csv", sep="/")
    colnames(sms.master) <- unfix.colnames(sms.master)
    write.csv(sms.master, file=outfile, row.names=F, quote=F)
    colnames(sms.master) <- fix.colnames(sms.master)
    return(sms.master)
}

#sms.master <- process.sms()
#sms.m1 <- sms.master
#sms.m1$kb.logged <- NULL
#sids <- unique(sms.m1$sid)
#for (i in 1:length(sids)) {
   #sid <- sids[i]
   #cmd <- paste(sid, " <- sms.m1[sms.m1$sid==", "\"", sid, "\"", ",]", sep="")
   #eval(parse(text=cmd))
   #cmd <- paste("rownames(", sid, ") <- NULL", sep="")
   #eval(parse(text=cmd))
#}
#
#ss <- split(sms.m1, sms.m1$sid)
  #for (i in 1:length(ss)) {
     #dat <- ss[[i]]
     #rownames(dat) <- NULL
     #print(dat)
  #}