Main Page/Research/MSB/Scripts/preprocess sms.R

From phurvitz
< Main Page‎ | Research‎ | MSB‎ | Scripts
Revision as of 01:58, 17 January 2009 by Phil Hurvitz (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to: navigation, search
# preprocess SMS messages to a standard format

# output data format will be
# sid kb_logged msb_secs gps_lock_last sms_phone_localtime   sms_net_localtime sms_phone_localdate      local.gpstime          utc.gpstime tsquality
# s09     63936     3974            -9 2007-12-07 13:40:01 2007-12-07 13:42:07          2007-12-07 2007-12-07 13:40:01 2007-12-07 21:40:01         A
# GPS quality: A=absolute, E1=estimate (good), E2=estimate (not too good), G=guess

process.sms <- function() {
    # an informative message            
    cat("Processing SMS data.... ")
    source("http://gis.washington.edu/phurvitz/R/functions.R")
    # where are files?
    current.dir <- getwd()
    masterdir <- "/home/phurvitz/public_html/msb/processed_data"

    #===================================SMS 6==========================
    # read in the sms master file (6 fields)
    sms.master.file.6 <- paste(masterdir, "sms6.csv", sep="/")
    sms <- read.csv(sms.master.file.6, as.is=T)
    # fix the column names
    colnames(sms) <- fix.colnames(sms)
    # initialize some new columns
    gps.week <- gps.ms <- local.gpstime <- utc.gpstime <- tsquality <- NA
    # standardize column order
    sms <-  with(sms, 
        data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, 
        local.gpstime, utc.gpstime, tsquality, stringsAsFactors=F))

    # remove those with bad subject names
    sms <- sms[grep("^s", sms$sid),]

    # standardize dates
    year2char <- nchar(sub(".*/", "", sms$sms.phone.localtime))==2
    sms$x <- as.POSIXct("2000-01-01")
    sms[year2char,]$x <- as.POSIXct(strptime(sms[year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%y"))
    sms[!year2char,]$x <- as.POSIXct(strptime(sms[!year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%Y"))
    sms$sms.phone.localtime <- sms$x
    sms$x <- NULL
    sms$sms.net.localtime <- as.POSIXct(strptime(sms$sms.net.localtime, "%Y-%b-%d %H:%M:%S"))

    # split into SIDs
    sms.sidlist <- split(sms, sms$sid)
    sms.master <- NULL

    # handle each SID
    for (i in 1:length(sms.sidlist)) {
        # get the list from this SID
        sid <- names(sms.sidlist)[i]
        # an informative message            
        cat(paste(sid, ",", sep=""))
        sms.sid <- sms.sidlist[[i]]
        # time differences
        td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,sms.net.localtime, units="s")))
        # what time difference is the mode?
        mode.td.s <- as.numeric(names(sort(table(td.s), T)[1]))
        mode.td.h <- (round(mode.td.s/3600, 2))
        # put in classes based on the match. E1 is ~ 1 hr, E2 is 1-2 hr, G is anyone's guess
        sms.sid$tsquality <- ifelse((abs(mode.td.h) > 1 & abs(mode.td.h) < 2), "E2",
                                  ifelse((abs(mode.td.s) > 0 & abs(mode.td.s) <= 3660), "E1",
                                  "G"))
        # set the timestamps 
        gpsunixtime <- with(sms.sid, ifelse(tsquality=="G", sms.sid$sms.net.localtime, sms.sid$sms.phone.localtime - mode.td.s))
        local.gpstime <- as.POSIXct(strptime("1jan1970", "%d%b%Y", tz="PST") + gpsunixtime)
        attr(local.gpstime, "tzone") <- ""
        sms.sid$local.gpstime <- local.gpstime
        sms.master <- rbind(sms.master, sms.sid)
    }
    # no GPS time should be later than a phone network time, so if this is the case, replace with phone net time
    sms.master$local.gpstime <- with(sms.master, ifelse(local.gpstime > sms.net.localtime, sms.net.localtime, local.gpstime))
    sms.master$local.gpstime <-  as.POSIXct(sms.master$local.gpstime, origin="1970-01-01", tz="PST")
    attr(sms.master$local.gpstime, "tzone") <- ""

    #===================================SMS 8==========================
    # now handle the sms8 records
    sms.master.file.8 <- paste(masterdir, "sms8.csv", sep="/")
    sms <- read.csv(sms.master.file.8, as.is=T)
    colnames(sms) <- fix.colnames(sms)
    local.gpstime <- utc.gpstime <- tsquality <- NA
    sms <-  with(sms, 
        data.frame(sid, kb.logged, msb.secs, gps.lock.last, gps.week, gps.ms, sms.phone.localtime, sms.net.localtime, 
        local.gpstime, utc.gpstime, tsquality, stringsAsFactors=F))

    # remove those with bad subject names
    sms <- sms[grep("^s", sms$sid),]

    # standardize dates
    year2char <- nchar(sub(".*/", "", sms$sms.phone.localtime))==2
    sms$x <- as.POSIXct("2000-01-01")
    sms[year2char,]$x <- as.POSIXct(strptime(sms[year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%y"))
    sms[!year2char,]$x <- as.POSIXct(strptime(sms[!year2char,]$sms.phone.localtime, "%I:%M:%S %p %m/%d/%Y"))
    sms$sms.phone.localtime <- sms$x
    sms$x <- NULL
    sms$sms.net.localtime <- as.POSIXct(strptime(sms$sms.net.localtime, "%Y-%b-%d %H:%M:%S"))
    sms <- sms[order(sms$sid),]

    # leap-seconds offset for GPS
    gps.offset <- -14
    
    # start of the GPS epoch
    start.epoch <- strptime("1980-01-06", "%Y-%m-%d", "GMT")
    sms.gps.gmt <- start.epoch + (sms$gps.week * 7 * 24 * 60 * 60 + sms$gps.ms / 1000 + gps.offset)
    sms$local.gpstime <- sms.gps.gmt
    attr(sms$local.gpstime, "tzone") <- NULL

    # split into SIDs
    sms.sidlist <- split(sms, sms$sid)

    # process each subject
    for (i in 1:length(sms.sidlist)) {
    #for (i in 6){
        # get the list from this SID
        sid <- names(sms.sidlist)[i]
        # an informative message            
        cat(paste(sid, ",", sep=""))
        sms.sid <- sms.sidlist[[i]]
        # time differences
        td.s <- as.numeric(with(sms.sid, difftime(sms.phone.localtime,local.gpstime, units="s")))
        # what time difference is the mode?
        mode.td.s <- as.numeric(names(sort(table(td.s), T)[1]))
        mode.td.h <- (round(mode.td.s/3600, 2))
        (sms.sid)
        # handle ugly cases
        if (sid=="s14") {
            # no records, really.
            next()
        }
        if (sid=="s29") {
            # one record seems to have a wrong timestamp
            sms.sid[28,"local.gpstime"] <- sms.sid[28,"local.gpstime"] + 3600
        }
        if (sid=="s61") {
            sms.sid[75,"local.gpstime"] <- sms.sid[75,"sms.net.localtime"]
        }
        if (sid=="s46" | sid=="s61") {
            # the majority have no GPS lock, so make a subset of time differences for locked measurements
            td.s <- td.s[abs(td.s)<100]
            mode.td.s <- as.numeric(names(sort(table(td.s), T)[1]))
        }
        if (sid=="s50") {
            # none of the SMS have a fix
            sms.sid$local.gpstime <- sms.sid$sms.phone.localtime
            sms.sid$tsquality <- "G"
        }
        # handle good cases
        if (sid=="s12") {
            sms.sid$tsquality <- "A"
        }
        # handle bad cases
        badlist <- paste("s", c(11,13,15,16,17,18,19,20,22,23,24,27,28,29,30,31,32,33,34,35,37,
            38,39,45,46,51,52,54,55,56,58,61,62,63,64,66,67,68,69,70), sep="")
        if (!is.na(match(sid, badlist ))) {
            # if the GPS week is bad, make the GPS time from the mode difference from phone time
            sms.sid$local.gpstime <- ifelse (sms.sid$gps.week==1340, sms.sid$sms.phone.localtime - mode.td.s, sms.sid$local.gpstime)
            sms.sid$local.gpstime <- as.POSIXct(sms.sid$local.gpstime, origin="1970-01-01", tz="GMT")
            attr(sms.sid$local.gpstime, "tzone") <- ""
            sms.sid$tsquality <- ifelse(sms.sid$gps.week==1340, "E1", "A")
        }
        sms.master <- rbind(sms.master, sms.sid)
    }
    # an informative message            
    cat("\n")
    # make a field of UTC time
    sms.master$utc.gpstime <- sms.master$local.gpstime 
    attr(sms.master$utc.gpstime, "tzone") <- "UTC"
    # write a CSV file
    outfile <- paste(masterdir, "sms.csv", sep="/")
    colnames(sms.master) <- unfix.colnames(sms.master)
    write.csv(sms.master, file=outfile, row.names=F, quote=F)
    colnames(sms.master) <- fix.colnames(sms.master)
    return(sms.master)
}


sms.master <- process.sms()
sms.master[sms.master$sid==sid,]