######################################################### ## HF/Fx Library ## ## Created by: Bingcheng Yan ## ## Last Updated on: 11/07/2003 ## ######################################################### # # Corrected S-Plus functions # aggregateSeries # # Visualization # 1. plotByDays # # # # DATA LOADING # 1. TAQLoad # 2. OlsenLoad # # # TIMESERIES AND DATA MANUPULATION # 1. rbindtimeSeries # 2. reorderTS # 3. is.tsBW # 4. tsBW # 5. ExchangeHoursOnly # 6. FxBizWeekOnly # 7. diff.withinDay # 8. diff.withinWeek # 9. align.withinWeek # 10. align.withinDay # 11. aggregateSeriesHF # 12. OvernightSmpl # # DESCRIPTIVE # 1. SmoothAcrossIntervs # 2. tableSmoother # # DIAGNOSTIC # 1. cusumLife # 2. cusumPlot # # VARIABLE CONSTRUCTION # 1. DurationInInterv # 2. PriceChgInInterv # 3. getSpread # 4. getMidQuote # 5. aggregateTradeTypes # 6. DetermInterp # 7. naDuration # 8. numNAs # 9. tradeDirec # 10. Genr.RealVol ############################################### # a function to generate realized volatility # with specified span for realized volatiltiy, # and span for returns over which realized volatility # is computed ############################################### Genr.RealVol = function(ts, interv.type = "daily", bound.hours = c("9:30", "16:00"), rv.span, rt.span, how.align = "before") { # ts: one-column log price level timeSeries # interv.type: "daily" or "weekly" for exchange based trading data and Fx type data respectively. # bound.hours: a two-element character vector in the format of "HH:MM" # the start and closing hours of the interval. In case of daily interval, # they are the start and closing hours of the exchange. In case of weekly interval, they # are the start hour of the biz week on Sunday and the closing hour of the biz week on # Friday. # rv.span: timeSpan object, the time span over which to compute # the realized volatility, e.g. timeSpan("5m") # rt.span: timeSpan object, the time span to define the returns # used to compute the realized volatility, e.g. timeSpan("1m") # rt.span must be smaller than or equal to rv.span # how.align: how to align, default "before", using the most recent observation, # see help for align. # Output: realized volatility in the same unit as price changes # Validation if(!is(ts, "timeSeries")) stop(paste(deparse(substitute(ts)), "must be a timeSeries!")) if(ncol(ts)>1) stop(paste(deparse(substitute(ts)), "can only be a one-column log price!")) if(!is(rv.span, "timeSpan")) stop("rv.span must be a timeSpan!") if(!is(rt.span, "timeSpan")) stop("rt.span must be a timeSpan!") if (rv.span < rt.span) stop("rv.span can not be smaller than rt.span!") # Check for NA in ts na.idx = is.na(ts@data) ts = timeSeries(sub(ts@data, !na.idx,), pos= ts@positions[!na.idx]) if(interv.type == "daily"){ ts.aligned = align.withinDay(ts, exch.hours = bound.hours, by = rt.span, k.by = NULL, how = how.align) ts.diff = diff.withinDay(ts.aligned, bizhrs.cleaned = T) } else if(interv.type == "weekly"){ ts.aligned = align.withinWeek(ts, bizweek.hours = bound.hours, by = rt.span, k.by = NULL, how = how.align, fullweek = T) ts.diff = diff.withinWeek(ts.aligned, bizhrs.cleaned = T) } else stop ("interv.type must be either daily or weekly") rt = ts.diff@data pos.rt = ts.diff@positions # compute realized vol if(is(rt, "data.frame")) rt = unlist(rt) rt.mat = matrix(as.vector(rt), nrow = round(rv.span/rt.span)) rv = sqrt(colSums(rt.mat^2)) # prepare the pos for realized vol idx = matrix(1:nrow(ts.diff), nrow = round(rv.span/rt.span))[round(rv.span/rt.span),] pos.rv = pos.rt[idx] timeSeries(data = data.frame(RealizedVol = rv), pos = pos.rv) } ############################################### # a function to excerpt data within an overnight interval # Whether the boundary positions will be included is controlled by # start.include and close.include ############################################### OvernightSmpl = function(ts, smpl.hours = c("22:00", "8:00"), start.include = T, close.include =T) { # a function to excerpt data within an overnight interval # Whether the boundary positions will be included is controlled by # start.include and close.include # smpl.hours: a two-element character vector in the format of "HH:MM", # specifying the starting and closing hour and minute of an # overnight interval if(!is(ts, "timeSeries")) stop("ts must be a timeSeries object") smpl.hrsSpan = timeSpan(smpl.hours, in.format = "%H:%M[:%S[:%N]]", format = "%02H%02M%02S%03N") if(any(is.na(smpl.hrsSpan))) stop("smpl.hours must be in the format of HH:MM:SS!") # boundary hours bound.times = as.numeric(as.character(smpl.hrsSpan)) start.time = bound.times[1] close.time = bound.times[2] pos = positions(ts) hms.list = .Call("time_to_hour_min_sec", pos, timeZoneList(), COPY = c(F, F), CLASSES = c("timeDate", "list")) pos.time = hms.list[[1]]*10^7 + hms.list[[2]]*10^5 + hms.list[[3]]*10^3 + hms.list[[4]] if(start.include) if(close.include) idx = (pos.time >= start.time) | (pos.time <= close.time) else idx = (pos.time >= start.time) | (pos.time < close.time) else if(close.include) idx = (pos.time > start.time) | (pos.time <= close.time) else idx = (pos.timie > start.time) | (pos.time < close.time) timeSeries(sub(ts@data, idx,), pos = pos[idx]) } ################################################################################### # function to assign buy/sell direction indicators to trades ################################################################################### tradeDirec = function(trade, mq, timeLag = "5s") { # trade: a one-column timeSeries for trading prices # mq: a one-column timeSeries for mid quotes # timeLag: either timeSpan or character specifying the time span, e.g. "5s", # if(!is(mq, "timeSeries") | !is(trade, "timeSeries") | ncol(mq) > 1 | ncol(trade) > 1) stop(paste(deparse(substitute(mq)), "and", deparse(substitute(trade)), "must be one-column timeSeries!")) if(!is(timeLag, "timeSpan")){ timeLag = timeSpan(timeLag) if(is.na(timeLag)) stop("timeLag must be either timeSpan or the character specifying the time span, e.g \"5s\"") } pos.mq = positions(mq) pos.trade = positions(trade) rng.mq = range(pos.mq) rng.trade = range(pos.trade) # separate data into daily intervals and assign trade direction days.mq = timeSeq(from=rng.mq[1], to=rng.mq[2], by = "days", align.by = T, extend = T) poscut.mq = as.integer(cut(pos.mq, days.mq, include.lowest = T)) poscats.mq = unique(poscut.mq) days.trade = timeSeq(from=rng.trade[1], to=rng.trade[2], by="days", align.by = T, extend = T) poscut.trade = as.integer(cut(pos.trade, days.trade, include.lowest = T)) poscats.trade = unique(poscut.trade) if(!identical(days.mq, days.trade) | !identical(poscats.mq, poscats.trade)) stop("Mid quotes and trading prices do not correspond to the same trading days") poscats = poscats.trade # utility function to assign buy/sell direction each day direc.Day = function(mq, trade, timeLag) { pos.trade = positions(trade) posLag = pos.trade - timeLag pos.mq = positions(mq) mq.prevail = seriesData(align(mq, pos = posLag, how = "before", error.how = "nearest")) trade.price = seriesData(trade) ans.tmp = rep(0, nrow(trade)) ans.tmp[trade.price > mq.prevail]= 1 ans.tmp[trade.price < mq.prevail] = -1 # idex for the trades whose prevailing quotes pos are # before the first quote observed in the day, they are # assigned "undetermined" ans.tmp[posLag < pos.mq[1]]= 0 timeSeries(data.frame(BuySellDirec = ans.tmp), pos = pos.trade) } #find out the start and end index of each block in the original trade and quote series ini.idx.mq = match(poscats, poscut.mq) last.idx.mq = c(ini.idx.mq[-1], length(poscut.mq)+1)-1 ini.idx.trade = match(poscats, poscut.trade) last.idx.trade = c(ini.idx.trade[-1], length(poscut.trade)+1)-1 do.call("rbindtimeSeries", lapply(1:length(poscats), function(i, mq, trade, ini.idx.mq, last.idx.mq, ini.idx.trade, last.idx.trade, timeLag, func.direc.Day) { idx.mq = ini.idx.mq[i]:last.idx.mq[i] idx.trade = ini.idx.trade[i]:last.idx.trade[i] func.direc.Day(mq=timeSeries(sub(mq@data, idx.mq,),pos=positions(mq)[idx.mq]), trade=timeSeries(sub(trade@data, idx.trade,),pos=positions(trade)[idx.trade]), timeLag = timeLag) }, mq, trade, ini.idx.mq, last.idx.mq, ini.idx.trade, last.idx.trade, timeLag, direc.Day)) } ################################################################################### # Corrected and accelerated version of aggregateSeries ################################################################################### aggregateSeries = function(x, pos, FUN, moving = F, together = F, drop.empty = T, include.ends = F, adj, offset, colnames, by, k.by = 1, week.align = NULL, holidays = timeDate()) { # Aggregate series object to new positions origpos <- positions(x) rng <- range(origpos) is.cal <- is(origpos, "positionsCalendar") # construct positions if missing if(!moving){ # take care of positions if(missing(pos)){ userPos = F if(is(by, "character")){ if(by == "weeks" & is.null(week.align)) stop("If by = \"weeks\" week.align must be supplied") pos <- timeSeq(from = rng[1], to = rng[2], by = by, k.by = k.by, align.by = T, extend = T, week.align = week.align, holidays = holidays) extendPos = T } else{ pos <- timeSeq(from = rng[1], to = rng[2], by = by) extendPos = F } } else userPos = T } if(moving) { if(!missing(pos)) warning("pos argument ignored for moving aggregation") len <- length(origpos) if(len < moving) { if(is.cal) return(timeSeries()) else return(signalSeries()) } # set up positions for returns and offsets poslen <- len - moving + 1 pos <- origpos[1:poslen] if(moving > 1) diffpos <- diff(as(origpos, "numeric"), moving - 1) else diffpos <- rep(0, len) whichpos <- T # call aggregation function newdat <- do.call("rbind", lapply(1:poslen, function(rowstart, numrows, x, nc, func, together) { cols <- if(together) func(sub(x, rowstart:(rowstart + numrows - 1), )) else if(nc > 0) sapply(1:nc, function(col, x, fun) { fun(sub(x, , col)) } , sub(x, rowstart:(rowstart + numrows - 1), ), func) else numeric(0) cols <- as.rectangular(cols) if(is(cols, "matrix") && numRows(cols) != 1) cols <- matrix(cols, nrow = 1) cols } , moving, x@data, numCols(x), FUN, together)) } else { # call cut to figure out which positions go in which bins posnums <- as(pos, "numeric") diffpos <- diff(posnums) if(any(is.na(diffpos)) || any(diffpos < 0)) stop("Positions for aggregation must be monotonically increasing without NA values") # get right bin endpoints newpos <- pos bpnum = length(newpos)# number of breakpoints # adjust the breakpoints or data if necessary if(userPos || !extendPos){ # whether include.ends is only a concern # for these two cases if(include.ends) { # modify the ends of breakpoints newpos[1] <- min(newpos[1], rng[1]) newpos[bpnum] <- max(newpos[bpnum], rng[2]) } else { # chop the data based on the range of newpos idx = origpos >= newpos[1] & origpos <= newpos[bpnum] if(!any(idx)) stop("The range of supplied positions does not cover any data at all!") x = x[idx,] origpos <- positions(x) rng <- range(origpos) } } pos = newpos[1:(bpnum-1)] # the pos now serve the lower end of each bin poslen = length(pos) diffpos = diff(newpos) poscut <- as.integer(cut(origpos, newpos, include.lowest = T)) if(!drop.empty) { poscats <- seq(along = pos) whichpos <- T } else { poscats <- unique(poscut) whichpos <- sort(match(poscats, seq(along = pos), nomatch = 0)) } # find out the start and end index of each block in the orignal series ini.idx = match(poscats, poscut, nomatch = 0) last.idx = rep(0, length(ini.idx)) last.idx[ini.idx!=0] = c((ini.idx[ini.idx!=0])[-1], length(poscut)+1)-1 if (is.vector(x@data)) x@data = as.matrix(x@data) # apply the function to get new series data newdat <- do.call("rbind", lapply(1:length(poscats), function(i, x, nc, ini.idx, last.idx, func, together) { cols <- if(together) func(sub(x, ini.idx[i]:last.idx[i], )) else if(nc > 0) sapply(1:nc, function(col, x, fun) { fun(sub(x, , col)) } , sub(x, ini.idx[i]:last.idx[i], ), func) else numeric(0) cols <- as.rectangular(cols) if(is(cols, "matrix") && numRows(cols) != 1) cols <- matrix(cols, nrow = 1) cols } , x@data, numCols(x), ini.idx, last.idx, FUN, together)) } # add offset or adj to positions if(!missing(offset)) pos <- pos + offset else if(!missing(adj) && (poslen > 1)) { pos <- pos + adj * diffpos } # rbind these rows together and put into a series # newdat <- do.call("rbind", newdat) if(missing(colnames)) colnames <- colIds(x@data) if(!is.null(colnames) && (numCols(newdat) == length(colnames))) colIds(newdat) <- colnames if(is.cal) timeSeries(pos = pos[whichpos], data = newdat) else signalSeries(pos = pos[whichpos], data = newdat) } ######################################################################################### # A function to smooth high-frequency data across trading intervals, i.e. daily or weekly ######################################################################################### SmoothAcrossIntervs = function(ts, n.subinterv, n.interv, FUN) { # ts: a one-column timeseries of high-frequency data with fixed number of subintervals in # each trading interval, daily or weekly. # n.subinterv: the fixed number of subintervals in each tradeing interval # n.interv: the number of tradeing intervals, days or weeks, involved in ts # FUN: the choice of smoothing function, valid choices are "mean", "median", # "loess.smooth", "supsmu". # Note that the nrow of ts has to be the product of n.subinterv and n.interv if(!is(ts, "timeSeries")) stop(paste(deparse(substitute(ts)), "must be a timeSeries object!")) if(nrow(ts) != n.subinterv * n.interv) stop(paste("The length of", deparse(substitute(ts)), "must be equal to the product of n.subinterv and n.interv")) if(ncol(ts) > 1) stop(paste(deparse(substitute(ts)), "can only have one column of data to smooth!")) pos = positions(ts) data = seriesData(ts) data.vec = as.vector(as.matrix(data)) data.mat = matrix(data.vec, ncol = n.interv) if(!is(FUN, "character")) FUN = deparse(substitute(FUN)) smuResult = tableSmoother(y = data.mat, FUN = FUN) timeSeries(data = data.frame(smuResult), pos = pos[1:n.subinterv]) } ###################################################################### # Function to generate bid-ask spread ###################################################################### getSpread = function(ask, bid, ticksize = NULL) { # ask: a one-column timeSeries for ask quotes # bid: a one-column timeSeries for bid quotes # ask and bid must have the same length since they are paired if(!is(ask, "timeSeries") | !is(bid, "timeSeries") | ncol(ask) > 1 | ncol(bid) > 1) stop(paste(deparse(substitute(ask)), "and", deparse(substitute(bid)), "must be one-column timeSeries!")) if( nrow(ask) != nrow(bid)) stop(paste("The length of", deparse(substitute(ask)), "must be equal to that of", deparse(substitute(bid)))) pos = positions(ask) ask.data = seriesData(ask) bid.data = seriesData(bid) spread.data = ask.data - bid.data ans = timeSeries(data = data.frame(Spread = spread.data), pos = pos) # compute spreads in ticks if required if(!is.null(ticksize)) ans = round(ans/ticksize) return(ans) } ###################################################################### # Function to generate mid quote for bid and ask quotes ###################################################################### getMidQuote = function(ask, bid) { # ask: a one-column timeSeries for ask quotes # bid: a one-column timeSeries for bid quotes # ask and bid must have the same length since they are paired if(!is(ask, "timeSeries") | !is(bid, "timeSeries") | ncol(ask) > 1 | ncol(bid) > 1) stop(paste(deparse(substitute(ask)), "and", deparse(substitute(bid)), "must be one-column timeSeries!")) if( nrow(ask) != nrow(bid)) stop(paste("The length of", deparse(substitute(ask)), "must be equal to that of", deparse(substitute(bid)))) pos = positions(ask) ask.data = seriesData(ask) bid.data = seriesData(bid) midquote.data = (ask.data + bid.data)/2 timeSeries(data = data.frame(MidQuote = midquote.data), pos = pos) } ###################################################################### # Function to load TAQ trade or quote data sets from ASCII file # it outputs a timeSeries object ###################################################################### TAQLoad = function(file="", type= "trade", fields.list=NULL, sep="|", skip=1, date.format = "%2d%m%Y") { # file: path of the TAQ data file # type: has to be either "trade", or "quote" # fields.list: a list object supplying field names and data types in the # TAQ data file, in the format of the "what" list used in scan( ). # The default is to use the included "trade" or # "quote" fields.list. If the fields.list is provided, the "type" # argument is ignored. However, the supplied fields.list must contain # the components of "Date" and "Time". # sep: the separator used in the TAQ data file, to be passed to scan( ) # skip: the number of rows to skip when reading the data # The data set to be loaded must have the date format as "ddmmyyyy" and the time # format as the number of seconds since the midnight of the day. if (is.null(fields.list)){ if (type == "trade") fields.list = list(Cond="", Ex="", Symbol="", Corr=0, G127=0, Price=0, Size=0, Date="", Seq=0, Time=0) else if (type == "quote") fields.list = list(Ex="", MMID="", Symbol="", Bid=0, BidSize=0, Mode=0, Ask=0, AskSize=0, Date="", Seq=0, Time=0) else stop("Type has to be either trade or quote!") } else { if ((!is.element("Date", names(fields.list))) | (!is.element("Time", names(fields.list)))) stop("Fields of Date and Time must be in the supplided fields.list!") } df = as.data.frame(scan(file = file, what = fields.list, sep = sep, skip = skip, multi.line=TRUE, strip.white=TRUE), stringsAsFactors = F) dates.tmp = timeDate(charvec = df[, "Date"], in.format = date.format, format = "%m/%d/%Y %H:%02M:%02S", zone = "Eastern") times.tmp = df[, "Time"] / (60*60*24) td.tmp = dates.tmp + times.tmp ans = timeSeries(data = df[, setdiff(colIds(df), c("Date","Time"))], pos = td.tmp) ans } ############################################################################################## # for Olsen ASCI file stored at specified location, the function load those data into splus and # store them in time Series ############################################################################################## OlsenLoad = function(file, fields.list = list(Date = "", Time = "", Bid = 0, Ask = 0, Institution = "")) { # for Olsen ASCI file stored at specified location, the function load those data into splus and # store them in time Series if ((!is.element("Date", names(fields.list))) | (!is.element("Time", names(fields.list)))) stop("Fields of Date and Time must be in the supplided fields.list!") df = as.data.frame(scan(file = file, what = fields.list, multi.line=TRUE, strip.white=TRUE), stringsAsFactors = F) td.input = paste(df[, "Date"], df[, "Time"], sep = " ") td = timeDate(charvec = td.input, in.format = "%d.%m.%Y %H:%M:%S", format = "%m/%d/%Y %H:%02M:%02S", zone = "GMT") ans = timeSeries(data = df[, setdiff(colIds(df), c("Date","Time"))], pos = td) ans } ################################################################################################ # a function to align high frequency data within each trading day, e.g. stock exchange price data. # Aligning only uses data within each day and then concatenate algined series for each day together # #################################################################################################### align.withinDay = function(ts, exch.hours = c("9:30", "16:00"), start.include = T, close.include =T, by, k.by = 1, how) { # exch.hours: a two-element character vector in the format of "HH:MM", # specifying the starting hour and minute of an exhange # trading day (the first element) and the closing hour and minute # of the trading day (the second element) # by: "minutes", "days", etc. see help for align # k.by: the units of by, see help for align # how: how to align, see help for align # Whether the boundary positions will be included is controlled by # start.include and close.include # VALUE: an aligned timeSeries within the defined trading hours each day if(!is(ts, "timeSeries")) stop("ts must be a timeSeries object") # utility functions day.align = function(ts.d, daystart.hour, dayclose.hour, by, k.by, how) { # ts.d: timeSeries data within one trading day pos.d = positions(ts.d) #the day align start.pos is the daystart.hour of the day start.pos = timeDate(paste(as.character(timeSeq(pos.d[1], length.out=1, format = "%m/%d/%Y")), daystart.hour), zone = pos.d@time.zone) start.pos@time.zone = "GMT" #the day align end.pos is the dayclose.hour of the day end.pos = timeDate(paste(as.character(timeSeq(pos.d[1], length.out =1, format = "%m/%d/%Y")), dayclose.hour), zone = pos.d@time.zone) end.pos@time.zone = "GMT" align.pos = timeSeq(from = start.pos, to = end.pos, by = by, k.by = k.by, format = pos.d@format, zone = "GMT") align.pos@time.zone = pos.d@time.zone align(ts.d, pos = align.pos, how = how, error.how = "nearest") } # cleaning observations outside the boundary hours, e.g. exchange hours ts = ExchangeHoursOnly(ts, exch.hours = exch.hours, start.include = start.include, close.include = close.include) daystart.hour = exch.hours[1] dayclose.hour = exch.hours[2] origpos = positions(ts) rng = range(origpos) # call cut to slice each day data and apply the day.align() pos = timeSeq(from = rng[1], to = rng[2], by = "days", align.by = T, extend = T) poscut = as.integer(cut(origpos, pos, include.lowest=T)) poscats = unique(poscut) #find out the start and end index of each block in the original series ini.idx = match(poscats, poscut) last.idx = c(ini.idx[-1], length(poscut)+1)-1 do.call("rbindtimeSeries", lapply(1:length(poscats), function(i, ts, ini.idx, last.idx, func.dayalign, daystart.hour, dayclose.hour, by, k.by, how) { idx = ini.idx[i]:last.idx[i] func.dayalign(timeSeries(sub(ts@data,idx,), pos=positions(ts)[idx]), daystart.hour=daystart.hour, dayclose.hour = dayclose.hour, by = by, k.by = k.by, how = how) }, ts, ini.idx, last.idx, day.align, daystart.hour, dayclose.hour, by, k.by, how)) } ########################################################################################################## # a function to align high frequency data within each biz week, e.g. FX data. Alignment only uses data within # each week and then concatenate algined series for each week together # ####################################################################################################### align.withinWeek = function(ts, bizweek.hours = c("22:00", "22:00"), start.include = T, close.include =T, by, k.by = 1, how, fullweek = F) { # bizweek.hours: a two-element character vector in the format of "HH:MM", # specifying the starting hour and minute of a new # biz week on Sunday (the first element) and the closing hour and minute # of the biz week on Friday (the second element) # by: "minutes", "days", etc. see help for align # k.by: the units of by, see help for align # how: how to align, see help for align # fullweek: a logical, if T, the aligment positions for each week will run # from the biz week start hour on Sun to the biz week close hour # on Friday. Otherwise, the alignment positions for the first # week will begin from the biz week start hour on Sun if the first # data position is on Sun, or from the midnight 00:00 of the day # if the first data position is on non-Sun; the alignment positions # for the last week will end at the biz week close hour on Fri # if the last data position is on Fri, or end at the midnight # 00:00 of the next day if the last data position is on non-Fri # Whether the boundary positions will be included is controlled by # start.include and close.include if(!is(ts, "timeSeries")) stop("ts must be a timeSeries object") # utility functions week.align = function(ts.w, find.start = T, find.end = T, weekstart.hour, weekclose.hour, by, k.by, how) { # ts.w: timeSeries data within one week (sun-fri) # find.start, find.end: logical, if this week is the 1st or last week # or both (one-week only) or neither (mid-weeks) findStartpos = function(firstpos, weekstart.hour) { # function takes a first timeDate position of one week data # returns the align start.pos of the week: the weekstart.hour on # Sun if the week data starts from Sun, or the 00:00 of the first # day in the week data origformat = firstpos@format firstpos@format = "%m/%d/%Y" if(wdydy(firstpos)[,"weekday"] == 0) # the week starts from Sun start.pos = timeDate(paste(as.character(firstpos), weekstart.hour), format = origformat, zone=firstpos@time.zone) else start.pos = timeDate(as.character(firstpos), format = origformat, zone=firstpos@time.zone) start.pos } findEndpos = function(lastpos, weekclose.hour) { # function takes a last timeDate position of one week data # returns the align end.pos of the week: the weekclose.hour on # Fri if the week data ends on Fri, or the 00:00 of the day after # the last day in the week data origformat = lastpos@format lastpos@format = "%m/%d/%Y" if (wdydy(lastpos)[,"weekday"] == 5) #the week ends on Fri end.pos = timeDate(paste(as.character(lastpos), weekclose.hour), format = origformat, zone = lastpos@time.zone) else end.pos = timeDate(as.character(lastpos), format = origformat, zone=lastpos@time.zone) +1 end.pos } pos.w = positions(ts.w) rng.w = range(pos.w) if (find.start) start.pos = findStartpos(rng.w[1], weekstart.hour) else #the week align start.pos is the weekstart.hour on Sunday of the week start.pos = timeDate(paste(as.character(timeSeq(from = rng.w[1], to = rng.w[1], by = "weeks", align.by = T, extend = T, week.align = "Sun", format = "%m/%d/%Y")[1]), weekstart.hour), zone=pos.w@time.zone) start.pos@time.zone = "GMT" if (find.end) end.pos = findEndpos(rng.w[2], weekclose.hour) else #the week align end.pos is the weekclose.hour on Friday of the week end.pos = timeDate(paste(as.character(timeSeq(from = rng.w[2], to = rng.w[2], by = "weeks", align.by = T, extend = T, week.align = "Sat", format = "%m/%d/%Y")[2]-1), weekclose.hour), zone=pos.w@time.zone) end.pos@time.zone = "GMT" align.pos = timeSeq(from = start.pos, to = end.pos, by = by, k.by = k.by, format = pos.w@format, zone = "GMT") align.pos@time.zone = pos.w@time.zone return(align(ts.w, pos = align.pos, how = how, error.how = "nearest")) } # cleaning observations outside the bizweek boundary hours, e.g. late Fri, Sat, and early Sun ts = FxBizWeekOnly(ts, bizweek.hours = bizweek.hours, start.include = start.include, close.include = close.include) weekstart.hour = bizweek.hours[1] weekclose.hour = bizweek.hours[2] origpos = positions(ts) rng = range(origpos) # call cut to slice each week data and apply the week.align() pos = timeSeq(from = rng[1], to = rng[2], by = "weeks", align.by = T, extend = T, week.align = "Sat") poscut = as.integer(cut(origpos, pos, include.lowest=T)) poscats = unique(poscut) n.weeks = length(poscats) ini.idx = match(poscats, poscut) last.idx = c(ini.idx[-1], length(poscut)+1)-1 if(fullweek) return(do.call("rbindtimeSeries", lapply(1:length(poscats), function(i, ts, ini.idx, last.idx, func.weekalign, weekstart.hour, weekclose.hour, by, k.by, how) { idx = ini.idx[i]:last.idx[i] func.weekalign(timeSeries(sub(ts@data,idx,), pos=positions(ts)[idx]), find.start = F, find.end = F, weekstart.hour=weekstart.hour, weekclose.hour = weekclose.hour, by = by, k.by = k.by, how = how) }, ts, ini.idx, last.idx, week.align, weekstart.hour, weekclose.hour, by, k.by, how))) if(n.weeks < 2){ # only 1 week of data # both align start and end positions to be determined return(week.align(ts, find.start = T, find.end = T, weekstart.hour=weekstart.hour, weekclose.hour = weekclose.hour, by = by, k.by = k.by, how = how)) } else{ # 2 or more weeks of data # the align start.pos of the 1st week and # the align end.pos of the last week to be determined # the 1st week ans.1stw = week.align(ts[ini.idx[1]:last.idx[1],], find.start = T, find.end = F, weekstart.hour=weekstart.hour, weekclose.hour = weekclose.hour, by = by, k.by = k.by, how = how) # the last week ans.lastw = week.align(ts[rev(ini.idx)[1]:rev(last.idx)[1], ], find.start = F, find.end = T, weekstart.hour=weekstart.hour, weekclose.hour = weekclose.hour, by = by, k.by = k.by, how = how) if(n.weeks > 2){ # there are mid weeks return(do.call("rbindtimeSeries", c(list(ans.1stw), lapply(2:(length(poscats)-1), function(i, ts, ini.idx, last.idx, func.weekalign, weekstart.hour, weekclose.hour, by, k.by, how) { idx = ini.idx[i]:last.idx[i] func.weekalign(timeSeries(sub(ts@data,idx,), pos=positions(ts)[idx]), find.start = F, find.end = F, weekstart.hour=weekstart.hour, weekclose.hour = weekclose.hour, by = by, k.by = k.by, how = how) }, ts, ini.idx, last.idx, week.align, weekstart.hour, weekclose.hour, by, k.by, how), list(ans.lastw)))) } else return(do.call("rbindtimeSeries", list(ans.1stw, ans.lastw))) } stop("Unlikely things happened! DEBUG!") } #################################################################################### # Taking difference for HF timeSeries within each day, e.g. stock exchange price data # by removing the first observation of differenced data for each day ############################################################################# diff.withinDay = function(ts, bizhrs.cleaned = F, exch.hours = c("9:30", "16:00"), start.include = T, close.include =T) { # bizhrs.cleaned: a logical if the data has been cleaned for observations outside the daily trading hours # exch.hours: ignored if bizhrs.cleaned = T # a two-element character vector in the format of "HH:MM", # specifying the starting hour and minute of an exhange # trading day (the first element) and the closing hour and minute # of the trading day (the second element) # # Whether the boundary positions will be included is controlled by # start.include and close.include (ignored if bizhrs.cleaned = T) if(!is(ts, "timeSeries")) stop("ts must be a timeSeries object") # loading the FinMetrics module, since the diff() function only works # properly under the FinMetrics module module(finmetrics, first = F) # cleaning observations outside the boundary hours, e.g. exchange hours if(!bizhrs.cleaned) ts = ExchangeHoursOnly(ts, exch.hours = exch.hours, start.include = start.include, close.include = close.include) # taking difference ts.diff = diff(ts) pos.diff = positions(ts.diff) # build an aux time series with row seq as data # then call aggregateSeries to find out the row num for the 1st obs each day ts.aux = timeSeries(data = seq(along = pos.diff), pos = pos.diff) idx.1st = seriesData(aggregateSeries(ts.aux, by = "days", FUN=function(x) {x[1]}, together = T)) if (nrow(idx.1st) < 2) # only one day is involved # no need to proceed return(ts.diff) # the row num for the 1st obs each day since the 2nd day idx.1st = idx.1st[-1, ] timeSeries(sub(ts.diff@data, -idx.1st,), pos = pos.diff[-idx.1st]) } ############################################################################# # Taking difference for HF timeSeries within each biz week, e.g. Fx price data # by removing the first observation of differenced data for each week ############################################################################### diff.withinWeek = function(ts, bizhrs.cleaned = F, bizweek.hours = c("22:00", "22:00"), start.include = T, close.include =T) { # bizhrs.cleaned: a logical if the data has been cleaned for observations outside the biz week hours # bizweek.hours: ignored if bizhrs.cleaned = T # a two-element character vector in the format of "HH:MM", # specifying the starting hour and minute of a new # biz week on Sunday (the first element) and the closing hour and minute # of the biz week on Friday (the second element) # # Whether the boundary positions will be included is controlled by # start.include and close.include (ignored if bizhrs.cleaned = T) if(!is(ts, "timeSeries")) stop("ts must be a timeSeries object") # loading the FinMetrics module, since the diff() function only works # properly under the FinMetrics modlue module(finmetrics, first = F) # cleaning observations outside the bizweek boundary hours, e.g. late Fri, Sat, and early Sun if(!bizhrs.cleaned) ts = FxBizWeekOnly(ts, bizweek.hours = bizweek.hours, start.include = start.include, close.include = close.include) # taking difference ts.diff = diff(ts) pos.diff = positions(ts.diff) # build an aux time series with row seq as data # then call aggregateSeries to find out the row num for the 1st obs each week ts.aux = timeSeries(data = seq(along = pos.diff), pos = pos.diff) idx.1st = seriesData(aggregateSeries(ts.aux, by = "weeks", week.align = "Sat", FUN=function(x) {x[1]}, together = T)) if (nrow(idx.1st) < 2) # only one week is involved # no need to proceed return(ts.diff) # the row num for the 1st obs each week since the 2nd week idx.1st = idx.1st[-1, ] timeSeries(sub(ts.diff@data, -idx.1st,), pos = pos.diff[-idx.1st]) } ############################################### # a function to excerpt data within exchange trading hours # Whether the boundary positions will be included is controlled by # start.include and close.include ############################################### ExchangeHoursOnly = function(ts, exch.hours = c("9:30", "16:00"), start.include = T, close.include =T) { # a function to excerpt data within exchange trading hours # Whether the boundary positions will be included is controlled by # start.include and close.include # exch.hours: a two-element character vector in the format of "HH:MM", # specifying the starting hour and minute of an exhange # trading day (the first element) and the closing hour and minute # of the trading day (the second element) if(!is(ts, "timeSeries")) stop("ts must be a timeSeries object") exch.hrsSpan = timeSpan(exch.hours, in.format = "%H:%M[:%S[:%N]]", format = "%02H%02M%02S%03N") if(any(is.na(exch.hrsSpan))) stop("exch.hours must be in the format of HH:MM:SS!") # boundary hours bound.times = sort(as.numeric(as.character(exch.hrsSpan))) start.time = bound.times[1] close.time = bound.times[2] pos = positions(ts) hms.list = .Call("time_to_hour_min_sec", pos, timeZoneList(), COPY = c(F, F), CLASSES = c("timeDate", "list")) pos.time = hms.list[[1]]*10^7 + hms.list[[2]]*10^5 + hms.list[[3]]*10^3 + hms.list[[4]] if(start.include) if(close.include) idx = (pos.time >= start.time) & (pos.time <= close.time) else idx = (pos.time >= start.time) & (pos.time < close.time) else if(close.include) idx = (pos.time > start.time) & (pos.time <= close.time) else idx = (pos.timie > start.time) & (pos.time < close.time) weekdays = .Call("time_to_weekday", pos, timeZoneList(), COPY = c(F, F), CLASSES = c("timeDate", "list")) idx = which(idx & !(weekdays == 0 | weekdays == 6)) timeSeries(sub(ts@data, idx,), pos = pos[idx]) } ############################################### # a function to excerpt data within the Fx bizweek # Whether the boundary positions will be included is controlled by # start.include and close.include ############################################### FxBizWeekOnly = function(ts, bizweek.hours = c("22:00", "22:00"), start.include = T, close.include =T) { # bizweek.hours: a two-element character vector in the format of "HH:MM", # specifying the starting hour and minute of a new # biz week on Sunday (the first element) and the closing hour and minute # of the biz week on Friday (the second element) if(!is(ts, "timeSeries")) stop("ts must be a timeSeries object") bizweek.hrsSpan = timeSpan(bizweek.hours, in.format = "%H:%M[:%S[:%N]]", format = "%02H%02M%02S%03N") if(any(is.na(bizweek.hrsSpan))) stop("bizweek.hours must be in the format of HH:MM:SS!") # boundary hours bound.times = as.numeric(as.character(bizweek.hrsSpan)) sundayStart.time = bound.times[1] fridayClose.time = bound.times[2] pos = positions(ts) hms.list = .Call("time_to_hour_min_sec", pos, timeZoneList(), COPY = c(F, F), CLASSES = c("timeDate", "list")) pos.time = hms.list[[1]]*10^7 + hms.list[[2]]*10^5 + hms.list[[3]]*10^3 + hms.list[[4]] # index for Fridays, Saturdays and Sundays weekdays = .Call("time_to_weekday", pos, timeZoneList(), COPY = c(F, F), CLASSES = c("timeDate", "list")) Fri.idx = weekdays == 5 Sat.idx = weekdays == 6 Sun.idx = weekdays == 0 if(start.include){ earlySun.idx = (pos.time < sundayStart.time) & Sun.idx if(close.include) lateFri.idx = (pos.time > fridayClose.time) & Fri.idx else lateFri.idx = (pos.time >= fridayClose.time) & Fri.idx } else { earlySun.idx = (pos.time <= sundayStart.time) & Sun.idx if(close.include) lateFri.idx = (pos.time > fridayClose.time) & Fri.idx else lateFri.idx = (pos.time >= fridayClose.time) & Fri.idx } idx = which(!(lateFri.idx | Sat.idx | earlySun.idx)) timeSeries(sub(ts@data, idx,), pos = pos[idx]) } ############################################### # a function to give out a logical vector indicate # if the positions (of timeSeries) are between given start and end positions ############################################### is.tsBW = function(x, start.pos, close.pos, start.include = T, close.include = T) { # x: timeDate or timeSeries # start.pos, and close.pos: each is a one-element timeDate object if(!is(x, "timeSeries") & !is(x,"timeDate") ) stop("x must be a timeSeries or timeDate object") if(!is(start.pos, "timeDate") | !is(close.pos, "timeDate") | length(start.pos) > 1 | length(close.pos) > 1) stop("start.pos and close.pos must be one-element timeDate object") if(is.na(start.pos) | is.na(close.pos)) stop("start.pos and close.pos can not be NA") if(is(x, "timeSeries")) pos = x@positions else pos = x if(start.include) if(close.include) ans = (pos >= start.pos) & (pos <= close.pos) else ans = (pos >= start.pos) & (pos < close.pos) else if(close.include) ans = (pos > start.pos) & (pos <= close.pos) else ans = (pos > start.pos) & (pos < close.pos) ans } ############################################### # a function to subscribe the part of a timeSeries # between the given start and end positions. If no observation # is within the specified interval, a NULL is returned and a warning # is reported. ############################################### tsBW = function(x, start.pos, close.pos, start.include = T, close.include = T) { # x: timeSeries # start.pos, and close.pos: each is a one-element timeDate object if(is(x, "timeSeries")) pos = x@positions else stop("x must be a timeSeries object") foundIdx = is.tsBW(x, start.pos = start.pos, close.pos = close.pos, start.include = start.include, close.include = close.include) if(any(foundIdx)) # there are some observations withing the interval return(timeSeries(sub(x@data, foundIdx,), pos = pos[foundIdx])) else{ warning("No observation is within the interval.") return(NULL) } } ############################################### # a function to plot intraday data by days. # useful for exploratory analysis and data diagnosis # ############################################### plotByDays = function(ts, coltoplot=1, daysToPlot = NULL, days.max = NULL, ...) { # ts: timeSeries to plot # coltoplot: column(s) to plot (column number, name, or logical vector allowed) # daysToPlot: a character vector specifying which days to plot. The format of days is # "m/d/Y". If daysToPlot is provided, days.max is ignored. # days.max: if daysToPlot is not given, the function is to plot data by days in order. # if days.max is given, it plots the first days.max days. Otherwise, the function # plots all days in data. # ...: other arguments passed to the plot function # If a day does not have a minimum of 2 observations to plot, a warning will be issued. if(is(ts, "timeSeries")) pos = ts@positions else stop("ts must be a timeSeries object") pos@format = "%02m/%02d/%Y" poschar = as.character(pos) days.data = unique(poschar) if(!is.null(daysToPlot)){ # user specified the days to plot n.daystoplot = length(daysToPlot) if(is(daysToPlot, "timeDate")) daysToPlot@format = pos@format else daysToPlot = timeDate(daysToPlot, zone = pos@time.zone, format = pos@format) daysToPlot = as.character(daysToPlot) } else{ # plot the days in data in order daysToPlot = days.data if(!is.null(days.max)) # plot the first days.max days n.daystoplot = min(length(days.data), days.max) else # plot all days in order n.daystoplot = length(days.data) } tmp = lapply(1:n.daystoplot, function(i, daysToPlot, poschar, ts, coltoplot, ...) { idx = poschar == daysToPlot[i] if (sum(idx) < 2) warning(paste(daysToPlot[i], "does not have the minimum of 2 observations in data to plot")) else plot(timeSeries(sub(ts@data,idx,coltoplot), pos=positions(ts)[idx]), plot.args=list(type="p"), x.axis.args=list(time.of.day.style="24:00"), ...) return() }, daysToPlot, poschar, ts, coltoplot, ...) return() } ############################################# ## a function to rbind timeSeries ############################################ rbindtimeSeries = function(...) { #...: any number of timeSeries objects, should be in the same class, ncol, variabls, etc args = list(...) if(!all(sapply(args, is, "timeSeries"))) stop("All arguments must be timeSeries") isDataVector = is.vector(seriesData(args[[1]])) if(isDataVector) newdata = do.call("c", lapply(args, FUN = seriesData)) else newdata = do.call("rbind", lapply(args, function(ts) { tsData = seriesData(ts) if(is(tsData, "matrix")){ rowIds(tsData) = character(0) return(tsData) } else { # data.frame attr(tsData, "dup.row.names") = T return(tsData) } })) newpos = do.call("concat", lapply(args, FUN = positions)) ans = timeSeries(newdata, pos = newpos) ans } ################################################################################################## ### a function to reorder a timeSeries according to its positions ################################################################################################### reorderTS = function(x) { x.pos = positions(x) x.data = seriesData(x) pos.order = order(x.pos) x.pos = x.pos[pos.order] x.data = sub(x.data, pos.order,) if (!is.null(rowIds(x.data))) { x.len = nrow(x.data) rowIds(x.data) = as.character(1:x.len) } ans = timeSeries(x.data, pos=x.pos) ans } ################################################################################## # A function to smooth data in table form (data.frame or matrix) across columns ################################################################################ tableSmoother = function(y, FUN, ...) { # the function is to smooth data in table form (data.frame or matrix) across columns # e.g. rows are timeblocks and cols are different days and the tableSmoother smooth # data across days. # the table form data can be obtained by, e.g. aggregateSeries(..., drop.empty=F) # or by reshape long vector of muti-days data into table form # y: data.frame or matrix, NA allowed # FUN: smoothing function to use. choices are "mean", "median", "loess.smooth", # or "supsmu" # ...: any other arguments to be passed to loess.smooth or supsmu # VALUE: a smoothed vector if( (!is.matrix(y))&(!is.data.frame(y))) stop (" Data has to be in table form, either data.frame or matrix!") if (is.null(switch(FUN, mean=0, median=0, loess.smooth=0, supsmu=0))) stop (" The smoothing function has to be one of mean, median, loess.smooth, or supsmu!") if (FUN=="mean") ans = as.vector(rowMeans(y, na.rm=T)) if (FUN=="median") ans = as.vector(apply(as.matrix(y), 1, FUN=FUN, na.rm=T)) if ( (FUN=="loess.smooth")|(FUN=="supsmu")) { n = nrow(y) y = as.vector(as.matrix(y)) x = rep(1:n, length= length(y)) na.idx = is.na(y) y = y[!na.idx] x = x[!na.idx] if (FUN=="loess.smooth") ans = loess.smooth(x, y, evaluation = n, ...)$y else ans = supsmu(x, y, ...)$y } ans } ################################################################################## ## A function to compute volatility (std.dev) of a given timeSeries for a specified timeinterval using aggregaeSeries ## The function return a dataframe with dimensions of block times and days ############################################################################# VTable = function(x.ts, by="minutes", k.by, overlapping=T) { # x.ts: the one-variable timeSeries data # by: time interval units # k.by: time interval length in "by" # overlapping: logical flag, if true, compute half-interval overlapping volatility pos = positions(x.ts) x.timeZone = pos@time.zone # Prepare colIds for the Volatility table pos@format = "%b.%02d.%Y" table.colnames = unique(as.character(pos)) # create an abbreviation of time units time.unit.abbrev = switch(by, minutes="m", seconds="s", hours="h") # create timeDate of days in data pos@format = "%02m/%02d/%Y" x.days=unique(as.character(pos)) smpl.blocks = timeDate(x.days, zone=x.timeZone) numDays=length(smpl.blocks) # Safeguard feature: write each day results onto disk(working directory) with temp.names, then cbind, and finally remove those # intermediate objects # create temp.names temp.names = paste("day", 1:numDays,"ans",sep="") for (i in 1:numDays) { # Extract the ith day data smpl = (pos >=smpl.blocks[i]) & (pos < (smpl.blocks[i]+1)) y = x.ts[smpl,] # construct the ith day aggregation positions (starting from t0) aggPos.1 = timeSeq(from= smpl.blocks[i], to= (smpl.blocks[i]+1), by=by, k.by=k.by, zone=x.timeZone) aggPos.1 = aggPos.1[!(aggPos.1 >= (smpl.blocks[i]+1))] # make sure all aggregation positions are within the ith day # volitility computing aggY = aggregateSeries(y, pos=aggPos.1, FUN=colStdevs, drop.empty=F) # computing overlapping volatility if desired, if (overlapping) { # construct the ith day aggregation positions (starting from t0+ half aggregation interval) aggPos.2 = aggPos.1 + timeSpan(paste(k.by, time.unit.abbrev, sep=""))/2 aggPos.2 = aggPos.2[!(aggPos.2 >= (smpl.blocks[i]+1))] # make sure all aggregation positions are within the ith day # data adjustment idx = positions(y) >= aggPos.2[1] y = y[idx] # volitility computing aggY.2 = aggregateSeries(y, pos=aggPos.2, FUN=colStdevs, drop.empty=F) # concatenate two aggregated series and reorder aggY = adhereTS(aggY, aggY.2) aggY= reorderTS(aggY) } # Prepare the ith talbe column aggData = seriesData(aggY) colIds(aggData)= table.colnames[i] pos.Y = positions(aggY) pos.Y@format = "%02H:%02M:%02S" rowIds(aggData)= as.character(pos.Y) #write ith day results into working directory with temp.names[i] assign(temp.names[i], aggData, where=1) } # Construct the table for (i in 1:numDays) { if (i==1) ans = get(temp.names[i], where=1) else ans = cbind(ans, get(temp.names[i], where=1)) } ans = as.data.frame(ans) # finalize the operation and remove intermediate objects remove(temp.names, where=1) ans } ############################################################################ # A function to aggregate B/S trade type dummy variables # over a certain interval/a certain number of trades proceeding the most recent # quote # ################################################################################## aggregateTradeTypes=function(trdDummy.ts, quotes.ts, by="seconds",k.by=20) { # trdDummy.ts: a timeSeries object with B/S dummy at the data slot # quotes.ts: positions of qutoes # by: "seconds", "minutes", "hours", or "trades" # k.by: number of time units or trades trades.pos = positions(trdDummy.ts) trdDummy = seriesData(trdDummy.ts) quotes.pos = positions(quotes.ts) quotes.len = length(quotes.pos) ans = matrix(NA, quotes.len, 1) if (by!="trades") { unitType = switch(by, seconds="s", minutes="m", hours="h") agg.span=timeSpan(paste(k.by, unitType, sep="")) quotes.pos.lag = quotes.pos - agg.span for ( i in 1:quotes.len) { smpl=(trades.pos >= quotes.pos.lag[i])&(trades.pos < quotes.pos[i]) if (sum(smpl)==0) next ans[i,1]=sum(trdDummy[smpl,]) } } else # aggregation by trades { for (i in 1:quotes.len) { smpl= trades.pos < pos[i] n= sum(smpl) ans[i,1]=sum(trdDummy[(n-k.by+1):n,]) } } ans=timeSeries(as.data.frame(ans), pos=quotes.pos) ans } ################################################################################### # A function to plot the stats with 95% upper and lower bounds ################################################################################# cusumPlot=function(x) { # Function to plot the cusum stats with upper and lower bounds # x: one-column timeSeries object of cusum stats n=nrow(x) factor=0.948*sqrt(n) pos=positions(x) y=cbind(seriesData(x), seq(factor,3*factor,length=n),seq(-factor,-3*factor,length=n)) colIds(y)=c("CUMSUM", "upper","lower") y=timeSeries(y,pos=pos) invisible(plot(y, plot.args=list(lty=c(1,2,2),col=c(2,1,1)),main=colIds(x),reference.grid=F)) } ############################################################################################################## # A function to compute the duration (life) of cusum stats # i.e. the duration of cusum stats before first time hitting the boundary ############################################################################################################## cusumLife=function(x) { # Function to compute the duration (life) in minutes before the model parameters estimates # give ill behaved filtering and forecasting (cusum stats hitting the 95% bounds) # x: one-column timeSeries object of cusum stats n=nrow(x) pos=positions(x) factor=0.948*sqrt(n) viol.idx=abs(seriesData(x)) > seq(factor,3*factor,length=n) if (sum(viol.idx)!=0) # violation occured { pos.viol=pos[viol.idx] life=pos.viol[1]-pos[1] } else # violation never occured in the testing sample life=pos[n]-pos[1] # Transform "life" from timespan class to numeric with the unit of minutes life@format="%s" life.min=as.numeric(as.character(life))%/%60 life.min } ######################################################################################3 # A function to compute time series of durations within specified intervals, daily or weekly. ################################################################################## DurationInInterv=function(x, units="seconds", interv.type = "daily", bound.hours = c("9:30", "16:00")) { # compute time series of durations within specified intervals, daily or weekly. # # x: a timeseries object or a timeDate object # units: durations can be either expressed in units of seconds, milliseconds, or minutes, the default is by seconds. # interv.type: "daily" or "weekly" for exchange based trading data and Fx type data respectively. # bound.hours: a two-element character vector in the format of "HH:MM" # the start and closing hours of the interval. In case of daily interval, # they are the start and closing hours of the exchange. In case of weekly interval, they # are the start hour of the biz week on Sunday and the closing hour of the biz week on # Friday. if(!is(x, "timeSeries") & !is(x,"timeDate") ) stop(paste(deparse(substitute(x)), "must be a timeSeries or timeDate object")) if(is(x, "timeSeries")) pos = positions(x) else pos = x ts.tmp = timeSeries(data = data.frame(pos), pos = pos) if (units == "seconds") factor = 86400 else if (units == "milliseconds") factor = 86400000 else if (units == "minutes") factor = 1440 else stop ("Duration units must be one of seconds, milliseconds, or minutes") if (interv.type == "daily") ans = diff.withinDay(ts = ts.tmp, exch.hours = bound.hours, start.include = T, close.include = T) else if (interv.type == "weekly") ans = diff.withinWeek(ts = ts.tmp, bizweek.hours = bound.hours, start.include = T, close.include = T) else stop ("interv.type must be one of daily or weekly") # the duration from differencing is in fraction of day, # adjust for desired units. ans = round(ans * factor) colIds(ans) = paste("Duration", "in", units, sep=".") return(ans) } ###################################################################################### # A function to compute time series of price changes (in approriate ticks, if desired) within # specified intervals, daily or weekly. ################################################################################## PriceChgInInterv=function(x, ticksize = NULL, interv.type = "daily", bound.hours = c("9:30", "16:00")) { # compute time series of price changes (in approriate tick sizes, if desired) # within specified intervals, daily or weekly. # # x: a timeseries of price levels # ticksize: the default is NULL to simply compute price change in original price levels. # If ticksize is supplied, the price changes will be divided by the ticksize to compute # price changes in ticks. # interv.type: "daily" or "weekly" for exchange based trading data and Fx type data respectively. # bound.hours: a two-element character vector in the format of "HH:MM" # the start and closing hours of the interval. In case of daily interval, # they are the start and closing hours of the exchange. In case of weekly interval, they # are the start hour of the biz week on Sunday and the closing hour of the biz week on # Friday. if(!is(x, "timeSeries")) stop(paste(deparse(substitute(x)), "must be a timeSeries object")) if (interv.type == "daily") ans = diff.withinDay(ts = x, exch.hours = bound.hours, start.include = T, close.include = T) else if (interv.type == "weekly") ans = diff.withinWeek(ts = x, bizweek.hours = bound.hours, start.include = T, close.include = T) else stop ("interv.type must be one of daily or weekly") # compute price changes in ticks if required if (!is.null(ticksize)) ans = round(ans / ticksize) return(ans) } ######################################################################################### # The function extends the aggregatesSeries function in S-Plus for high-frequency data # by fixing the start and end aggregation positions based on trading conventions: # daily business hours for exchange trading, and bizweek for Fx trading. ############################################################################################## aggregateSeriesHF = function(ts, interv.type = "daily", bound.hours = c("9:30", "16:00"), by, k.by, FUN, adj, drop.empty = F, ...) { # The function extends the aggregatesSeries function in S-Plus for high-frequency data # by fixing the start and end aggregation positions based on trading conventions: # daily business hours for exchange trading, and bizweek for Fx trading. # interv.type: "daily" or "weekly" for exchange based trading data and Fx type data respectively. # bound.hours: a two-element character vector in the format of "HH:MM" # the start and closing hours of the interval. In case of daily interval, # they are the start and closing hours of the exchange. In case of weekly interval, they # are the start hour of the biz week on Sunday and the closing hour of the biz week on # Friday. # FUN: aggregation function to be applied to data in each block # adj: 0 or 1, adjusting the aggregated data toward the lower end of aggregation block or upper end. # ...: other parameters to be passed to aggregateSeries() if(!is(ts, "timeSeries")) stop(paste(deparse(substitute(ts)), "must be a timeSeries object")) if((adj!=0) & (adj!=1)) stop("adj must be either 0 or 1") # utility functions for Fx data findStartpos = function(firstpos, weekstart.hour) { # function takes a first timeDate position of data # returns the start.pos of the week: the weekstart.hour on # Sun origformat = firstpos@format firstpos@format = "%m/%d/%Y" if(wdydy(firstpos)[,"weekday"] == 0) # the week starts from Sun start.pos = timeDate(paste(as.character(firstpos), weekstart.hour), format = origformat, zone=firstpos@time.zone) else{ pos.tmp = timeSeq(from = firstpos, to = firstpos, by = "weeks", align.by = T, extend = T, week.align = "Sun", format = "%m/%d/%Y")[1] start.pos = timeDate(paste(as.character(pos.tmp), weekstart.hour), format = origformat, zone=firstpos@time.zone) } start.pos } findEndpos = function(lastpos, weekclose.hour) { # function takes a last timeDate position of data # returns the end.pos of the week: the weekclose.hour on # Fri origformat = lastpos@format lastpos@format = "%m/%d/%Y" if (wdydy(lastpos)[,"weekday"] == 5) #the week ends on Fri end.pos = timeDate(paste(as.character(lastpos), weekclose.hour), format = origformat, zone = lastpos@time.zone) else{ pos.tmp = timeSeq(from = lastpos, to = lastpos, by = "weeks", align.by = T, extend = T, week.align = "Fri", format = "%m/%d/%Y")[2] end.pos = timeDate(paste(as.character(pos.tmp), weekclose.hour), format = origformat, zone = lastpos@time.zone) } end.pos } start.hour = bound.hours[1] close.hour = bound.hours[2] if (interv.type == "daily"){ # exchange based data # clean data ts = ExchangeHoursOnly(ts, exch.hours = bound.hours) origpos = positions(ts) rng = range(origpos) # Figure out the start and end positions for aggregation pos.tmp = rng[1] pos.tmp@format = "%m/%d/%Y" start.pos = timeDate(paste(as.character(pos.tmp), start.hour), format = origpos@format, zone= origpos@time.zone) start.pos@time.zone = "GMT" pos.tmp = rng[2] pos.tmp@format = "%m/%d/%Y" end.pos = timeDate(paste(as.character(pos.tmp), close.hour), format = origpos@format, zone= origpos@time.zone) end.pos@time.zone = "GMT" newpos = timeSeq(from = start.pos, to = end.pos, by = by, k.by = k.by, format = origpos@format, zone="GMT") newpos@time.zone = origpos@time.zone # eliminate those off hours and weekend aggregation blocks aux.ts = timeSeries(data = seq(along= newpos), pos = newpos) aux.ts = ExchangeHoursOnly(aux.ts, exch.hours = bound.hours) newpos = positions(aux.ts) ans = aggregateSeries(ts, pos = newpos, FUN=FUN, adj = adj, drop.empty = drop.empty,...) if(adj ==1) ans = ExchangeHoursOnly(ans, exch.hours = bound.hours, start.include = F) else ans = ExchangeHoursOnly(ans, exch.hours = bound.hours, close.include = F) } else if (interv.type == "weekly"){ # Fx type data # clean data ts = FxBizWeekOnly(ts, bizweek.hours = bound.hours) origpos = positions(ts) rng = range(origpos) start.pos = findStartpos(rng[1], start.hour) start.pos@time.zone = "GMT" end.pos = findEndpos(rng[2], close.hour) end.pos@time.zone = "GMT" newpos = timeSeq(from = start.pos, to = end.pos, by = by, k.by = k.by, format = origpos@format, zone = "GMT") newpos@time.zone = origpos@time.zone # eliminate those weekend aggregation blocks aux.ts = timeSeries(data = seq(along= newpos), pos = newpos) aux.ts = FxBizWeekOnly(aux.ts, bizweek.hours = bound.hours) newpos = positions(aux.ts) ans = aggregateSeries(ts, pos = newpos, FUN = FUN, adj = adj, drop.empty = drop.empty,...) if (adj ==1) ans = FxBizWeekOnly(ans, bizweek.hours = bound.hours, start.include = F) else ans = FxBizWeekOnly(ans, bizweek.hours = bound.hours, close.include = F) } else stop ("interv.type must be one of daily or weekly") ans } ######################################################################################### # A function to linearly interplate the values of daily deterministic components in paramters at the positions of # data obersvations. ############################################################################################## DetermInterp= function(data.pos, daily.determ) { # data.pos: a timeDate object, from the positions of data time series # daily.determ: a timeSeries object of daily deterministic patterns of a set of parameters, # typically output from certain smoother across days. In fact, the date information of this time # series is ignored and the intra-day time positions are actually used. Each column of the timeSeries # corresponds to a particular parameter # A critical assumption is that the daily deterministic pattern covers the positions range for each day data # output: a timeSeries object with data.pos as postitions and # linearly interplated daily deterministic values at those postions # identify the days involved in data observations data.pos@format = "%02m/%02d/%Y" data.days = unique(as.character(data.pos)) numDays = length(data.days) # identify the time blocks in daily deterministic components determ.pos = positions(daily.determ) determ.pos@format = "%02H:%02M:%02S" determ.blocks = unique(as.character(determ.pos)) numBlocks = length(determ.blocks) # record data and determ timezones info data.pos.timeZone = data.pos@time.zone determ.pos.timeZone = determ.pos@time.zone if (data.pos.timeZone != determ.pos.timeZone) stop ("Data observations and daily deterministic patterns must be in the same time zones!") # Create a new determ series to match days in data observations pos.strings = paste(rep(data.days, each=numBlocks), rep(determ.blocks, numDays), sep=" ") determ.newpos = timeDate(pos.strings, zone = data.pos.timeZone) para.names = colIds(daily.determ) determ.data = seriesData(daily.determ) determ.newdata = matrix(rep(as.vector(t(as.matrix(determ.data))), numDays), by.row=T, ncol=length(para.names)) colIds(determ.newdata) = para.names determ.newdata = as.data.frame(determ.newdata) determ.newSeries = timeSeries(determ.newdata, pos= determ.newpos) # linearly interplate deterministic components values for data positions ans = align(determ.newSeries, pos= data.pos, how="interp", error.how="nearest") ans } ############################################################################### # A function to compute the number of NAs between two adjecent non-NA quotes ########################################################################## numNAs = function(x) { # x: a timeseries of quotes with NAs # output: a timeseries of the number of NAs between two adjecent # non-NA quotes if(!is(x, "timeSeries")) stop("x must be a timeSeries object") numrow=nrow(x) pos=positions(x) temp=as.matrix(seq(1,numrow, by=1)) na.idx=is.na(seriesData(x)) temp.nonna=temp[!na.idx,1,drop=F] pos.nonna=pos[!na.idx] ans=diff(temp.nonna)-1 colIds(ans)="NumNAs" ans=timeSeries(as.data.frame(ans), pos=pos.nonna[-1]) ans } ############################################################################ # A function to compute NA duration, i.e. how long observations have been in NA state ############################################################################## naDuration = function (x.ts) { # The function taks a time series of observations with NAs and return a time series # of equal length (and the same timedate position) of the input. The values of the resulting # timeseries indicate that at each time position, how long (duration) observations have been in # NA state ## E.x. # Input Positions:P1 P2 P3 P4 P5 P6 P7 P8 P9 P10 P11 P12 # Value: nonNA NA nonNA NA NA nonNA NA NA NA NA nonNA nonNA # Output Positions:P1 P2 P3 P4 P5 P6 P7 P8 P9 P10 P11 P12 # Value: 0 0 0 0 P5-P4 0 0 P8-P7 P9-P7 P10-P7 0 0 # data info x.pos = positions(x.ts) x.data = seriesData(x.ts) x.row = nrow(x.ts) # Initialize results ans = matrix(0, x.row,1) colIds(ans) = "naDuration" # construct aux sequence number vector aux = seq(1, x.row, by=1) # identify non-na data sequency number nona.idx = !is.na(x.data) nona.seqNum = aux[nona.idx] # finishing "bracketing" nona.seqNum = c(0, nona.seqNum, x.row+1) # NA duration is only defined for those blocks #(bracketed by two nonNAs) that have two or more # consecutive NAs, find them key.seqNum = nona.seqNum[c(F,diff(nona.seqNum)>2)] key.seqNum.idx = match(key.seqNum, nona.seqNum) for (i in key.seqNum.idx) { dur.temp = duration(x.ts[(nona.seqNum[i-1]+1):(nona.seqNum[i]-1),]) ans[(nona.seqNum[i-1]+2):(nona.seqNum[i]-1),] = cumsum(as.matrix(seriesData(dur.temp))) } ans = as.data.frame(ans) ans = timeSeries(ans, pos =x.pos) }