# descriptiveStatistics.r # # script file for computing descriptive statistics # # author: Eric Zivot # created: September 18, 2008 # revised: October 13, 2008 # # Core R functions used: # # acf compute sample autocovariances or autocorrelations # apply apply function to rows or columns of matrix # args determine agruments of a function # boxplot compute boxplot # cbind combine data objects vertically # class determine class of object # colIds get column names from object # cor compute sample correlation matrix # density compute smoothed histogram # end get end date of time series # help invoke help system # hist compute histogram # legend add legend to graph # length compute column length of matrix # library load R package # mean compute sample mean # names show names of object # par set graphics parameters # plot generic plot function # points add points to a graph # qqline add line to qq-plot # qqnorm qq-plot against normal distribution # qt compute quantiles of student t distribution # rlnorm generate random data from log-normal distribution # rt generate random data from student t distribution # scale standardize a vector of data # pnorm compute normal CDF # seq generate sequence of numbers # sort sort data # start get start date of time series # stdev compute sample standard deviation # ts.plot time series plot # var compute sample variance or covariance matrix # ? invoke help system # # Rpackages used # TSA Time Series Analysis # skewness sample skewness # kurtosis sample kurtosis # acf modifies R function acf() to make plot start # at lag 1 instead of lag 0 # tseries Time series and computational finance # get.hist.quote load data from Yahoo! # zoo # plot.zoo plot zoo object # load packages library("TSA") library("tseries") library("zoo") # get monthly adjusted closing price data on MSFT and SP500 from Yahoo # using the tseries function get.hist.quote. Set sample to Jan 1998 through # Dec 2007. Note: if you are not careful with the start and end dates # or if you set the retclass to "ts" then results might look weird MSFT.prices = get.hist.quote(instrument="msft", start="1998-01-01", end="2007-12-31", quote="AdjClose", provider="yahoo", origin="1970-01-01", compression="m", retclass="zoo") class(MSFT.prices) colnames(MSFT.prices) start(MSFT.prices) end(MSFT.prices) # change date index class to yearmon # index(MSFT.prices) = as.yearmon(index(MSFT.prices)) # note: x-axis on graphs do not look good if this is done SP500.prices = get.hist.quote(instrument="^gspc", start="1998-01-01", end="2007-12-31", quote="AdjClose", provider="yahoo", origin="1970-01-01", compression="m", retclass="zoo") # change date index class to yearmon # index(SP500.prices) = as.yearmon(index(SP500.prices)) # # 2. compute cc returns # MSFT = diff(log(MSFT.prices)) SP500 = diff(log(SP500.prices)) MSFTSP500 = merge(MSFT,SP500) colnames(MSFTSP500) class(MSFT) class(SP500) class(MSFTSP500) # convert zoo data to matrix data for non time series analysis # many R functions do not have methods implemented for zoo objects # and results may be unpredictable MSFT.mat = coredata(MSFT) colnames(MSFT.mat) = "MSFT" SP500.mat = coredata(SP500) colnames(SP500.mat) = "SP500" MSFTSP500.mat = coredata(MSFTSP500) colnames(MSFTSP500) = c("MSFT","SP500") # # 3. time plots # # look at help file for plot method for zoo objects ?plot.zoo # plot individual prices plot(MSFT.prices,main="Monthly closing price of MSFT", ylab="Price") plot(SP500.prices,main="Monthly closing price of SP500", ylab="Price") # put returns on the same plot in separate panels plot(MSFTSP500, main="Monthly cc returns on MSFT and SP500") # put returns on same plot in one panel and add a horizontal line plot(MSFTSP500, plot.type="single", main="Monthly cc returns on MSFT and SP500", col = c("red", "blue")) abline(h=0) legend(x="topleft", legend=c("MSFT","SP500"), col=c("red","blue")) # # 5. create simulated iid Gaussian data with same mean and SD # as MSFT # set.seed(123) gwn = rnorm(length(MSFT),mean=mean(MSFT),sd=sd(MSFT)) par(mfrow=c(2,1)) plot(MSFT,main="Monthly cc returns on MSFT") abline(h=0) ts.plot(gwn,main="Gaussian data with same mean and sd as MSFT") abline(h=0) par(mfrow=c(1,1)) # # 6. histograms # # use hist() command # note: hist() does not have a method for objects of class zoo # must use coredata to extract data prior to using hist args(hist) ?hist hist(MSFT.mat,main="Histogram of MSFT monthly cc returns", col="slateblue1") # scale histogram so that total area = 1 hist(MSFT.mat,main="Histogram of MSFT monthly returns", probability=T) hist(gwn,main="Histogram of simulated Gaussian data", col="slateblue1") hist(SP500.mat,main="Histogram of SP500 monthly cc returns", col="slateblue1") # plot both histograms on same page # note different scales par(mfrow=c(2,1)) hist(MSFT.mat,main="Histogram of MSFT monthly returns", col="slateblue1") hist(SP500.mat,main="Histogram of SP500 monthly returns", col="slateblue1") par(mfrow=c(1,1)) par(mfrow=c(1,2)) hist(MSFT.mat,main="MSFT") hist(SP500.mat,main="SP500") par(mfrow=c(1,1)) # use same breakpoints for both histograms MSFT.breaks = hist(MSFT.mat,plot=F) MSFT.breaks par(mfrow=c(2,1)) hist(MSFT.mat,main="MSFT", col="slateblue1") hist(SP500.mat,main="SP500", col="slateblue1", breaks=MSFT.breaks$breaks) par(mfrow=c(1,1)) # Use RMetrics histPlot for zoo objects - note Trellis display # note common scale for histograms! args(histPlot) histPlot(MSFT, add.fit=F, rug=F) # # 7. smoothed histograms # # use density() command ?density args(density) MSFT.density = density(MSFT.mat) class(MSFT.density) names(MSFT.density) MSFT.density plot(MSFT.density,type="l",xlab="monthly return", col="orange", lwd=2, ylab="density estimate",main="Smoothed histogram") # put histogram and density plot on same graph hist(MSFT.mat,main="Histogram and smoothed density", col="slateblue1", probability=T) points(MSFT.density,type="l", col="orange", lwd=2) SP500.density = density(SP500) plot(SP500.density,type="l",xlab="monthly return", ylab="density estimate",main="Smoothed histogram") # combine density plots on one graph hist(SP500.mat,main="Histogram and smoothed density", probability=T) points(SP500.density,type="l") # # 8. Empirical quantiles and summary statistics # # use quantile() function ?quantile args(quantile) quantile(MSFT.mat) qnorm(p=c(0.01,0.05), mean=mean(MSFT.mat), sd=sd(MSFT.mat)) quantile(MSFT.mat,probs=c(0.01,0.05)) qnorm(p=c(0.01,0.05), mean=mean(SP500.mat), sd=sd(SP500.mat)) quantile(SP500.mat,probs=c(0.01,0.05)) # note: empirical quantiles are useful for computing value-at-risk (VaR) # compute median median(MSFT.mat) # compute interquartile range IQR quantile(MSFT.mat,probs=0.75) - quantile(MSFT.mat,probs=0.25) # use mean, var, sd, skewness and kurtosis functions mean(MSFT.mat) var(MSFT.mat) sd(MSFT.mat) skewness(MSFT.mat) kurtosis(MSFT.mat) # kurtosis function actually computes excess kurtosis kurtosis(MSFT) + 3 # note: summary is a generic function with several methods summary(MSFT.mat) # Use apply to compute statistics for columns apply(MSFTSP500.mat, 2, mean) apply(MSFTSP500.mat, 2, sd) apply(MSFTSP500.mat, 2, skewness) apply(MSFTSP500.mat, 2, kurtosis) # # 9. empirical distribution function # # compute and plot empirical distribution function for simulated gaussian data n1 = length(gwn) plot(sort(gwn),(1:n1)/n1,type="s",ylim=c(0,1), col="slateblue1", lwd=2, main="Empirical CDF of Gaussian data", ylab="#x(i) <= x") # compare empirical cdf to standard normal cdf for simulated gaussian data z1 = scale(gwn) # standardize to have mean zero and sd 1 n1 = length(gwn) F.hat = 1:n1/n1 # empirical cdf x1 = sort(z1) # sort from smallest to largest y1 = pnorm(x1) # compute standard normal cdf at x # The following plot options are used # type determine type of plot: "l" is line plot; "s" is step plot # lty line type: 1 is solid line; 3 is dot-dashed line # lwd line thickness: higher values give thicker lines # col line color: 1 is black, 2 is blue etc. # For help on plot options, see help(par) plot(x1,y1,main="Empirical CDF vs. Normal CDF for Gaussian data", type="l",lwd=2,xlab="standardized gwn",ylab="CDF") points(x1,F.hat, type="s", lty=1, lwd=3, col="orange") legend(x="topleft",legend=c("Normal CDF","Empirical CDF"), lty=c(1,1), lwd=2, col=c("black","orange")) # compare empirical cdf to standard normal cdf for MSFT returns z1 = scale(MSFT.mat) # standardize to have mean zero and sd 1 n1 = length(MSFT.mat) F.hat = 1:n1/n1 # empirical cdf x1 = sort(z1) # sort from smallest to largest y1 = pnorm(x1) # compute standard normal cdf at x plot(x1,y1,main="Empirical CDF vs. Normal CDF for MSFT returns", type="l",lwd=2,xlab="standardized MSFT returns",ylab="CDF") points(x1,F.hat, type="s", lty=1, lwd=3, col="orange") legend(x="topleft",legend=c("Normal CDF","Empirical CDF"), lty=c(1,1), lwd=c(2,3), col=c("black","orange")) # compare empirical cdf to standard normal cdf for SP500 returns z1 = scale(SP500.mat) # standardize to have mean zero and sd 1 n1 = length(SP500.mat) F.hat = 1:n1/n1 # empirical cdf x1 = sort(z1) # sort from smallest to largest y1 = pnorm(x1) # compute standard normal cdf at x plot(x1,y1,main="Empirical CDF vs. Normal CDF for SP500 returns", type="l",lwd=2,xlab="standardized SP500 returns",ylab="CDF") points(x1,F.hat, type="s", lty=1, lwd=3, col="orange") legend(x="topleft",legend=c("Normal CDF","Empirical CDF"), lty=c(1,1), lwd=c(2,3), col=c("black","orange")) # # 10. QQ plots: compare empirical quantiles to those from normal distribution # par(mfrow=c(2,2)) # 4 panel layout: 2 rows and 2 columns qqnorm(gwn, main="Gaussian data") qqline(gwn) qqnorm(MSFT.mat, main="MSFT") qqline(MSFT.mat) qqnorm(SP500.mat, main="SP500") qqline(SP500.mat) par(mfrow=c(1,1)) # Data for student t with 3 df: tails fatter than normal set.seed(123) tdata = rt(100,df=3) # Student-t with 3 df gdata = rnorm(100) # N(0,1) data xx = seq(from=-5,to=5,length=100) # data for log-normal: asymmetric distribution lndata = rlnorm(100) yy = seq(from=0, to = 7, length=100) par(mfrow=c(2,2)) # 1st plot plot(xx,dnorm(xx),type="l", lwd=2, main="Normal and Student-t with 3 df", xlab = "z, t", ylab = "pdf") points(xx,dt(xx,df=3), type="l", col="orange", lwd=3) legend(x="topright", legend=c("Normal","Student-t"), lty=c(1,1), col=c("black","orange"), lwd=c(2,3)) # 2nd plot plot(yy,dnorm(yy,mean=4,sd=1),type="l", lwd=2, ylim=c(0,0.7), main="Normal and Log-Normal", xlab = "z, exp(z)", ylab = "pdf") points(yy,dlnorm(yy), type="l", lwd=3, col="orange") legend(x="topright", legend=c("Normal","Log-Normal"), lty=c(1,1), col=c("black","orange"), lwd=c(2,3)) # 3rd plot qqnorm(tdata) qqline(tdata) # 4th plot qqnorm(lndata) qqline(lndata) par(mfrow=c(1,1)) # # 11. outliers # # create MSFT return data polluted by outlier MSFT.new = MSFT.mat colnames(MSFT.new) = "MSFT.new" MSFT.new[20] = -0.9 par(mfrow=c(2,1)) ts.plot(MSFT.new,main="Monthly cc returns on MSFT polluted by outlier") abline(h=0) hist(MSFT.new, col="slateblue1") par(mfrow=c(1,1)) # compare summary statistic for MSFT returns and returns polluted by outlier tmp = cbind(MSFT.mat, MSFT.new) apply(tmp, 2, mean) apply(tmp, 2, sd) apply(tmp, 2, skewness) apply(tmp, 2, kurtosis) # # 12. boxplots # # use boxplot() function ?boxplot args(boxplot) boxplot(MSFT.mat,outchar=T,main="Boxplot of monthly cc returns on Microsoft", ylab="monthly cc return", col="slateblue1") boxplot(SP500,outchar=T,main="Boxplot of monthly cc returns on SP500", ylab="monthly cc return") boxplot(gwn,MSFT.mat,SP500.mat,names=c("gwn","MSFT","SP500"),outchar=T, col="slateblue1", main="Comparison of return distributions", ylab="monthly cc return") # # 13. graphically summarize data (see Smith notes and Carmona book) # par(mfrow=c(2,2)) hist(MSFT.mat,main="MSFT monthly cc returns", probability=T, ylab="cc return", col="slateblue1") boxplot(MSFT.mat,outchar=T, ylab="cc return", col="slateblue1") plot(MSFT.density,type="l",xlab="cc return", col="slateblue1", lwd=2, ylab="density estimate", main="Smoothed density") qqnorm(MSFT.mat) qqline(MSFT.mat) par(mfrow=c(1,1)) # # 14. bivariate graphical summaries # # bivariate scatterplot plot(MSFT.mat,SP500.mat,main="Monthly cc returns on MSFT and SP500", col="slateblue1") abline(h=mean(SP500.mat)) # horizontal line at SP500 mean abline(v=mean(MSFT.mat)) # vertical line at MSFT mean # all pairwise scatterplots pairs(cbind(gwn,MSFT.mat,SP500.mat), col="slateblue1") # compute covariance and correlation matrix var(cbind(gwn,MSFT.mat,SP500.mat)) cor(cbind(gwn,MSFT.mat,SP500.mat)) # # 15. Time series descriptive statistics # # sample autocovariances and autocorrelations ?acf args(acf) # autocorrelations for simulated gaussian data and SP500 par(mfrow=c(3,1)) tmp=acf(gwn) tmp=acf(MSFT.mat) tmp=acf(SP500.mat) par(mfrow=c(1,1))