Effective graphs for data display

Introduction
Figure 1
Figure 2
Figure 3
Figure 4
Figure 5
Figure 6
Figure 7
Figure 8
Figure 9
Figure 10
Figure 11
Figure 12

Introduction

This page contains R code and output from the examples in the paper. In all cases, the data are random samples from the freely-available NHANES dataset, and the variables used are;

BPXSAR: systolic blood pressure (mmHg)
BPXDAR: diastolic blood pressure (mmHg)
BPXDI1, BPXDI2: two diastolic blood pressure readings
race_ethc: race/ethnicity, coded as Hispanic, White non-Hispanic, Black non-Hispanic and Other
DR1TFOLA: folate intake (\(\mu\)g/day)
gender: sex, coded as Male/Female
BMXBMI: body mass index (kg/m\(^2\))
RIDAGEY: age (years)

To run the code, the following packages must first be installed;

beeswarm, vioplot, splines, lattice: install.packages(c("beeswarm","vioplot","lattice"))
snippets: install.packages(pkgs="snippets",repos="http://www.rforge.net/")

Following this, cut-and-paste the code for each figure into an R session.

Figure 1

nhanessmall <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhanessmall.csv", na=".")
with(nhanessmall, stripchart(BPXSAR, vertical=FALSE, pch=1, xlab="Systolic BP (mmHg)"))

library("beeswarm")
with(nhanessmall, beeswarm(BPXSAR, breaks=NA, vertical=FALSE, pch=1, method="center", xlab="Systolic BP (mmHg)"))

Figure 2

nhanesmedium <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhanesmedium.csv", na=".")
with(nhanesmedium, stripchart(BPXSAR, vertical=FALSE, pch=1, xlab="Systolic BP (mmHg)"))#, cex=1))

library("beeswarm")
with(nhanesmedium, beeswarm(BPXSAR, breaks=NA, vertical=FALSE, pch=1, method="center", xlab="Systolic BP (mmHg)"))

with(nhanesmedium, beeswarm(BPXSAR, breaks=seq(min(BPXSAR), max(BPXSAR), 1), vertical=FALSE, pch=1, method="center", xlab="Systolic BP (mmHg)"))
# adding superimposed points and legend:
points(x=mean(nhanesmedium$BPXSAR), y=0.6, pch=16, col="blue")
lines(x=confint(lm(BPXSAR~1, data=nhanesmedium)), y=c(0.6,0.6), pch=16, col="blue")
legend("topright", col="blue",pch=16, lty=1, "Sample mean & 95% conf interval", bty="n")

Figure 3

nhaneslarge <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhaneslarge.csv", na=".")
library("vioplot")
vioplot(nhaneslarge$BPXSAR, h=3, names="", horizontal=TRUE, col="grey90", drawRect=FALSE)
mtext(side=1, line=3, "Systolic BP (mmHg)")
points(x=mean(nhaneslarge$BPXSAR), y=1, pch=16, col="blue")
points(x=median(nhaneslarge$BPXSAR), y=1, pch=18, col="red")
legend("topright", pch=c(16,18), col=c("blue","red"), c("Sample mean","Sample median"), bty="n")

with(nhaneslarge, hist(BPXSAR, n=30, col="grey", main="", xlab="Systolic BP (mmHg)"))

Figure 4

nhaneslarge <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhaneslarge.csv", na=".")
barplot(as.matrix(table(nhaneslarge$BPXSAR>140)/1000), beside=FALSE, horiz=TRUE, xlim=c(0,1),
xlab="Proportion with Systolic BP <= 140 mmHg\n")

dotchart( 1-mean( nhaneslarge$BPXSAR>140), xlim=c(0.03,0.97), lcolor=NA,  
xlab="Proportion with Systolic BP <= 140 mmHg,\nwith corresponding 95% conf interval", pch=1)
lines(x = 1-binom.test( sum(nhaneslarge$BPXSAR>140), 1000)$conf.int, y=c(1,1))

par(mar=c(4,2,4,0.3)+0.3, xpd=TRUE)
barplot(as.matrix(table(nhaneslarge$race_ethc)/1000), beside=FALSE, horiz=TRUE, xlim=c(0,1),
xlab="Proportion\n")
legend("top", inset=c(0, -0.5), fill=gray.colors(4), horiz=TRUE, bty="n", levels(nhaneslarge$race_ethc))

par(mar=c(5,2,0,0.3)+0.3)
dotchart( as.matrix(rev(table(nhaneslarge$race_ethc)))/1000, 
xlim=c(0.03,0.97), lcolor=NA, gcolor="white", xlab="Proportion,\nwith corresponding 95% conf interval")
for(i in 1:4){
lines(x = binom.test( sum(as.numeric(nhaneslarge$race_ethc)==i), 1000)$conf.int,
  y=5-c(i,i))
}

Figure 5

nhaneslarge <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhaneslarge.csv", na=".")
par(mar=c(0,0,0,0))
pie(table(nhaneslarge$BPXSAR>140), labels=c("SBP<=140 mmHg ", "SBP>140 mmHg"), col=gray.colors(2))

pie(table(nhaneslarge$race_ethc), col=gray.colors(4))

Figure 6

nhanessmall <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhanessmall.csv", na=".")
nhanesmedium <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhanesmedium.csv", na=".")
nhaneslarge <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhaneslarge.csv", na=".")
#empty plot to set up axes, labels etc:
boxplot(DR1TFOLA ~gender, ylim=c(0,2200), type="n", border=0,
     data=nhanessmall,xlab="",ylab=quote("Folate intake"~(mu*g/day)) )
#code to add points:
library("beeswarm")
beeswarm(DR1TFOLA ~gender, 
 data=nhanessmall,method="center", breaks=NA, add=TRUE)

boxplot(DR1TFOLA ~gender, ylim=c(0,2200), type="n", border=0,
     data=nhanesmedium,xlab="",ylab=quote("Folate intake"~(mu*g/day)) )
beeswarm(DR1TFOLA ~gender, data=nhanesmedium,method="center", add=TRUE)

library("sm")
library("vioplot")
vioplot( subset(nhaneslarge, gender=="Male")$DR1TFOLA, 
         subset(nhaneslarge, gender=="Female")$DR1TFOLA, 
         h=20, ylim=c(0,2200), names=c("Male","Female"), col="grey90", drawRect=FALSE)
mtext(side=2, line=3, cex=0.75, quote("Folate intake"~(mu*g/day)))
# add lines for the RDA values, and a legend
segments(.8,400,1.2,400,lwd=2,lty=2,col="grey30")
segments(1.8,400,2.2,400,lwd=2,lty=2,col="grey30")
segments(1.8,600,2.2,600,lwd=2,lty=1,col="grey30")
legend("topright",legend=c("RDA: pregnant women", "RDA: other adults"),lty=1:2,lwd=2,bty="n",col="grey30")

Figure 7

nhanesmedium <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhanesmedium.csv", na=".")
#empty plot to set up axes - same as the larger dataset 
plot(BPXSAR~RIDAGEYR, data=nhaneslarge, xlab="Age (years)",ylab="Systolic BP (mmHg)",type="n")
points(BPXSAR~RIDAGEYR,data=nhanesmedium)
# fit the linear model, add the line and 95% CI
lmmodel <- lm(BPXSAR~RIDAGEYR, data=nhanesmedium)
mfit <- predict(lmmodel,se=TRUE) # assumes homoskedasticity
mii <- order(nhanesmedium$RIDAGEYR)
polygon(nhanesmedium$RIDAGEYR[c(mii,rev(mii))],
        mfit$fit[c(mii,rev(mii))]+mfit$se.fit[c(mii,rev(mii))]*rep(c(-1.96,1.96),each=200),
        col="#00000050",border=NA)
lines(nhanesmedium$RIDAGEYR[mii],mfit$fit[mii],type="l")

nhaneslarge <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhaneslarge.csv", na=".")
# plotting the larger dataset;
plot(BPXSAR~RIDAGEYR,data=nhaneslarge,xlab="Age (years)",ylab="Systolic BP (mmHg)",col="grey")
# spline fit plus prediction interval
library("splines")
splinemodel <- lm(BPXSAR~ns(RIDAGEYR,3),data=nhaneslarge,)
fit <- predict(splinemodel,se=TRUE)
ii <- order(nhaneslarge$RIDAGEYR)
polygon(nhaneslarge$RIDAGEYR[c(ii,rev(ii))],
        fit$fit[c(ii,rev(ii))]+fit$se.fit[c(ii,rev(ii))]*rep(c(-1.96,1.96), each=1000),
        col="#00000050",border=NA)
lines(nhaneslarge$RIDAGEYR[ii],fit$fit[ii],type="l")

Figure 8

nhaneslarge <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhaneslarge.csv", na=".")
par(las=3, xpd=TRUE, mar=c(8,4,0.5,4)+0.3)
with(nhaneslarge, 
  plot(y=cut(BPXSAR,c(0,110,140,250)), x=race_ethc, ylab="Category of Systolic BP (mmHg)", xlab=""))
mtext(side=4, "Proportion", line=2.5)

# tabulate the proportions
tabcounts <- with(nhaneslarge,  table( race_ethc, cutbp=cut(BPXSAR,c(0,110,140,250)) ))
tabprop <- apply(tabcounts, 1, function(x){x/sum(x)}) 
tabd <- data.frame(prop = as.vector(tabprop), 
                   race_ethc=rep(attr(tabcounts, "dimnames")[[1]], each=3),
                   cut_sbp=rep(attr(tabcounts, "dimnames")[[2]], 4))
par(las=3, xpd=TRUE, mar=c(8,4,3.5,4)+0.3)
# plot the proportions, label axes, and add legend
with(tabd, 
    plot(prop ~ rep(1:4, each=3), col=gray.colors(3)[as.numeric(cut_sbp)], pch=19, xlab="", ylab="Proportion", 
       axes=F, ylim=c(0,1))
    )
axis(side=2)
axis(side=1, at=1:4, labels=attr(tabcounts, "dimnames")[[1]], las=2)
box()
par(xpd=TRUE)
legend("top", inset=c(0, -0.30), col=grey.colors(3), pch=19, bty="n", 
paste(attr(tabcounts, "dimnames")[[2]],"mmHg"))

Figure 9

nhaneslarge <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhaneslarge.csv", na=".")
library("snippets")
tabbp <- with(nhaneslarge, 
              table(cut(BPXDI1,c(0,60,70,80,120)), cut(BPXDI2,c(0,60,70,80,120)))
              ) 
par(mar=c(6,7,0,0)+0.1)
fd.matrix(as.matrix(tabbp), las=2)
abline(0,1,lty=3) # 45-degree line
mtext(side=1, line=5, "Diastolic BP range (mmHg)\n1st measurement")
mtext(side=2, line=5, "Diastolic BP range (mmHg)\n2nd measurement")

Figure 10

par(las=1, mar=c(4,7,0,0)+0.3)
nhanesmedium <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhanesmedium.csv", na=".")
expit <- function(x){exp(x)/(1+exp(x))} 
library("beeswarm")
# plot the data
beeswarm(RIDAGEYR~I(BPXDAR>70),data=nhanesmedium, method="center", breaks=NA, horizontal=TRUE, xlab="Age (years)",
         ylab="", labels=c("<= 70mmHg","> 70mmHg"))
# fit the logistic model, and add fitted values with 95% CI
glmmodel <- glm(BPXDAR>70~RIDAGEYR,data=nhanesmedium, family=binomial)
mfit <- predict(glmmodel,se=TRUE)
mii <- order(nhanesmedium$RIDAGEYR)
polygon(x=nhanesmedium$RIDAGEYR[c(mii,rev(mii))],
        y=1+expit(mfit$fit[c(mii,rev(mii))]+mfit$se.fit[c(mii,rev(mii))]*rep(c(-1.96,1.96),each=200)),
        col="#00000050",border=NA)
lines(nhanesmedium$RIDAGEYR[mii],1+expit(mfit$fit[mii]),type="l")
mtext(side=2, line=6, "Diastolic BP (mmHg)", las=3)

#plot the data
beeswarm(RIDAGEYR~I(BPXDAR>70),data=nhanesmedium, method="center", breaks=NA, horizontal=TRUE, xlab="Age (years)", 
         ylab="", labels=c("<= 70mmHg","> 70mmHg"))
# fit the spline-based model, adding fitted values with 95% CI
splinemodel <- glm(BPXDAR>70~ns(RIDAGEYR,3),data=nhanesmedium, family=binomial)
mfit <- predict(splinemodel,se=TRUE)
mii <- order(nhanesmedium$RIDAGEYR)
polygon(x=nhanesmedium$RIDAGEYR[c(mii,rev(mii))],
        y=1+expit(mfit$fit[c(mii,rev(mii))]+mfit$se.fit[c(mii,rev(mii))]*rep(c(-1.96,1.96),each=200)),
        col="#00000050",border=NA)
lines(nhanesmedium$RIDAGEYR[mii],1+expit(mfit$fit[mii]),type="l")
mtext(side=2, line=6, "Diastolic BP (mmHg)", las=3)

Figure 11

nhanesmedium <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhanesmedium.csv", na=".")
par(las=1, mar=c(4,6,1,0.5)) # horizontal axis labels, margin sizes
expit <- function(x){exp(x)/(1+exp(x))} 
mycuts <- c(60,70,80)
cut_bp <- with(nhanesmedium, cut(BPXDAR, c(0,mycuts,150)))
plot(I(BPXDAR>70)~RIDAGEYR,data=nhanesmedium,xlab="Age (years)", ylab="Proportion,\nbased on logistic regression", type="n")
# add lines for multiple fitted models
for(i in 1:4){
  thiscat <- as.numeric(cut_bp)==i
    glmmodel <- glm(thiscat~RIDAGEYR,data=nhanesmedium, family=binomial)
    mfit <- predict(glmmodel, se=TRUE, newdata=nhanesmedium)
    mii <- order(nhanesmedium$RIDAGEYR)
    lines(x=(nhanesmedium$RIDAGEYR)[mii],y=expit((mfit$fit)[mii]),type="l", lty=i)
    }
# legend with right-justified labels
temp <- legend("top", lty=1:4, legend=rep(" ",4), bty="n", 
    text.width=strwidth(paste(levels(cut_bp)," mmHg", sep="")[4]),
    xjust=1, yjust=1)
text(temp$rect$left + temp$rect$w, temp$text$y,
     paste(levels(cut_bp)," mmHg", sep=""), pos = 2)

plot(I(BPXDAR>70)~RIDAGEYR,data=nhanesmedium,xlab="Age (years)", ylab="Proportion,\nbased on logistic regression", type="n")
# add lines for multiple fitted models;
for(i in 1:3){
    glmmodel <- glm(BPXDAR>mycuts[i]~RIDAGEYR, data=nhanesmedium, family=binomial)
    mfit <- predict(glmmodel,se=TRUE)
    mii <- order(nhanesmedium$RIDAGEYR)
    lines(x=nhanesmedium$RIDAGEYR[mii],y=expit(mfit$fit[mii]),type="l", lty=i)
    }
legend("topleft", lty=1:3, legend=paste(">", mycuts, " mmHg", sep=""), bty="n")

Figure 12

library("lattice", warn.conflicts=FALSE)
nhaneslarge <- read.csv("http://faculty.washington.edu/kenrice/heartgraphs/nhaneslarge.csv", na=".")
xyplot(BPXSAR~BMXBMI|cut(RIDAGEYR,c(0,30,55,100), labels=c("<= 30 years","31-55 years", ">55 years")),groups=gender,
  col.symbol="grey60",
        data=nhaneslarge,xlab=quote("Body Mass Index"~(kg/m^2)),ylab="Systolic BP (mmHg)",
    panel=panel.superpose, 
    panel.groups=function(x,y,group.number,...){
        panel.xyplot(x,y,...)
        panel.loess(x,y,col.line="black",lty=group.number,lwd=2, alpha=1)
        },
    layout=c(3,1),
    par.settings = list(
        superpose.line = list( col="black", lty=1:2, lwd=2 )
    ),
    auto.key=list(space="top", columns=2, 
                     title="Sex", cex.title=1,
                     lines=TRUE, points=FALSE)
    )