setwd("C:/Users/kenrice/Desktop/SISG-ADV") # # Q1 - reading in data from sisg.nc # #install.packages("ncdf") library("ncdf") nc <- open.ncdf("sisg.nc") nc # note 2 dimensions, 2 variables - the variables have different dimensions tensnps <- get.var.ncdf(nc, varid="genotype", start=c(1,1), count=c(10,-1)) dim(tensnps) #illustrating a loop - reading in one person's data at a time htz <- numeric(279) for (i in 1:279){ snps<-get.var.ncdf(nc, varid="genotype", start=c(1,i), count=c(-1,1)) htz[i]<-sum(snps==1)/sum(snps!=-1) } summary(htz) plot(density(htz)) close(nc) # # Q2 - reading in data from SEAflights.db # #install.packages("RSQLite") library("RSQLite") sqlite <- dbDriver("SQLite") flights <- dbConnect(sqlite,"SEAflights.db") dbListTables(conn=flights) dbListFields(conn=flights, name="SEA") # 2nd argument names table of interest sfo <- dbGetQuery(conn=flights, statement="select ArrDelay, DepDelay from SEA where Origin='SFO'") summary(sfo) dbDisconnect(flights) # # Q3 - read in from netCDF, write to SQLite # # setup for SQLite; recall that connecting to a new file makes a new file # system("rm hapmap.db", show=TRUE) # file.remove("hapmap.db") # get rid of old versions before starting dir(pattern=".db") sqlite <- dbDriver("SQLite") hapmap <- dbConnect(sqlite,"hapmap.db") dbListTables(hapmap) # empty file.info("hapmap.db") #system("ls -l hapmap.db", intern=T) nc <- open.ncdf("sisg.nc") nc # do for one person; mysnps <- get.var.ncdf(nc, "genotype",start=c(1,1), count=c(-1,1)) snptable <- data.frame(subject=rep(1,10000), snp=1:10000, genotype=as.vector(mysnps)) dbWriteTable(conn=hapmap, "hapmap", snptable, row.names=FALSE) #system("ls -l hapmap.db", show=T) file.info("hapmap.db") dbListTables(hapmap) dbListFields(hapmap, "hapmap") dbGetQuery(hapmap, "select count(*), genotype from hapmap group by genotype") # do for everyone else - note use of append=TRUE # (this takes ~30s) for(i in 2:279){ mysnps<-get.var.ncdf(nc, "genotype",start=c(1,i), count=c(-1,1)) snptable<-data.frame(subject=rep(i,10000), snp=1:10000, genotype=as.vector(mysnps)) dbWriteTable(hapmap, "hapmap",snptable,append=TRUE, row.names=F) } #system("ls -l hapmap.db", show=T) file.info("hapmap.db") # using the now-full hapmap.db SQLite database dbListTables(hapmap) dbListFields(hapmap,"hapmap") dbGetQuery(hapmap, "select count(*) from hapmap") # how long does it take to access genotypes for one person? system.time(replicate(10,{ dbGetQuery(hapmap,"select genotype from hapmap where subject=42") })) system.time(replicate(10,{ get.var.ncdf(nc,"genotype",c(1,42),c(-1,1)) })) dbDisconnect(hapmap) close(nc) # # Q4 # class structure: vector of start and count indices on each dimension, # # currently only works in two dimensions. # # make a new type of object make_ncmatrix <- function(filename, variablename){ nc <- open.ncdf(filename) lengths <- sapply(nc$dim, function(d) d$len) subsets <- lapply(lengths, function(d) 1:d) rval <- list(nc=nc,var=variablename, lengths=lengths, subsets=subsets,filename=filename) class(rval) <- "ncmatrix" rval } # print method print.ncmatrix<-function(x){ cat("ncmatrix: ") cat(x$var," from ",x$filename,"\n") invisible(x) } # dim method dim.ncmatrix<-function(x){ sapply(x$subsets, length) } # subsetting method (two dimensions) "[.ncmatrix" <- function(x, i,j){ if(!missing(i)) x$subsets[[1]] <- x$subsets[[1]][i] if (!missing(j)) x$subsets[[2]] <- x$subsets[[2]][j] x } # as.matrix method as.matrix.ncmatrix<-function(x){ starts <-sapply(x$subsets,min) ends <-sapply(x$subsets,max) counts <-ends-starts data <-get.var.ncdf(x$nc,x$var,start=starts,count=counts+1) data[x$subsets[[1]]-starts[1]+1, x$subsets[[2]]-starts[2]+1] } # what can we do to ncmatrix objects? methods(class="ncmatrix") aa <- make_ncmatrix(filename="sisg.nc",variablename="genotype") aa dim(aa) #pick out a non-contiguous subset bb<-aa[ seq(350, 400, 10) , 4:7] bb dim(bb) as.matrix(bb)