setwd("C:/Users/kenrice/Desktop/SISG-ADV")

#
# Q1 - reading in data from sisg.nc
#

#install.packages("ncdf")
library("ncdf")
nc <- open.ncdf("sisg.nc")
nc # note 2 dimensions, 2 variables - the variables have different dimensions

tensnps <- get.var.ncdf(nc, varid="genotype", start=c(1,1), count=c(10,-1))
dim(tensnps)

#illustrating a loop - reading in one person's data at a time
htz <- numeric(279)
for (i in 1:279){
	snps<-get.var.ncdf(nc, varid="genotype", start=c(1,i), count=c(-1,1))
	htz[i]<-sum(snps==1)/sum(snps!=-1)
	}
summary(htz)
plot(density(htz))

close(nc)

#
# Q2 - reading in data from SEAflights.db
#

#install.packages("RSQLite")
library("RSQLite")
sqlite  <- dbDriver("SQLite") 
flights <- dbConnect(sqlite,"SEAflights.db")	

dbListTables(conn=flights)
dbListFields(conn=flights, name="SEA") # 2nd argument names table of interest

sfo <- dbGetQuery(conn=flights, 
statement="select ArrDelay, DepDelay from SEA where Origin='SFO'")
summary(sfo)

dbDisconnect(flights)

#
# Q3 - read in from netCDF, write to SQLite
#

# setup for SQLite; recall that connecting to a new file makes a new file

# system("rm hapmap.db", show=TRUE)
# file.remove("hapmap.db")
# get rid of old versions before starting
dir(pattern=".db")

sqlite  <- dbDriver("SQLite") 
hapmap <- dbConnect(sqlite,"hapmap.db")
dbListTables(hapmap) # empty
file.info("hapmap.db")
#system("ls -l hapmap.db", intern=T)	

nc <- open.ncdf("sisg.nc")
nc

# do for one person;
mysnps   <- get.var.ncdf(nc, "genotype",start=c(1,1), count=c(-1,1))
snptable <- data.frame(subject=rep(1,10000), snp=1:10000,
                     genotype=as.vector(mysnps))
dbWriteTable(conn=hapmap, "hapmap", snptable, row.names=FALSE)
#system("ls -l hapmap.db", show=T)	

file.info("hapmap.db")

dbListTables(hapmap) 
dbListFields(hapmap, "hapmap")
dbGetQuery(hapmap, "select count(*), genotype from hapmap group by genotype")

# do for everyone else - note use of append=TRUE
# (this takes ~30s)
for(i in 2:279){
	mysnps<-get.var.ncdf(nc, "genotype",start=c(1,i), count=c(-1,1))
    snptable<-data.frame(subject=rep(i,10000),
                     snp=1:10000,
                     genotype=as.vector(mysnps))
    dbWriteTable(hapmap, "hapmap",snptable,append=TRUE, row.names=F)
	}                     

#system("ls -l hapmap.db", show=T)	
file.info("hapmap.db")

# using the now-full hapmap.db SQLite database
dbListTables(hapmap)
dbListFields(hapmap,"hapmap")
dbGetQuery(hapmap, "select count(*) from hapmap")

# how long does it take to access genotypes for one person?

system.time(replicate(10,{
	dbGetQuery(hapmap,"select genotype from hapmap where subject=42")
	}))

system.time(replicate(10,{
	get.var.ncdf(nc,"genotype",c(1,42),c(-1,1))
	}))

dbDisconnect(hapmap)
close(nc)

#
# Q4
# class structure: vector of start and count indices on each dimension, 
# 
# currently only works in two dimensions.
#

# make a new type of object
make_ncmatrix <- function(filename, variablename){
	   nc      <- open.ncdf(filename)
	   lengths <- sapply(nc$dim, function(d) d$len)
	   subsets <- lapply(lengths, function(d) 1:d)
	   rval    <- list(nc=nc,var=variablename, lengths=lengths,
	              subsets=subsets,filename=filename)
	   class(rval) <- "ncmatrix"
	   rval
	}

# print method
print.ncmatrix<-function(x){
	              cat("ncmatrix: ")
	              cat(x$var," from ",x$filename,"\n")
	              invisible(x)
	              }

# dim method	              
dim.ncmatrix<-function(x){
	     sapply(x$subsets, length)
	}
	

# subsetting method (two dimensions)	
"[.ncmatrix" <- function(x, i,j){
	  if(!missing(i))
	    x$subsets[[1]] <- x$subsets[[1]][i]
	  if (!missing(j))
   	  x$subsets[[2]]   <- x$subsets[[2]][j]
   	x  
	}
	
# as.matrix method
as.matrix.ncmatrix<-function(x){
	 starts  <-sapply(x$subsets,min)
	 ends    <-sapply(x$subsets,max)
	 counts  <-ends-starts
	 data    <-get.var.ncdf(x$nc,x$var,start=starts,count=counts+1)
	 data[x$subsets[[1]]-starts[1]+1, x$subsets[[2]]-starts[2]+1]
	}	
	
# what can we do to ncmatrix objects?
methods(class="ncmatrix")
	
aa <- make_ncmatrix(filename="sisg.nc",variablename="genotype")
aa
dim(aa)

#pick out a non-contiguous subset
bb<-aa[ seq(350, 400, 10) , 4:7]	
bb
dim(bb)
as.matrix(bb)