##################################################################################################################
##################################################################################################################
## PLINK script: recoding the data

plink --bfile Transferrin --chr 3 --from-kb 134840 --to-kb 135052 --recodeA --out  TFsnps

plink --bfile Transferrin --chr 12 --from-kb 23450 --to-kb 25050  --recodeA --out  randomSegment



##################################################################################################################
##################################################################################################################
### R Scripts : Reading data into R and getting rid of missing phenotypes


snps = read.table("TFsnps.raw", header = T) 
phenotypes = read.table("Tr.pheno", header = F)

missing.phenotypes = which(is.na(phenotypes[,3]))
Z = as.matrix(snps[-missing.phenotypes,7:30])
y = as.vector(phenotypes[-missing.phenotypes,3])


##################################################################################################################
##################################################################################################################
### R Scripts: find out home much missing data we have in the SNPs, and then omit the samples with missing SNPs

sum(is.na(Z)) ## this gives us the number of SNPs that are missing

w.missing = which(apply(is.na(Z),1,sum) > 0) ## This figures out which subjects have ANY missing values

Z = Z[-w.missing,]  ## now we get rid of the subjects with any missing SNPs
y = y[-w.missing]   ## BE CAREFUL: because we are over-writing the variable, don't run this line more than once by accident.

sum(is.na(Z))
sum(is.na(y)) ## Now we see that there is no missingness in either the Z or the y.


##################################################################################################################
##################################################################################################################
### R Scripts: minP tests


pvals = rep(NA, ncol(Z))
for (j in 1:ncol(Z)) {
  pvals[j] = summary(lm(y~Z[,j]))$coef[2,4]  ### this gets the p-value for the association between the j-th snp and y
}


min.pvalue = min(pvals) ## This actually runs the previous function

min.pvalue  ## This is the unadjusted minimum p-value

bonf.pvalue = min(min.pvalue*ncol(Z), 1) ## this is the bonferroni adjusted minimum p-value

### since unadjusted is way too liberal and bonferroni can be way to conservative, let's try getting some sense
### of the "effective number of tests".  We'll try the PCA approach, though many others are possible

getEffectiveSampSize.PCA = function(Z, proportionVariability = 0.99) {
## this function will estimate the number of PCs necessary to explain a certain proportion of the variability
## Z is the matrix of genotypes and proportionVariability is the proportion of variability we want to explain.

  pca = prcomp(Z) ### Does principal component analysis
  evs = pca$sdev**2/sum(pca$sdev**2) ## represents the proportion of variability that is explained by each component
  return(min(which(cumsum(evs)>= proportionVariability)))
}

eff90pct = getEffectiveSampSize.PCA(Z, 0.90)
eff95pct = getEffectiveSampSize.PCA(Z, 0.95)
eff99pct = getEffectiveSampSize.PCA(Z, 0.99)
eff99.9pct = getEffectiveSampSize.PCA(Z, 0.999)

p90pct = min(eff90pct*min.pvalue, 1)
p95pct = min(eff95pct*min.pvalue, 1)
p99pct = min(eff99pct*min.pvalue, 1)
p99.9pct = min(eff99.9pct*min.pvalue, 1)

cbind(c(p90pct, p95pct,p99pct, p99.9pct), c(90,95,99,99.9))

### Note that there is tremendous heterogeneity in the p-values.  Although they are all significant, it's not clear which is the best.

##################################################################################################################
##################################################################################################################
### R Scripts: averaging/collapsing tests

### let's first collapse the SNPs into a simple average or sum of the variants within the region

C = apply(Z, 1, sum) ## This sums each row of Z
hist(C)

summary(lm(y~C))  ## This gives us the association between the averaged/summed SNP value and the outcome y

### Now let's take the top principal component

pca = prcomp(Z)       ### find eigen decomposition
pc1 = Z%*%pca$rot[,1] ### computes the top principal component

summary(lm(y~pc1)) ## Gives us the association


##################################################################################################################
##################################################################################################################
###---***---***---###---***---***---###---***---***---###---***---***---###---***---***---###---***---***---###
###---***---***---###---***---***---###---***---***---###---***---***---###---***---***---###---***---***---###
##################################################################################################################
##################################################################################################################


##################################################################################################################
##################################################################################################################
### R Scripts: Now let's repeat everything with just some random segment of the genome

snps = read.table("randomSegment.raw", header = T) 
phenotypes = read.table("Tr.pheno", header = F)

missing.phenotypes = which(is.na(phenotypes[,3]))
Z = as.matrix(snps[-missing.phenotypes,7:247])
y = as.vector(phenotypes[-missing.phenotypes,3])



##################################################################################################################
##################################################################################################################
### R Scripts: find out home much missing data we have in the SNPs, and then omit the samples with missing SNPs

sum(is.na(Z)) ## this gives us the number of SNPs that are missing

w.missing = which(apply(is.na(Z),1,sum) > 0) ## This figures out which subjects have ANY missing values

Z = Z[-w.missing,]  ## now we get rid of the subjects with any missing SNPs
y = y[-w.missing]   ## BE CAREFUL: because we are over-writing the variable, don't run this line more than once by accident.

sum(is.na(Z))
sum(is.na(y)) ## Now we see that there is no missingness in either the Z or the y.

##################################################################################################################
##################################################################################################################
### R Scripts: minP tests


pvals = rep(NA, ncol(Z))
for (j in 1:ncol(Z)) {
  pvals[j] = summary(lm(y~Z[,j]))$coef[2,4]  ### this gets the p-value for the association between the j-th snp and y
}


min.pvalue = min(pvals) ## This actually runs the previous function

min.pvalue  ## This is the unadjusted minimum p-value

bonf.pvalue = min(min.pvalue*ncol(Z), 1) ## this is the bonferroni adjusted minimum p-value

### since unadjusted is way too liberal and bonferroni can be way to conservative, let's try getting some sense
### of the "effective number of tests".  We'll try the PCA approach, though many others are possible

getEffectiveSampSize.PCA = function(Z, proportionVariability = 0.99) {
## this function will estimate the number of PCs necessary to explain a certain proportion of the variability
## Z is the matrix of genotypes and proportionVariability is the proportion of variability we want to explain.

  pca = prcomp(Z) ### Does principal component analysis
  evs = pca$sdev**2/sum(pca$sdev**2) ## represents the proportion of variability that is explained by each component
  return(min(which(cumsum(evs)>= proportionVariability)))
}

eff90pct = getEffectiveSampSize.PCA(Z, 0.90)
eff95pct = getEffectiveSampSize.PCA(Z, 0.95)
eff99pct = getEffectiveSampSize.PCA(Z, 0.99)
eff99.9pct = getEffectiveSampSize.PCA(Z, 0.999)

p90pct = min(eff90pct*min.pvalue, 1)
p95pct = min(eff95pct*min.pvalue, 1)
p99pct = min(eff99pct*min.pvalue, 1)
p99.9pct = min(eff99.9pct*min.pvalue, 1)

cbind(c(p90pct, p95pct,p99pct, p99.9pct), c(90,95,99,99.9))

### Note that there is tremendous heterogeneity in the p-values.  Although they are all significant, it's not clear which is the best.

##################################################################################################################
##################################################################################################################
### R Scripts: averaging/collapsing tests

### let's first collapse the SNPs into a simple average or sum of the variants within the region

C = apply(Z, 1, sum) ## This sums each row of Z
hist(C)

summary(lm(y~C))  ## This gives us the association between the averaged/summed SNP value and the outcome y

### Now let's take the top principal component

pca = prcomp(Z)       ### find eigen decomposition
pc1 = Z%*%pca$rot[,1] ### computes the top principal component

summary(lm(y~pc1)) ## Gives us the association