##########
## WGCNA
##########

## Acknowledgement: 
## This code is based on an example from Mike Inouye (UQ), which was primarily 
## written by Scott Ritchie (UQ). 

## Install the package
# source("https://bioconductor.org/biocLite.R")
# biocLite(c("impute", "preprocessCore", "GO.db", "AnnotationDbi"))
# install.packages("WGCNA")

rm(list=ls())

## read the data
read.matrix <- function(file) {
  df <- read.table(file, header=TRUE, row.names=1, sep="\t", 
                   check.names=FALSE)
  mat <- as.matrix(df)
  # Avoid having numeric IDs
  rownames(mat) <- paste0("Probe_", rownames(mat))
  colnames(mat) <- paste0("Sample_", colnames(mat))
  return(mat)
}

## NOTE: Update this path and make sure that you have saved the data 
setwd("/Users/ashojaie/Dropbox/Teaching/shortcourse_NET/Ali/WGCNAexample")
liver_ge <- read.matrix("liver_expression_head2500.txt")


## Removes probes with >5% of observations missing
filter_probes <- function(x) {
  # How many samples are missing for each probe?
  nMissing <- apply(x, 2, function(probe) {
    sum(is.na(probe))
  })
  missingness <- nMissing/nrow(x)
  return(x[, missingness <= 0.05])
}


## Removes samples that failed > 5% of their assays
filter_samples <- function(x) {
  # How many samples are missing for each probe?
  nMissing <- apply(x, 1, function(sample) {
    sum(is.na(sample))
  })
  missingness <- nMissing/ncol(x)
  return(x[missingness <= 0.05,])
}

liver_ge_fil <- filter_samples(filter_probes(liver_ge))


## Impute the remaining missing values
library(impute) 

if (any(is.na(liver_ge_fil)))
  liver_ge_fil_imp <- impute.knn(liver_ge_fil)$data


anyNA(liver_ge_fil)
anyNA(liver_ge_fil_imp)


## Chose the top varying probes
most_varying <- function(ge, topN=1000) {
  standard_deviation <- apply(ge, 1, sd)
  sorted <- sort(standard_deviation, decreasing=TRUE)
  sorted_names <- names(sorted)
  topN_names <- sorted_names[seq_len(topN)]
  return(topN_names)
}

top_varying <- most_varying(liver_ge_fil_imp)
liver_ge_fil_imp_vary <- liver_ge_fil_imp[top_varying,]


## Network inference
library(WGCNA)
calculate_coexpression <- function(ge) {
  coexpression <- cor(t(ge), method="pearson")
}

infer_network <- function(coexpression) {
  # First pick the soft threshold to use to define the interaction network
  sft <- WGCNA::pickSoftThreshold(abs(coexpression), dataIsExpr=FALSE)
  if (is.na(sft$powerEstimate)) {
    sft$powerEstimate <- 1
    warning("Could not satisfy the scale-free topology criterion")
  }
  network <- abs(coexpression)^sft$powerEstimate
}

liver_coexpression <- calculate_coexpression(liver_ge_fil_imp_vary)
liver_network <- infer_network(liver_coexpression)


## Identify tightly coexpressed modules in the network 
detect_modules <- function(ge, network) {
  # Calculate the distance between probes based on their topological similarity:
  # i.e. the strength of their coexpression as well as the similarity of their
  # patterns of coexpression to all other probes
  probe_dist <- WGCNA::TOMdist(network)
  dimnames(probe_dist) <- dimnames(network)
  
  # Hierarchically cluster based on this distance metric
  dendro <- hclust(as.dist(probe_dist), method="average")
  
  # Detect modules. `cutreeDynamic` is a function in the `dynamicTreeCut`
  # package, which is loaded in by the `WGCNA` package.
  module_labels <- cutreeDynamic(dendro, distM = probe_dist)
  names(module_labels) <- colnames(network)
  
  # Merge similar modules 
  merged <- mergeCloseModules(t(ge), module_labels, relabel=TRUE)
  module_labels <- merged$colors
  
  return(module_labels)
}

liver_modules <- detect_modules(liver_ge_fil_imp_vary, liver_network)
## Each probe is now assigned a (numeric) module label:
table(liver_modules)


## Save the data for analysis using NetRep
#dir.create("temp_data")

write.matrix <- function(x, file) {
  write.csv(x, file, quote=FALSE)
}

write.vector <- function(x, file, col.names) {
  column_matrix <- t(t(x))
  colnames(column_matrix) <- col.names
  write.csv(column_matrix, file, quote=FALSE)
}


## NetRep requires the probes to be columns, so we will transpose when saving
write.matrix(t(liver_ge_fil_imp_vary), "temp_data/liver_expression.csv")
write.matrix(liver_coexpression, "temp_data/liver_coexpression.csv")
write.matrix(liver_network, "temp_data/liver_network.csv")
write.vector(liver_modules, "temp_data/liver_modules.csv", col.names="Module")


## Phenotype association analysis (using NetRep, based on eigengenes)
MEs <- moduleEigengenes(t(liver_ge_fil_imp_vary), liver_modules)

names(MEs$varExplained) <- colnames(MEs$eigengenes)
MEs$varExplained

## Add sample names to the eigengenes 
rownames(MEs$eigengenes) <- colnames(liver_ge_fil_imp_vary)

pheno<-read.table("pheno.txt", header=TRUE)
data<-cbind(pheno,MEs$eigengenes)

plot(data$ME6,data$pheno,xlab="module summary expression",ylab="phenotype levels",pch=20)
lmfit <- lm(pheno ~ ME6, data=data)
summary(lmfit)
abline(a=lmfit$coefficients[[1]], b=lmfit$coefficients[[2]], col="red", lwd=2)