First, make sure that you have installed the Bioconductor R packages gdsfmt and SNPRelate:

Load the pakcages gdsfmt and SNPRelate into your R session

library(gdsfmt)
library(SNPRelate)

Read in file with Information on Populations of Sample Individuals

POPINFO=read.table(file="../SISGAssocData/Population_Sample_Info.txt",header=TRUE)

Obtain a table with the number of sample individuals from each population

table(POPINFO$Population)
# 
# ASW CEU MXL NAM YRI 
#  87 165  86  63 203

Check to make sure the order of individuals is the same as the PLINK file

identical(POPINFO$IID,FAM$V2)
# [1] TRUE

NOW we will use functions in the gdsfmt and SNPRelate packages to perform a PCA. A tutorial on gdsfmt and SNPRelate can be found at: http://corearray.sourceforge.net/tutorials/SNPRelate/

Now perform a PCA using a function from the SNPRelate package

pca <- snpgdsPCA(genofile)
# Principal Component Analysis (PCA) on genotypes:
# Excluding 0 SNP on non-autosomes
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 150,872 SNPs
#     using 1 (CPU) core
# PCA:    the sum of all selected genotypes (0,1,2) = 47894459
# CPU capabilities: Double-Precision SSE2
# Mon Jul 16 02:13:31 2018    (internal increment: 1240)
# 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed in 8s
# Mon Jul 16 02:13:39 2018    Begin (eigenvalues and eigenvectors)
# Mon Jul 16 02:13:39 2018    Done.

Extract the first 2 PCs and make a data frame

tab <- data.frame(sample.id = pca$sample.id,
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)
head(tab)
#   sample.id         EV1         EV2
# 1 HGDP00702 -0.04914341 -0.09600764
# 2 HGDP00703 -0.04288822 -0.07801091
# 3 HGDP00704 -0.04920633 -0.09414968
# 4 HGDP00706 -0.04910582 -0.09618059
# 5 HGDP00708 -0.04917482 -0.09728481
# 6 HGDP00710 -0.04953930 -0.09574544

Make a plot of the top 2 PCs

plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1")

Now incorporate population membership for the PCA plot

Get sample id and create a new dataframe with the population membership

sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
population=POPINFO$Population
tab <- data.frame(sample.id = pca$sample.id,
    pop = factor(population)[match(pca$sample.id, sample.id)],
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)
head(tab)
#   sample.id pop         EV1         EV2
# 1 HGDP00702 NAM -0.04914341 -0.09600764
# 2 HGDP00703 NAM -0.04288822 -0.07801091
# 3 HGDP00704 NAM -0.04920633 -0.09414968
# 4 HGDP00706 NAM -0.04910582 -0.09618059
# 5 HGDP00708 NAM -0.04917482 -0.09728481
# 6 HGDP00710 NAM -0.04953930 -0.09574544

Make a plot of the top 2 PCs, with points color coded by population membership

plot(tab$EV2, tab$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1", main="PCA using all SNPs")
legend("topleft", legend=levels(tab$pop), pch="o", col=1:(nlevels(tab$pop)))

Now make scatterplots of the top 4 PCs with proportional variance explained included

pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
# [1] 11.30  3.99  0.55  0.44  0.40  0.39
lbls <- paste("PC", 1:4, "\n", format(pc.percent[1:4], digits=2), "%", sep="")
pairs(pca$eigenvect[,1:4], col=tab$pop, labels=lbls)

Do we actually need 150K SNPs for population structure infererence in this sample?

Obtain a subset of SNPs based on linkage disequilibrium (LD threshold of 0.2

snpset <- snpgdsLDpruning(genofile, ld.threshold=0.2)
# SNP pruning based on LD:
# Excluding 0 SNP on non-autosomes
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 150,872 SNPs
#     using 1 (CPU) core
#     sliding window: 500,000 basepairs, Inf SNPs
#     |LD| threshold: 0.2
#     method: composite
# Chromosome 1: 28.62%, 3,312/11,572
# Chromosome 2: 26.19%, 3,344/12,766
# Chromosome 3: 27.54%, 2,898/10,524
# Chromosome 4: 28.57%, 2,521/8,825
# Chromosome 5: 27.07%, 2,617/9,668
# Chromosome 6: 26.39%, 2,590/9,814
# Chromosome 7: 27.79%, 2,250/8,097
# Chromosome 8: 25.30%, 2,212/8,743
# Chromosome 9: 26.44%, 1,978/7,482
# Chromosome 10: 27.15%, 2,231/8,217
# Chromosome 11: 26.84%, 2,006/7,474
# Chromosome 12: 27.89%, 2,080/7,457
# Chromosome 13: 28.15%, 1,632/5,798
# Chromosome 14: 27.47%, 1,422/5,176
# Chromosome 15: 26.75%, 1,315/4,916
# Chromosome 16: 27.20%, 1,341/4,930
# Chromosome 17: 31.95%, 1,228/3,843
# Chromosome 18: 27.85%, 1,321/4,743
# Chromosome 19: 35.59%, 794/2,231
# Chromosome 20: 29.15%, 1,213/4,161
# Chromosome 21: 29.71%, 656/2,208
# Chromosome 22: 30.04%, 669/2,227
# 41,630 markers are selected in total.
snpset.id <- unlist(snpset)

Now perform a PCA with the subset of SNPs

pca2 <- snpgdsPCA(genofile, snp.id=snpset.id)
# Principal Component Analysis (PCA) on genotypes:
# Excluding 109,242 SNPs (non-autosomes or non-selection)
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 41,630 SNPs
#     using 1 (CPU) core
# PCA:    the sum of all selected genotypes (0,1,2) = 11026888
# CPU capabilities: Double-Precision SSE2
# Mon Jul 16 02:13:42 2018    (internal increment: 1240)
# 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed in 2s
# Mon Jul 16 02:13:44 2018    Begin (eigenvalues and eigenvectors)
# Mon Jul 16 02:13:44 2018    Done.
tab2 <- data.frame(sample.id = pca2$sample.id,
    pop = factor(population)[match(pca2$sample.id, sample.id)],
    EV1 = pca2$eigenvect[,1],    # the first eigenvector
    EV2 = pca2$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)

Make a plot of the top 2 PCs with the subset of SNP, with points color coded by population membership

plot(tab2$EV2, tab2$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1", main="PCA using a subset of SNPs")
legend("bottomright", legend=levels(tab$pop), pch="o", col=1:(nlevels(tab$pop)))

Estimate proportional ancestry from PCA

First Estimate Native American and European Ancestry for the MXL from the PCA. For simplicity assume that MXL have negligible African Ancestry.

Obtain average PC 2 value for CEU (European) and NAM (Native Americans), since PC2 is relecting European and Native American ancesty


avgCEU2=mean(pca2$eigenvect[population=="CEU",2])
avgNAM2=mean(pca2$eigenvect[population=="NAM",2])

Estimate proportional MXL ancestry, based on their relative value to the average CEU and NAM PC 2 values

MXLadmix=(pca2$eigenvect[population=="MXL",2]-avgNAM2)/(avgCEU2-avgNAM2)

Make a table of the estimated admixture proportions

tab2=cbind(MXLadmix,1-MXLadmix)
myorder=order(MXLadmix)
temp=t(as.matrix(tab2[myorder,]))

MAKE A BARPLOT OF MXL ESTIMATED ANCESTRY FROM THE PCA

barplot(temp, col=c("blue","green"),xlab="Individual ", ylab="Ancestry", border=NA,axisnames=FALSE,main="Ancestry of MXL",ylim=c(0,1))
legend("bottomright", c("European","Native American"), lwd=4, col=c("blue","green"), bg="white",cex=0.85)

How about predicting ancestry for 3 ancestral populations?

See documentation about this here: https://www.rdocumentation.org/packages/SNPRelate/versions/1.6.4/topics/snpgdsEIGMIX

First define groups

groups <- list(CEU = sample.id[population == "CEU"],
               YRI = sample.id[population == "YRI"],
               NAM = sample.id[population=="NAM"])

Can use the snpgdsAdmixProp function to estimate the proportional ancestry from the pca that we did. See

prop <- snpgdsAdmixProp(pca2, groups=groups,bound=TRUE)
plot(prop[, "YRI"], prop[, "CEU"], col=as.integer(tab$pop),
     xlab = "Admixture Proportion from YRI",
     ylab = "Admixture Proportion from CEU")
abline(v=0, col="gray25", lty=2)
abline(h=0, col="gray25", lty=2)
abline(a=1, b=-1, col="gray25", lty=2)
legend("topright", legend=levels(tab$pop), pch="o", col=1:5)

Can also obtain barplots of proportional ancestry of MXL that was estimated from PCA

MXLprop<-data.frame(prop[population=="MXL",])
myorder=order(MXLprop$CEU)
temp=t(as.matrix(MXLprop[myorder,]))


barplot(temp, col=c("blue","red","green"),xlab="Individual ", ylab="Ancestry", border=NA,axisnames=FALSE,
        main="Estimated Ancestry of MXL from PCA",ylim=c(0,1))
legend("bottomright", c("European","African","Native American"),lwd=4, col=c("blue","red","green"),bg="white",cex=0.85)