First, make sure that you have installed the Bioconductor R packages gdsfmt and SNPRelate:

source(“http://bioconductor.org/biocLite.R”)
biocLite(“gdsfmt”)
biocLite(“SNPRelate”)

Load the pakcages gdsfmt and SNPRelate into your R session

library(gdsfmt)
library(SNPRelate)

Investigate the contents of the PLINK files .fam and .bim files

Look at the .fam file and obtain the number of individual in the study

FAM<-read.table(file="../SISGAssocData/YRI_CEU_ASW_MEX_NAM.fam",sep=" ", header=FALSE,na="NA")
head(FAM)
#     V1        V2 V3 V4 V5 V6
# 1 1432 HGDP00702  0  0  2 -9
# 2 1433 HGDP00703  0  0  1 -9
# 3 1434 HGDP00704  0  0  2 -9
# 4 1436 HGDP00706  0  0  2 -9
# 5 1438 HGDP00708  0  0  2 -9
# 6 1440 HGDP00710  0  0  1 -9
dim(FAM)
# [1] 604   6
unique(FAM$V1)
#   [1] 1432  1433  1434  1436  1438  1440  1551  1555  1556  1561  1563 
#  [12] 1564  1567  1570  1571  1572  1573  1574  1575  1576  1577  1578 
#  [23] 1579  1580  1581  1582  1585  1586  1587  1588  1589  1590  1592 
#  [34] 1593  1594  1684  1704  1707  1708  1710  1711  1714  1718  1720 
#  [45] 1721  1722  1723  1726  1727  1740  1744  1746  1747  1750  1753 
#  [56] 1754  1756  1758  1759  1760  1761  1762  1763  2427  2431  2424 
#  [67] 2469  2368  2425  2430  2470  2484  2436  2426  2487  2475  2480 
#  [78] 2418  2471  2474  2490  2433  2495  2477  2492  2485  2494  2446 
#  [89] 2491  2489  2483  2466  2476  2488  2357  2367  2472  2467  2371 
# [100] 2481  2437  2496  2369  2493  2505  1328  1377  1349  1330  1444 
# [111] 1344  1463  1418  13291 1424  1346  13292 1354  13281 1451  1421 
# [122] 1353  1358  1458  1456  1423  1345  1355  1350  1334  1347  1340 
# [133] 1408  1447  1459  1362  1341  1420  1416  1454  1375  M012  M016 
# [144] M001  M002  M006  M007  M011  M015  M017  M023  M027  M033  M035 
# [155] M004  M005  M010  M026  M031  M034  M036  M039  M008  M032  M037 
# [166] M009  M014  M024  M028  2382  2394  2395  2414  2405  Y001  Y014 
# [177] Y039  Y075  Y041  Y007  Y073  Y010  Y092  Y006  Y033  Y038  Y030 
# [188] Y061  Y111  Y120  Y036  Y003  Y079  Y116  Y100  Y027  Y044  Y002 
# [199] Y052  Y110  Y057  Y035  Y019  Y025  Y005  Y045  Y117  Y004  Y043 
# [210] Y072  Y023  Y058  Y024  Y074  Y112  Y048  Y017  Y013  Y050  Y056 
# [221] Y042  Y047  Y018  Y071  Y028  Y009  Y077  Y060  Y016  Y012  Y040 
# [232] Y101  Y051  Y105  Y020  Y022  Y118  Y021  Y054  Y113  Y108  Y059 
# [243] Y119  Y062  Y109  Y114  Y032  Y080  Y078  Y102  Y103  Y055  Y049 
# [254] Y064  Y034  Y029  Y076 
# 257 Levels: 1328 13281 13291 13292 1330 1334 1340 1341 1344 1345 ... Y120

Now obtain information about the number SNPs in the .bim file

map<-read.table(file="../SISGAssocData/YRI_CEU_ASW_MEX_NAM.bim",sep="\t", header=FALSE,na="NA")
head(map)
#   V1        V2 V3      V4 V5 V6
# 1  1 rs9442372  0 1008567  1  2
# 2  1 rs2887286  0 1145994  1  2
# 3  1 rs3813199  0 1148140  1  2
# 4  1 rs6685064  0 1201155  1  2
# 5  1 rs9439462  0 1452629  1  2
# 6  1 rs3820011  0 1878053  1  2
dim(map)
# [1] 150872      6

Read in file with Information on Populations of Sample Individuals

POPINFO=read.table(file="../SISGAssocData/Population_Sample_Info.txt",header=TRUE)

Obtain a table with the number of sample individuals from each population

table(POPINFO$Population)
# 
# ASW CEU MXL NAM YRI 
#  87 165  86  63 203

Check to make sure the order of individuals is the same as the PLINK file

identical(POPINFO$IID,FAM$V2)
# [1] TRUE

NOW we will use functions in the gdsfmt and SNPRelate packages to perform a PCA. A tutorial on gdsfmt and SNPRelate can be found at: http://corearray.sourceforge.net/tutorials/SNPRelate/

Create a GDS file for R from the PLINK files

bedfile<-"../SISGAssocData/YRI_CEU_ASW_MEX_NAM.bed" 
famfile<-"../SISGAssocData/YRI_CEU_ASW_MEX_NAM.fam" 
bimfile<-"../SISGAssocData/YRI_CEU_ASW_MEX_NAM.bim"

Convert PLINK FILES INTO GDS FORMAT FOR R

snpgdsBED2GDS(bedfile, famfile, bimfile, "IPCgeno.gds")
# Start snpgdsBED2GDS ...
#   BED file: "../SISGAssocData/YRI_CEU_ASW_MEX_NAM.bed" in the SNP-major mode (Sample X SNP)
#   FAM file: "../SISGAssocData/YRI_CEU_ASW_MEX_NAM.fam", DONE.
#   BIM file: "../SISGAssocData/YRI_CEU_ASW_MEX_NAM.bim", DONE.
# Mon Jul 16 02:13:30 2018  store sample id, snp id, position, and chromosome.
#   start writing: 604 samples, 150872 SNPs ...
#   Mon Jul 16 02:13:30 2018    0%
#   Mon Jul 16 02:13:31 2018    100%
# Mon Jul 16 02:13:31 2018  Done.
# Optimize the access efficiency ...
# Clean up the fragments of GDS file:
#     open the file 'IPCgeno.gds' (22.8M)
#     # of fragments: 39
#     save to 'IPCgeno.gds.tmp'
#     rename 'IPCgeno.gds.tmp' (22.8M, reduced: 252B)
#     # of fragments: 18

Open the GDS file and

genofile <- snpgdsOpen("IPCgeno.gds")

Explore the GDS file and a few of the components

head(genofile)
# $filename
# [1] "/Users/tathornt/Documents/SISG_2018_ASSOCIATION_EXERCISES/Exercises/IPCgeno.gds"
# 
# $id
# [1] 0
# 
# $root
# +    [  ] *
# |--+ sample.id   { Str8 604 ZIP_ra(27.7%), 1.3K }
# |--+ snp.id   { Str8 150872 ZIP_ra(37.6%), 561.7K }
# |--+ snp.position   { Int32 150872 ZIP_ra(90.9%), 535.8K }
# |--+ snp.chromosome   { UInt8 150872 ZIP_ra(0.16%), 241B } *
# |--+ snp.allele   { Str8 150872 ZIP_ra(0.13%), 765B }
# |--+ genotype   { Bit2 604x150872, 21.7M } *
# \--+ sample.annot   [ data.frame ] *
#    |--+ sex   { Str8 604 ZIP_ra(14.2%), 178B }
#    \--+ phenotype   { Int32 604 ZIP_ra(1.61%), 46B }
# 
# $readonly
# [1] TRUE
head(read.gdsn(index.gdsn(genofile, "sample.id")))
# [1] "HGDP00702" "HGDP00703" "HGDP00704" "HGDP00706" "HGDP00708" "HGDP00710"
head(read.gdsn(index.gdsn(genofile, "snp.id")))
# [1] "rs9442372" "rs2887286" "rs3813199" "rs6685064" "rs9439462" "rs3820011"

Extract genotype for a specific SNP and plot a histrogram of the genotype values

g <- snpgdsGetGeno(genofile, snp.id="rs3813199")
# Genotype matrix: 604 samples X 1 SNPs
hist(g)

Now perform a PCA using a function from the SNPRelate package

pca <- snpgdsPCA(genofile)
# Principal Component Analysis (PCA) on genotypes:
# Excluding 0 SNP on non-autosomes
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 150,872 SNPs
#     using 1 (CPU) core
# PCA:    the sum of all selected genotypes (0,1,2) = 47894459
# CPU capabilities: Double-Precision SSE2
# Mon Jul 16 02:13:31 2018    (internal increment: 1240)
# 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed in 8s
# Mon Jul 16 02:13:39 2018    Begin (eigenvalues and eigenvectors)
# Mon Jul 16 02:13:39 2018    Done.

Extract the first 2 PCs and make a data frame

tab <- data.frame(sample.id = pca$sample.id,
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)
head(tab)
#   sample.id         EV1         EV2
# 1 HGDP00702 -0.04914341 -0.09600764
# 2 HGDP00703 -0.04288822 -0.07801091
# 3 HGDP00704 -0.04920633 -0.09414968
# 4 HGDP00706 -0.04910582 -0.09618059
# 5 HGDP00708 -0.04917482 -0.09728481
# 6 HGDP00710 -0.04953930 -0.09574544

Make a plot of the top 2 PCs

plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1")

Now incorporate population membership for the PCA plot

Get sample id and create a new dataframe with the population membership

sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
population=POPINFO$Population
tab <- data.frame(sample.id = pca$sample.id,
    pop = factor(population)[match(pca$sample.id, sample.id)],
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)
head(tab)
#   sample.id pop         EV1         EV2
# 1 HGDP00702 NAM -0.04914341 -0.09600764
# 2 HGDP00703 NAM -0.04288822 -0.07801091
# 3 HGDP00704 NAM -0.04920633 -0.09414968
# 4 HGDP00706 NAM -0.04910582 -0.09618059
# 5 HGDP00708 NAM -0.04917482 -0.09728481
# 6 HGDP00710 NAM -0.04953930 -0.09574544

Make a plot of the top 2 PCs, with points color coded by population membership

plot(tab$EV2, tab$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1", main="PCA using all SNPs")
legend("topleft", legend=levels(tab$pop), pch="o", col=1:(nlevels(tab$pop)))

Now make scatterplots of the top 4 PCs with proportional variance explained included

pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
# [1] 11.30  3.99  0.55  0.44  0.40  0.39
lbls <- paste("PC", 1:4, "\n", format(pc.percent[1:4], digits=2), "%", sep="")
pairs(pca$eigenvect[,1:4], col=tab$pop, labels=lbls)

Do we actually need 150K SNPs for population structure infererence in this sample?

Obtain a subset of SNPs based on linkage disequilibrium (LD threshold of 0.2

snpset <- snpgdsLDpruning(genofile, ld.threshold=0.2)
# SNP pruning based on LD:
# Excluding 0 SNP on non-autosomes
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 150,872 SNPs
#     using 1 (CPU) core
#     sliding window: 500,000 basepairs, Inf SNPs
#     |LD| threshold: 0.2
#     method: composite
# Chromosome 1: 28.62%, 3,312/11,572
# Chromosome 2: 26.19%, 3,344/12,766
# Chromosome 3: 27.54%, 2,898/10,524
# Chromosome 4: 28.57%, 2,521/8,825
# Chromosome 5: 27.07%, 2,617/9,668
# Chromosome 6: 26.39%, 2,590/9,814
# Chromosome 7: 27.79%, 2,250/8,097
# Chromosome 8: 25.30%, 2,212/8,743
# Chromosome 9: 26.44%, 1,978/7,482
# Chromosome 10: 27.15%, 2,231/8,217
# Chromosome 11: 26.84%, 2,006/7,474
# Chromosome 12: 27.89%, 2,080/7,457
# Chromosome 13: 28.15%, 1,632/5,798
# Chromosome 14: 27.47%, 1,422/5,176
# Chromosome 15: 26.75%, 1,315/4,916
# Chromosome 16: 27.20%, 1,341/4,930
# Chromosome 17: 31.95%, 1,228/3,843
# Chromosome 18: 27.85%, 1,321/4,743
# Chromosome 19: 35.59%, 794/2,231
# Chromosome 20: 29.15%, 1,213/4,161
# Chromosome 21: 29.71%, 656/2,208
# Chromosome 22: 30.04%, 669/2,227
# 41,630 markers are selected in total.
snpset.id <- unlist(snpset)

Now perform a PCA with the subset of SNPs

pca2 <- snpgdsPCA(genofile, snp.id=snpset.id)
# Principal Component Analysis (PCA) on genotypes:
# Excluding 109,242 SNPs (non-autosomes or non-selection)
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 41,630 SNPs
#     using 1 (CPU) core
# PCA:    the sum of all selected genotypes (0,1,2) = 11026888
# CPU capabilities: Double-Precision SSE2
# Mon Jul 16 02:13:42 2018    (internal increment: 1240)
# 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed in 2s
# Mon Jul 16 02:13:44 2018    Begin (eigenvalues and eigenvectors)
# Mon Jul 16 02:13:44 2018    Done.
tab2 <- data.frame(sample.id = pca2$sample.id,
    pop = factor(population)[match(pca2$sample.id, sample.id)],
    EV1 = pca2$eigenvect[,1],    # the first eigenvector
    EV2 = pca2$eigenvect[,2],    # the second eigenvector
    stringsAsFactors = FALSE)

Make a plot of the top 2 PCs with the subset of SNP, with points color coded by population membership

plot(tab2$EV2, tab2$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1", main="PCA using a subset of SNPs")
legend("bottomright", legend=levels(tab$pop), pch="o", col=1:(nlevels(tab$pop)))

Estimate proportional ancestry from PCA

First Estimate Native American and European Ancestry for the MXL from the PCA. For simplicity assume that MXL have negligible African Ancestry.

Obtain average PC 2 value for CEU (European) and NAM (Native Americans), since PC2 is relecting European and Native American ancesty


avgCEU2=mean(pca2$eigenvect[population=="CEU",2])
avgNAM2=mean(pca2$eigenvect[population=="NAM",2])

Estimate proportional MXL ancestry, based on their relative value to the average CEU and NAM PC 2 values

MXLadmix=(pca2$eigenvect[population=="MXL",2]-avgNAM2)/(avgCEU2-avgNAM2)

Make a table of the estimated admixture proportions

tab2=cbind(MXLadmix,1-MXLadmix)
myorder=order(MXLadmix)
temp=t(as.matrix(tab2[myorder,]))

MAKE A BARPLOT OF MXL ESTIMATED ANCESTRY FROM THE PCA

barplot(temp, col=c("blue","green"),xlab="Individual ", ylab="Ancestry", border=NA,axisnames=FALSE,main="Ancestry of MXL",ylim=c(0,1))
legend("bottomright", c("European","Native American"), lwd=4, col=c("blue","green"), bg="white",cex=0.85)

How about predicting ancestry for 3 ancestral populations?

See documentation about this here: https://www.rdocumentation.org/packages/SNPRelate/versions/1.6.4/topics/snpgdsEIGMIX

First define groups

groups <- list(CEU = sample.id[population == "CEU"],
               YRI = sample.id[population == "YRI"],
               NAM = sample.id[population=="NAM"])

Can use the snpgdsAdmixProp function to estimate the proportional ancestry from the pca that we did. See

prop <- snpgdsAdmixProp(pca2, groups=groups,bound=TRUE)
plot(prop[, "YRI"], prop[, "CEU"], col=as.integer(tab$pop),
     xlab = "Admixture Proportion from YRI",
     ylab = "Admixture Proportion from CEU")
abline(v=0, col="gray25", lty=2)
abline(h=0, col="gray25", lty=2)
abline(a=1, b=-1, col="gray25", lty=2)
legend("topright", legend=levels(tab$pop), pch="o", col=1:5)

Can also obtain barplots of proportional ancestry of MXL that was estimated from PCA

MXLprop<-data.frame(prop[population=="MXL",])
myorder=order(MXLprop$CEU)
temp=t(as.matrix(MXLprop[myorder,]))


barplot(temp, col=c("blue","red","green"),xlab="Individual ", ylab="Ancestry", border=NA,axisnames=FALSE,
        main="Estimated Ancestry of MXL from PCA",ylim=c(0,1))
legend("bottomright", c("European","African","Native American"),lwd=4, col=c("blue","red","green"),bg="white",cex=0.85)

Exercise 5 SISG Association Mapping