First, make sure that you have installed the Bioconductor R packages gdsfmt and SNPRelate:
Load the pakcages gdsfmt and SNPRelate into your R session
library(gdsfmt)
library(SNPRelate)
Look at the .fam file and obtain the number of individual in the study
FAM<-read.table(file="../SISGAssocData/YRI_CEU_ASW_MEX_NAM.fam",sep=" ", header=FALSE,na="NA")
head(FAM)
# V1 V2 V3 V4 V5 V6
# 1 1432 HGDP00702 0 0 2 -9
# 2 1433 HGDP00703 0 0 1 -9
# 3 1434 HGDP00704 0 0 2 -9
# 4 1436 HGDP00706 0 0 2 -9
# 5 1438 HGDP00708 0 0 2 -9
# 6 1440 HGDP00710 0 0 1 -9
dim(FAM)
# [1] 604 6
unique(FAM$V1)
# [1] 1432 1433 1434 1436 1438 1440 1551 1555 1556 1561 1563
# [12] 1564 1567 1570 1571 1572 1573 1574 1575 1576 1577 1578
# [23] 1579 1580 1581 1582 1585 1586 1587 1588 1589 1590 1592
# [34] 1593 1594 1684 1704 1707 1708 1710 1711 1714 1718 1720
# [45] 1721 1722 1723 1726 1727 1740 1744 1746 1747 1750 1753
# [56] 1754 1756 1758 1759 1760 1761 1762 1763 2427 2431 2424
# [67] 2469 2368 2425 2430 2470 2484 2436 2426 2487 2475 2480
# [78] 2418 2471 2474 2490 2433 2495 2477 2492 2485 2494 2446
# [89] 2491 2489 2483 2466 2476 2488 2357 2367 2472 2467 2371
# [100] 2481 2437 2496 2369 2493 2505 1328 1377 1349 1330 1444
# [111] 1344 1463 1418 13291 1424 1346 13292 1354 13281 1451 1421
# [122] 1353 1358 1458 1456 1423 1345 1355 1350 1334 1347 1340
# [133] 1408 1447 1459 1362 1341 1420 1416 1454 1375 M012 M016
# [144] M001 M002 M006 M007 M011 M015 M017 M023 M027 M033 M035
# [155] M004 M005 M010 M026 M031 M034 M036 M039 M008 M032 M037
# [166] M009 M014 M024 M028 2382 2394 2395 2414 2405 Y001 Y014
# [177] Y039 Y075 Y041 Y007 Y073 Y010 Y092 Y006 Y033 Y038 Y030
# [188] Y061 Y111 Y120 Y036 Y003 Y079 Y116 Y100 Y027 Y044 Y002
# [199] Y052 Y110 Y057 Y035 Y019 Y025 Y005 Y045 Y117 Y004 Y043
# [210] Y072 Y023 Y058 Y024 Y074 Y112 Y048 Y017 Y013 Y050 Y056
# [221] Y042 Y047 Y018 Y071 Y028 Y009 Y077 Y060 Y016 Y012 Y040
# [232] Y101 Y051 Y105 Y020 Y022 Y118 Y021 Y054 Y113 Y108 Y059
# [243] Y119 Y062 Y109 Y114 Y032 Y080 Y078 Y102 Y103 Y055 Y049
# [254] Y064 Y034 Y029 Y076
# 257 Levels: 1328 13281 13291 13292 1330 1334 1340 1341 1344 1345 ... Y120
Now obtain information about the number SNPs in the .bim file
map<-read.table(file="../SISGAssocData/YRI_CEU_ASW_MEX_NAM.bim",sep="\t", header=FALSE,na="NA")
head(map)
# V1 V2 V3 V4 V5 V6
# 1 1 rs9442372 0 1008567 1 2
# 2 1 rs2887286 0 1145994 1 2
# 3 1 rs3813199 0 1148140 1 2
# 4 1 rs6685064 0 1201155 1 2
# 5 1 rs9439462 0 1452629 1 2
# 6 1 rs3820011 0 1878053 1 2
dim(map)
# [1] 150872 6
POPINFO=read.table(file="../SISGAssocData/Population_Sample_Info.txt",header=TRUE)
Obtain a table with the number of sample individuals from each population
table(POPINFO$Population)
#
# ASW CEU MXL NAM YRI
# 87 165 86 63 203
Check to make sure the order of individuals is the same as the PLINK file
identical(POPINFO$IID,FAM$V2)
# [1] TRUE
bedfile<-"../SISGAssocData/YRI_CEU_ASW_MEX_NAM.bed"
famfile<-"../SISGAssocData/YRI_CEU_ASW_MEX_NAM.fam"
bimfile<-"../SISGAssocData/YRI_CEU_ASW_MEX_NAM.bim"
snpgdsBED2GDS(bedfile, famfile, bimfile, "IPCgeno.gds")
# Start snpgdsBED2GDS ...
# BED file: "../SISGAssocData/YRI_CEU_ASW_MEX_NAM.bed" in the SNP-major mode (Sample X SNP)
# FAM file: "../SISGAssocData/YRI_CEU_ASW_MEX_NAM.fam", DONE.
# BIM file: "../SISGAssocData/YRI_CEU_ASW_MEX_NAM.bim", DONE.
# Mon Jul 16 02:13:30 2018 store sample id, snp id, position, and chromosome.
# start writing: 604 samples, 150872 SNPs ...
# Mon Jul 16 02:13:30 2018 0%
# Mon Jul 16 02:13:31 2018 100%
# Mon Jul 16 02:13:31 2018 Done.
# Optimize the access efficiency ...
# Clean up the fragments of GDS file:
# open the file 'IPCgeno.gds' (22.8M)
# # of fragments: 39
# save to 'IPCgeno.gds.tmp'
# rename 'IPCgeno.gds.tmp' (22.8M, reduced: 252B)
# # of fragments: 18
Open the GDS file and
genofile <- snpgdsOpen("IPCgeno.gds")
Explore the GDS file and a few of the components
head(genofile)
# $filename
# [1] "/Users/tathornt/Documents/SISG_2018_ASSOCIATION_EXERCISES/Exercises/IPCgeno.gds"
#
# $id
# [1] 0
#
# $root
# + [ ] *
# |--+ sample.id { Str8 604 ZIP_ra(27.7%), 1.3K }
# |--+ snp.id { Str8 150872 ZIP_ra(37.6%), 561.7K }
# |--+ snp.position { Int32 150872 ZIP_ra(90.9%), 535.8K }
# |--+ snp.chromosome { UInt8 150872 ZIP_ra(0.16%), 241B } *
# |--+ snp.allele { Str8 150872 ZIP_ra(0.13%), 765B }
# |--+ genotype { Bit2 604x150872, 21.7M } *
# \--+ sample.annot [ data.frame ] *
# |--+ sex { Str8 604 ZIP_ra(14.2%), 178B }
# \--+ phenotype { Int32 604 ZIP_ra(1.61%), 46B }
#
# $readonly
# [1] TRUE
head(read.gdsn(index.gdsn(genofile, "sample.id")))
# [1] "HGDP00702" "HGDP00703" "HGDP00704" "HGDP00706" "HGDP00708" "HGDP00710"
head(read.gdsn(index.gdsn(genofile, "snp.id")))
# [1] "rs9442372" "rs2887286" "rs3813199" "rs6685064" "rs9439462" "rs3820011"
Extract genotype for a specific SNP and plot a histrogram of the genotype values
g <- snpgdsGetGeno(genofile, snp.id="rs3813199")
# Genotype matrix: 604 samples X 1 SNPs
hist(g)
pca <- snpgdsPCA(genofile)
# Principal Component Analysis (PCA) on genotypes:
# Excluding 0 SNP on non-autosomes
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 150,872 SNPs
# using 1 (CPU) core
# PCA: the sum of all selected genotypes (0,1,2) = 47894459
# CPU capabilities: Double-Precision SSE2
# Mon Jul 16 02:13:31 2018 (internal increment: 1240)
#
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 8s
# Mon Jul 16 02:13:39 2018 Begin (eigenvalues and eigenvectors)
# Mon Jul 16 02:13:39 2018 Done.
tab <- data.frame(sample.id = pca$sample.id,
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
head(tab)
# sample.id EV1 EV2
# 1 HGDP00702 -0.04914341 -0.09600764
# 2 HGDP00703 -0.04288822 -0.07801091
# 3 HGDP00704 -0.04920633 -0.09414968
# 4 HGDP00706 -0.04910582 -0.09618059
# 5 HGDP00708 -0.04917482 -0.09728481
# 6 HGDP00710 -0.04953930 -0.09574544
plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1")
Get sample id and create a new dataframe with the population membership
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
population=POPINFO$Population
tab <- data.frame(sample.id = pca$sample.id,
pop = factor(population)[match(pca$sample.id, sample.id)],
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
head(tab)
# sample.id pop EV1 EV2
# 1 HGDP00702 NAM -0.04914341 -0.09600764
# 2 HGDP00703 NAM -0.04288822 -0.07801091
# 3 HGDP00704 NAM -0.04920633 -0.09414968
# 4 HGDP00706 NAM -0.04910582 -0.09618059
# 5 HGDP00708 NAM -0.04917482 -0.09728481
# 6 HGDP00710 NAM -0.04953930 -0.09574544
Make a plot of the top 2 PCs, with points color coded by population membership
plot(tab$EV2, tab$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1", main="PCA using all SNPs")
legend("topleft", legend=levels(tab$pop), pch="o", col=1:(nlevels(tab$pop)))
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
# [1] 11.30 3.99 0.55 0.44 0.40 0.39
lbls <- paste("PC", 1:4, "\n", format(pc.percent[1:4], digits=2), "%", sep="")
pairs(pca$eigenvect[,1:4], col=tab$pop, labels=lbls)
Obtain a subset of SNPs based on linkage disequilibrium (LD threshold of 0.2
snpset <- snpgdsLDpruning(genofile, ld.threshold=0.2)
# SNP pruning based on LD:
# Excluding 0 SNP on non-autosomes
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 150,872 SNPs
# using 1 (CPU) core
# sliding window: 500,000 basepairs, Inf SNPs
# |LD| threshold: 0.2
# method: composite
# Chromosome 1: 28.62%, 3,312/11,572
# Chromosome 2: 26.19%, 3,344/12,766
# Chromosome 3: 27.54%, 2,898/10,524
# Chromosome 4: 28.57%, 2,521/8,825
# Chromosome 5: 27.07%, 2,617/9,668
# Chromosome 6: 26.39%, 2,590/9,814
# Chromosome 7: 27.79%, 2,250/8,097
# Chromosome 8: 25.30%, 2,212/8,743
# Chromosome 9: 26.44%, 1,978/7,482
# Chromosome 10: 27.15%, 2,231/8,217
# Chromosome 11: 26.84%, 2,006/7,474
# Chromosome 12: 27.89%, 2,080/7,457
# Chromosome 13: 28.15%, 1,632/5,798
# Chromosome 14: 27.47%, 1,422/5,176
# Chromosome 15: 26.75%, 1,315/4,916
# Chromosome 16: 27.20%, 1,341/4,930
# Chromosome 17: 31.95%, 1,228/3,843
# Chromosome 18: 27.85%, 1,321/4,743
# Chromosome 19: 35.59%, 794/2,231
# Chromosome 20: 29.15%, 1,213/4,161
# Chromosome 21: 29.71%, 656/2,208
# Chromosome 22: 30.04%, 669/2,227
# 41,630 markers are selected in total.
snpset.id <- unlist(snpset)
Now perform a PCA with the subset of SNPs
pca2 <- snpgdsPCA(genofile, snp.id=snpset.id)
# Principal Component Analysis (PCA) on genotypes:
# Excluding 109,242 SNPs (non-autosomes or non-selection)
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 41,630 SNPs
# using 1 (CPU) core
# PCA: the sum of all selected genotypes (0,1,2) = 11026888
# CPU capabilities: Double-Precision SSE2
# Mon Jul 16 02:13:42 2018 (internal increment: 1240)
#
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 2s
# Mon Jul 16 02:13:44 2018 Begin (eigenvalues and eigenvectors)
# Mon Jul 16 02:13:44 2018 Done.
tab2 <- data.frame(sample.id = pca2$sample.id,
pop = factor(population)[match(pca2$sample.id, sample.id)],
EV1 = pca2$eigenvect[,1], # the first eigenvector
EV2 = pca2$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
Make a plot of the top 2 PCs with the subset of SNP, with points color coded by population membership
plot(tab2$EV2, tab2$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1", main="PCA using a subset of SNPs")
legend("bottomright", legend=levels(tab$pop), pch="o", col=1:(nlevels(tab$pop)))
Obtain average PC 2 value for CEU (European) and NAM (Native Americans), since PC2 is relecting European and Native American ancesty
avgCEU2=mean(pca2$eigenvect[population=="CEU",2])
avgNAM2=mean(pca2$eigenvect[population=="NAM",2])
Estimate proportional MXL ancestry, based on their relative value to the average CEU and NAM PC 2 values
MXLadmix=(pca2$eigenvect[population=="MXL",2]-avgNAM2)/(avgCEU2-avgNAM2)
Make a table of the estimated admixture proportions
tab2=cbind(MXLadmix,1-MXLadmix)
myorder=order(MXLadmix)
temp=t(as.matrix(tab2[myorder,]))
MAKE A BARPLOT OF MXL ESTIMATED ANCESTRY FROM THE PCA
barplot(temp, col=c("blue","green"),xlab="Individual ", ylab="Ancestry", border=NA,axisnames=FALSE,main="Ancestry of MXL",ylim=c(0,1))
legend("bottomright", c("European","Native American"), lwd=4, col=c("blue","green"), bg="white",cex=0.85)
See documentation about this here: https://www.rdocumentation.org/packages/SNPRelate/versions/1.6.4/topics/snpgdsEIGMIX
groups <- list(CEU = sample.id[population == "CEU"],
YRI = sample.id[population == "YRI"],
NAM = sample.id[population=="NAM"])
prop <- snpgdsAdmixProp(pca2, groups=groups,bound=TRUE)
plot(prop[, "YRI"], prop[, "CEU"], col=as.integer(tab$pop),
xlab = "Admixture Proportion from YRI",
ylab = "Admixture Proportion from CEU")
abline(v=0, col="gray25", lty=2)
abline(h=0, col="gray25", lty=2)
abline(a=1, b=-1, col="gray25", lty=2)
legend("topright", legend=levels(tab$pop), pch="o", col=1:5)
MXLprop<-data.frame(prop[population=="MXL",])
myorder=order(MXLprop$CEU)
temp=t(as.matrix(MXLprop[myorder,]))
barplot(temp, col=c("blue","red","green"),xlab="Individual ", ylab="Ancestry", border=NA,axisnames=FALSE,
main="Estimated Ancestry of MXL from PCA",ylim=c(0,1))
legend("bottomright", c("European","African","Native American"),lwd=4, col=c("blue","red","green"),bg="white",cex=0.85)