#!/bin/bash

# This series of scripts should allow you to replicate Sharon Browning's ancestry-specific effective population size estimation in the main simulated data set in the "Ancestry-Specific Recent Effective Population Size in the Americas", providing the results in Figure 1.

# The results will not be identical to those in the paper due to randomness. RFMix gives variable results due to the application of random forests, and does not allow for the setting of the seed. Assignment of admixed IBD segments to an ancestry is random, and the seed was not explicitly set in the analysis given in the paper. Finally, in earlier versions of Beagle, running the phasing multi-threaded meant that the same seed could give different phasings, and we ran it multi-threaded for the analysis given in the paper.

# Please let Sharon know if you find any issues with these scripts (sguy@uw.edu). 

#It is assumed that you are using the bash unix/linux shell, and that you have downloaded the simulated data files to a directory "data", and that the appropriate utility programs are in your working directory. You also need to have a "results" directory.

# You'll need the following Beagle Utility files downloadable from https://faculty.washington.edu/browning/beagle_utilities/utilities.html
# File manipulation utilities: filtercolumns.jar 
# File conversion utilities: base2genetic.jar

# You'll need the IBD gapfill utility, merge-ibd-segments.jar, available from http://faculty.washington.edu/browning/refined-ibd.html

# You'll need the revised IBDNE program, and the Beagle program
# IBDNE can be obtained from: http://faculty.washington.edu/browning/ibdne.html#download
# The particular version of Beagle 4.1 that we used here can be obtained from https://faculty.washington.edu/browning/beagle/beagle.09Feb16.2b7.jar
# If you use the newest version of Beagle instead, note that the phasing and IBD detection are now separated into two programs - you'll run Beagle for phasing, then refined-ibd for IBD detection. The results should be the same (up to random differences in phasing).

# You'll need the following scripts, available from: http://faculty.washington.edu/sguy/asibne/asibne_scripts/
# adjust_npairs.py filter_gapfilled_ibd_ancestry.py rephasevit.py run_rfmix

# The data files (30 vcf files, a map file, and the true Ne for making the plot) can be found at http://faculty.washington.edu/sguy/asibne/simulated_data/

# After generating the data with msprime, I filtered out all variants with frequency 0.05 or less, I removed 70% of the remaining variants, and I added genotype error as described in the paper. The data attached to the pipeline are the data after these steps were taken.

#############################################################################

# Run Beagle's phasing and detect IBD with refinedIBD.
# Note that these functions are performed in the same program with this older version of Beagle, whereas in the newer version two separate programs are used.
# You can change the nthreads parameter to fit the CPU you are using.
# Expect this command to take about 4 hours per chromosome with 1 thread.
for i in `seq 30`; do
  java -jar beagle.09Feb16.2b7.jar gt=data/chrom${i}_filtered.vcf.gz out=results/chrom${i}_phased nthreads=1 ibd=true
done

# Fill the gaps (this just takes a second or two per chromosome)
for i in `seq 30`; do
  cat results/chrom${i}_phased.ibd | java -jar merge-ibd-segments.23Apr18.249.jar results/chrom${i}_phased.vcf.gz data/constrecomb.map 0.6 1 > results/chrom${i}_filled.ibd
done

# Create RFMIX input (this just takes a few seconds per chromosome)
# This includes reordering the data to put the admixed chromosomes first in the data.
# In a real data set, if the vcf contains some extra information that prevents the first command in this set from working, pipe the data through simplify-vcf.jar, which can be found at http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/utilities/
for i in `seq 30`; do
  zcat results/chrom${i}_phased.vcf.gz | grep -v '#' | cut -f10- | tr -d '\t|' > results/chrom${i}.alleles_all
  cut -c1-600 results/chrom${i}.alleles_all > results/chrom${i}.alleles_ref
  cut -c601-1600 results/chrom${i}.alleles_all > results/chrom${i}.alleles_adm 
  paste results/chrom${i}.alleles_adm results/chrom${i}.alleles_ref | tr -d '\t' > results/chrom${i}.alleles
  rm results/chrom${i}.alleles_*
  cat data/constrecomb.map | grep -i "^${i}[[:space:]]" | cut -d' ' -f4 > results/chrom${i}.mapcol1
  cat data/constrecomb.map | grep -i "^${i}[[:space:]]" | cut -d' ' -f3 > results/chrom${i}.mapcol2
  paste results/chrom${i}.mapcol1 results/chrom${i}.mapcol2 > results/chrom${i}.2colmap
  zcat results/chrom${i}_phased.vcf.gz | grep -v '#' | cut -f2 | java -jar base2genetic.jar 1 results/chrom${i}.2colmap > results/chrom${i}.snploc
  rm results/chrom${i}.*map*
done

# make the rfmix classes file (very quick)
for j in `seq 1000`; do echo "0" >> temp; done
for j in `seq 200`; do echo "1" >> temp; done
for j in `seq 200`; do echo "2" >> temp; done
for j in `seq 200`; do echo "3" >> temp; done
(cat temp | tr '\n' ' '; echo "") > results/classes 
rm temp

# Run RFMIX (this takes around one hour per chromosome)
# Due to the way rfmix works, need to cd to the directory where it is installed.
# The "run_rfmix" script does this, but you'll need to go into the script and update the directory for rfmix on your computer.
for i in `seq 30`; do
  ./run_rfmix results/chrom${i}.alleles results/classes results/chrom${i}.snploc results/chrom${i}_rfmix
done

# make list of admixed ids
rm results/admids
for j in `seq 300 799`; do echo msp_$j >> results/admids; done

# Rephase the RFMIX output to match the original phasing (< 1 min per chromosome)
for i in `seq 30`; do
  python rephasevit.py results/chrom${i}_phased.vcf.gz results/admids results/chrom${i}_rfmix.allelesRephased0.txt results/chrom${i}_rfmix.0.Viterbi.txt > results/chrom${i}_rfmix_Viterbi_refinedibdphase
  rm results/chrom${i}_rfmix.*
done

# filter on population and put in the ancestry (this takes about 10 minutes per chromosome
nanc=3
for i in `seq 30`; do
  python filter_gapfilled_ibd_ancestry.py results/chrom${i}_phased.ibd results/chrom${i}_filled.ibd results/chrom${i}${x}_rfmix_Viterbi_refinedibdphase $nanc > results/chrom${i}_allanc.gapfilled_ibd
 for anc in 1 2 3; do
   cat results/chrom${i}_allanc.gapfilled_ibd | grep -i "[[:space:]]${anc}"'$' | cut -f1-8 -d' ' > results/chrom${i}_anc${anc}.gapfilled_ibd
  done
done

# combine data from the different chromosomes (this takes a second)
for anc in 1 2 3; do
  cat results/chrom[1-9]_anc${anc}.gapfilled_ibd results/chrom[1-3][0-9]_anc${anc}.gapfilled_ibd > results/anc${anc}.gapfilled_ibd 
done

# calculate adjusted number of pairs of haplotypes (needed to run ancestry-specific IBDNE; takes about 5 minutes per ancestry)
for anc in 1 2 3; do
  cat results/chrom[1-9]_rfmix_Viterbi_refinedibdphase results/chrom[1-3][0-9]_rfmix_Viterbi_refinedibdphase | java -jar filtercolumns.jar 1 results/admids | python adjust_npairs.py $anc > results/anc${anc}_npairs  
done

# run IBDNe using 2 cM IBD length threshold (takes 10 minutes per ancestry with 12 CPU cores)
cm=2
for anc in 1 2 3; do
  cat results/anc${anc}.gapfilled_ibd | java -jar ibdne.05May18.1c3.jar map=data/constrecomb.map nthreads=12 mincm=$cm npairs=`cat results/anc${anc}_npairs` filtersamples=false out=results/anc${anc}_${cm}cM.ibdne
done

# plot the results in R
pdf("figure1reproduction.pdf",height=3,width=10)
par(mfrow=c(1,3),mar=c(4.5,4.5,3.5,1.5)+0.1)
truene = read.table("data/true.ne")
for(anc in 1:3){
  filename=paste("results/anc",anc,"_2cM.ibdne.ne",sep="")
  x=read.table(filename,header=T)
  col2="#00000080"
  plot(truene[,1],truene[,1+anc],type="l",log="y",xlim=c(0,100),ylim=c(1e3,1e6),xlab="g (generations before present)",ylab="Ne (effective population size)",xaxs="i",yaxt="n",las=1,lwd=2.5,lty=32,col="blue",main=paste("Simulated",c("African","European","Asian")[anc],"ancestry"))
  abline(h=c(1e2,1e3,1e4,1e5,1e6,1e7),v=seq(20,80,20),col="lightgray",lty="dotted")
  axis(2,at=c(1e3,1e4,1e5,1e6,1e7),labels=c(expression(10^3),expression(10^4),expression(10^5),expression(10^6),expression(10^7)),las=1)
  polygon(c(x[,1],x[nrow(x):1,1]),c(x[,3],x[nrow(x):1,4]),col=col2,density=-1,border=col2)
  lines(x[,1],x[,2],lwd=1)
}
dev.off()