##########################
#Install and load packages
##########################
#####
#RENV
#####
##Download renv package if necessary (our version = 0.17.3)
#Get currently installed packages
pkg <- installed.packages()[,1]
#Check if renv is installed, else download it
if (!("renv" %in% pkg)) {
if (!requireNamespace("remotes", quietly = TRUE)) {
install.packages("remotes")
}
remotes::install_version("renv", version = "0.17.3")
print('Installed package: renv (v0.17.3)')
} else {
print('renv is already installed')
}
## [1] "renv is already installed"
#load renv package
library(renv)
#Set directory to location renv lock file
setwd("/home/genomics/rolbrechts/00_Data/01_First_analysis/07_PCA/R_markdown/")
renv::activate()
#Download packages from lockfile (answer yes in console)
renv::restore(packages = c('vcfR','adegenet','readr','ggplot2'))
## * The library is already synchronized with the lockfile.
##############
#Load packages
##############
library(vcfR)
library(adegenet)
library(readr)
library(ggplot2)
################
#Import vcf file
################
vcf <- read.vcfR("GATK_filtered.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 45
## header_line: 46
## variant count: 402
## column count: 373
##
Meta line 45 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 402
## Character matrix gt cols: 373
## skip: 0
## nrows: 402
## row_num: 0
##
Processed variant: 402
## All variants processed
#Check vcf file
head(vcf)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##FILTER=<ID=LowQual,Description=\"Low quality\">"
## [1] "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths fo [Truncated]"
## [1] "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read [Truncated]"
## [1] "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">"
## [1] "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
## [1] "First 6 rows."
## [1]
## [1] "***** Fixed section *****"
## CHROM POS ID REF ALT QUAL FILTER
## [1,] "Cc_chr1" "4736327" NA "G" "A" "143501" NA
## [2,] "Cc_chr1" "4736344" NA "G" "A" "35316.90" NA
## [3,] "Cc_chr1" "4736368" NA "T" "C" "27557.30" NA
## [4,] "Cc_chr1" "4736369" NA "A" "G" "30361.80" NA
## [5,] "Cc_chr1" "4736373" NA "A" "G" "396623" NA
## [6,] "Cc_chr1" "25932640" NA "C" "T" "507530" NA
## [1]
## [1] "***** Genotype section *****"
## FORMAT Collection_G0260FOG_2345
## [1,] "GT:AD:DP:GQ:PL" "0/0:250,0:250:99:0,752,10414"
## [2,] "GT:AD:DP:GQ:PL" "0/0:250,0:250:99:0,753,10438"
## [3,] "GT:AD:DP:GQ:PL" "0/0:250,0:250:99:0,753,10440"
## [4,] "GT:AD:DP:GQ:PL" "0/0:249,0:250:99:0,749,10383"
## [5,] "GT:AD:DP:GQ:PL" "0/0:250,0:250:99:0,753,10433"
## [6,] "GT:AD:DP:GQ:PL" "0/1:128,122:250:99:4340,0,4593"
## Collection_G0261FOG_2346 Collection_G0259FOG_2350
## [1,] "0/0:77,0:77:99:0,229,3172" "0/0:250,0:250:99:0,752,10417"
## [2,] "0/0:77,0:77:99:0,232,3211" "0/0:250,0:250:99:0,753,10435"
## [3,] "0/0:77,0:77:99:0,232,3214" "0/0:250,0:250:99:0,753,10431"
## [4,] "0/0:77,0:77:99:0,232,3199" "0/0:250,0:250:99:0,749,10391"
## [5,] "0/0:76,1:77:99:0,192,3132" "1/1:6,244:250:99:9950,518,0"
## [6,] "1/1:1,132:133:99:5469,360,0" "0/0:247,3:250:99:0,627,10170"
## Collection_G0250FOG_1470 Collection_G0251FOG_1472
## [1,] "0/0:30,0:30:90:0,90,1253" "0/0:22,0:22:66:0,66,919"
## [2,] "0/0:30,0:30:90:0,90,1253" "0/0:22,0:22:66:0,66,919"
## [3,] "0/0:30,0:30:90:0,90,1253" "0/0:22,0:22:66:0,66,919"
## [4,] "0/0:30,0:30:90:0,90,1253" "0/0:22,0:22:66:0,66,919"
## [5,] "0/0:30,0:30:90:0,90,1253" "0/1:13,9:22:99:310,0,477"
## [6,] "0/0:40,0:40:99:0,120,1670" "0/0:24,0:24:72:0,72,1002"
## [1] "First 6 columns only."
## [1]
## [1] "Unique GT formats:"
## [1] "GT:AD:DP:GQ:PL"
## [1]
###############################
#Convert vcf to Genlight object
###############################
gen <- vcfR2genlight(vcf)
################
#Perform The PCA
################
#Perform a PCA (only 2 components needed for the plot)
no_components <- 2
pca <- glPca(gen, nf=no_components)
#Check if the PCA was successful by plotting the scores
plot(pca$scores)
######################
#Add sample annotations
######################
#To make more informative graphs, sample annotations are added to the dataframe containing the pca scores
#First, import the annotation file and assign it to a variable
annotations <- read_delim("Annotations.txt" ,delim = "\t")
## Rows: 364 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Sample_ID, Collection, Species
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Create new dataframe containing a first column with Sample_IDs and other columns containing the PCA scores
pca_annotated <- cbind(Sample_ID = row.names(pca$scores), as.data.frame(pca$scores))
#New dataframe and sample annotation are merged by column "Sample_ID"
pca_annotated <- merge(pca_annotated, annotations, by = "Sample_ID")
head(pca_annotated,10)
## Sample_ID PC1 PC2 Collection
## 1 Anthonyi_wild_plot_BR_2255 3.5648253 2.1579548 Coffea spec.
## 2 Anthonyi_wild_plot_BR_2269 2.8939738 1.7854999 Coffea spec.
## 3 Arabica_wild_plot_Aerts_2256 2.1944611 0.7020742 Coffea spec.
## 4 Arabica_wild_plot_Aerts_2257 2.2710339 0.5288963 Coffea spec.
## 5 Brevipes_wild_plot_BR_2247 1.7361191 0.7595663 Coffea spec.
## 6 Brevipes_wild_plot_BR_2248 1.6395862 0.4302185 Coffea spec.
## 7 Canephora_wild_plot_INERA_2285 0.9503060 0.8456958 Unknown origin
## 8 Canephora_wild_plot_INERA_2288 0.4811003 -0.5887980 Unknown origin
## 9 Canephora_wild_plot_INERA_2289 -0.2860125 -0.9511718 Unknown origin
## 10 Canephora_wild_plot_INERA_2290 -0.2288216 -0.4981704 Unknown origin
## Species
## 1 anthonyi
## 2 anthonyi
## 3 arabica
## 4 arabica
## 5 brevipes
## 6 brevipes
## 7 canephora
## 8 canephora
## 9 canephora
## 10 canephora
##############
#Plot the PCA
##############
#Set colours for species
pca_annotated$Species <- as.factor(pca_annotated$Species)
col <- setNames(c("#660000", "#333399", "#ACAD6A", "#33FF00", "#23BC2A",
"#00FFFF", "#FF0099", "#FF9933", "#990066", "#660099",
"#406261", "#99CCCC", "#53634E", "#FFCC00", "#993300",
"#996633", "#FFCCCC", "#9EC93C", "#F76C7B", "#0066FF",
"#FF3300", "#666000", "#9093C8", "#3CBAC9", "#C99098"),
levels(pca_annotated$Species))
col
## anthonyi arabica brevipes
## "#660000" "#333399" "#ACAD6A"
## canephora charrieriana congensis
## "#33FF00" "#23BC2A" "#00FFFF"
## congensis? dubardi ebracteolata
## "#FF0099" "#FF9933" "#990066"
## eugenioides humilis kapakata
## "#660099" "#406261" "#99CCCC"
## kivuensis lebrunianus/Psilanthus? liberica
## "#53634E" "#FFCC00" "#993300"
## lulandoensis macrocarpa pocsii
## "#996633" "#FFCCCC" "#9EC93C"
## pseudozanguebariae Psilanthus Quilu
## "#F76C7B" "#0066FF" "#FF3300"
## racemosa salvatrix sessiliflora
## "#666000" "#9093C8" "#3CBAC9"
## stenophylla
## "#C99098"
#Set shapes for collection
pca_annotated$Collection <- as.factor(pca_annotated$Collection)
shapes <- setNames(c(3,17,15,16,4), levels(pca_annotated$Collection))
shapes
## Coffea spec. Unknown origin Vietnam collection Wild collection
## 3 17 15 16
## Yangambi collection
## 4
#Percentage of explained variance by PCA1 and 2 will be shown on the axes if he plot
#Value is rounded to 4 numbers and multiplied by 100
PC1_percent <- round(pca$eig[1]/sum(pca$eig),4) * 100
PC2_percent <- round(pca$eig[2]/sum(pca$eig),4) * 100
PC1_percent
## [1] 15.54
PC2_percent
## [1] 7.88
#Making the plot
#Set PC1 on x-axis and PC2 on y-axis.
#Color and shape are set to Species and Accession respectively
PCA_plot <- ggplot(pca_annotated,aes(x = PC1,y = PC2, color = Species, shape = Collection))+
geom_point(alpha = 0.4, size = 3.5)+
#limits to put graphs on the same scale as BCFtools (comment to disable)
xlim(-3,5)+
ylim(-2.01,4.5)+
scale_color_manual(values = col)+
scale_shape_manual(values = shapes)+
#Set explained variance ratio on axis labels
xlab(paste0("PC1 ", PC1_percent, "%"))+
ylab(paste0("PC2 ", PC2_percent, "%"))+
ggtitle('PCA using GATK')+
guides(size = 12)+
theme_bw()
PCA_plot
Create png files from the plots.
#Save plots as png files
#Set the dimensions of the PNG files
png_width <- 900
png_height <- 600
png("GATK.png", width = png_width, height = png_height)
PCA_plot
dev.off()
## png
## 2