PCA.knit

##########################
#Install and load packages
##########################

#####
#RENV
#####

##Download renv package if necessary (our version = 0.17.3)
#Get currently installed packages
pkg <- installed.packages()[,1] 

#Check if renv is installed, else download it
if (!("renv" %in% pkg)) {
  if (!requireNamespace("remotes", quietly = TRUE)) {
    install.packages("remotes")
  }
  remotes::install_version("renv", version = "0.17.3")
  print('Installed package: renv (v0.17.3)')
} else {
  print('renv is already installed')
}

## [1] "renv is already installed"

#load renv package
library(renv)
#Set directory to location renv lock file
setwd("/home/genomics/rolbrechts/00_Data/01_First_analysis/07_PCA/R_markdown/")
renv::activate()
#Download packages from lockfile (answer yes in console)
renv::restore(packages = c('vcfR','adegenet','readr','ggplot2'))

## * The library is already synchronized with the lockfile.

##############
#Load packages
##############

library(vcfR)
library(adegenet)
library(readr)
library(ggplot2)

################
#Import vcf file
################

vcf <- read.vcfR("GATK_filtered.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 45
##   header_line: 46
##   variant count: 402
##   column count: 373
## 
Meta line 45 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 402
##   Character matrix gt cols: 373
##   skip: 0
##   nrows: 402
##   row_num: 0
## 
Processed variant: 402
## All variants processed

#Check vcf file
head(vcf)

## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##FILTER=<ID=LowQual,Description=\"Low quality\">"
## [1] "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths fo [Truncated]"
## [1] "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read  [Truncated]"
## [1] "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">"
## [1] "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
## [1] "First 6 rows."
## [1] 
## [1] "***** Fixed section *****"
##      CHROM     POS        ID REF ALT QUAL       FILTER
## [1,] "Cc_chr1" "4736327"  NA "G" "A" "143501"   NA    
## [2,] "Cc_chr1" "4736344"  NA "G" "A" "35316.90" NA    
## [3,] "Cc_chr1" "4736368"  NA "T" "C" "27557.30" NA    
## [4,] "Cc_chr1" "4736369"  NA "A" "G" "30361.80" NA    
## [5,] "Cc_chr1" "4736373"  NA "A" "G" "396623"   NA    
## [6,] "Cc_chr1" "25932640" NA "C" "T" "507530"   NA    
## [1] 
## [1] "***** Genotype section *****"
##      FORMAT           Collection_G0260FOG_2345        
## [1,] "GT:AD:DP:GQ:PL" "0/0:250,0:250:99:0,752,10414"  
## [2,] "GT:AD:DP:GQ:PL" "0/0:250,0:250:99:0,753,10438"  
## [3,] "GT:AD:DP:GQ:PL" "0/0:250,0:250:99:0,753,10440"  
## [4,] "GT:AD:DP:GQ:PL" "0/0:249,0:250:99:0,749,10383"  
## [5,] "GT:AD:DP:GQ:PL" "0/0:250,0:250:99:0,753,10433"  
## [6,] "GT:AD:DP:GQ:PL" "0/1:128,122:250:99:4340,0,4593"
##      Collection_G0261FOG_2346      Collection_G0259FOG_2350      
## [1,] "0/0:77,0:77:99:0,229,3172"   "0/0:250,0:250:99:0,752,10417"
## [2,] "0/0:77,0:77:99:0,232,3211"   "0/0:250,0:250:99:0,753,10435"
## [3,] "0/0:77,0:77:99:0,232,3214"   "0/0:250,0:250:99:0,753,10431"
## [4,] "0/0:77,0:77:99:0,232,3199"   "0/0:250,0:250:99:0,749,10391"
## [5,] "0/0:76,1:77:99:0,192,3132"   "1/1:6,244:250:99:9950,518,0" 
## [6,] "1/1:1,132:133:99:5469,360,0" "0/0:247,3:250:99:0,627,10170"
##      Collection_G0250FOG_1470    Collection_G0251FOG_1472  
## [1,] "0/0:30,0:30:90:0,90,1253"  "0/0:22,0:22:66:0,66,919" 
## [2,] "0/0:30,0:30:90:0,90,1253"  "0/0:22,0:22:66:0,66,919" 
## [3,] "0/0:30,0:30:90:0,90,1253"  "0/0:22,0:22:66:0,66,919" 
## [4,] "0/0:30,0:30:90:0,90,1253"  "0/0:22,0:22:66:0,66,919" 
## [5,] "0/0:30,0:30:90:0,90,1253"  "0/1:13,9:22:99:310,0,477"
## [6,] "0/0:40,0:40:99:0,120,1670" "0/0:24,0:24:72:0,72,1002"
## [1] "First 6 columns only."
## [1] 
## [1] "Unique GT formats:"
## [1] "GT:AD:DP:GQ:PL"
## [1]

###############################
#Convert vcf to Genlight object
###############################
gen <- vcfR2genlight(vcf)

################
#Perform The PCA
################

#Perform a PCA (only 2 components needed for the plot)
no_components <- 2
pca <- glPca(gen, nf=no_components)

#Check if the PCA was successful by plotting the scores
plot(pca$scores)

######################
#Add sample annotations
######################

#To make more informative graphs, sample annotations are added to the dataframe containing the pca scores
#First, import the annotation file and assign it to a variable
annotations <- read_delim("Annotations.txt" ,delim = "\t")

## Rows: 364 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Sample_ID, Collection, Species
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Create new dataframe containing a first column with Sample_IDs and other columns containing the PCA scores
pca_annotated <- cbind(Sample_ID = row.names(pca$scores), as.data.frame(pca$scores))

#New dataframe and sample annotation are merged by column "Sample_ID"
pca_annotated <- merge(pca_annotated, annotations, by = "Sample_ID")

head(pca_annotated,10)

##                         Sample_ID        PC1        PC2     Collection
## 1      Anthonyi_wild_plot_BR_2255  3.5648253  2.1579548   Coffea spec.
## 2      Anthonyi_wild_plot_BR_2269  2.8939738  1.7854999   Coffea spec.
## 3    Arabica_wild_plot_Aerts_2256  2.1944611  0.7020742   Coffea spec.
## 4    Arabica_wild_plot_Aerts_2257  2.2710339  0.5288963   Coffea spec.
## 5      Brevipes_wild_plot_BR_2247  1.7361191  0.7595663   Coffea spec.
## 6      Brevipes_wild_plot_BR_2248  1.6395862  0.4302185   Coffea spec.
## 7  Canephora_wild_plot_INERA_2285  0.9503060  0.8456958 Unknown origin
## 8  Canephora_wild_plot_INERA_2288  0.4811003 -0.5887980 Unknown origin
## 9  Canephora_wild_plot_INERA_2289 -0.2860125 -0.9511718 Unknown origin
## 10 Canephora_wild_plot_INERA_2290 -0.2288216 -0.4981704 Unknown origin
##      Species
## 1   anthonyi
## 2   anthonyi
## 3    arabica
## 4    arabica
## 5   brevipes
## 6   brevipes
## 7  canephora
## 8  canephora
## 9  canephora
## 10 canephora

##############
#Plot the PCA
##############

#Set colours for species
pca_annotated$Species <- as.factor(pca_annotated$Species)

col <- setNames(c("#660000", "#333399", "#ACAD6A", "#33FF00", "#23BC2A", 
                  "#00FFFF", "#FF0099", "#FF9933", "#990066", "#660099", 
                  "#406261", "#99CCCC", "#53634E", "#FFCC00", "#993300",
                  "#996633", "#FFCCCC", "#9EC93C", "#F76C7B", "#0066FF", 
                  "#FF3300", "#666000", "#9093C8", "#3CBAC9", "#C99098"),
                levels(pca_annotated$Species))

col

##                anthonyi                 arabica                brevipes 
##               "#660000"               "#333399"               "#ACAD6A" 
##               canephora            charrieriana               congensis 
##               "#33FF00"               "#23BC2A"               "#00FFFF" 
##              congensis?                 dubardi            ebracteolata 
##               "#FF0099"               "#FF9933"               "#990066" 
##             eugenioides                 humilis                kapakata 
##               "#660099"               "#406261"               "#99CCCC" 
##               kivuensis lebrunianus/Psilanthus?                liberica 
##               "#53634E"               "#FFCC00"               "#993300" 
##            lulandoensis              macrocarpa                  pocsii 
##               "#996633"               "#FFCCCC"               "#9EC93C" 
##      pseudozanguebariae              Psilanthus                   Quilu 
##               "#F76C7B"               "#0066FF"               "#FF3300" 
##                racemosa               salvatrix            sessiliflora 
##               "#666000"               "#9093C8"               "#3CBAC9" 
##             stenophylla 
##               "#C99098"

#Set shapes for collection
pca_annotated$Collection <- as.factor(pca_annotated$Collection)

shapes <- setNames(c(3,17,15,16,4), levels(pca_annotated$Collection))

shapes

##        Coffea spec.      Unknown origin  Vietnam collection     Wild collection 
##                   3                  17                  15                  16 
## Yangambi collection 
##                   4

#Percentage of explained variance by PCA1 and 2 will be shown on the axes if he plot
#Value is rounded to 4 numbers and multiplied by 100
PC1_percent <- round(pca$eig[1]/sum(pca$eig),4) * 100
PC2_percent <- round(pca$eig[2]/sum(pca$eig),4) * 100

PC1_percent

## [1] 15.54

PC2_percent

## [1] 7.88

#Making the plot
#Set PC1 on x-axis and PC2 on y-axis. 
#Color and shape  are set to Species and Accession respectively
PCA_plot <- ggplot(pca_annotated,aes(x = PC1,y = PC2, color = Species, shape = Collection))+
  geom_point(alpha = 0.4, size = 3.5)+
  #limits to put graphs on the same scale as BCFtools (comment to disable)
  xlim(-3,5)+
  ylim(-2.01,4.5)+
  scale_color_manual(values = col)+
  scale_shape_manual(values = shapes)+
  #Set explained variance ratio on axis labels
  xlab(paste0("PC1 ", PC1_percent, "%"))+
  ylab(paste0("PC2 ", PC2_percent, "%"))+
  ggtitle('PCA using GATK')+
  guides(size = 12)+
  theme_bw()

PCA_plot

Create png files from the plots.

#Save plots as png files
#Set the dimensions of the PNG files
png_width <- 900
png_height <- 600

png("GATK.png", width = png_width, height = png_height)
PCA_plot
dev.off()

## png 
##   2