##########################
#Install and load packages
##########################
#####
#RENV
#####
##Download renv package if necessary (our version = 0.17.3)
#Get currently installed packages
pkg <- installed.packages()[,1]
#Check if renv is installed, else download it
if (!("renv" %in% pkg)) {
if (!requireNamespace("remotes", quietly = TRUE)) {
install.packages("remotes")
}
remotes::install_version("renv", version = "0.17.3")
print('Installed package: renv (v0.17.3)')
} else {
print('renv is already installed')
}
## [1] "renv is already installed"
#load renv package
library(renv)
#Set direcory to location renv lock file
setwd("/home/genomics/rolbrechts/00_Data/01_First_analysis/09_Core_collection/R markdown/")
renv::activate()
#Download packages from lockfile (answer yes in console)
renv::restore(packages = c('rJava','corehunter','ggplot2','ggrepel',
'beepr','patchwork','vcfR',
'adegenet','readr','tidyr','dplyr'))
## * The library is already synchronized with the lockfile.
##############
#Load packages
##############
library(rJava)
library(corehunter)
library(ggplot2)
library(ggrepel)
library(beepr)
library(patchwork)
library(vcfR)
library(adegenet)
library(readr)
library(tidyr)
library(dplyr)
##########
#Read data
##########
### genotype file
genoo <- genotypes(file = "Core_hunter_subset.txt",
format = "biparental")
Initialize Core Hunter data:
my.data <- coreHunterData(genoo)
#Check the object
my.data
## Core Hunter data
## ----------------
##
## Number of accessions = 275
## Ids: chr [1:275] "G0001" "G0002" "G0003" "G0004" "G0005" "G0006" "G0007" ...
##
## # Genotypes
##
## Format = biparental
##
## Number of markers = 304
## Marker names: chr [1:304] "mk0001" "mk0002" "mk0003" "mk0004" "mk0005" "mk0006" "mk0007" ...
## Number of alleles per marker = 2
##
## Read from file: "/home/genomics/rolbrechts/00_Data/01_First_analysis/09_Core_collection/R markdown/Core_hunter_subset.txt"
The first function requires the core hunter data, a vector of collection sizes and the core collection method, either ‘CC-I’ or ‘CC-X’. It will create a core collection for each size and evaluate their Shannon diversity, Expected heterozygosity, Allele coverage, and Average accession-to-nearest-entry distance (CC-I) or Average entry-to-nearest-entry distance (CC-X). The evaluation of all sizes is returned as a data frame.
eval_CC <- function(data,size_vect,method) {
#Fixate seed
set.seed(100)
#Create empty dataframe
scores <- data.frame()
#Loop over desired collection sizes
for (size in size_vect) {
#If the user specified CC-I:
if (method == 'CC-I') {
#Create core collection with desired size
core <- sampleCore(my.data, objective("AN", "MR"), size = size, steps = 500)
#Evaluate core collection
AN_MR <- evaluateCore(core, data, objective("AN", "MR"))
CV <- evaluateCore(core, data, objective("CV"))
SH <- evaluateCore(core, data, objective("SH"))
HE <- evaluateCore(core, data, objective("HE"))
#Put evaluation scores and collection size in a dataframe
score <- data.frame('Size' = length(core$sel), 'SH' = SH,
'HE' = HE, 'CV' = CV, 'AN_MR' = AN_MR)
#If the user specified CC-X:
} else if (method == 'CC-X') {
#Create core collection with desired size
core <- sampleCore(my.data, objective("EN", "MR"), size = size, steps = 500)
#Evaluate core collection
EN_MR <- evaluateCore(core, data, objective("EN", "MR"))
CV <- evaluateCore(core, data, objective("CV"))
SH <- evaluateCore(core, data, objective("SH"))
HE <- evaluateCore(core, data, objective("HE"))
#Put evaluation scores and collection size in a dataframe
score <- data.frame('Size' = length(core$sel), 'SH' = SH,
'HE' = HE, 'CV' = CV, 'EN_MR' = EN_MR)
#Error message should there be a wrong argument
} else {
print('Parse a valid method as function argument ("CC-I" or "CC-X")')
break
}
#Add dataframe of each iteration into global dataframe
scores <- rbind(scores,score)
}
#optionally add beep when function is finished
beep(sound = 5)
Sys.sleep(5)
#Return global dataframe with all scores for all collection sizes
return(scores)
}
The second function takes the evaluation data frame and plots the four evaluation parameters.
eval_plot <- function(scores_dataframe,
cols = c('skyblue',
'firebrick',
'forestgreen',
'orange'),
sub_title = c('Shannon diversity',
'Expected heterozygosity',
'Allele coverage',
'Average accession-to-nearest-entry distance')
) {
#First check if the cols and sub_title vectors have at least 4 values:
if (length(cols) < 4 || length(sub_title) < 4) {
stop('The inputted vectors are not of correct length (4)')
}
#Make an empty list to store the plots in
plots <- list()
#Make counter to pick different color and subtitle each iteration
count <- 1
#Create a plot for each of the evaluation parameters
#These are the column names of the scores_dataframe minus the first column
for (eval in colnames(scores_dataframe)[-1]) {
#Set the substitle of the graph
if (eval == 'EN_MR') {
#If CC-X => overwrite sub_title of AN_MR
subtitle <- "Average entry-to-nearest-entry distance"
} else {
#Set title of subgraph
subtitle <-sub_title[count]
}
#Create the scatter/line plot for each evaluation parameter
#Set collection size as y-axis
plot <- ggplot(scores_dataframe, aes(x = Size)) +
#Line plot
geom_line(aes(y = .data[[eval]], color = eval), linewidth = 0.5) +
#Also add scatterplot
geom_point(aes(y = .data[[eval]], color = eval), size = 1.2) +
#Set colours to the lines
scale_color_manual(values = cols[count]) +
#Set x-axis breaks
scale_x_continuous(breaks = seq(min(scores_dataframe$Size),
max(scores_dataframe$Size), by = 10)) +
#Set x-axis label
labs(x = "Collection size", color = "Variable") +
#Set subplot title
ggtitle(subtitle) +
theme_bw() +
#Remove legend
theme(legend.position = "none",
#Center subtitle
plot.title = element_text(color = cols[count], size=12, hjust = 0.5),
#Remove y axis title
axis.title.y = element_blank())
#Add the plot to the plot list
plots[[eval]] <- plot
#Increase the counter
count <- count +1
}
#Set main plot title
if (colnames(scores_dataframe)[5] == "AN_MR") {
plot_title = 'CC-I Evaluation'
} else {
plot_title = 'CC-X Evaluation'
}
#combine the four plots in a 2 by 2 layout
combined_plot <- wrap_plots(plots, ncol = 2) +
#Set main title for combined plot
plot_annotation(title = plot_title,
#Center main title
theme = theme(plot.title = element_text(hjust = 0.5)))
return(combined_plot)
}
Define the core collection sizes you want to test. Set the step to 1 to test all sizes. This will guarantee no local optima are missed but will of course take longer to calculate. Another option is to first set a high step and then zoom in to a smaller size range.
#Define the step
step <- 1
#Set desired sizes for the analysis
#2 is minimal size, total samples - 1 is maximum size
sizes <- seq(2,(length(genoo$ids)-1),step)
Execute the evaluation function and check the result.
global_scores_I <- eval_CC(my.data,sizes,'CC-I')
head(global_scores_I, 10)
## Size SH HE CV AN_MR
## 1 2 5.842472 0.08675987 0.6019737 0.2158707
## 2 3 5.878271 0.10782164 0.6463816 0.2107435
## 3 4 5.893299 0.11626234 0.6694079 0.2063297
## 4 5 5.897828 0.11763158 0.6842105 0.2025582
## 5 6 5.895070 0.11408077 0.6924342 0.1992520
## 6 7 5.997558 0.17313373 0.8404605 0.1964048
## 7 8 5.990417 0.16748047 0.8470395 0.1939297
## 8 9 5.989049 0.16630117 0.8536184 0.1915832
## 9 10 5.989966 0.16564145 0.8667763 0.1893906
## 10 11 5.990713 0.16528926 0.8782895 0.1874005
Create the plot. The default colors and subtitles from the function are used.
CCI_eval_plot <- eval_plot(global_scores_I)
We chose the collection size corresponding to the local Shannon diversity and expected heterozygosity peak around size 40 (see tab with evaluation plots). We determined this peak is present at size 42 by checking the data frame.
#Pick your preferred core collection size after reviewing plot and scores dataframe
size_I <- 42
#Create core collection with that size
CCI_best <- sampleCore(my.data, objective("AN", "MR"), size = size_I, steps = 500)
#Check which samples are present in this collection with cross reference file
cross_ref <- read.table('Core_hunter_subset_cross.txt', header = T)
#Merge cross ref with the samples of the collection
collection_samples_I <- merge(cross_ref, data.frame('Core_ID' = CCI_best$sel),
by = 'Core_ID')
#Samples present in the core collection are in following variable:
collection_samples_I$vcf_ID
## [1] "Psilanthus_wild_plot_INERA_2283"
## [2] "Psilanthus/lebrunianus_wid_plot_INERA_2284"
## [3] "Canephora_wild_plot_INERA_2285"
## [4] "Petitquilu_wild_plot_INERA_2287"
## [5] "Canephora_wild_plot_INERA_2291"
## [6] "Canephora_wild_plot_INERA_2293"
## [7] "Collection_G0002FOG_2608"
## [8] "Collection_G0003FOG_2612"
## [9] "Collection_G0014FOG_1979"
## [10] "Collection_G0019FOG_2654"
## [11] "Collection_G0028FOG_1994"
## [12] "Collection_G0029FOG_2257"
## [13] "Collection_G0036FOG_2072"
## [14] "Collection_G0037FOG_2260"
## [15] "Collection_G0040FOG_2007"
## [16] "Collection_G0053FOG_2021"
## [17] "Collection_G0067FOG_2036"
## [18] "Collection_G0073FOG_2267"
## [19] "Collection_G0074FOG_2268"
## [20] "Collection_G0082FOG_2271"
## [21] "Collection_G0099FOG_2479"
## [22] "Collection_G0101FOG_2077"
## [23] "Collection_G0103FOG_2604"
## [24] "Collection_G0105FOG_2676"
## [25] "Collection_G0106FOG_2617"
## [26] "Collection_G0109FOG_2279"
## [27] "Collection_G0114FOG_2285"
## [28] "Collection_G0117FOG_3010"
## [29] "Collection_G0122FOG_2630"
## [30] "Collection_G0127FOG_3075"
## [31] "Collection_G0135FOG_2830"
## [32] "Collection_G0164FOG_3177"
## [33] "Collection_G0177FOG_3201"
## [34] "Collection_G0210FOG_2841"
## [35] "Collection_G0214FOG_2832"
## [36] "Collection_G0222FOG_2834"
## [37] "Collection_G0225FOG_2930"
## [38] "Collection_G0233FOG_3179"
## [39] "Collection_G0245FOG_2940"
## [40] "Collection_G0246FOG_3220"
## [41] "Collection_G0252FOG_1473"
The same workflow is used for the CC-X core collection. Be sure to provide ‘CC-X’ instead of ‘CC-I’ to the ‘eval_CC’ function.
global_scores_X <- eval_CC(my.data,sizes,'CC-X')
head(global_scores_X,10)
## Size SH HE CV EN_MR
## 1 2 6.027199 0.2162829 0.7467105 0.5575440
## 2 3 6.057567 0.2282529 0.8059211 0.4392847
## 3 4 6.043966 0.2122738 0.8240132 0.4090379
## 4 5 6.041549 0.2073026 0.8421053 0.3775362
## 5 6 6.045328 0.2051352 0.8750000 0.3588874
## 6 7 6.029655 0.1943810 0.8684211 0.3394213
## 7 8 6.030871 0.1930253 0.8898026 0.3319609
## 8 9 6.031393 0.1906067 0.9095395 0.3244432
## 9 10 6.023291 0.1851151 0.9144737 0.3179467
## 10 11 6.019668 0.1826066 0.9194079 0.3123415
Create the plot:
CCX_eval_plot <- eval_plot(global_scores_X)
The CC-X collection size should be smaller than CC-I because CC-X contains collection extremes. We set a minimal allele coverage threshold of 95% for the collection because we also don’t want to lose too much genetic information. The smallest collection size that meets this criterion is 28.
#Pick your preferred core collection size after reviewing plot and scores dataframe
#Less obvious in this graph => choose some sort of cut-off
#E.g. smallest collection size with Allele coverage >= 0.95
CV_treshold_X <- 0.95
#Find the smallest collection size above the CV threshold
size_X <- min(global_scores_X[global_scores_X$CV >= CV_treshold_X,]$Size)
size_X
## [1] 28
#Create core collection with that size
CCX_best <- sampleCore(my.data, objective("EN", "MR"), size = size_X, steps = 500)
#Check which samples are present in this collection with cross reference file
#Merge cross ref with the samples of the collection
collection_samples_X <- merge(cross_ref, data.frame('Core_ID' = CCX_best$sel),
by = 'Core_ID')
#Samples present in the core collection are in following variable:
collection_samples_X$vcf_ID
## [1] "Psilanthus_wild_plot_INERA_2283" "Collection_G0004FOG_1968"
## [3] "Collection_G0006FOG_2607" "Collection_G0036FOG_2072"
## [5] "Collection_G0041FOG_2274" "Collection_G0049FOG_2017"
## [7] "Collection_G0058FOG_2027" "Collection_G0061FOG_2030"
## [9] "Collection_G0066FOG_2035" "Collection_G0081FOG_2052"
## [11] "Collection_G0083FOG_2272" "Collection_G0089FOG_2060"
## [13] "Collection_G0092FOG_2063" "Collection_G0113FOG_2284"
## [15] "Collection_G0149FOG_3160" "Collection_G0151FOG_3155"
## [17] "Collection_G0153FOG_3164" "Collection_G0166FOG_2657"
## [19] "Collection_G0175FOG_2645" "Collection_G0191FOG_2741"
## [21] "Collection_G0207FOG_2820" "Collection_G0209FOG_2831"
## [23] "Collection_G0223FOG_2845" "Collection_G0230FOG_2859"
## [25] "Collection_G0249FOG_2787" "Collection_G0251FOG_1472"
## [27] "Collection_G0255FOG_1494" "Collection_G0256FOG_1498"
The PCA is performed by reading in the original vcf file, which is then converted to a Genlight object.
#############
#PCA analysis
#############
#Set read vcf file
vcf <- read.vcfR("INERA_subset_no_psilanthus_outlier.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 42
## header_line: 43
## variant count: 464
## column count: 284
##
Meta line 42 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 464
## Character matrix gt cols: 284
## skip: 0
## nrows: 464
## row_num: 0
##
Processed variant: 464
## All variants processed
#Convert vcf to Genlight object
gen <- vcfR2genlight(vcf)
#Perform a PCA (only 2 components needed for the plot)
no_components <- 2
pca <- glPca(gen, nf=no_components)
#Check the PCA
plot(pca$scores)
#Check the proportion of variance explained by:
#First axis
PC1 <- round(pca$eig[1]/sum(pca$eig),4) * 100
#Second axis
PC2 <- round(pca$eig[2]/sum(pca$eig),4) * 100
#############
#PCA plotting
#############
#Create dataframe stating presence or abscence in core collection per sample
Collection <- data.frame(Sample_ID = cross_ref[seq(1,genoo$size),]$vcf_ID)
#Then add 2nd columns with presence or absence by
#Check if sample_ID is present in the created core collection variables
Collection <- Collection %>%
mutate(Collection = case_when(
#Sample only present in CC-I?
Sample_ID %in% collection_samples_I$vcf_ID & !( Sample_ID %in% collection_samples_X$vcf_ID) ~ "CC-I",
#Sample only present in CC-X?
Sample_ID %in% collection_samples_X$vcf_ID & !(Sample_ID %in% collection_samples_I$vcf_ID) ~ "CC-X",
#Sample present in both collections?
Sample_ID %in% collection_samples_X$vcf_ID & Sample_ID %in% collection_samples_I$vcf_ID ~'CC-I and CC-X',
#Remaining samples
TRUE ~ 'Non-core'))
#Create new dataframe containing first column with Sample_IDs and other columns containing PCA scores
pca_final <- cbind(Sample_ID = row.names(pca$scores), as.data.frame(pca$scores))
#pca_final and collection dataframes are merged by column "Sample_ID"
pca_final <- merge(pca_final, Collection, by = "Sample_ID")
#Set colours for collection
#The order of the levels determines the order of the plot legend
pca_final$Collection <- factor(pca_final$Collection,
levels = c('Non-core', "CC-I", "CC-X", 'CC-I and CC-X'))
col <- setNames(c("#33FF00", "#FF0099", "#0066FF","#00FFFF"),
levels(pca_final$Collection))
#Create the PCA plot
#(uncomment the geom repel code to add labels to samples in a collection)
Plot_PCA_Collection <- ggplot(pca_final,
aes(x = PC1,y = PC2, color = Collection))+
geom_point(alpha = 0.4, size = 5)+
#Optionally add labels to samples in core collections
# geom_text_repel(data = pca_final[pca_final$Collection != "Non-core",],
# aes(x = PC1, y = PC2, color = Collection,
# label = Sample_ID),
# max.overlaps = 50,
# #Adjust label size
# size = 3
#) +
scale_color_manual(values = col,
#Change legend labels to also include collection sizes
labels = c(paste0(levels(pca_final$Collection)[1]," (n = ",
sum(Collection$Collection == 'Non-core'), ')'),
paste0(levels(pca_final$Collection)[2]," (n = ", sum(Collection$Collection == 'CC-I'), ')'),
paste0(levels(pca_final$Collection)[3]," (n = ", sum(Collection$Collection == 'CC-X'), ')'),
paste0(levels(pca_final$Collection)[4]," (n = ", sum(Collection$Collection == 'CC-I and CC-X'), ')')))+
#Add precalculated variance ratios to the axes
xlab(paste0("PC1 ", PC1, "%"))+
ylab(paste0("PC2 ", PC2, "%"))+
ggtitle('PCA INERA Core collection')+
guides()+
theme_bw()+
#Center title
theme(plot.title = element_text(hjust=0.5))
Create png files from the plots.
#Save plots as png files
#Set the dimensions of the PNG files
png_width <- 900
png_height <- 600
png("CC-I.png", width = png_width, height = png_height)
CCI_eval_plot
dev.off()
png("CC-X.png", width = png_width, height = png_height)
CCX_eval_plot
dev.off()
png("Core_PCA.png", width = png_width, height = png_height)
Plot_PCA_Collection
dev.off()