Core_collector.knit

##########################
#Install and load packages
##########################

#####
#RENV
#####

##Download renv package if necessary (our version = 0.17.3)
#Get currently installed packages
pkg <- installed.packages()[,1] 

#Check if renv is installed, else download it
if (!("renv" %in% pkg)) {
  if (!requireNamespace("remotes", quietly = TRUE)) {
    install.packages("remotes")
  }
  remotes::install_version("renv", version = "0.17.3")
  print('Installed package: renv (v0.17.3)')
} else {
  print('renv is already installed')
}

## [1] "renv is already installed"

#load renv package
library(renv)
#Set direcory to location renv lock file
setwd("/home/genomics/rolbrechts/00_Data/01_First_analysis/09_Core_collection/R markdown/")
renv::activate()
#Download packages from lockfile (answer yes in console)
renv::restore(packages = c('rJava','corehunter','ggplot2','ggrepel',
                           'beepr','patchwork','vcfR',
                           'adegenet','readr','tidyr','dplyr'))

## * The library is already synchronized with the lockfile.

##############
#Load packages
##############

library(rJava)
library(corehunter)
library(ggplot2)
library(ggrepel)
library(beepr)
library(patchwork)
library(vcfR)
library(adegenet)
library(readr)
library(tidyr)
library(dplyr)

##########
#Read data
##########

### genotype file
genoo <- genotypes(file = "Core_hunter_subset.txt", 
                   format = "biparental")

Initialize Core Hunter data:

my.data <- coreHunterData(genoo)

#Check the object
my.data

## Core Hunter data
## ----------------
## 
## Number of accessions = 275 
## Ids: chr [1:275] "G0001" "G0002" "G0003" "G0004" "G0005" "G0006" "G0007" ...
## 
## # Genotypes
## 
## Format = biparental
## 
## Number of markers = 304
## Marker names: chr [1:304] "mk0001" "mk0002" "mk0003" "mk0004" "mk0005" "mk0006" "mk0007" ...
## Number of alleles per marker = 2
## 
## Read from file: "/home/genomics/rolbrechts/00_Data/01_First_analysis/09_Core_collection/R markdown/Core_hunter_subset.txt"

Defining functions

The first function requires the core hunter data, a vector of collection sizes and the core collection method, either ‘CC-I’ or ‘CC-X’. It will create a core collection for each size and evaluate their Shannon diversity, Expected heterozygosity, Allele coverage, and Average accession-to-nearest-entry distance (CC-I) or Average entry-to-nearest-entry distance (CC-X). The evaluation of all sizes is returned as a data frame.

eval_CC <- function(data,size_vect,method) {
  
  #Fixate seed
  set.seed(100)
  
  #Create empty dataframe
  scores <- data.frame()
  
  #Loop over desired collection sizes
  for (size in size_vect) {
    
    #If the user specified CC-I:
    if (method == 'CC-I') {
      
      #Create core collection with desired size
      core <- sampleCore(my.data, objective("AN", "MR"), size = size, steps = 500)
      
      #Evaluate core collection
      AN_MR <- evaluateCore(core, data, objective("AN", "MR"))
      CV <- evaluateCore(core, data, objective("CV"))
      SH <- evaluateCore(core, data, objective("SH"))
      HE <- evaluateCore(core, data, objective("HE"))
      
      #Put evaluation scores and collection size in a dataframe
      score <- data.frame('Size' = length(core$sel), 'SH' = SH,
                          'HE' = HE, 'CV' = CV, 'AN_MR' = AN_MR)
      
      #If the user specified CC-X:  
    } else  if (method == 'CC-X') {
      
        #Create core collection with desired size
        core <- sampleCore(my.data, objective("EN", "MR"), size = size, steps = 500)
        
        #Evaluate core collection
        EN_MR <- evaluateCore(core, data, objective("EN", "MR"))
        CV <- evaluateCore(core, data, objective("CV"))
        SH <- evaluateCore(core, data, objective("SH"))
        HE <- evaluateCore(core, data, objective("HE"))
        
        #Put evaluation scores and collection size in a dataframe
        score <- data.frame('Size' = length(core$sel), 'SH' = SH,
                            'HE' = HE, 'CV' = CV, 'EN_MR' = EN_MR)
      
        #Error message should there be a wrong argument
    } else {
      
      print('Parse a valid method as function argument ("CC-I" or "CC-X")')
      break
      
    }
    
    #Add dataframe of each iteration into global dataframe
    scores <- rbind(scores,score)
    
  }
  
  #optionally add beep when function is finished
  beep(sound = 5)
  Sys.sleep(5)
  
  #Return global dataframe with all scores for all collection sizes
  return(scores)
  
}

The second function takes the evaluation data frame and plots the four evaluation parameters.

eval_plot <- function(scores_dataframe, 
                           cols = c('skyblue',
                                    'firebrick',
                                    'forestgreen',
                                    'orange'),
                           sub_title = c('Shannon diversity',
                                        'Expected heterozygosity',
                                        'Allele coverage',
                                        'Average accession-to-nearest-entry distance')
                           ) {
  
  #First check if the cols and sub_title vectors have at least 4 values:
  if (length(cols) < 4 || length(sub_title) < 4) {
    
    stop('The inputted vectors are not of correct length (4)')
    
  }
  
  #Make an empty list to store the plots in
  plots <- list()
  
  #Make counter to pick different color and subtitle each iteration
  count <- 1
  
  #Create a plot for each of the evaluation parameters
  #These are the column names of the scores_dataframe minus the first column
  for (eval in colnames(scores_dataframe)[-1]) {
    
    #Set the substitle of the graph
    if (eval == 'EN_MR') {
      #If CC-X => overwrite sub_title of AN_MR
      subtitle <- "Average entry-to-nearest-entry distance"
    } else {
      #Set title of subgraph
      subtitle <-sub_title[count]
    }
    
    #Create the scatter/line plot for each evaluation parameter
    #Set collection size as y-axis
    plot <- ggplot(scores_dataframe, aes(x = Size)) +
      #Line plot
      geom_line(aes(y = .data[[eval]], color = eval), linewidth = 0.5) +
      #Also add scatterplot 
      geom_point(aes(y = .data[[eval]], color = eval), size = 1.2) +
      #Set colours to the lines
      scale_color_manual(values = cols[count]) +
      #Set x-axis breaks
      scale_x_continuous(breaks = seq(min(scores_dataframe$Size), 
                                      max(scores_dataframe$Size), by = 10)) +
      #Set x-axis label
      labs(x = "Collection size", color = "Variable") +
      #Set subplot title
      ggtitle(subtitle) +
      theme_bw() +
      #Remove legend
      theme(legend.position = "none", 
            #Center subtitle
            plot.title = element_text(color = cols[count], size=12, hjust = 0.5),
            #Remove y axis title
            axis.title.y = element_blank())
    
    #Add the plot to the plot list
    plots[[eval]] <- plot
    
    #Increase the counter
    count <- count +1
  }
  
  #Set main plot title
  if (colnames(scores_dataframe)[5] == "AN_MR") {

    plot_title = 'CC-I Evaluation'
  
  } else {
    
    plot_title = 'CC-X Evaluation'
    
  }
 
  #combine the four plots in a 2 by 2 layout
  combined_plot <- wrap_plots(plots, ncol = 2) + 
    #Set main title for combined plot
    plot_annotation(title = plot_title, 
                    #Center main title
                    theme = theme(plot.title = element_text(hjust = 0.5)))
  
  return(combined_plot)
  
}

CC-I core collection

Define the core collection sizes you want to test. Set the step to 1 to test all sizes. This will guarantee no local optima are missed but will of course take longer to calculate. Another option is to first set a high step and then zoom in to a smaller size range.

#Define the step 
step <- 1 

#Set desired sizes for the analysis
#2 is minimal size, total samples - 1 is maximum size
sizes <- seq(2,(length(genoo$ids)-1),step)

Execute the evaluation function and check the result.

global_scores_I <- eval_CC(my.data,sizes,'CC-I')
head(global_scores_I, 10)

##    Size       SH         HE        CV     AN_MR
## 1     2 5.842472 0.08675987 0.6019737 0.2158707
## 2     3 5.878271 0.10782164 0.6463816 0.2107435
## 3     4 5.893299 0.11626234 0.6694079 0.2063297
## 4     5 5.897828 0.11763158 0.6842105 0.2025582
## 5     6 5.895070 0.11408077 0.6924342 0.1992520
## 6     7 5.997558 0.17313373 0.8404605 0.1964048
## 7     8 5.990417 0.16748047 0.8470395 0.1939297
## 8     9 5.989049 0.16630117 0.8536184 0.1915832
## 9    10 5.989966 0.16564145 0.8667763 0.1893906
## 10   11 5.990713 0.16528926 0.8782895 0.1874005

Create the plot. The default colors and subtitles from the function are used.

CCI_eval_plot <- eval_plot(global_scores_I)

We chose the collection size corresponding to the local Shannon diversity and expected heterozygosity peak around size 40 (see tab with evaluation plots). We determined this peak is present at size 42 by checking the data frame.

#Pick your preferred core collection size after reviewing plot and scores dataframe
size_I <- 42

#Create core collection with that size
CCI_best <- sampleCore(my.data, objective("AN", "MR"), size = size_I, steps = 500)

#Check which samples are present in this collection with cross reference file
cross_ref <- read.table('Core_hunter_subset_cross.txt', header = T)

#Merge cross ref with the samples of the collection
collection_samples_I <- merge(cross_ref, data.frame('Core_ID' = CCI_best$sel), 
                              by = 'Core_ID')

#Samples present in the core collection are in following variable:
collection_samples_I$vcf_ID

##  [1] "Psilanthus_wild_plot_INERA_2283"           
##  [2] "Psilanthus/lebrunianus_wid_plot_INERA_2284"
##  [3] "Canephora_wild_plot_INERA_2285"            
##  [4] "Petitquilu_wild_plot_INERA_2287"           
##  [5] "Canephora_wild_plot_INERA_2291"            
##  [6] "Canephora_wild_plot_INERA_2293"            
##  [7] "Collection_G0002FOG_2608"                  
##  [8] "Collection_G0003FOG_2612"                  
##  [9] "Collection_G0014FOG_1979"                  
## [10] "Collection_G0019FOG_2654"                  
## [11] "Collection_G0028FOG_1994"                  
## [12] "Collection_G0029FOG_2257"                  
## [13] "Collection_G0036FOG_2072"                  
## [14] "Collection_G0037FOG_2260"                  
## [15] "Collection_G0040FOG_2007"                  
## [16] "Collection_G0053FOG_2021"                  
## [17] "Collection_G0067FOG_2036"                  
## [18] "Collection_G0073FOG_2267"                  
## [19] "Collection_G0074FOG_2268"                  
## [20] "Collection_G0082FOG_2271"                  
## [21] "Collection_G0099FOG_2479"                  
## [22] "Collection_G0101FOG_2077"                  
## [23] "Collection_G0103FOG_2604"                  
## [24] "Collection_G0105FOG_2676"                  
## [25] "Collection_G0106FOG_2617"                  
## [26] "Collection_G0109FOG_2279"                  
## [27] "Collection_G0114FOG_2285"                  
## [28] "Collection_G0117FOG_3010"                  
## [29] "Collection_G0122FOG_2630"                  
## [30] "Collection_G0127FOG_3075"                  
## [31] "Collection_G0135FOG_2830"                  
## [32] "Collection_G0164FOG_3177"                  
## [33] "Collection_G0177FOG_3201"                  
## [34] "Collection_G0210FOG_2841"                  
## [35] "Collection_G0214FOG_2832"                  
## [36] "Collection_G0222FOG_2834"                  
## [37] "Collection_G0225FOG_2930"                  
## [38] "Collection_G0233FOG_3179"                  
## [39] "Collection_G0245FOG_2940"                  
## [40] "Collection_G0246FOG_3220"                  
## [41] "Collection_G0252FOG_1473"

CC-X core collection

The same workflow is used for the CC-X core collection. Be sure to provide ‘CC-X’ instead of ‘CC-I’ to the ‘eval_CC’ function.

global_scores_X <- eval_CC(my.data,sizes,'CC-X')
head(global_scores_X,10)

##    Size       SH        HE        CV     EN_MR
## 1     2 6.027199 0.2162829 0.7467105 0.5575440
## 2     3 6.057567 0.2282529 0.8059211 0.4392847
## 3     4 6.043966 0.2122738 0.8240132 0.4090379
## 4     5 6.041549 0.2073026 0.8421053 0.3775362
## 5     6 6.045328 0.2051352 0.8750000 0.3588874
## 6     7 6.029655 0.1943810 0.8684211 0.3394213
## 7     8 6.030871 0.1930253 0.8898026 0.3319609
## 8     9 6.031393 0.1906067 0.9095395 0.3244432
## 9    10 6.023291 0.1851151 0.9144737 0.3179467
## 10   11 6.019668 0.1826066 0.9194079 0.3123415

Create the plot:

CCX_eval_plot <- eval_plot(global_scores_X)

The CC-X collection size should be smaller than CC-I because CC-X contains collection extremes. We set a minimal allele coverage threshold of 95% for the collection because we also don’t want to lose too much genetic information. The smallest collection size that meets this criterion is 28.

#Pick your preferred core collection size after reviewing plot and scores dataframe
#Less obvious in this graph => choose some sort of cut-off
#E.g. smallest collection size with Allele coverage >= 0.95
CV_treshold_X <- 0.95

#Find the smallest collection size above the CV threshold 
size_X <- min(global_scores_X[global_scores_X$CV >= CV_treshold_X,]$Size)
size_X

## [1] 28

#Create core collection with that size
CCX_best <- sampleCore(my.data, objective("EN", "MR"), size = size_X, steps = 500)

#Check which samples are present in this collection with cross reference file
#Merge cross ref with the samples of the collection
collection_samples_X <- merge(cross_ref, data.frame('Core_ID' = CCX_best$sel),
                              by = 'Core_ID')

#Samples present in the core collection are in following variable:
collection_samples_X$vcf_ID

##  [1] "Psilanthus_wild_plot_INERA_2283" "Collection_G0004FOG_1968"       
##  [3] "Collection_G0006FOG_2607"        "Collection_G0036FOG_2072"       
##  [5] "Collection_G0041FOG_2274"        "Collection_G0049FOG_2017"       
##  [7] "Collection_G0058FOG_2027"        "Collection_G0061FOG_2030"       
##  [9] "Collection_G0066FOG_2035"        "Collection_G0081FOG_2052"       
## [11] "Collection_G0083FOG_2272"        "Collection_G0089FOG_2060"       
## [13] "Collection_G0092FOG_2063"        "Collection_G0113FOG_2284"       
## [15] "Collection_G0149FOG_3160"        "Collection_G0151FOG_3155"       
## [17] "Collection_G0153FOG_3164"        "Collection_G0166FOG_2657"       
## [19] "Collection_G0175FOG_2645"        "Collection_G0191FOG_2741"       
## [21] "Collection_G0207FOG_2820"        "Collection_G0209FOG_2831"       
## [23] "Collection_G0223FOG_2845"        "Collection_G0230FOG_2859"       
## [25] "Collection_G0249FOG_2787"        "Collection_G0251FOG_1472"       
## [27] "Collection_G0255FOG_1494"        "Collection_G0256FOG_1498"

Visualisation of core collection samples using PCA

The PCA is performed by reading in the original vcf file, which is then converted to a Genlight object.

#############
#PCA analysis
#############

#Set read vcf file
vcf <- read.vcfR("INERA_subset_no_psilanthus_outlier.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 42
##   header_line: 43
##   variant count: 464
##   column count: 284
## 
Meta line 42 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 464
##   Character matrix gt cols: 284
##   skip: 0
##   nrows: 464
##   row_num: 0
## 
Processed variant: 464
## All variants processed

#Convert vcf to Genlight object
gen <- vcfR2genlight(vcf)

#Perform a PCA (only 2 components needed for the plot)
no_components <- 2
pca <- glPca(gen, nf=no_components)

#Check the PCA
plot(pca$scores)

#Check the proportion of variance explained by:
#First axis
PC1 <- round(pca$eig[1]/sum(pca$eig),4) * 100

#Second axis
PC2 <- round(pca$eig[2]/sum(pca$eig),4) * 100

#############
#PCA plotting
#############

#Create dataframe stating presence or abscence in core collection per sample
Collection <- data.frame(Sample_ID = cross_ref[seq(1,genoo$size),]$vcf_ID)

#Then add 2nd columns with presence or absence by
#Check if sample_ID is present in the created core collection variables 
Collection <- Collection %>%
mutate(Collection = case_when(
  #Sample only present in CC-I?
  Sample_ID %in% collection_samples_I$vcf_ID & !( Sample_ID %in% collection_samples_X$vcf_ID) ~ "CC-I",
  #Sample only present in CC-X?
  Sample_ID %in% collection_samples_X$vcf_ID & !(Sample_ID %in% collection_samples_I$vcf_ID) ~ "CC-X",
  #Sample present in both collections?
  Sample_ID %in% collection_samples_X$vcf_ID & Sample_ID %in% collection_samples_I$vcf_ID ~'CC-I and CC-X',
  #Remaining samples
  TRUE ~ 'Non-core'))

#Create new dataframe containing first column with Sample_IDs and other columns containing PCA scores
pca_final <- cbind(Sample_ID = row.names(pca$scores), as.data.frame(pca$scores))

#pca_final and collection dataframes are merged by column "Sample_ID"
pca_final <- merge(pca_final, Collection, by = "Sample_ID")

#Set colours for collection
#The order of the levels determines the order of the plot legend
pca_final$Collection <- factor(pca_final$Collection, 
                               levels = c('Non-core', "CC-I", "CC-X", 'CC-I and CC-X'))

col <- setNames(c("#33FF00", "#FF0099", "#0066FF","#00FFFF"), 
                levels(pca_final$Collection))

#Create the PCA plot
#(uncomment the geom repel code to add labels to samples in a collection)
Plot_PCA_Collection <- ggplot(pca_final,
                              aes(x = PC1,y = PC2, color = Collection))+
  geom_point(alpha = 0.4, size = 5)+
  #Optionally add labels to samples in core collections
  # geom_text_repel(data = pca_final[pca_final$Collection != "Non-core",],
  #                 aes(x = PC1, y = PC2, color = Collection,
  #                     label = Sample_ID),
  #                 max.overlaps = 50,
  #                 #Adjust label size
  #                 size = 3
  #) +
  scale_color_manual(values = col, 
                     #Change legend labels to also include collection sizes
                     labels = c(paste0(levels(pca_final$Collection)[1]," (n = ", 
                                       sum(Collection$Collection == 'Non-core'), ')'),
                                paste0(levels(pca_final$Collection)[2]," (n = ", sum(Collection$Collection == 'CC-I'), ')'),
                                paste0(levels(pca_final$Collection)[3]," (n = ", sum(Collection$Collection == 'CC-X'), ')'),
                                paste0(levels(pca_final$Collection)[4]," (n = ", sum(Collection$Collection == 'CC-I and CC-X'), ')')))+
  #Add precalculated variance ratios to the axes
  xlab(paste0("PC1 ", PC1, "%"))+
  ylab(paste0("PC2 ", PC2, "%"))+
  ggtitle('PCA INERA Core collection')+
  guides()+
  theme_bw()+
  #Center title
  theme(plot.title = element_text(hjust=0.5))

Create png files from the plots.

#Save plots as png files
#Set the dimensions of the PNG files
png_width <- 900
png_height <- 600

png("CC-I.png", width = png_width, height = png_height)
CCI_eval_plot
dev.off()

png("CC-X.png", width = png_width, height = png_height)
CCX_eval_plot
dev.off()

png("Core_PCA.png", width = png_width, height = png_height)
Plot_PCA_Collection
dev.off()