Chapitre 5 Statistiques

Importer les libraires que nous devons utiliser

library(vegan)
library(tidyverse)
library(devtools)
library(forcats) # To reorder our factors (function "fct_relevel")
library(dplyr) # Dataframe manipulation (notably function "pull")
library(tidyr) # Dataframe manipulation (function "separate")
library(phyloseq) # Very pratical library for the analysis of amplicon data 
library(randomcoloR) # Generate sets of random colors
library(ggplot2) # Generate plots 
library(stringr) # Makes working with strings as easy as possible (function "str_replace")
library(ggtext) # Allows the use of markdown text (used to make names italic)
library(ggpubr)

5.1 Thème pour les graphiques

Je génère mon propre thème pour les différents graphiques à générer dans lequel je spécifie par exemple la taille du texte, la taille des lignes des axes, etc…

text_size = 12 
custom_theme = function(){ 
  theme_classic() %+replace% 
    theme(
      #text elements
      plot.title = element_text(size = text_size),                #set font size
      plot.subtitle = element_text(size = text_size),               #font size
      plot.caption = element_text(size = text_size),               #right align
      axis.title = element_text(size = text_size),               #font size
      axis.text = element_text(size = text_size),                #font size
      axis.text.x = element_text(size = text_size),
      axis.text.y = element_text(size = text_size), 
      legend.text = element_text(size = text_size), 
      legend.title = element_text(size = text_size, hjust=0), 
      strip.text = element_text(size = text_size),
      strip.background = element_blank(), 
      panel.border = element_rect(colour = "black", fill = NA, linewidth = 1), 
      panel.grid.minor = element_blank(), 
      panel.grid.major = element_line(linewidth = 0.25, linetype = 'solid', colour = "grey")
      )
}

Importer les fichiers produit par le pipeline DADA2 (incluant la raréfaction)

asv = read.table(file = "~/rarefied_asv.csv", sep=",", row.names=1, header=TRUE, check.names=FALSE) 
taxa = read.table(file = "~/rarefied_taxa.csv", sep=",", row.names=1, header=TRUE) 
meta = read.table(file = "~/metadata.csv", sep=",", row.names=1, header=TRUE) 

# Insérer le symbole "\n" (signifie saut de line) afin que le texte s'affiche sur deux lignes dans la légende de notre figure   
meta$Region = gsub("Paume de la main dominante","Paume de la \nmain dominante", meta$Region)

# Combiner les tableaux dans un objet de type phyloseq 
ps = phyloseq(otu_table(asv, taxa_are_rows = TRUE), tax_table(as.matrix(taxa)), sample_data(meta))

5.2 Indice de Shannon

Calculer l’indice de diversité par échantillon

shannon = cbind(estimate_richness(ps, measures = 'shannon'), sample_data(ps))

Maintenant que nous avons les valeurs de diversité (indice de Shannon) nous aimerions comparer ces valeurs en fonction de la région échantilloné (langue et mains). Pour ce faire, nous pouvons utiliser un test de t de student, mais nous devons avant nous assurer que nos données respectent les postulats de ce test (distribution normale et variance similaire)

# Est ce que les données ont une distibution gausienne (normale) ? 
valeur_shapiro=list() # créer une liste vide 
# Réaliser le test de shapiro pour chacune des régions 
for (region in unique(shannon$Region)){
    sample = subset(shannon, Region == region)
    ok = shapiro.test(sample$Shannon)
    valeur_shapiro = append(valeur_shapiro,ok$p.value)
}
data.frame(valeur_shapiro) 
# Pour nos deux régions (langue et mains) la distribution des valeurs de Shannon suit une distribution normale 

# Est ce que la variance est similaire ? 
# Nous pouvons facilement tester la vairance avec la fonction var.test
var.test(Shannon ~ Region, shannon, 
         alternative = "two.sided")
t.test(Shannon ~ Region, data = shannon)

Nos données respectent les postulats du test de t de Student, nous pouvons donc procéder à l’analyse.

p = ggplot(shannon, aes(x = Region, y = Shannon, fill = Region)) + 
  custom_theme() +
  geom_boxplot(alpha = 0.3, outlier.shape = NA) +
  geom_jitter(aes(color = Region), size = 3, ) +
  scale_fill_manual(values = c("cornflowerblue","palevioletred")) + 
  scale_color_manual(values = c("cornflowerblue","palevioletred"))  + 
  labs(x = "Région", y = expression(paste("Indice de diversité de Shannon (", italic("H'"), ")"))) +  
  scale_y_continuous(limits = c(0,4), expand = c(0,0)) +
  annotate(geom ="text", x = 1.5, y = 3.8, label = expression(paste(italic("p")," < .001")), size = 5)

5.3 Ordination (PCoA)

# Transformer les données d'abondance relative avec l'indice d'Hellinger  
asv_hellinger = decostand(asv, method = "hellinger")
# On combine les tableaux de donnnées dans un object phyloseq 
ps = phyloseq(otu_table(asv_hellinger, taxa_are_rows = TRUE), tax_table(as.matrix(taxa)), sample_data(meta))
# Calculer l'indice de distance de Bray-Curtis entre chaque échantillon 
dist = vegdist(t(asv_hellinger), method = "bray")
# Générer l'ordination avec les fonction R de bases.
PCOA = cmdscale(dist, eig=TRUE, add=TRUE) 
# On extrait les coordonnées des points 
position = PCOA$points 
# Changer le nom des colonnes 
colnames(position) = c("Axe.1", "Axe.2") 
# get percentage of variation explained by each axis 
percent_explained = 100*PCOA$eig/sum(PCOA$eig) 
# reduce number of digits (arrondir)
reduced_percent = format(round(percent_explained[1:2], digits = 2), nsmall = 1, trim = TRUE) 
# Generate pretty labels for plot 
pretty_labs = c(glue("Axe 1 ({reduced_percent[1]}%)"), glue("Axe 2 ({reduced_percent[2]}%)")) 
# combine PCOA results with metadata 
df = merge(position, meta, by = 0) 

Générer le graphique

plot = ggplot(df, aes(x=Axe.1, y=Axe.2, color=Region)) + 
  custom_theme() +
  geom_point(size = 3) + 
  labs(x = pretty_labs[1], y = pretty_labs[2], color = "Région") + 
  scale_y_continuous(limits = c(-0.50,0.50), expand = c(0,0)) +
  scale_x_continuous(limits = c(-0.5,0.5), expand = c(0,0)) + 
  geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", linewidth = 0.2) + 
  scale_color_manual(values = c("cornflowerblue","palevioletred")) 

PERMANOVA et betadisper

# Calculer la dispersion (distance au centroïde)
dispr = vegan::betadisper(dist, group = meta$Region)
boxplot(dispr, main = "", xlab = "")
# ANOVA sur la dispersion (p = 0.001)
permutest(dispr) 
# ANOVA sur la distance en fonction des région (p = 0.001)
adns = adonis2(dist ~ meta$Region)

5.4 LEfSe

# Définir le rang taxonomique d'intérêt 
taxa = "Genus"
# On combine les tableaux de données dans un object phyloseq 
ps = phyloseq(otu_table(asv, taxa_are_rows=TRUE), tax_table(as.matrix(taxa)), sample_data(meta))
# Lancer l'analyse LEfSe
out_lefse = run_lefse(ps, group = "Region",taxa_rank = taxa)
# Extraire les résultats sous forme de tableau
df_lefse = marker_table(out_lefse) %>% data.frame()
# Keeping only the 9 taxa with the highest LDA score in each group 
number_of_taxa = 5 
lda_out = list()
i = 0 
for (each_region in unique(df_lefse$enrich_group)){
  i = i + 1 
  region = subset(df_lefse, enrich_group == each_region)
  top_taxa=head(region[order(region$ef_lda, decreasing= T),], n=number_of_taxa)
  lda_out[[i]] = top_taxa
}
df = do.call("rbind", lda_out)
# Multiplier par -1 les score LDA pour la région sous la langue 
df$ef_lda = with(df, ifelse(enrich_group == "Sous la langue", -1 * df$ef_lda, 1*df$ef_lda))

Générer le graphique

p = ggplot(df, aes(x = ef_lda, y = reorder(feature, ef_lda), fill = enrich_group)) + 
  geom_bar(stat = "identity", width = 0.7, size = 0.5) +
  custom_theme() + 
  facet_grid(rows = vars(enrich_group), scales = "free_y", space = "free_y") +
  theme(legend.position = "none", 
          strip.text.y = element_text(angle =)) +
  scale_x_continuous(limits=c(-6,6), expand = c(0,0)) +
  labs(x = "Score LDA", y = "Genre") + 
  scale_fill_manual(values=c("cornflowerblue","palevioletred")) + 
  geom_vline(xintercept = 0, linewidth = 0.25, colour="black") 

5.5 Abondances relatives

# Defining the number of most abundant taxa to keep
number_of_taxa = 3
ps_rel_abund=transform_sample_counts(ps, function(x) x/sum(x)) 
ps_glom = tax_glom(ps_rel_abund, taxrank = taxa_rank) 
melted_df = psmelt(ps_glom)

# Create an empty list that we will populated with the unique taxa of each sample
list_of_all_taxonomic_rank= list() 
i = 0 
# Beginning of the for loop
for (each_sample in unique(melted_df$Sample)){
    i = i + 1
    sample = subset(melted_df, Sample == each_sample) # Create a new dataframe from the iterator (sample). 
    total_abundance = aggregate(sample$Abundance, by = list(taxa_rank = sample[[taxa_rank]]), FUN = sum) # Combine together the same taxa and sum the abundances 
    top = head(total_abundance[order(total_abundance$x, decreasing= T),], n = number_of_taxa) # Sort by abundance and keep only the X number of taxa defined by variable number_of_taxa
    others_df = sample[!sample[[taxa_rank]] %in% top$taxa_rank,] # Extract in a new dataframe all taxa that are not present in the dataframe `top`
    others_list = pull(others_df, taxa_rank) # Create a list by pulling all the values from the column corresponding to the taxa_rank into a list
    sample[sample[[taxa_rank]]%in% others_list,][[taxa_rank]] = "Others" # In the dataframe `sample` rename all the taxa from the list `others_list` as `Others`
    list_of_all_taxonomic_rank[[i]] = sample #save this dataframe in our list 
}
df = do.call("rbind",list_of_all_taxonomic_rank) # combine all the dataframe from the list into one dataframe
unique_taxon = data.frame(unique(df[[taxa_rank]])) # create dataframe with the unique names of taxa 
name = colnames(unique_taxon) # extract the name of the column in order to rename the column with the following line 
names(unique_taxon)[names(unique_taxon) == name] = as.character(taxa_rank) # Rename the column to the taxa rank defined earlier 

# get the total number of unique most abundant taxa 
n = nrow(unique_taxon) 
# generate a set of X unique colors corresponding to the number of unique taxa
palette = distinctColorPalette(n)
unique_taxon[[taxa_rank]] = factor(unique_taxon[[taxa_rank]])
names(palette) = levels(unique_taxon[[taxa_rank]])
# assign gray to category "Others". The same nomenclature can be use to manually change certain colors.
palette[["Others"]] = "#E1E1E1"

#  recreate palette with markdown to italicize name and remove the underscore after Unclassified
all_names = data.frame(names(palette))
names_markdown = all_names %>%
  mutate(names.palette. = str_replace(names.palette., "(.*)","*\\1*"), # Adding asterisk at beginning and end of every taxa
         names.palette. = str_replace(names.palette., "\\*Unclassified_(.*)\\*","Unclassified *\\1*"), # Removing the asterisk for words that don't need to be italicize (Unclassified and Others)
         names.palette. = str_replace(names.palette., "\\*Others\\*", "Others"))
list_names=as.vector(names_markdown$names.palette.)
# Replace names of object
names(palette) = c(list_names)

# Making the same modification to the taxa name from the legend to the taxa names in the dataframe
df[[taxa_rank]] = str_replace(df[[taxa_rank]], "(.*)","*\\1*")
df[[taxa_rank]] = str_replace(df[[taxa_rank]], "\\*Unclassified_(.*)\\*","Unclassified *\\1*")
df[[taxa_rank]] = str_replace(df[[taxa_rank]], "\\*Others\\*", "Others")

# Ordering the legend in alphabetical order
legend_raw = unique(df[[taxa_rank]]) #Extract legend as text
ordered_legend = sort(legend_raw) # order alphabetically
reordered_legend = fct_relevel(ordered_legend, "Others") # move "Others" to the beginning
final_legend = levels(reordered_legend) # Extract the levels in a new object

my_scale = scale_fill_manual(name = as.character(taxa_rank), breaks = paste(final_legend), values = palette, na.translate=FALSE, drop=TRUE, limits = force)

p = ggplot(df, aes(x = Scientifque, weight = Abundance, fill = fct_reorder(.data[[taxa_rank]],Abundance,.desc=FALSE))) + # .data is very important to force the evaluation of the input variables (taxonomic_rank)
    geom_bar() +
    coord_flip() +
    labs(y ='Abondance relative', x="Scientifque") +
    theme_classic() +
    facet_wrap(~ Region, ncol=2, strip.position="top",scales='free_x') + 
    theme(text = element_text(size = text_size),
          panel.spacing = unit(2, "lines"),
          plot.title = element_text(hjust =0.5, size=text_size),
          axis.title=element_text(size=text_size),
          axis.text.x = element_text(size=text_size, hjust=0.6), 
          axis.text.y = element_text(size=text_size, vjust=0.5),
          legend.position = "bottom",
          legend.title = element_text(size = text_size),
          legend.text = element_markdown(size = 12),
          legend.key.size = unit(0.5, 'cm'),
          legend.margin = margin(), # pre-emptively set zero margins 
          strip.background = element_blank(), 
          strip.text = element_text(size = text_size)) + # remove facet_grid box background 
    my_scale + # Load our color palette 
    scale_y_continuous(limits = c(0, 1),breaks = seq(0, 1, by = 0.5), expand = c(0,0), labels = c("0", "0.5", "1")) + # Remove the white space 
  # Adjusting the legend, notably the number of rows and position
    guides(fill = guide_legend(nrow = 8, title = "Genre", title.position = "top", title.hjust = 0.5, reverse=FALSE))