Chapitre 5 Statistiques
Importer les libraires que nous devons utiliser
library(vegan)
library(tidyverse)
library(devtools)
library(forcats) # To reorder our factors (function "fct_relevel")
library(dplyr) # Dataframe manipulation (notably function "pull")
library(tidyr) # Dataframe manipulation (function "separate")
library(phyloseq) # Very pratical library for the analysis of amplicon data
library(randomcoloR) # Generate sets of random colors
library(ggplot2) # Generate plots
library(stringr) # Makes working with strings as easy as possible (function "str_replace")
library(ggtext) # Allows the use of markdown text (used to make names italic)
library(ggpubr)
5.1 Thème pour les graphiques
Je génère mon propre thème pour les différents graphiques à générer dans lequel je spécifie par exemple la taille du texte, la taille des lignes des axes, etc…
text_size = 12
custom_theme = function(){
theme_classic() %+replace%
theme(
#text elements
plot.title = element_text(size = text_size), #set font size
plot.subtitle = element_text(size = text_size), #font size
plot.caption = element_text(size = text_size), #right align
axis.title = element_text(size = text_size), #font size
axis.text = element_text(size = text_size), #font size
axis.text.x = element_text(size = text_size),
axis.text.y = element_text(size = text_size),
legend.text = element_text(size = text_size),
legend.title = element_text(size = text_size, hjust=0),
strip.text = element_text(size = text_size),
strip.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, linewidth = 1),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(linewidth = 0.25, linetype = 'solid', colour = "grey")
)
}
Importer les fichiers produit par le pipeline DADA2 (incluant la raréfaction)
asv = read.table(file = "~/rarefied_asv.csv", sep=",", row.names=1, header=TRUE, check.names=FALSE)
taxa = read.table(file = "~/rarefied_taxa.csv", sep=",", row.names=1, header=TRUE)
meta = read.table(file = "~/metadata.csv", sep=",", row.names=1, header=TRUE)
# Insérer le symbole "\n" (signifie saut de line) afin que le texte s'affiche sur deux lignes dans la légende de notre figure
meta$Region = gsub("Paume de la main dominante","Paume de la \nmain dominante", meta$Region)
# Combiner les tableaux dans un objet de type phyloseq
ps = phyloseq(otu_table(asv, taxa_are_rows = TRUE), tax_table(as.matrix(taxa)), sample_data(meta))
5.2 Indice de Shannon
Calculer l’indice de diversité par échantillon
Maintenant que nous avons les valeurs de diversité (indice de Shannon) nous aimerions comparer ces valeurs en fonction de la région échantilloné (langue et mains). Pour ce faire, nous pouvons utiliser un test de t de student, mais nous devons avant nous assurer que nos données respectent les postulats de ce test (distribution normale et variance similaire)
# Est ce que les données ont une distibution gausienne (normale) ?
valeur_shapiro=list() # créer une liste vide
# Réaliser le test de shapiro pour chacune des régions
for (region in unique(shannon$Region)){
sample = subset(shannon, Region == region)
ok = shapiro.test(sample$Shannon)
valeur_shapiro = append(valeur_shapiro,ok$p.value)
}
data.frame(valeur_shapiro)
# Pour nos deux régions (langue et mains) la distribution des valeurs de Shannon suit une distribution normale
# Est ce que la variance est similaire ?
# Nous pouvons facilement tester la vairance avec la fonction var.test
var.test(Shannon ~ Region, shannon,
alternative = "two.sided")
t.test(Shannon ~ Region, data = shannon)
Nos données respectent les postulats du test de t de Student, nous pouvons donc procéder à l’analyse.
p = ggplot(shannon, aes(x = Region, y = Shannon, fill = Region)) +
custom_theme() +
geom_boxplot(alpha = 0.3, outlier.shape = NA) +
geom_jitter(aes(color = Region), size = 3, ) +
scale_fill_manual(values = c("cornflowerblue","palevioletred")) +
scale_color_manual(values = c("cornflowerblue","palevioletred")) +
labs(x = "Région", y = expression(paste("Indice de diversité de Shannon (", italic("H'"), ")"))) +
scale_y_continuous(limits = c(0,4), expand = c(0,0)) +
annotate(geom ="text", x = 1.5, y = 3.8, label = expression(paste(italic("p")," < .001")), size = 5)
5.3 Ordination (PCoA)
# Transformer les données d'abondance relative avec l'indice d'Hellinger
asv_hellinger = decostand(asv, method = "hellinger")
# On combine les tableaux de donnnées dans un object phyloseq
ps = phyloseq(otu_table(asv_hellinger, taxa_are_rows = TRUE), tax_table(as.matrix(taxa)), sample_data(meta))
# Calculer l'indice de distance de Bray-Curtis entre chaque échantillon
dist = vegdist(t(asv_hellinger), method = "bray")
# Générer l'ordination avec les fonction R de bases.
PCOA = cmdscale(dist, eig=TRUE, add=TRUE)
# On extrait les coordonnées des points
position = PCOA$points
# Changer le nom des colonnes
colnames(position) = c("Axe.1", "Axe.2")
# get percentage of variation explained by each axis
percent_explained = 100*PCOA$eig/sum(PCOA$eig)
# reduce number of digits (arrondir)
reduced_percent = format(round(percent_explained[1:2], digits = 2), nsmall = 1, trim = TRUE)
# Generate pretty labels for plot
pretty_labs = c(glue("Axe 1 ({reduced_percent[1]}%)"), glue("Axe 2 ({reduced_percent[2]}%)"))
# combine PCOA results with metadata
df = merge(position, meta, by = 0)
Générer le graphique
plot = ggplot(df, aes(x=Axe.1, y=Axe.2, color=Region)) +
custom_theme() +
geom_point(size = 3) +
labs(x = pretty_labs[1], y = pretty_labs[2], color = "Région") +
scale_y_continuous(limits = c(-0.50,0.50), expand = c(0,0)) +
scale_x_continuous(limits = c(-0.5,0.5), expand = c(0,0)) +
geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", linewidth = 0.2) +
scale_color_manual(values = c("cornflowerblue","palevioletred"))
PERMANOVA et betadisper
5.4 LEfSe
# Définir le rang taxonomique d'intérêt
taxa = "Genus"
# On combine les tableaux de données dans un object phyloseq
ps = phyloseq(otu_table(asv, taxa_are_rows=TRUE), tax_table(as.matrix(taxa)), sample_data(meta))
# Lancer l'analyse LEfSe
out_lefse = run_lefse(ps, group = "Region",taxa_rank = taxa)
# Extraire les résultats sous forme de tableau
df_lefse = marker_table(out_lefse) %>% data.frame()
# Keeping only the 9 taxa with the highest LDA score in each group
number_of_taxa = 5
lda_out = list()
i = 0
for (each_region in unique(df_lefse$enrich_group)){
i = i + 1
region = subset(df_lefse, enrich_group == each_region)
top_taxa=head(region[order(region$ef_lda, decreasing= T),], n=number_of_taxa)
lda_out[[i]] = top_taxa
}
df = do.call("rbind", lda_out)
# Multiplier par -1 les score LDA pour la région sous la langue
df$ef_lda = with(df, ifelse(enrich_group == "Sous la langue", -1 * df$ef_lda, 1*df$ef_lda))
Générer le graphique
p = ggplot(df, aes(x = ef_lda, y = reorder(feature, ef_lda), fill = enrich_group)) +
geom_bar(stat = "identity", width = 0.7, size = 0.5) +
custom_theme() +
facet_grid(rows = vars(enrich_group), scales = "free_y", space = "free_y") +
theme(legend.position = "none",
strip.text.y = element_text(angle =)) +
scale_x_continuous(limits=c(-6,6), expand = c(0,0)) +
labs(x = "Score LDA", y = "Genre") +
scale_fill_manual(values=c("cornflowerblue","palevioletred")) +
geom_vline(xintercept = 0, linewidth = 0.25, colour="black")
5.5 Abondances relatives
# Defining the number of most abundant taxa to keep
number_of_taxa = 3
ps_rel_abund=transform_sample_counts(ps, function(x) x/sum(x))
ps_glom = tax_glom(ps_rel_abund, taxrank = taxa_rank)
melted_df = psmelt(ps_glom)
# Create an empty list that we will populated with the unique taxa of each sample
list_of_all_taxonomic_rank= list()
i = 0
# Beginning of the for loop
for (each_sample in unique(melted_df$Sample)){
i = i + 1
sample = subset(melted_df, Sample == each_sample) # Create a new dataframe from the iterator (sample).
total_abundance = aggregate(sample$Abundance, by = list(taxa_rank = sample[[taxa_rank]]), FUN = sum) # Combine together the same taxa and sum the abundances
top = head(total_abundance[order(total_abundance$x, decreasing= T),], n = number_of_taxa) # Sort by abundance and keep only the X number of taxa defined by variable number_of_taxa
others_df = sample[!sample[[taxa_rank]] %in% top$taxa_rank,] # Extract in a new dataframe all taxa that are not present in the dataframe `top`
others_list = pull(others_df, taxa_rank) # Create a list by pulling all the values from the column corresponding to the taxa_rank into a list
sample[sample[[taxa_rank]]%in% others_list,][[taxa_rank]] = "Others" # In the dataframe `sample` rename all the taxa from the list `others_list` as `Others`
list_of_all_taxonomic_rank[[i]] = sample #save this dataframe in our list
}
df = do.call("rbind",list_of_all_taxonomic_rank) # combine all the dataframe from the list into one dataframe
unique_taxon = data.frame(unique(df[[taxa_rank]])) # create dataframe with the unique names of taxa
name = colnames(unique_taxon) # extract the name of the column in order to rename the column with the following line
names(unique_taxon)[names(unique_taxon) == name] = as.character(taxa_rank) # Rename the column to the taxa rank defined earlier
# get the total number of unique most abundant taxa
n = nrow(unique_taxon)
# generate a set of X unique colors corresponding to the number of unique taxa
palette = distinctColorPalette(n)
unique_taxon[[taxa_rank]] = factor(unique_taxon[[taxa_rank]])
names(palette) = levels(unique_taxon[[taxa_rank]])
# assign gray to category "Others". The same nomenclature can be use to manually change certain colors.
palette[["Others"]] = "#E1E1E1"
# recreate palette with markdown to italicize name and remove the underscore after Unclassified
all_names = data.frame(names(palette))
names_markdown = all_names %>%
mutate(names.palette. = str_replace(names.palette., "(.*)","*\\1*"), # Adding asterisk at beginning and end of every taxa
names.palette. = str_replace(names.palette., "\\*Unclassified_(.*)\\*","Unclassified *\\1*"), # Removing the asterisk for words that don't need to be italicize (Unclassified and Others)
names.palette. = str_replace(names.palette., "\\*Others\\*", "Others"))
list_names=as.vector(names_markdown$names.palette.)
# Replace names of object
names(palette) = c(list_names)
# Making the same modification to the taxa name from the legend to the taxa names in the dataframe
df[[taxa_rank]] = str_replace(df[[taxa_rank]], "(.*)","*\\1*")
df[[taxa_rank]] = str_replace(df[[taxa_rank]], "\\*Unclassified_(.*)\\*","Unclassified *\\1*")
df[[taxa_rank]] = str_replace(df[[taxa_rank]], "\\*Others\\*", "Others")
# Ordering the legend in alphabetical order
legend_raw = unique(df[[taxa_rank]]) #Extract legend as text
ordered_legend = sort(legend_raw) # order alphabetically
reordered_legend = fct_relevel(ordered_legend, "Others") # move "Others" to the beginning
final_legend = levels(reordered_legend) # Extract the levels in a new object
my_scale = scale_fill_manual(name = as.character(taxa_rank), breaks = paste(final_legend), values = palette, na.translate=FALSE, drop=TRUE, limits = force)
p = ggplot(df, aes(x = Scientifque, weight = Abundance, fill = fct_reorder(.data[[taxa_rank]],Abundance,.desc=FALSE))) + # .data is very important to force the evaluation of the input variables (taxonomic_rank)
geom_bar() +
coord_flip() +
labs(y ='Abondance relative', x="Scientifque") +
theme_classic() +
facet_wrap(~ Region, ncol=2, strip.position="top",scales='free_x') +
theme(text = element_text(size = text_size),
panel.spacing = unit(2, "lines"),
plot.title = element_text(hjust =0.5, size=text_size),
axis.title=element_text(size=text_size),
axis.text.x = element_text(size=text_size, hjust=0.6),
axis.text.y = element_text(size=text_size, vjust=0.5),
legend.position = "bottom",
legend.title = element_text(size = text_size),
legend.text = element_markdown(size = 12),
legend.key.size = unit(0.5, 'cm'),
legend.margin = margin(), # pre-emptively set zero margins
strip.background = element_blank(),
strip.text = element_text(size = text_size)) + # remove facet_grid box background
my_scale + # Load our color palette
scale_y_continuous(limits = c(0, 1),breaks = seq(0, 1, by = 0.5), expand = c(0,0), labels = c("0", "0.5", "1")) + # Remove the white space
# Adjusting the legend, notably the number of rows and position
guides(fill = guide_legend(nrow = 8, title = "Genre", title.position = "top", title.hjust = 0.5, reverse=FALSE))