介绍
phyloseq包对多类型数据的综合软件,并其对这些数据提供统计分析和可视化方法。
微生物数据分析的主要挑战之一是如何整合不同类型的数据,从而对其进行生态学、遗传学、系统发育学、多元统计、可视化和检验等分析。同时,由于同行之间需要分享彼此的分析结果,如何去重复各自的结果呢?这需要一款统一数据输入接口且包含多种分析方法的软件,而phyloseq
就是为处理这样的问题诞生的R包。
phyloseq数据结构
phyloseq对象的输入数据:
- **otu_table:**也即是物种丰度表,以matrix方式输入,行名是物种名字;
- **sample_data:**表型数据,包含样本的分组信息和环境因素等,以data.frame方式输入,行名是样本名字;
- tax_table:物种分类学水平的信息,以matrix方式输入,行名或者第一列是otu_table的行名;
- **phy_tree:**OTU的进化树关系表,计算uniFrac距离;
- refseq: DNA,RNA和AA氨基酸的序列信息。
使用
输入数据
- 物种丰度表: otu_mat
- 物种分类水平表:tax_mat
- 样本表型:samples_df
library(dplyr)
library(ggplot2)
library(phyloseq)
library(readxl)
library(tibble)otu_mat<- read_excel("../datset/CARBOM data.xlsx", sheet = "OTU matrix") %>% column_to_rownames("otu")
tax_mat<- read_excel("../datset/CARBOM data.xlsx", sheet = "Taxonomy table") %>% column_to_rownames("otu")
samples_df <- read_excel("../datset/CARBOM data.xlsx", sheet = "Samples") %>% column_to_rownames("sample")
OTU <- otu_table(otu_mat %>% as.matrix(), taxa_are_rows = TRUE)
TAX <- tax_table(tax_mat %>% as.matrix())
samples <- sample_data(samples_df)carbom <- phyloseq(OTU, TAX, samples)
对phylose对象的处理
# 数据名字
sample_names(carbom)
rank_names(carbom)
sample_variables(carbom)# 数据子集
subset_samples(carbom, Select_18S_nifH =="Yes")
subset_taxa(carbom, Division %in% c("Chlorophyta", "Dinophyta", "Cryptophyta", "Haptophyta", "Ochrophyta", "Cercozoa"))
subset_taxa(carbom, !(Class %in% c("Syndiniales", "Sarcomonadea")))# 中位数测序深度归一化reads数目
total <- median(sample_sums(carbom))
standf <- function(x, t=total){round(t * (x / sum(x)))}
carbom <- transform_sample_counts(carbom, standf)
alpha diversity
plot_richness(carbom, x="fraction", color = "fraction", measures=c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson"))+stat_boxplot(geom='errorbar', linetype=1, width=0.3)+geom_boxplot(aes(color=fraction), alpha=0.1)+ggpubr::stat_compare_means(comparisons = list(c("Nano", "Pico")),method = "wilcox.test")+guides(color=F)+theme_bw()
barplot
plot_bar(carbom, fill = "Division")+theme_bw()+# 0->left; .5->center; 1->righttheme(axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1))
tree
library(ape)
random_tree <- rtree(ntaxa(carbom), rooted=TRUE, tip.label=taxa_names(carbom))
carbom_tree <- phyloseq(OTU, TAX, samples, random_tree)# at least 20% of reads in at least one sample
carbom_abund <- filter_taxa(carbom_tree, function(x) {sum(x > total*0.20) > 0}, TRUE)plot_tree(carbom_abund, color="fraction", shape="level", label.tips="Division", ladderize="left", plot.margin=0.3)+labs(x="",y="")+scale_color_manual(values = c("red", "blue"))+theme_bw()
heatmap
# at least 20% of reads in at least one sample
carbom_abund <- filter_taxa(carbom_tree, function(x) {sum(x > total*0.20) > 0}, TRUE)
plot_heatmap(carbom_abund, method = "NMDS", distance = "bray")# 自己设定距离
# plot_heatmap(carbom_abund, method = "MDS", distance = "(A+B-2*J)/(A+B-J)",
# taxa.label = "Class", taxa.order = "Class",
# trans=NULL, low="beige", high="red", na.value="beige")
For vectors x and y the “quadratic” terms are J = sum(x*y), A = sum(x^2), B = sum(y^2) and “minimum” terms are J = sum(pmin(x,y)), A = sum(x) and B = sum(y), and “binary” terms are either of these after transforming data into binary form (shared number of species, and number of species for each row). Somes examples :
- A+B-2*J “quadratic” squared Euclidean
- A+B-2*J “minimum” Manhattan
- (A+B-2*J)/(A+B) “minimum” Bray-Curtis
- (A+B-2*J)/(A+B) “binary” Sørensen
- (A+B-2*J)/(A+B-J) “binary” Jaccard
ordination
# method : c("DCA", "CCA", "RDA", "CAP", "DPCoA", "NMDS", "MDS", "PCoA")
# disrance: unlist(distanceMethodList)
carbom.ord <- ordinate(carbom, method = "PCoA", distance = "bray")# plot_ordination(carbom, carbom.ord, type="taxa", color="Class", shape= "Class",
# title="OTUs")plot_ordination(carbom, carbom.ord, type="samples", color="fraction", shape="level")+geom_point(size=3)+theme_bw()
network analysis
# plot_net(carbom, distance = "(A+B-2*J)/(A+B)", type = "taxa",
# maxdist = 0.7, color="Class", point_label="Genus")# plot_net(carbom, distance = "(A+B-2*J)/(A+B)", type = "samples",
# maxdist = 0.7, color="fraction", point_label="fraction")plot_net(carbom_abund, distance = "(A+B-2*J)/(A+B)", type = "taxa", maxdist = 0.8, color="Class", point_label="Genus")
Deseq2 with phyloseq
library(DESeq2)
library(ggplot2)diagdds <- phyloseq_to_deseq2(carbom_abund, ~ fraction)
diagdds <- DESeq(diagdds, test="Wald", fitType="parametric")res <- results(diagdds, cooksCutoff = FALSE)
sigtab <- res[which(res$padj < 0.01), ]
sigtab <- cbind(as(sigtab, "data.frame"), as(tax_table(carbom_abund)[rownames(sigtab), ], "matrix"))
head(sigtab)
rarefaction curves
rarecurve2 <- function (x, step = 1, sample, xlab = "Sample Size", ylab = "Species", label = TRUE, col = "black", ...)## See documentation for vegan rarecurve, col is now used to define## custom colors for lines and panels
{tot <- rowSums(x)S <- vegan::specnumber(x)nr <- nrow(x)out <- lapply(seq_len(nr), function(i) {n <- seq(1, tot[i], by = step)if (n[length(n)] != tot[i])n <- c(n, tot[i])drop(vegan::rarefy(x[i, ], n))})Nmax <- sapply(out, function(x) max(attr(x, "Subsample")))Smax <- sapply(out, max)plot(c(1, max(Nmax)), c(1, max(Smax)), xlab = xlab, ylab = ylab,type = "n", ...)if (!missing(sample)) {abline(v = sample)rare <- sapply(out, function(z) approx(x = attr(z, "Subsample"),y = z, xout = sample, rule = 1)$y)abline(h = rare, lwd = 0.5)}for (ln in seq_len(length(out))) {color <- col[((ln-1) %% length(col)) + 1]N <- attr(out[[ln]], "Subsample")lines(N, out[[ln]], col = color, ...)}if (label) {ordilabel(cbind(tot, S), labels = rownames(x), col = col, ...)}invisible(out)
}## Rarefaction curve, ggplot style
ggrare <- function(physeq, step = 10, label = NULL, color = NULL, plot = TRUE, parallel = FALSE, se = TRUE) {## Args:## - physeq: phyloseq class object, from which abundance data are extracted## - step: Step size for sample size in rarefaction curves## - label: Default `NULL`. Character string. The name of the variable## to map to text labels on the plot. Similar to color option## but for plotting text.## - color: (Optional). Default ‘NULL’. Character string. The name of the## variable to map to colors in the plot. This can be a sample## variable (among the set returned by## ‘sample_variables(physeq)’ ) or taxonomic rank (among the set## returned by ‘rank_names(physeq)’).#### Finally, The color scheme is chosen automatically by## ‘link{ggplot}’, but it can be modified afterward with an## additional layer using ‘scale_color_manual’.## - color: Default `NULL`. Character string. The name of the variable## to map to text labels on the plot. Similar to color option## but for plotting text.## - plot: Logical, should the graphic be plotted.## - parallel: should rarefaction be parallelized (using parallel framework)## - se: Default TRUE. Logical. Should standard errors be computed.## require veganx <- as(otu_table(physeq), "matrix")if (taxa_are_rows(physeq)) { x <- t(x) }## This script is adapted from vegan `rarecurve` functiontot <- rowSums(x)S <- rowSums(x > 0)nr <- nrow(x)rarefun <- function(i) {cat(paste("rarefying sample", rownames(x)[i]), sep = "\n")n <- seq(1, tot[i], by = step)if (n[length(n)] != tot[i]) {n <- c(n, tot[i])}y <- vegan::rarefy(x[i, ,drop = FALSE], n, se = se)if (nrow(y) != 1) {rownames(y) <- c(".S", ".se")return(data.frame(t(y), Size = n, Sample = rownames(x)[i]))} else {return(data.frame(.S = y[1, ], Size = n, Sample = rownames(x)[i]))}}if (parallel) {out <- mclapply(seq_len(nr), rarefun, mc.preschedule = FALSE)} else {out <- lapply(seq_len(nr), rarefun)}df <- do.call(rbind, out)## Get sample dataif (!is.null(sample_data(physeq, FALSE))) {sdf <- as(sample_data(physeq), "data.frame")sdf$Sample <- rownames(sdf)data <- merge(df, sdf, by = "Sample")labels <- data.frame(x = tot, y = S, Sample = rownames(x))labels <- merge(labels, sdf, by = "Sample")}## Add, any custom-supplied plot-mapped variablesif( length(color) > 1 ){data$color <- colornames(data)[names(data)=="color"] <- deparse(substitute(color))color <- deparse(substitute(color))}if( length(label) > 1 ){labels$label <- labelnames(labels)[names(labels)=="label"] <- deparse(substitute(label))label <- deparse(substitute(label))}p <- ggplot(data = data, aes_string(x = "Size", y = ".S", group = "Sample", color = color))p <- p + labs(x = "Sample Size", y = "Species Richness")if (!is.null(label)) {p <- p + geom_text(data = labels, aes_string(x = "x", y = "y", label = label, color = color),size = 4, hjust = 0)}p <- p + geom_line()if (se) { ## add standard error if availablep <- p + geom_ribbon(aes_string(ymin = ".S - .se", ymax = ".S + .se", color = NULL, fill = color), alpha = 0.2)}if (plot) {plot(p)}invisible(p)
}ggrare(carbom, step = 100, color = "fraction", label = "fraction", se = FALSE)
ternary
ternary_norm <- function(physeq, group, levelOrder = NULL, raw = FALSE, normalizeGroups = TRUE) {## Args:## - phyloseq class object, otus abundances are extracted from this object## - group: Either the a single character string matching a## variable name in the corresponding sample_data of ‘physeq’, or a## factor with the same length as the number of samples in ‘physeq’.## - raw: logical, should raw read counts be used to compute relative abudances of an## OTU among different conditions (defaults to FALSE)## - levelOrder: Order along which to rearrange levels of `group`. Goes like (left, top, right) for## ternary plots and (left, top, right, bottom) for diamond plots. ## - normalizeGroups: logical, only used if raw = FALSE, should all levels be given## equal weights (TRUE, default) or weights equal to their sizes (FALSE)## Get grouping factor if (!is.null(sam_data(physeq, FALSE))) {if (class(group) == "character" & length(group) == 1) {x1 <- data.frame(sam_data(physeq))if (!group %in% colnames(x1)) {stop("group not found among sample variable names.")}group <- x1[, group]}}if (class(group) != "factor") {group <- factor(group)}## Reorder levels of factorif (length(levels(group)) > 4) {warnings("There are 5 groups or more, the data frame will not be suitable for ternary plots.")}if (!is.null(levelOrder)) {if (any(! group %in% levelOrder)) {stop("Some levels of the factor are not included in `levelOrder`")} else {group <- factor(group, levels = levelOrder)}}## construct relative abundances matrixtdf <- as(otu_table(physeq), "matrix")if (!taxa_are_rows(physeq)) { tdf <- t(tdf) }## If raw, no normalisation should be doneif (raw) {tdf <- t(tdf)abundance <- rowSums(t(tdf))/sum(tdf)meandf <- t(rowsum(tdf, group, reorder = TRUE))/rowSums(t(tdf))} else { ## Construct relative abundances by sampletdf <- apply(tdf, 2, function(x) x/sum(x))if (normalizeGroups) {meandf <- t(rowsum(t(tdf), group, reorder = TRUE)) / matrix(rep(table(group), each = nrow(tdf)),nrow = nrow(tdf))abundance <- rowSums(meandf)/sum(meandf)meandf <- meandf / rowSums(meandf)} else {abundance <- rowSums(tdf)/sum(tdf)meandf <- t(rowsum(t(tdf), group, reorder = TRUE))/rowSums(tdf)}}## Construct cartesian coordinates for de Finetti's diagram## (taken from wikipedia, http://en.wikipedia.org/wiki/Ternary_plot)if (ncol(meandf) == 3) {ternary.coord <- function(a,b,c) { # a = left, b = right, c = topreturn(data.frame(x = 1/2 * (2*b + c)/(a + b + c),y = sqrt(3) / 2 * c / (a + b + c)))}cat(paste("(a, b, c) or (left, right, top) are (",paste(colnames(meandf), collapse = ", "),")", sep = ""), sep = "\n")## Data pointsdf <- data.frame(x = 1/2 * (2*meandf[ , 2] + meandf[ , 3]),y = sqrt(3)/2 * meandf[ , 3],abundance = abundance, row.names = rownames(meandf))## Extreme pointsextreme <- data.frame(ternary.coord(a = c(1, 0, 0),b = c(0, 1, 0),c = c(0, 0, 1)),labels = colnames(meandf),row.names = c("left", "right", "top"))}if (ncol(meandf) == 4) {diamond.coord <- function(a, b, c, d) {return(data.frame(x = (a - c) / (a + b + c + d),y = (b - d) / (a + b + c + d)))}cat(paste("(a, b, c, d) or (right, top, left, bottom) are (",paste(colnames(meandf), collapse = ", "),")", sep = ""), sep = "\n")## data pointsdf <- data.frame(x = (meandf[ , 1] - meandf[ , 3]),y = (meandf[ , 2] - meandf[ , 4]),abundance = abundance, row.names = rownames(meandf))## extreme pointsextreme <- data.frame(diamond.coord(a = c(1, 0, 0, 0),b = c(0, 1, 0, 0),c = c(0, 0, 1, 0),d = c(0, 0, 0, 1)),labels = colnames(meandf),row.names = c("right", "top", "left", "bottom"))}## Merge coordinates with taxonomix informationdf$otu <- rownames(df)## Add taxonomic informationif (!is.null(tax_table(physeq, FALSE))) {tax <- data.frame(otu = rownames(tax_table(physeq)),tax_table(physeq))df <- merge(df, tax, by.x = "otu")}## Add attributesattr(df, "labels") <- colnames(meandf)attr(df, "extreme") <- extremeattr(df, "type") <- c("ternary", "diamond", "other")[cut(ncol(meandf), breaks = c(0, 3, 4, Inf))]return(df)
}ternary_plot <- function(physeq, group, grid = TRUE, size = "log2(abundance)",color = NULL, shape = NULL, label = NULL,levelOrder = NULL, plot = TRUE,raw = FALSE, normalizeGroups = TRUE) {## Args:## - phyloseq class object, otus abundances are extracted from this object## - group: Either the a single character string matching a## variable name in the corresponding sample_data of ‘physeq’, or a## factor with the same length as the number of samples in ‘physeq’.## - raw: logical, should raw read counts be used to compute relative abudances of an## OTU among different conditions (defaults to FALSE)## - normalizeGroups: logical, only used if raw = FALSE, should all levels be given## equal weights (TRUE, default) or weights equal to their sizes (FALSE)## - levelOrder: Order along which to rearrange levels of `group`. Goes like (left, top, right) for## ternary plots and (left, top, right, bottom) for diamond plots.## - plot: logical, should the figure be plotted## - grid: logical, should a grid be plotted.## - size: mapping for size aesthetics, defaults to `abundance`.## - shape: mapping for shape aesthetics.## - color: mapping for color aesthetics.## - label: Default `NULL`. Character string. The name of the variable## to map to text labels on the plot. Similar to color option## but for plotting text.data <- ternary_norm(physeq, group, levelOrder, raw, normalizeGroups)labels <- attr(data, "labels")extreme <- attr(data, "extreme")type <- attr(data, "type")if (type == "other") {stop("Ternary plots are only available for 3 or 4 levels")}## bordersborders <- data.frame(x = extreme$x,y = extreme$y,xend = extreme$x[c(2:nrow(extreme), 1)],yend = extreme$y[c(2:nrow(extreme), 1)])## gridternary.coord <- function(a,b,c) { # a = left, b = right, c = topreturn(data.frame(x = 1/2 * (2*b + c)/(a + b + c),y = sqrt(3) / 2 * c / (a + b + c)))}diamond.coord <- function(a, b, c, d) {return(data.frame(x = (a - c) / (a + b + c + d),y = (b - d) / (a + b + c + d)))}x <- seq(1, 9, 1) / 10 ## Create base plot with theme_bwp <- ggplot() + theme_bw()## Remove normal grid, axes titles and axes ticksp <- p + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank(), panel.border = element_blank(),axis.ticks = element_blank(), axis.text.x = element_blank(),axis.text.y = element_blank(),axis.title.x = element_blank(),axis.title.y = element_blank())if (type == "ternary") {## prepare levels' labelsaxes <- extremeaxes$x <- axes$x + c(-1/2, 1/2, 0) * 0.1axes$y <- axes$y + c(-sqrt(3)/4, -sqrt(3)/4, sqrt(3)/4) * 0.1## prepare ternary gridbottom.ticks <- ternary.coord(a = x, b = 1-x, c = 0)left.ticks <- ternary.coord(a = x, b = 0, c = 1-x)right.ticks <- ternary.coord(a = 0, b = 1 - x, c = x)ticks <- data.frame(bottom.ticks, left.ticks, right.ticks)colnames(ticks) <- c("xb", "yb", "xl", "yl", "xr", "yr")## Add grid (optional)if (grid == TRUE) {p <- p + geom_segment(data = ticks, aes(x = xb, y = yb, xend = xl, yend = yl),size = 0.25, color = "grey40")p <- p + geom_segment(data = ticks, aes(x = xb, y = yb, xend = xr, yend = yr),size = 0.25, color = "grey40")p <- p + geom_segment(data = ticks, aes(x = rev(xl), y = rev(yl), xend = xr, yend = yr),size = 0.25, color = "grey40")}}if (type == "diamond") {## prepare levels' labelsaxes <- extremeaxes$x <- axes$x + c(1, 0, -1, 0) * 0.1axes$y <- axes$y + c(0, 1, 0, -1) * 0.1## prepare diamond grid nw.ticks <- diamond.coord(a = x, b = 1-x, c = 0, d = 0)ne.ticks <- diamond.coord(a = 0, b = x, c = 1-x, d = 0)sw.ticks <- diamond.coord(a = x, b = 0, c = 0, d = 1 - x)se.ticks <- diamond.coord(a = 0, b = 0, c = 1-x, d = x)ticks <- data.frame(nw.ticks, ne.ticks, se.ticks, sw.ticks)colnames(ticks) <- c("xnw", "ynw", "xne", "yne","xse", "yse", "xsw", "ysw") ## Add grid (optional)if (grid == TRUE) {p <- p + geom_segment(data = ticks, aes(x = xnw, y = ynw, xend = xse, yend = yse),size = 0.25, color = "grey40")p <- p + geom_segment(data = ticks, aes(x = xne, y = yne, xend = xsw, yend = ysw),size = 0.25, color = "grey40")p <- p + geom_segment(aes(x = c(0, -1), y = c(-1, 0),xend = c(0, 1), yend = c(1, 0)),size = 0.25, color = "grey40")}}## Add bordersp <- p + geom_segment(data = borders, aes(x = x, y = y, xend = xend, yend = yend))## Add levels' labelsp <- p + geom_text(data = axes, aes(x = x, y = y, label = labels))## Add, any custom-supplied plot-mapped variablesif( length(color) > 1 ){data$color <- colornames(data)[names(data)=="color"] <- deparse(substitute(color))color <- deparse(substitute(color))}if( length(shape) > 1 ){data$shape <- shapenames(data)[names(data)=="shape"] <- deparse(substitute(shape))shape <- deparse(substitute(shape))} if( length(label) > 1 ){data$label <- labelnames(data)[names(data)=="label"] <- deparse(substitute(label))label <- deparse(substitute(label))}if( length(size) > 1 ){data$size <- sizenames(data)[names(data)=="size"] <- deparse(substitute(size))size <- deparse(substitute(size))}## Add data pointsternary_map <- aes_string(x = "x", y = "y", color = color,shape = shape, size = size, na.rm = TRUE)p <- p + geom_point(data = data, mapping = ternary_map)## Add the text labelsif( !is.null(label) ){label_map <- aes_string(x="x", y="y", label=label, na.rm=TRUE)p <- p + geom_text(data = data, mapping = label_map,size=3, vjust=1.5, na.rm=TRUE)}if (plot) {plot(p)}invisible(p)
}samples_df$New_group <- paste0("group_", replicate(nrow(samples_df), sample(c("A", "B", "C"), 1, replace = FALSE)))samples <- sample_data(samples_df)carbom <- phyloseq(OTU, TAX, samples)
# color or shape are taxonomy
ternary_plot(carbom, "New_group", color = "Division")
参考
- phyloseq tutorial
- phyloseq: An R Package for Reproducible Interactive Analysis and Graphics of Microbiome Census Data
- phyloseq extend
- phyloseq tutorial 2