# INSTALL CRAN PACKAGES
# Install missing CRAN packages
install.packages(setdiff(
c(
"stats", "dplyr", "ggplot2", "flextable", "ggpubr",
"randomForest", "ggridges", "ggalluvial", "tibble",
"matrixStats", "RColorBrewer", "ape", "rlang",
"scales", "magrittr", "phangorn", "igraph", "tidyr",
"xml2", "data.table", "reshape2", "vegan", "patchwork", "officer"
),
installed.packages()[, "Package"]
))
# Load CRAN packages
lapply(c(
"stats", "dplyr", "ggplot2", "flextable", "ggpubr", "randomForest",
"ggridges", "ggalluvial", "tibble", "matrixStats", "RColorBrewer",
"ape", "rlang", "scales", "magrittr", "phangorn", "igraph", "tidyr",
"xml2", "data.table", "reshape2", "vegan", "patchwork", "officer"
), library, character.only = TRUE)
# INSTALL BIOCONDUCTOR PACKAGES
# Install BiocManager if not installed
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# Install missing Bioconductor packages
BiocManager::install(setdiff(
c(
"phyloseq", "msa", "DESeq2", "ggtree", "edgeR",
"Biostrings", "DECIPHER", "microbiome", "limma",
"S4Vectors", "SummarizedExperiment", "TreeSummarizedExperiment"
),
installed.packages()[, "Package"]
))
# Load Bioconductor packages
lapply(
c(
"phyloseq", "msa", "DESeq2", "edgeR", "Biostrings", "ggtree", "DECIPHER",
"microbiome", "limma", "S4Vectors", "SummarizedExperiment", "TreeSummarizedExperiment"
),
library,
character.only = TRUE
)
# INSTALL DspikeIn FROM GITHUB
# To access the DspikeIn vignette for a detailed tutorial, use vignette("DspikeIn"), or browse all available vignettes with browseVignettes("DspikeIn").
devtools::install_github("mghotbi/DspikeIn", build_vignettes = TRUE, dependencies = TRUE)
library(DspikeIn)
browseVignettes("DspikeIn")
vignette("DspikeIn")
## or
if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
devtools::install_github("mghotbi/DspikeIn")
# Load DspikeIn only if installed
if ("DspikeIn" %in% installed.packages()[, "Package"]) {
library(DspikeIn)
} else {
stop("DspikeIn installation failed. Check errors above.")
}
The development of the DspikeIn package was made possible through the generous and pioneering efforts of the R and Bioconductor communities. We gratefully acknowledge the developers and maintainers of the following open-source packages, whose tools and infrastructure underpin our work: Core infrastructure & data manipulation: methods, stats, utils, graphics, grDevices, data.table, dplyr, tibble, tidyr, reshape2, matrixStats, rlang, S4Vectors, grid, officer, xml2 Statistical analysis & modeling: DESeq2, edgeR, limma, randomForest, microbiome Phylogenetics & microbiome structure: phyloseq, TreeSummarizedExperiment, SummarizedExperiment, phangorn, ape, DECIPHER, msa, Biostrings Network and graph analysis: igraph, ggraph Visualization & layout design: ggplot2, ggrepel, ggpubr, ggnewscale, ggalluvial, ggtree, ggtreeExtra, ggstar, ggridges, patchwork, scales, RColorBrewer, flextable
These tools collectively empowered us to build a reproducible, modular, and extensible platform for robust absolute abundance quantification in microbial community analysis. We further acknowledge the broader scientific community working on absolute microbial quantification, spike-in calibration, and compositional data analysis, whose foundational insights directly informed the design and conceptual framework of DspikeIn.
The DspikeIn package supports both phyloseq and TreeSummarizedExperiment formats to streamline microbial quantification across diverse experimental setups. It accommodates either a single spike-in taxon or synthetic community taxa with variable or equal spike-in volumes and copy numbers. The package offers a comprehensive suite of tools for AA quantification, addressing challenges through ten core functions: 1) validation of spiked species, 2) data preprocessing, 3) system-specific spiked species retrieval, 4) scaling factor calculation, 5) conversion to absolute abundance, 6) bias correction and normalization, 7) performance assessment, and 8) taxa exploration and filtering 9) network topology assessment 10) further analyses and visualization.
DspikeIn works with 7 taxonomic ranks
# To remove strain from the taxonomic ranks
# DspikeIn works with 7 taxonomic ranks
# colnames(taxmat) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
library(phyloseq)
# Function to remove strain information from taxonomy columns
remove_strain_info <- function(tax_table) {
# Define the regex pattern for common strain identifiers
pattern <- "Strain.*|strain.*|\\s*\\[.*\\]|\\s*\\(.*\\)" # Adjust the regex to match specific strain formats
# Apply the pattern to each column of the taxonomy table
for (col in colnames(tax_table)) {
tax_table[, col] <- gsub(pattern, "", tax_table[, col]) # Remove strain info
tax_table[, col] <- trimws(tax_table[, col]) # Trim trailing whitespace
}
return(tax_table)
}
# Step 1: Extract the taxonomy table
taxonomy <- tax_table(ps)
# Step 2: Remove strain information (including the `Strain` column)
cleaned_taxonomy <- remove_strain_info(taxonomy)
# Remove the `Strain` column if it exists
if ("Strain" %in% colnames(cleaned_taxonomy)) {
cleaned_taxonomy <- cleaned_taxonomy[, colnames(cleaned_taxonomy) != "Strain"]
}
# Step 3: Update the taxonomy table in the phyloseq object
tax_table(ps) <- cleaned_taxonomy
# Step 4: Verify the changes
print(head(tax_table(ps))) # Display the first few rows
# To add species rank to the taxonomic ranks
library(phyloseq)
# Step 1: Extract taxonomy table safely
taxonomy <- as.data.frame(as.matrix(tax_table(ps)))
if (!"Genus" %in% colnames(taxonomy)) {
stop("The 'Genus' column is missing in the taxonomy table.")
}
# Step 2: Handle potential missing genera (optional but recommended)
taxonomy$Genus[is.na(taxonomy$Genus) | taxonomy$Genus == ""] <- "Unknown"
# Step 3: Create a new 'species' column
taxonomy$species <- paste0(taxonomy$Genus, "_OTU", seq_len(nrow(taxonomy)))
# Step 4: Assign back to phyloseq object (must coerce back to matrix)
tax_table(ps) <- tax_table(as.matrix(taxonomy))
for more information please refer to https://github.com/joey711/phyloseq & https://github.com/markrobinsonuzh/TreeSummarizedExperiment
# Build phyloseq
otu <- read.csv("otu.csv", header = TRUE, sep = ",", row.names = 1)
# taxonomic rank need to be capilalized, only the first letter of each rank
tax <- read.csv("tax.csv", header = TRUE, sep = ",", row.names = 1)
# Ensure 'spiked.volume' column is present and correctly formatted in metadata
meta <- read.csv("metadata.csv", header = TRUE, sep = ",")
# Convert data to appropriate formats
meta <- as.data.frame(meta)
taxmat <- as.matrix(tax)
otumat <- as.matrix(otu)
colnames(taxmat) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
OTU <- otu_table(otumat, taxa_are_rows = TRUE)
TAX <- phyloseq::tax_table(taxmat)
# Check
row.names(meta) <- sample_names(OTU)
metadata <- sample_data(meta)
# Build phyloseq obj
physeq <- phyloseq(OTU, TAX, metadata)
# Follow the next steps if tree and reference files are included
MyTree <- read.tree("tree.nwk")
reference_seqs <- readDNAStringSet(file = "dna-sequences.fasta", format = "fasta")
physeq_16SOTU <- merge_phyloseq(physeq, reference_seqs, MyTree)
physeq_16SOTU <- tidy_phyloseq_tse(physeq_16SOTU)
saveRDS(physeq_16SOTU, file = "physeq_16SOTU.rds")
physeq_16SOTU <- readRDS("physeq_16SOTU.rds")
# Build TreeSummarizedExperiment (TSE)
otu <- read.csv("otu.csv", header = TRUE, sep = ",", row.names = 1)
otu_mat <- as.matrix(otu) # Convert to matrix
tax <- read.csv("tax.csv", header = TRUE, sep = ",", row.names = 1)
colnames(tax) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
tax_mat <- as.matrix(tax) # Convert to matrix
meta <- read.csv("metadata.csv", header = TRUE, sep = ",", row.names = 1)
reference_seqs <- readDNAStringSet("dna-sequences.fasta", format = "fasta")
tse <- TreeSummarizedExperiment(
assays = list(counts = otu_mat), # OTU table
rowData = tax_mat, # Taxonomy information
colData = meta, # Sample metadata
rowTree = MyTree, # Phylogenetic tree
rowSeqs = reference_seqs # Reference sequences
)
Do all detected sample spike-in sequences cluster with the reference, and are their branch lengths statistically similar, supporting a common ancestor?
All sample-derived sequences are forming a clade with the reference. We look for a monophyletic grouping of spike-in OTUs The clade is strongly supported (bootstrap around 100 percentage). The branch lengths and distances are in a biologically plausible range.
# Use the Neighbor-Joining method based on a Jukes-Cantor distance matrix and plot the tree with bootstrap values.
# we compare the Sanger read of Tetragenococcus halophilus with the FASTA sequence of Tetragenococcus halophilus from our phyloseq object.
library(Biostrings)
library(phyloseq)
library(DspikeIn)
# Get path to external data folder
extdata_path <- system.file("extdata", package = "DspikeIn")
list.files(extdata_path)
## [1] "Complete.graphml" "NoBasid.graphml" "NoHubs.graphml" "Ref.fasta" "Sample.fasta"
data("physeq_16SOTU", package = "DspikeIn")
physeq_16SOTU <- tidy_phyloseq_tse(physeq_16SOTU)
physeq_16SOTU <- phyloseq::prune_taxa(
get_tax_table(physeq_16SOTU)$Kingdom %in% c("Bacteria", "Archaea"),
physeq_16SOTU
)
# Subset the phyloseq object to include only Tetragenococcus species first
# Tetragenococcus <- subset_taxa(physeq_16SOTU, Genus=="Tetragenococcus")
# Tetragenococcus <- subset_taxa(Tetragenococcus, !is.na(taxa_names(Tetragenococcus)) & taxa_names(Tetragenococcus) != "")
# tree <- phy_tree(Tetragenococcus)
# ref_sequences_Tetragenococcus <- refseq(Tetragenococcus)
# library(Biostrings)
# writeXStringSet(ref_sequences_Tetragenococcus, "Sample.fasta.fasta")
ref_fasta <- system.file("extdata", "Ref.fasta", package = "DspikeIn")
sample_fasta <- system.file("extdata", "Sample.fasta", package = "DspikeIn")
result <- validate_spikein_clade(
reference_fasta = ref_fasta,
sample_fasta = sample_fasta,
bootstrap = 200,
output_prefix = NULL)
## use default substitution matrix
Tip labels= OTU/ASV names Branch length numbers= Actual evolutionary distances (small = very similar) Prevalence stars How frequently the OTU occurs across samples Blue bar ring= Log10 mean abundance Outer colored tiles= The metadata variable you choose (e.g., Animal.type)
data("physeq_16SOTU", package = "DspikeIn")
library(ggstar)
library(ggplot2)
# filter your object to only include spike-in taxa beforehand:
# change the OTU IDs for easy detection
# Big stars = detected in many samples
# Small stars = rarely detected
# log10(Mean Abundance) Bars= Color intensity reflects mean abundance.
# The log-transformed average abundance of each OTU across all samples where it appears.
# Extreme blue may signal unintended over-representation.
# Metadata Ring = factor of your interest e.g. Animal.type
# Each OTU is colored by where it was observed.
# Branch length numbers= Actual evolutionary distances (small = very similar)
spikein <- phyloseq::subset_taxa(physeq_16SOTU, Genus == "Tetragenococcus")
taxa_names(spikein) <- paste0("OTU", seq_len(ntaxa(spikein)))
# Visualize
ps <- plot_spikein_tree_diagnostic(
obj = spikein,
metadata_var = "Animal.type",
output_prefix = "tetragenococcus_diag"
)
merges monophyletic ASVs/OTUs
# merges monophyletic ASVs/OTUs
# The function Pre_processing_species() merges ASVs/OTUs
# of a species using "sum" or "max" methods, preserving taxonomic,
# phylogenetic, and sequencing data.
# Load the phyloseq objs/ TSE obj
library(phyloseq)
library(DspikeIn)
library(TreeSummarizedExperiment)
library(SummarizedExperiment)
data("physeq_16SOTU", package = "DspikeIn")
# TSE format
# tse_16SOTU <- convert_phyloseq_to_tse(physeq_16SOTU)
# physeq_16SOTU <- convert_tse_to_phyloseq(tse_16SOTU)
physeq_16SOTU <- DspikeIn::tidy_phyloseq_tse(physeq_16SOTU) # make it tidy
# Check if metadata contains spiked volumes with this format
colnames(sample_data(physeq_16SOTU))
## [1] "sample.id" "X16S.biosample"
## [3] "dna.biosample" "data.type"
## [5] "ampliconlibrary.quantification.ng.ul" "plate.ID"
## [7] "well.location" "Env.broad.scale"
## [9] "Host.taxon" "Host.genus"
## [11] "Host.species" "Animal.type"
## [13] "Animal.ecomode" "Clade.Order"
## [15] "Family" "Diet"
## [17] "Diet.Detailed" "Habitat"
## [19] "Metamorphosis" "Reproduction"
## [21] "Ecoregion.III" "Ecoregion.IV"
## [23] "Site" "sample.name"
## [25] "biosample.parent" "data.type.1"
## [27] "ampliconlibrary.quantification.ng.ul.1" "plate.ID.1"
## [29] "well.location.1" "sample.or.blank"
## [31] "sample.spiked.blank" "spiked.volume"
## [33] "swab.presence" "MK.spike"
#
# CALCULATE SCALING FACTORS
# Calculate spike-in scaling factors
result <- calculate_spikeIn_factors(
obj = Spiked_16S_sum_scaled,
spiked_cells = spiked_cells,
merged_spiked_species = merged_spiked_species
)
# View extracted outputs
result$spiked_species_merged # Merged spiked species name
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 1 taxa and 264 samples ]
## sample_data() Sample Data: [ 264 samples by 34 sample variables ]
## tax_table() Taxonomy Table: [ 1 taxa by 7 taxonomic ranks ]
## refseq() DNAStringSet: [ 1 reference sequences ]
## # A tibble: 264 × 2
## Sample Total_Reads
## <chr> <dbl>
## 1 spiked.blank.20433_S84 8
## 2 spiked.blank.20817_S84 47066
## 3 Std2uL.20625_S84 62433
## 4 StdSwab1uL.20624_S72 17639
## 5 STP1719.20422_S47 14549
## 6 STP213.20423_S59 83
## 7 STP268.20424_S71 17
## 8 STP544.20419_S11 2259
## 9 STP570.20420_S23 822
## 10 STP579.20421_S35 1759
## # ℹ 254 more rows
scaling_factors <- result$scaling_factors
head(scaling_factors) # Vector of scaling factors per sample
## spiked.blank.20433_S84 spiked.blank.20817_S84 Std2uL.20625_S84 StdSwab1uL.20624_S72 STP1719.20422_S47
## 230.87500000 0.03924277 0.02958371 0.05235558 0.12695031
## STP213.20423_S59
## 22.25301205
# Convert relative counts to absolute counts
# absolute counts=relative counts×sample-specific scaling factor
# Convert to absolute counts
absolute <- convert_to_absolute_counts(Spiked_16S_sum_scaled, scaling_factors)
# Extract processed data
absolute_counts <- absolute$absolute_counts
physeq_absolute <- absolute$obj_adj
physeq_absolute <- tidy_phyloseq_tse(physeq_absolute)
# View absolute count data
head(absolute_counts)
## # A tibble: 6 × 264
## spiked.blank.20433_S84 spiked.blank.20817_S84 Std2uL.20625_S84 StdSwab1uL.20624_S72 STP1719.20422_S47
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## # ℹ 259 more variables: STP213.20423_S59 <dbl>, STP268.20424_S71 <dbl>, STP544.20419_S11 <dbl>,
## # STP570.20420_S23 <dbl>, STP579.20421_S35 <dbl>, STP614.20418_S94 <dbl>, UHM1000.20604_S22 <dbl>,
## # UHM1001.20609_S82 <dbl>, UHM1007.20622_S48 <dbl>, UHM1009.20614_S47 <dbl>, UHM1010.20621_S36 <dbl>,
## # UHM1011.20606_S46 <dbl>, UHM1024.20620_S24 <dbl>, UHM1026.20607_S58 <dbl>, UHM1028.20613_S35 <dbl>,
## # UHM1032.20605_S34 <dbl>, UHM1033.20619_S12 <dbl>, UHM1034.20616_S71 <dbl>, UHM1035.20611_S11 <dbl>,
## # UHM1036.20612_S23 <dbl>, UHM1052.20615_S59 <dbl>, UHM1060.20723_S1 <dbl>, UHM1065.20724_S13 <dbl>,
## # UHM1068.20732_S14 <dbl>, UHM1069.20742_S39 <dbl>, UHM1070.20725_S25 <dbl>, UHM1071.20733_S26 <dbl>, …
# CALCULATE SPIKE PERCENTAGE & summary stat
# ** Calculate spike percentage & Generate summary statistics for absolute counts**
# Generate summary statistics for absolute counts
post_eval_summary <- calculate_summary_stats_table(absolute_counts)
# You may want to Back normal to check calculation accuracy
# the scaling factor was computed based on spiked species reads and fixed cell count.
# Multiplying the spiked species read count by this scaling factor restores the exact spiked cell count.
# lets check it
# BackNormal <- calculate_spike_percentage(
# physeq_absolute,
# merged_spiked_species,
# passed_range = c(0.1, 20)
# )
# Optional, or you may do it at the end of the process
# ** Time to filter out unsuccessful spiked samples**
library(phyloseq)
library(dplyr)
library(tibble)
library(microbiome)
filtered_sample_data <- microbiome::meta(physeq_absolute) %>%
as.data.frame() %>%
tibble::rownames_to_column(var = "Sample") %>%
dplyr::mutate(Sample = as.character(Sample)) %>%
dplyr::left_join(Perc, by = "Sample")
filtered_sample_data <- tibble::column_to_rownames(filtered_sample_data, "Sample")
filtered_sample_data <- sample_data(as.data.frame(filtered_sample_data))
# Assign back to phyloseq obj
sample_data(physeq_absolute) <- filtered_sample_data
The goal is to identify the range where, for example, the evenness of your community remains independent of spiked species retrieval—meaning the p-value should not be significant, and the R² value should be low, indicating minimal influence.
# The acceptable range of spiked species retrieval is system-dependent
# Spiked species become centroid of the community (Distance to Centroid)
# Spiked species become dominant and imbalance the community (Evenness)
# What range of spiked species retrieval is appropriate for your system?
# Calculate Pielou's Evenness using Shannon index and species richness (Observed)
# Load required libraries
library(vegan)
# physeq_ab <- normalization_set(physeq_absolute, method = "rar")
# physeq_absolute <- physeq_ab$dat.normed
# Calculate Pielou's Evenness using Shannon index and species richness (Observed)
alphab <- phyloseq::estimate_richness(physeq_absolute, measures = c("Observed", "Shannon"))
alphab$Pielou_evenness <- alphab$Shannon / alphab$Observed
# Normalize values
alphab <- alphab %>%
mutate(across(c("Observed", "Shannon", "Pielou_evenness"), ~ as.numeric(scale(.))))
metadata <- as.data.frame(microbiome::meta(physeq_absolute))
metadata$Sample <- rownames(metadata)
alphab$Sample <- rownames(alphab)
# Merge alpha diversity metrics into metadata
metadata <- dplyr::left_join(metadata, alphab[, c("Sample", "Observed", "Shannon", "Pielou_evenness")], by = "Sample")
metadata <- metadata %>%
column_to_rownames(var = "Sample")
# Updated metadata back to the phyloseq obj
sample_data(physeq_absolute) <- sample_data(metadata)
if (!"Spiked_Reads" %in% colnames(metadata)) {
stop("Column 'Spiked_Reads' not found in metadata.")
}
# Generate regression plot
plot_object <- regression_plot(
data = metadata,
x_var = "Pielou_evenness",
y_var = "Spiked_Reads",
custom_range = c(0.1, 20, 30, 50, 100),
plot_title = NULL
)
# * Calculate the percentage of spiked species retrieval per sample*
absolute_abundance_16S_OTU_perc <- phyloseq::subset_samples(physeq_absolute, sample.or.blank != "blank")
# Adjust the threshold range as needed based on system specifications
result_perc <- calculate_spike_percentage(
absolute_abundance_16S_OTU_perc,
merged_spiked_species,
passed_range = c(0.1, 20)
)
# BackNormal
conc <- conclusion(absolute_abundance_16S_OTU_perc,
merged_spiked_species,
max_passed_range = 20,
output_path
)
conc$full_report
## # A tibble: 260 × 5
## Sample Total_Reads Spiked_Reads Percentage Result
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 STP1719.20422_S47 2429 1847 76.0 failed
## 2 STP213.20423_S59 188314 1847 0.981 passed
## 3 STP268.20424_S71 757488 1847 0.244 passed
## 4 STP544.20419_S11 1913 1847 96.5 failed
## 5 STP570.20420_S23 5948 1847 31.1 failed
## 6 STP579.20421_S35 5452 1847 33.9 failed
## 7 STP614.20418_S94 5 0 0 failed
## 8 UHM1000.20604_S22 43347 924 2.13 passed
## 9 UHM1001.20609_S82 39309 924 2.35 passed
## 10 UHM1007.20622_S48 86418 924 1.07 passed
## # ℹ 250 more rows
mean_total_reads_spiked | sd_total_reads_spiked | median_total_reads_spiked | mean_percentage | sd_percentage | median_percentage | passed_count | failed_count |
---|---|---|---|---|---|---|---|
113,281.1 | 306,326.4 | 29,339.5 | 20.21071 | 32.43013 | 4.156814 | 178 | 82 |
# you may keep only passed data
# Filter to get only the samples that passed
passed_samples <- result_perc$Sample[result_perc$Result == "passed"]
# Subset the original phyloseq object to keep only the samples that passed
passed_physeq <- phyloseq::prune_samples(passed_samples, absolute_abundance_16S_OTU_perc)
physeq_absolute <- absolute$obj_adj
pps_Abs <- DspikeIn::get_long_format_data(physeq_absolute)
# calculation for relative abundance needs sum of total reads
# total_reads <- sum(pps_Abs$Abundance)
# Generate an alluvial plot
alluvial_plot_abs <- alluvial_plot(
data = pps_Abs,
axes = c("Host.genus", "Ecoregion.III", "Diet"),
abundance_threshold = 10000,
fill_variable = "Family",
silent = TRUE,
abundance_type = "absolute",
top_taxa = 15,
text_size = 4,
legend_ncol = 1,
custom_colors = DspikeIn::color_palette$light_MG # Use the color palette from DspikeIn
)
you may select to transform your data befor moving forward with Differential Abundance
# you may need to normalize/transform your data to reduce biases
ps <- physeq_16SOTU
# TC Normalization
result_TC <- normalization_set(ps, method = "TC", groups = "Host.species")
normalized_ps_TC <- result_TC$dat.normed
scaling_factors_TC <- result_TC$scaling.factor
# UQ Normalization
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
result_UQ <- normalization_set(ps, method = "UQ", groups = "Host.species")
normalized_ps_UQ <- result_UQ$dat.normed
scaling_factors_UQ <- result_UQ$scaling.factor
# Median Normalization
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
result_med <- normalization_set(ps, method = "med", groups = "Host.species")
normalized_ps_med <- result_med$dat.normed
scaling_factors_med <- result_med$scaling.factor
# DESeq Normalization
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
ps_n <- remove_zero_negative_count_samples(ps)
result_DESeq <- normalization_set(ps_n, method = "DESeq", groups = "Animal.type")
normalized_ps_DESeq <- result_DESeq$dat.normed
scaling_factors_DESeq <- result_DESeq$scaling.factor
# Poisson Normalization
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
result_Poisson <- normalization_set(ps, method = "Poisson", groups = "Host.genus")
normalized_ps_Poisson <- result_Poisson$dat.normed
scaling_factors_Poisson <- result_Poisson$scaling.factor
# Quantile Normalization
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
result_QN <- normalization_set(ps, method = "QN")
normalized_ps_QN <- result_QN$dat.normed
scaling_factors_QN <- result_QN$scaling.factor
# TMM Normalization
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
result_TMM <- normalization_set(ps, method = "TMM", groups = "Animal.type")
normalized_ps_TMM <- result_TMM$dat.normed
scaling_factors_TMM <- result_TMM$scaling.factor
# CLR Normalization
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
result_clr <- normalization_set(ps, method = "clr")
normalized_ps_clr <- result_clr$dat.normed
scaling_factors_clr <- result_clr$scaling.factor
# Rarefying
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
result_rar <- normalization_set(ps, method = "rar")
normalized_ps_rar <- result_rar$dat.normed
scaling_factors_rar <- result_rar$scaling.factor
# CSS Normalization
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
result_css <- normalization_set(ps, method = "css")
normalized_ps_css <- result_css$dat.normed
scaling_factors_css <- result_css$scaling.factor
# TSS Normalization
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
result_tss <- normalization_set(ps, method = "tss")
normalized_ps_tss <- result_tss$dat.normed
scaling_factors_tss <- result_tss$scaling.factor
# RLE Normalization
data("physeq_16SOTU", package = "DspikeIn")
ps <- physeq_16SOTU
result_rle <- normalization_set(ps, method = "rle")
normalized_ps_rle <- result_rle$dat.normed
scaling_factors_rle <- result_rle$scaling.factor
Ridge plot
rf_physeq <- RandomForest_selected(
physeq_16SOTU,
response_var = "Host.genus",
na_vars = c("Habitat", "Ecoregion.III", "Host.genus", "Diet")
)
ridge_physeq <- ridge_plot_it(rf_physeq, taxrank = "Family", top_n = 10) + scale_fill_manual(values = DspikeIn::color_palette$cool_MG)
# ridge_physeq
result_css <- normalization_set(rf_physeq, method = "css")
normalized_ps_css <- result_css$dat.normed
ridge_physeq <- ridge_plot_it(normalized_ps_css, taxrank = "Family", top_n = 10) + scale_fill_manual(values = DspikeIn::color_palette$cool_MG)
# ridge_physeq
remove the spike-in sp before further analysis
absolute <- phyloseq::subset_taxa(physeq_absolute, Genus != "Tetragenococcus")
absolute <- subset_taxa(absolute, Family != "Chloroplast" & Order != "Chloroplast")
Caudate_abs <- phyloseq::subset_samples(absolute, Clade.Order == "Caudate")
Three_Genara_abs <- phyloseq::subset_samples(Caudate_abs, Host.genus %in% c("Desmognathus", "Plethodon", "Eurycea"))
Three_Genara_abs_BlueRidge <- phyloseq::subset_samples(Three_Genara_abs, Ecoregion.III == "Blue Ridge")
Desmog_Blue_Ins_16_abs <- phyloseq::subset_samples(Three_Genara_abs_BlueRidge, Host.genus == "Desmognathus")
results_DESeq2 <- perform_and_visualize_DA(
obj = Desmog_Blue_Ins_16_abs,
method = "DESeq2",
group_var = "Host.taxon",
contrast = c("Desmognathus monticola", "Desmognathus imitator"),
output_csv_path = "DA_DESeq2.csv",
target_glom = "Genus",
significance_level = 0.05
)
head(results_DESeq2$results)
## # A tibble: 6 × 17
## baseMean logFC lfcSE stat pvalue padj FDR Significance group OTU Kingdom Phylum Class Order Family Genus
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <fct> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 10 9.11e-8 0.241 3.77e-7 1.00 1 1 Not Signifi… Desm… b004… Bacter… Armat… uncu… uncu… uncul… uncu…
## 2 2 1.33e-7 0.533 2.51e-7 1.00 1 1 Not Signifi… Desm… f872… Bacter… Prote… Gamm… Chro… Chrom… Thio…
## 3 1.36 -1.06e-6 0.749 -1.41e-6 1.00 1 1 Not Signifi… Desm… df13… Bacter… Firmi… Synt… Synt… Syntr… Synt…
## 4 2 1.33e-7 0.533 2.51e-7 1.00 1 1 Not Signifi… Desm… ed28… Bacter… Firmi… Symb… Symb… Symbi… uncu…
## 5 12 8.26e-8 0.221 3.73e-7 1.00 1 1 Not Signifi… Desm… 63f5… Bacter… Gemma… Gemm… Gemm… Gemma… uncu…
## 6 13.8 -3.68e-1 0.245 -1.50e+0 0.132 1 1 Not Signifi… Desm… fea9… Bacter… Gemma… Gemm… Gemm… Gemma… Gemm…
## # ℹ 1 more variable: Species <chr>
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 27 taxa and 42 samples ]
## sample_data() Sample Data: [ 42 samples by 34 sample variables ]
## tax_table() Taxonomy Table: [ 27 taxa by 7 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 27 tips and 26 internal nodes ]
## refseq() DNAStringSet: [ 27 reference sequences ]
# Relative abundance
data("physeq_16SOTU", package = "DspikeIn")
physeq_16SOTU <- tidy_phyloseq_tse(physeq_16SOTU)
relative <- phyloseq::subset_taxa(physeq_16SOTU, Genus != "Tetragenococcus")
Caudate_rel <- phyloseq::subset_samples(relative, Clade.Order == "Caudate")
Three_Genara_rel <- phyloseq::subset_samples(Caudate_rel, Host.genus %in% c("Desmognathus", "Plethodon", "Eurycea"))
Three_Genara_rel_BlueRidge <- phyloseq::subset_samples(Three_Genara_rel, Ecoregion.III == "Blue Ridge")
Desmog_Blue_Ins_16_rel <- phyloseq::subset_samples(Three_Genara_rel_BlueRidge, Host.genus == "Desmognathus")
results_DESeq2_rel <- perform_and_visualize_DA(
obj = Desmog_Blue_Ins_16_rel,
method = "DESeq2",
group_var = "Host.taxon",
contrast = c("Desmognathus monticola", "Desmognathus imitator"),
output_csv_path = "DA_DESeq2.csv",
target_glom = "Genus",
significance_level = 0.05
)
# head(results_DESeq2_rel$results) # sig taxa
# results_DESeq2_rel$plot
# results_DESeq2_rel$bar_plot
# results_DESeq2_rel$obj_significant
# DspikeIn: Differential Abundance Examples
# Using perform_and_visualize_DA() with multiple contrast scenarios
# Load example dataset from the package
data("physeq_16SOTU", package = "DspikeIn")
# 1. Single Contrast
res_single <- perform_and_visualize_DA(
obj = physeq_16SOTU,
method = "DESeq2",
group_var = "Diet",
contrast = c("Insectivore", "Carnivore"),
target_glom = "Genus")
# 2. Single Factor — All Pairwise Contrasts
# level names for DESeq2 compatibility
sample_data(physeq_16SOTU)$Host.taxon <- factor(make.names(sample_data(physeq_16SOTU)$Host.taxon))
# Get unique levels
host_levels <- levels(sample_data(physeq_16SOTU)$Host.taxon)
# contrast list
contrast_named <- list(
Host.taxon = combn(host_levels, 2, simplify = FALSE))
# multiple pairwise contrasts
res_multi <- perform_and_visualize_DA(
obj = physeq_16SOTU,
method = "DESeq2",
group_var = "Host.taxon",
contrast = contrast_named,
target_glom = "Genus")
# 3. Single Factor — Selected Contrasts
contrast_list <- list(
c("Insectivore", "Carnivore"),
c("Omnivore", "Herbivore"))
res_selected <- perform_and_visualize_DA(
obj = physeq_16SOTU,
method = "DESeq2",
group_var = "Diet",
contrast = contrast_list,
global_fdr = TRUE)
# 4. Multiple Factors — Selected Contrasts
contrast_named <- list(
Diet = list(
c("Insectivore", "Carnivore"),
c("Omnivore", "Carnivore") ),
Animal.type = list(
c("Frog", "Salamander") ))
res_multi_factor <- perform_and_visualize_DA(
obj = physeq_16SOTU,
method = "DESeq2",
contrast = contrast_named,
target_glom = "Genus",
significance_level = 0.01,
global_fdr = TRUE)
# 5. Multiple Factors — all pairwise contrasts
# Ensure clean factor levels
sample_data(physeq_16SOTU)$Host.taxon <- droplevels(factor(sample_data(physeq_16SOTU)$Host.taxon))
sample_data(physeq_16SOTU)$Habitat <- droplevels(factor(sample_data(physeq_16SOTU)$Habitat))
# pairwise contrasts
host_levels <- levels(sample_data(physeq_16SOTU)$Host.taxon)
habitat_levels <- levels(sample_data(physeq_16SOTU)$Habitat)
contrast_named <- list(
Host.taxon = combn(host_levels, 2, simplify = FALSE),
Habitat = combn(habitat_levels, 2, simplify = FALSE))
results_multi_factor <- perform_and_visualize_DA(
obj = physeq_16SOTU,
method = "DESeq2",
contrast = contrast_named,
target_glom = "Genus",
significance_level = 0.01)
# 6. Multiple Factors - combined group contrasts (interactions)
# Create a combined grouping variable
sample_data(physeq_16SOTU)$ComboGroup <- factor(interaction(
sample_data(physeq_16SOTU)$Animal.type,
sample_data(physeq_16SOTU)$Diet,
drop = TRUE))
# selected contrasts
contrast_list <- list(
c("Salamander.Insectivore", "Lizard.Insectivore"),
c("Salamander.Carnivore", "Snake.Carnivore"),
c("Salamander.Carnivore", "Frog.Carnivore"))
# multi-contrast analysis
res_combo <- perform_and_visualize_DA(
obj = physeq_16SOTU,
method = "DESeq2",
group_var = "ComboGroup",
contrast = contrast_list,
target_glom = "Genus",
global_fdr = TRUE)
# Visualization of community composition
Rel <- phyloseq::subset_taxa(physeq_16SOTU, Genus != "Tetragenococcus")
Prok_OTU_spiked <- phyloseq::subset_samples(Rel, spiked.volume %in% c("2", "1"))
Prok_OTU_spiked <- phyloseq::subset_samples(Prok_OTU_spiked, sample.or.blank != "blank")
Prok_OTU_sal <- phyloseq::subset_samples(Prok_OTU_spiked, Animal.type == "Salamander")
Prok_OTU_sal <- tidy_phyloseq_tse(Prok_OTU_sal)
TB<-taxa_barplot(
Prok_OTU_sal,
target_glom = "Genus",
custom_tax_names = NULL,
normalize = TRUE,
treatment_variable = "Habitat",
abundance_type = "relative",
x_angle = 25,
fill_variable = "Family",
facet_variable = "Diet",
top_n_taxa = 20,
palette = DspikeIn::color_palette$mix_MG,
legend_size = 11,
legend_columns = 1,
x_scale = "free",
xlab = NULL)
# 1. Initialization and loading NetWorks for Comparision
# library(SpiecEasi)
# library(ggnet)
# library(igraph)
library(DspikeIn)
library(tidyr)
library(dplyr)
library(ggpubr)
library(igraph)
# To create a microbial co-occurrence network, you can refer to the SpiecEasi package available at:
# SpiecEasi GitHub Repository https://github.com/zdk123/SpiecEasi
# herp.Bas.rel.f is a merged phyloseq object for both bacterial and fungal domains
# spiec.easi(herp.Bas.rel.f, method='mb', lambda.min.ratio=1e-3, nlambda=250,pulsar.select=TRUE )
Complete <- load_graphml("Complete.graphml")
NoBasid <- load_graphml("NoBasid.graphml")
NoHubs <- load_graphml("NoHubs.graphml")
# result <- weight_Network(graph_path = "Complete.graphml")
# result
result_kk <- degree_network(
graph_path = load_graphml("Complete.graphml"),
save_metrics = TRUE,
layout_type = "stress"
)
# print(result_kk$plot)
# 2. Metrics Calculation
result_Complete <- node_level_metrics(Complete)
result_NoHubs <- node_level_metrics(NoHubs)
result_NoBasid <- node_level_metrics(NoBasid)
Complete_metrics <- result_Complete$metrics
Nohub_metrics <- result_NoHubs$metrics
Nobasid_metrics <- result_NoBasid$metrics
Complete_metrics <- data.frame(lapply(Complete_metrics, as.character), stringsAsFactors = FALSE)
Nohub_metrics <- data.frame(lapply(Nohub_metrics, as.character), stringsAsFactors = FALSE)
Nobasid_metrics <- data.frame(lapply(Nobasid_metrics, as.character), stringsAsFactors = FALSE)
print(igraph::vcount(Complete)) # Number of nodes
## [1] 308
## [1] 1144
## [1] 307
## [1] 1187
## [1] 286
## [1] 916
metrics_scaled <- bind_rows(
Complete_metrics %>% mutate(Network = "Complete"),
Nohub_metrics %>% mutate(Network = "NoHubs"),
Nobasid_metrics %>% mutate(Network = "NoBasid")
) %>%
dplyr::mutate(dplyr::across(where(is.numeric), scale))
metrics_long_scaled <- metrics_scaled %>%
tidyr::pivot_longer(cols = -c(Node, Network), names_to = "Metric", values_to = "Value")
bind the metrics to plot them
# 3. Visualization
# Remove missing values
metrics_long_scaled <- na.omit(metrics_long_scaled)
# We visualize only six metrics
selected_metrics <- c(
"Degree", "Closeness", "Betweenness",
"EigenvectorCentrality", "PageRank", "Transitivity"
)
metrics_long_filtered <- metrics_long_scaled %>%
filter(Metric %in% selected_metrics) %>%
mutate(
Value = as.numeric(as.character(Value)),
Network = recode(Network,
"Complete" = "Complete Network",
"NoHubs" = "Network & Module Hubs Removed",
"NoBasid" = "Basidiobolus Subnetwork Removed" ) ) %>%
na.omit() # Remove any NA if any
metrics_long_filtered$Network <- factor(metrics_long_filtered$Network,
levels = c(
"Complete Network",
"Network & Module Hubs Removed",
"Basidiobolus Subnetwork Removed" ))
# DspikeIn::color_palette$light_MG
network_colors <- c(
"Complete Network" = "#F1E0C5",
"Network & Module Hubs Removed" = "#D2A5A1",
"Basidiobolus Subnetwork Removed" = "#B2C3A8")
# statistical comparisons a vs b
comparisons <- list(
c("Complete Network", "Network & Module Hubs Removed"),
c("Complete Network", "Basidiobolus Subnetwork Removed"),
c("Network & Module Hubs Removed", "Basidiobolus Subnetwork Removed"))
networks_in_data <- unique(metrics_long_filtered$Network)
comparisons <- comparisons[sapply(comparisons, function(pair) all(pair %in% networks_in_data))]
met <- ggplot(metrics_long_filtered, aes(x = Network, y = Value, fill = Network)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(aes(color = Network),
position = position_jitter(0.2), alpha = 0.2, size = 1.5 ) +
scale_fill_manual(values = network_colors) +
scale_color_manual(values = network_colors) +
facet_wrap(~Metric, scales = "free_y", labeller = label_wrap_gen(width = 20)) +
ggpubr::stat_compare_means(method = "wilcox.test", label = "p.signif", comparisons = comparisons) +
theme_minimal() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text(size = 10, angle = 10, hjust = 0.8),
strip.text = element_text(size = 12, margin = margin(t = 15, b = 5)),
legend.position = "top",
legend.text = element_text(size = 13),
legend.title = element_text(size = 13, face = "bold"),
plot.title = element_text(size = 13, face = "bold")
) + labs(title = "Selected Node Metrics Across Networks", fill = "Network Type", color = "Network Type")
Complete <- load_graphml("Complete.graphml")
result2 <- extract_neighbors(
graph = Complete,
target_node = "OTU69:Basidiobolus_sp", mode = "all")
head(result2$summary)
## # A tibble: 6 × 2
## Type Node
## <chr> <chr>
## 1 First Neighbor OTU8:Mortierella_sp
## 2 First Neighbor OTU13:Mortierella_sp
## 3 First Neighbor OTU15:Mortierella_sp
## 4 First Neighbor OTU16:Ascomycota_sp
## 5 First Neighbor OTU18:Helotiales_sp
## 6 First Neighbor OTU19:Margaritispora_monticola
Metric | Description |
---|---|
‘Node’ | Node name (character format) |
‘Degree’ | Number of edges connected to the node |
‘Strength’ | Sum of edge weights connected to the node |
‘Closeness’ | Closeness centrality (normalized, based on shortest paths) |
‘Betweenness’ | Betweenness centrality (normalized, measures control over network flow) |
‘EigenvectorCentrality’ | Eigenvector centrality (importance based on connections to influential nodes) |
‘PageRank’ | PageRank score (importance based on incoming links) |
‘Transitivity’ | Local clustering coefficient (tendency of a node to form triangles) |
‘Coreness’ | Node’s coreness (from k-core decomposition) |
‘Constraint’ | Burt’s constraint (measures structural holes in a node’s ego network) |
‘EffectiveSize’ | Inverse of constraint (larger values = more non-redundant connections) |
‘Redundancy’ | Sum of constraint values of a node’s alters |
‘Community’ | Community assignment from Louvain clustering |
‘Efficiency’ | Global efficiency (average inverse shortest path length) |
‘Local_Efficiency’ | Local efficiency (subgraph efficiency for a node’s neighbors) |
‘Within_Module_Connectivity’ | Proportion of neighbors in the same community |
‘Among_Module_Connectivity’ | Proportion of neighbors in different communities |
# Load required libraries
library(igraph)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
# Compute Node-Level Metrics
completeMetrics <- node_level_metrics(Complete)
NoHubsMetrics <- node_level_metrics(NoHubs)
NoBasidMetrics <- node_level_metrics(NoBasid)
# completeMetrics$facet_plot
# Ensure each dataset has a "Network" column before combining
completeMetrics$metrics <- completeMetrics$metrics %>% mutate(Network = "Complete Network")
NoHubsMetrics$metrics <- NoHubsMetrics$metrics %>% mutate(Network = "Network & Module Hubs Removed")
NoBasidMetrics$metrics <- NoBasidMetrics$metrics %>% mutate(Network = "Basidiobolus Subnetwork Removed")
# Combine All Data
combined_data <- bind_rows(
completeMetrics$metrics,
NoHubsMetrics$metrics,
NoBasidMetrics$metrics
)
# Add Node Identifier if missing
if (!"Node" %in% colnames(combined_data)) {
combined_data <- combined_data %>% mutate(Node = rownames(.))
}
# Convert `Network` into Factor
combined_data$Network <- factor(combined_data$Network, levels = c(
"Complete Network",
"Network & Module Hubs Removed",
"Basidiobolus Subnetwork Removed"))
# Convert Data to Long Format
metrics_long <- combined_data %>%
pivot_longer(
cols = c("Redundancy", "Efficiency", "Betweenness"),
names_to = "Metric", values_to = "Value")
# Define Custom Colors and Shapes
network_colors <- c(
"Complete Network" = "#F1E0C5",
"Network & Module Hubs Removed" = "#D2A5A1",
"Basidiobolus Subnetwork Removed" = "#B2C3A8")
network_shapes <- c(
"Complete Network" = 21,
"Network & Module Hubs Removed" = 22,
"Basidiobolus Subnetwork Removed" = 23)
# Determine Top 30% of Nodes to Label/Optional
metrics_long <- metrics_long %>%
group_by(Network, Metric) %>%
mutate(Label = ifelse(rank(-Value, ties.method = "random") / n() <= 0.3, Node, NA))
# ?quadrant_plot() can creat plot for indivisual network
# plot <- quadrant_plot(metrics, x_metric = "Degree", y_metric = "Efficiency")
# Create comparision Plots
create_metric_plot <- function(metric_name, data, title) {
data_filtered <- data %>% filter(Metric == metric_name)
median_degree <- median(data_filtered$Degree, na.rm = TRUE)
median_value <- median(data_filtered$Value, na.rm = TRUE)
ggplot(data_filtered, aes(x = Degree, y = Value, fill = Network)) +
geom_point(aes(shape = Network), size = 3, stroke = 1, color = "black") +
geom_text_repel(aes(label = Label), size = 3, max.overlaps = 50) +
scale_fill_manual(values = network_colors) +
scale_shape_manual(values = network_shapes) +
geom_vline(xintercept = median_degree, linetype = "dashed", color = "black", size = 1) +
geom_hline(yintercept = median_value, linetype = "dashed", color = "black", size = 1) +
labs(
title = title,
x = "Degree",
y = metric_name,
fill = "Network",
shape = "Network" ) +
theme_minimal() +
theme(
plot.title = element_text(
hjust = 0.5, size = 16, face = "bold",
margin = margin(t = 10, b = 20) # Moves the title downward
),
axis.title = element_text(size = 14, face = "bold"),
legend.position = "top",
legend.title = element_text(size = 14, face = "bold"),
legend.text = element_text(size = 12) )
}
# Generate Plots
plot_redundancy <- create_metric_plot("Redundancy", metrics_long, "Redundancy vs. Degree Across Networks")
plot_efficiency <- create_metric_plot("Efficiency", metrics_long, "Efficiency vs. Degree Across Networks")
plot_betweenness <- create_metric_plot("Betweenness", metrics_long, "Betweenness vs. Degree Across Networks")
# Save Plots
# ggsave("plot_redundancy_20percent.png", plot_redundancy, width = 8, height = 6)
# ggsave("plot_efficiency_20percent.png", plot_efficiency, width = 8, height = 6)
# ggsave("plot_betweenness_20percent.png", plot_betweenness, width = 8, height = 6)
# Print Plots
# print(plot_redundancy)
# print(plot_efficiency)
# print(plot_betweenness)
# Compute degree metrics and visualize the network
# Options: `"stress"` (default), `"graphopt"`, `"fr"`
result <- degree_network(graph_path = Complete, save_metrics = TRUE)
# print(result$metrics)
# print(result$plot)
# Compute network weights for different graph structures
NH <- weight_Network(graph_path = "NoHubs.graphml")
NB <- weight_Network(graph_path = "NoBasid.graphml")
C <- weight_Network(graph_path = "Complete.graphml")
# Extract metrics from the computed network weights
CompleteM <- C$metrics
NoHubsM <- NH$metrics
NoBasidM <- NB$metrics
# Combine metrics into a single dataframe for comparison
df <- bind_rows(
CompleteM %>% mutate(Group = "CompleteM"),
NoHubsM %>% mutate(Group = "NoHubsM"),
NoBasidM %>% mutate(Group = "NoBasidM")) %>%
pivot_longer(cols = -Group, names_to = "Metric", values_to = "Value")
# Aggregate the total values by metric and group
df_bar <- df %>%
group_by(Metric, Group) %>%
summarise(Total_Value = sum(Value), .groups = "drop")
# Plot the metrics comparison
pg<-ggplot(df_bar, aes(x = Metric, y = log1p(Total_Value), fill = Group)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("#F1E0C5", "#D2A5A1", "#B2C3A8")) +
labs(
title = "Total Network Metrics Comparison",
x = "Metric",
y = "Total Value (log-scaled)",
fill = "Group" )
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.5
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Berlin
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DspikeIn_0.99.10 ggrepel_0.9.6 igraph_2.1.4
## [4] ggpubr_0.6.0 tidyr_1.3.1 vegan_2.6-10
## [7] lattice_0.22-7 permute_0.9-7 microbiome_1.30.0
## [10] tibble_3.2.1 dplyr_1.1.4 flextable_0.9.8
## [13] ggplot2_3.5.2 ggstar_1.0.4 remotes_2.5.0
## [16] phyloseq_1.52.0 TreeSummarizedExperiment_2.16.1 Biostrings_2.76.0
## [19] XVector_0.48.0 SingleCellExperiment_1.30.1 SummarizedExperiment_1.38.1
## [22] Biobase_2.68.0 GenomicRanges_1.60.0 GenomeInfoDb_1.44.0
## [25] IRanges_2.42.0 S4Vectors_0.46.0 BiocGenerics_0.54.0
## [28] generics_0.1.4 MatrixGenerics_1.20.0 matrixStats_1.5.0
## [31] testthat_3.2.3
##
## loaded via a namespace (and not attached):
## [1] urlchecker_1.0.1 vctrs_0.6.5 digest_0.6.37 png_0.1-8
## [5] shape_1.4.6.1 BiocBaseUtils_1.10.0 msa_1.40.0 MASS_7.3-65
## [9] fontLiberation_0.1.0 reshape2_1.4.4 httpuv_1.6.16 foreach_1.5.2
## [13] withr_3.0.2 xfun_0.52 ggfun_0.1.8 ellipsis_0.3.2
## [17] survival_3.8-3 memoise_2.0.1 commonmark_1.9.5 rcmdcheck_1.4.0
## [21] profvis_0.4.0 systemfonts_1.2.3 ragg_1.4.0 tidytree_0.4.6
## [25] GlobalOptions_0.1.2 ggtreeExtra_1.18.0 Formula_1.2-5 sys_3.4.3
## [29] prettyunits_1.2.0 promises_1.3.2 httr_1.4.7 rstatix_0.7.2
## [33] rhdf5filters_1.20.0 ps_1.9.1 rhdf5_2.52.0 rstudioapi_0.17.1
## [37] UCSC.utils_1.4.0 miniUI_0.1.2 ggalluvial_0.12.5 processx_3.8.6
## [41] curl_6.2.2 ScaledMatrix_1.16.0 ggraph_2.2.1 polyclip_1.10-7
## [45] randomForest_4.7-1.2 xopen_1.0.1 GenomeInfoDbData_1.2.14 quadprog_1.5-8
## [49] SparseArray_1.8.0 RBGL_1.84.0 xtable_1.8-4 stringr_1.5.1
## [53] desc_1.4.3 ade4_1.7-23 doParallel_1.0.17 evaluate_1.0.3
## [57] S4Arrays_1.8.0 BiocFileCache_2.16.0 irlba_2.3.5.1 colorspace_2.1-1
## [61] filelock_1.0.3 magrittr_2.0.3 later_1.4.2 viridis_0.6.5
## [65] ggtree_3.16.0 DECIPHER_3.4.0 XML_3.99-0.18 pillar_1.10.2
## [69] nlme_3.1-168 iterators_1.0.14 compiler_4.5.0 beachmat_2.24.0
## [73] stringi_1.8.7 biomformat_1.36.0 devtools_2.4.5 plyr_1.8.9
## [77] crayon_1.5.3 abind_1.4-8 gridGraphics_0.5-1 locfit_1.5-9.12
## [81] graphlayouts_1.2.2 bit_4.6.0 fastmatch_1.1-6 codetools_0.2-20
## [85] textshaping_1.0.1 BiocSingular_1.24.0 openssl_2.3.2 bslib_0.9.0
## [89] GetoptLong_1.0.5 multtest_2.64.0 mime_0.13 splines_4.5.0
## [93] circlize_0.4.16 Rcpp_1.0.14 dbplyr_2.5.0 sparseMatrixStats_1.20.0
## [97] BiocCheck_1.44.2 knitr_1.50 blob_1.2.4 utf8_1.2.5
## [101] clue_0.3-66 fs_1.6.6 DelayedMatrixStats_1.30.0 pkgbuild_1.4.7
## [105] ggsignif_0.6.4 ggplotify_0.1.2 Matrix_1.7-3 callr_3.7.6
## [109] statmod_1.5.0 tweenr_2.0.3 pkgconfig_2.0.3 tools_4.5.0
## [113] cachem_1.1.0 RSQLite_2.3.11 viridisLite_0.4.2 DBI_1.2.3
## [117] fastmap_1.2.0 rmarkdown_2.29 scales_1.4.0 grid_4.5.0
## [121] credentials_2.0.2 usethis_3.1.0 sass_0.4.10 broom_1.0.8
## [125] officer_0.6.9 patchwork_1.3.0 BiocManager_1.30.25 graph_1.86.0
## [129] carData_3.0-5 farver_2.1.2 tidygraph_1.3.1 mgcv_1.9-3
## [133] biocViews_1.76.0 yaml_2.3.10 roxygen2_7.3.2 cli_3.6.5
## [137] purrr_1.0.4 lifecycle_1.0.4 rsconnect_1.4.1 askpass_1.2.1
## [141] sessioninfo_1.2.3 backports_1.5.0 BiocParallel_1.42.0 gtable_0.3.6
## [145] rjson_0.2.23 ggridges_0.5.6 parallel_4.5.0 ape_5.8-1
## [149] limma_3.64.0 jsonlite_2.0.0 edgeR_4.6.2 bitops_1.0-9
## [153] bit64_4.6.0-1 brio_1.1.5 Rtsne_0.17 yulab.utils_0.2.0
## [157] BiocNeighbors_2.2.0 zip_2.3.3 jquerylib_0.1.4 lazyeval_0.2.2
## [161] shiny_1.10.0 htmltools_0.5.8.1 rappdirs_0.3.3 glue_1.8.0
## [165] httr2_1.1.2 gdtools_0.4.2 RCurl_1.98-1.17 rprojroot_2.0.4
## [169] treeio_1.32.0 gridExtra_2.3 R6_2.6.1 DESeq2_1.48.1
## [173] labeling_0.4.3 cluster_2.1.8.1 pkgload_1.4.0 Rhdf5lib_1.30.0
## [177] stringdist_0.9.15 aplot_0.2.5 DirichletMultinomial_1.50.0 DelayedArray_0.34.1
## [181] tidyselect_1.2.1 ggforce_0.4.2 xml2_1.3.8 fontBitstreamVera_0.1.1
## [185] car_3.1-3 rsvd_1.0.5 fontquiver_0.2.1 data.table_1.17.2
## [189] htmlwidgets_1.6.4 ComplexHeatmap_2.24.0 RColorBrewer_1.1-3 rlang_1.1.6
## [193] gert_2.1.5 uuid_1.2-1 RUnit_0.4.33 ggnewscale_0.5.1
## [197] phangorn_2.12.1 Cairo_1.6-2
Spike-in volume Protocol;
The species Tetragenococcus halophilus (bacterial spike; ATCC33315) and Dekkera bruxellensis (fungal spike; WLP4642-White Labs) were selected as taxa to spike into gut microbiome samples as they were not found in an extensive collection of wildlife skin (GenBank BioProjects: PRJNA1114724, PRJNA 1114659) or gut microbiome samples. Stock cell suspensions of both microbes were grown in either static tryptic soy broth (T. halophilus) or potato dextrose broth (D. bruxellensis) for 72 hours then serially diluted and optical density (OD600) determined on a ClarioStar plate reader. Cell suspensions with an optical density of 1.0, 0.1, 0.01, 0.001 were DNA extracted using the Qiagen DNeasy Powersoil Pro Kit. These DNA isolations were used as standards to determine the proper spike in volume of cells to represent 0.1-10% of a sample (Rao et al., 2021b) Fecal pellets (3.1 ± 1.6 mg; range = 1 – 5.1 mg) from an ongoing live animal study using wood frogs (Lithobates sylvaticus) were used to standardize the input material for the development of this protocol. A total of (n=9) samples were used to validate the spike in protocol. Each fecal sample was homogenized in 1mL of sterile molecular grade water then 250uL of fecal slurry was DNA extracted as above with and without spiked cells. Two approaches were used to evaluate the target spike-in of 0.1-10%, the range of effective spike-in percentage described in (Rao et al., 2021b), including 1) an expected increase of qPCR cycle threshold (Ct) value that is proportional to the amount of spiked cells and 2) the expected increase in copy number of T. halophilus and D. bruxellensis in spiked vs. unspiked samples. A standard curve was generated using a synthetic fragment of DNA for the 16S-V4 rRNA and ITS1 rDNA regions of T. halophilus and D. bruxellensis, respectively. The standard curve was used to convert Ct values into log copy number for statistical analyses (detailed approach in[2, 3]) using the formula y = -0.2426x + 10.584 for T. halophilus and y = -0.3071x + 10.349 for D. bruxellensis, where x is the average Ct for each unknown sample. Quantitative PCR (qPCR) was used to compare known copy numbers from synthetic DNA sequences of T. halophilus and D. bruxellensis to DNA extractions of T. halophilus and D. bruxellensis independently, and wood frog fecal samples with and without spiked cells. SYBR qPCR assays were run at 20ul total volume including 10ul 2X Quantabio PerfeCTa SYBR Green Fastmix, 1ul of 10uM forward and reverse primers, 1ul of ArcticEnzymes dsDNAse master mix clean up kit, and either 1ul of DNA for D. bruxellensis or 3ul for T. halophilus. Different volumes of DNA were chosen for amplification of bacteria and fungi due to previous optimization of library preparation and sequencing steps [3]. The 515F [4] and 806R [5] primers were chosen to amplify bacteria and ITS1FI2 [6] and ITS2 for fungi, as these are the same primers used during amplicon library preparation and sequencing. Cycling conditions on an Agilent AriaMX consisted of 95 C for 3 mins followed by 40 cycles of 95 C for 15 sec, 60 C for 30 sec and 72 C for 30 sec. Following amplification, a melt curve was generated under the following conditions including 95 C for 30 sec, and a melt from 60 C to 90 C increasing in resolution of 0.5 C in increments of a 5 sec soak time. To validate the spike in protocol we selected two sets of fecal samples including 360 samples from a diverse species pool of frogs, lizards, salamanders and snakes and a more targeted approach of 122 fecal samples from three genera of salamanders from the Plethodontidae. (Supplemental Table #). FFecal samples were not weighed in the field, rather, a complete fecal pellet was diluted in an equal volume of sterile water and standardized volume of fecal slurry (250µL) extracted for independent samples.. A volume of 1ul T. halophilus (1847 copies) and 1ul D. bruxellensis (733 copies) were spiked into each fecal sample then DNA was extracted as above, libraries constructed, and amplicon sequenced on an Illumina MiSeq as in [7] .