Metabolomics Statistical Analysis

Univariate Analysis

library(tidyverse)

Load normalized data

data <- read.csv('normalized_data.csv', row.names = 1) groups <- factor(read.csv('sample_info.csv')$group)

T-test for each feature

ttest_results <- apply(data, 2, function(x) { test <- t.test(x ~ groups) c(pvalue = test$p.value, fc = mean(x[groups == levels(groups)[2]]) - mean(x[groups == levels(groups)[1]])) }) ttest_results <- as.data.frame(t(ttest_results)) ttest_results$fdr <- p.adjust(ttest_results$pvalue, method = 'BH')

Significant features

sig_features <- ttest_results[ttest_results$fdr < 0.05, ] cat('Significant features (FDR<0.05):', nrow(sig_features), '\n')

Fold Change Calculation

Calculate fold change between groups

calculate_fc <- function(data, groups) { group_means <- aggregate(data, by = list(groups), FUN = mean, na.rm = TRUE) rownames(group_means) <- group_means$Group.1 group_means <- group_means[, -1]

fc &#x3C;- as.numeric(group_means[2, ]) / as.numeric(group_means[1, ])
log2fc &#x3C;- log2(fc)

return(data.frame(feature = colnames(data), fold_change = fc, log2fc = log2fc))

}

fc_results <- calculate_fc(data, groups)

Volcano Plot

library(ggplot2)

Combine statistics

results <- merge(ttest_results, fc_results, by.x = 'row.names', by.y = 'feature') results$significant <- results$fdr < 0.05 & abs(results$log2fc) > 1

Plot

ggplot(results, aes(x = log2fc, y = -log10(pvalue), color = significant)) + geom_point(alpha = 0.6) + scale_color_manual(values = c('gray', 'red')) + geom_hline(yintercept = -log10(0.05), linetype = 'dashed') + geom_vline(xintercept = c(-1, 1), linetype = 'dashed') + labs(x = 'Log2 Fold Change', y = '-log10(p-value)', title = 'Metabolomics Volcano Plot') + theme_bw()

ggsave('volcano_plot.png', width = 8, height = 6)

PCA

library(pcaMethods)

PCA

pca_result <- pca(data, nPcs = 5, method = 'ppca')

Scores plot

scores <- as.data.frame(scores(pca_result)) scores$group <- groups

ggplot(scores, aes(x = PC1, y = PC2, color = group)) + geom_point(size = 3) + stat_ellipse(level = 0.95) + labs(x = paste0('PC1 (', round(pca_result@R2[1] * 100, 1), '%)'), y = paste0('PC2 (', round(pca_result@R2[2] * 100, 1), '%)')) + theme_bw()

ggsave('pca_plot.png', width = 8, height = 6)

Loadings

loadings <- as.data.frame(loadings(pca_result)) top_pc1 <- loadings[order(abs(loadings$PC1), decreasing = TRUE)[1:20], ]

PLS-DA

library(mixOmics)

PLS-DA

plsda_result <- plsda(as.matrix(data), groups, ncomp = 3)

Cross-validation

perf_plsda <- perf(plsda_result, validation = 'Mfold', folds = 5, nrepeat = 50) plot(perf_plsda, col = color.mixo(5:7))

Optimal components

ncomp_opt <- perf_plsda$choice.ncomp['BER', 'centroids.dist'] cat('Optimal components:', ncomp_opt, '\n')

Final model

final_plsda <- plsda(as.matrix(data), groups, ncomp = ncomp_opt)

Plot

plotIndiv(final_plsda, group = groups, ellipse = TRUE, legend = TRUE)

VIP scores

vip <- vip(final_plsda) top_vip <- sort(vip[, ncomp_opt], decreasing = TRUE)[1:20] print(top_vip)

sPLS-DA (Sparse)

Tune number of features to select

tune_splsda <- tune.splsda(as.matrix(data), groups, ncomp = 3, validation = 'Mfold', folds = 5, nrepeat = 50, test.keepX = c(5, 10, 20, 50, 100))

optimal_keepX <- tune_splsda$choice.keepX

Final sparse model

splsda_result <- splsda(as.matrix(data), groups, ncomp = ncomp_opt, keepX = optimal_keepX)

Selected features

selected_features <- selectVar(splsda_result, comp = 1)$name cat('Selected features (comp 1):', length(selected_features), '\n')

OPLS-DA (Orthogonal PLS-DA)

library(ropls)

OPLS-DA

oplsda <- opls(data, groups, predI = 1, orthoI = NA)

Summary

print(oplsda)

Scores plot

plot(oplsda, typeVc = 'x-score')

S-plot (loadings vs correlation)

plot(oplsda, typeVc = 'x-loading')

VIP

vip_scores <- getVipVn(oplsda) top_vip <- sort(vip_scores, decreasing = TRUE)[1:20]

Random Forest

library(randomForest)

Random Forest classification

rf_model <- randomForest(x = data, y = groups, importance = TRUE, ntree = 500)

Importance

importance <- importance(rf_model) top_features <- rownames(importance)[order(importance[, 'MeanDecreaseAccuracy'], decreasing = TRUE)[1:20]]

Plot importance

varImpPlot(rf_model, n.var = 20)

ROC Analysis

library(pROC)

ROC for top biomarker

top_feature <- 'feature_123' # Replace with actual feature name roc_result <- roc(groups, data[, top_feature])

Plot

plot(roc_result, main = paste('AUC =', round(auc(roc_result), 3)))

Multiple features

biomarkers <- c('feature_1', 'feature_2', 'feature_3') for (feat in biomarkers) { roc_i <- roc(groups, data[, feat]) cat(feat, ': AUC =', round(auc(roc_i), 3), '\n') }

Heatmap

library(pheatmap)

Top differential features

top_features <- rownames(sig_features)[1:50] data_top <- data[, top_features]

Annotation

annotation_row <- data.frame(Group = groups) rownames(annotation_row) <- rownames(data)

pheatmap(t(data_top), annotation_col = annotation_row, scale = 'row', clustering_method = 'ward.D2', filename = 'heatmap.png', width = 10, height = 12)

Related Skills

normalization-qc - Data preparation
pathway-mapping - Functional interpretation
multi-omics-integration/mixomics-analysis - Advanced multivariate methods

bio-metabolomics-statistical-analysis

Safety Notice

Copy this and send it to your AI assistant to learn

Load normalized data

T-test for each feature

Significant features

Calculate fold change between groups

Combine statistics

Plot

PCA

Scores plot

Loadings

PLS-DA

Cross-validation

Optimal components

Final model

Plot

VIP scores

Tune number of features to select

Final sparse model

Selected features

OPLS-DA

Summary

Scores plot

S-plot (loadings vs correlation)

VIP

Random Forest classification

Importance

Plot importance

ROC for top biomarker

Plot

Multiple features

Top differential features

Annotation

Source Transparency

Related Skills

bio-microbiome-diversity-analysis

bio-pdb-geometric-analysis

bio-spatial-transcriptomics-image-analysis

bio-proteomics-dia-analysis