Batch Effect Correction
ComBat-Seq (Count Data)
library(sva)
counts: raw count matrix (genes x samples)
batch: vector of batch labels
group: vector of biological condition (optional, to preserve)
corrected_counts <- ComBat_seq(counts = as.matrix(counts), batch = batch, group = condition, full_mod = TRUE)
Result is batch-corrected count matrix
Use for visualization, clustering, but NOT for DE (use design formula instead)
ComBat (Normalized Data)
library(sva)
For normalized expression (log-transformed, TPM, etc.)
NOT for raw counts
Create model matrix
mod <- model.matrix(~ condition, data = metadata) mod0 <- model.matrix(~ 1, data = metadata)
Run ComBat
corrected_expr <- ComBat(dat = as.matrix(normalized_expr), batch = metadata$batch, mod = mod, par.prior = TRUE)
limma removeBatchEffect
library(limma)
For visualization/clustering only
Preserves group differences while removing batch
design <- model.matrix(~ condition, data = metadata) corrected_expr <- removeBatchEffect(normalized_expr, batch = metadata$batch, design = design)
For PCA, heatmaps, etc.
DESeq2 Design Formula (Recommended for DE)
library(DESeq2)
Include batch in design formula - preferred for DE analysis
dds <- DESeqDataSetFromMatrix(countData = counts, colData = metadata, design = ~ batch + condition)
Batch is modeled, not removed
DE results are adjusted for batch
dds <- DESeq(dds) res <- results(dds, contrast = c('condition', 'treatment', 'control'))
Surrogate Variable Analysis (SVA)
library(sva)
When batch is unknown, estimate surrogate variables
mod <- model.matrix(~ condition, data = metadata) mod0 <- model.matrix(~ 1, data = metadata)
Estimate number of surrogate variables
n_sv <- num.sv(normalized_expr, mod, method = 'leek')
Estimate surrogate variables
svobj <- sva(normalized_expr, mod, mod0, n.sv = n_sv)
Add SVs to design for DE
design_with_sv <- cbind(mod, svobj$sv)
SVA with DESeq2
library(DESeq2) library(sva)
Normalize for SV estimation
dds <- DESeqDataSetFromMatrix(countData = counts, colData = metadata, design = ~ condition) dds <- estimateSizeFactors(dds) norm_counts <- counts(dds, normalized = TRUE)
Estimate SVs
mod <- model.matrix(~ condition, data = metadata) mod0 <- model.matrix(~ 1, data = metadata) svobj <- sva(norm_counts, mod, mod0)
Add SVs to colData
for (i in seq_len(ncol(svobj$sv))) { colData(dds)[[paste0('SV', i)]] <- svobj$sv[, i] }
Update design
sv_formula <- as.formula(paste('~', paste(paste0('SV', 1:ncol(svobj$sv)), collapse = ' + '), '+ condition')) design(dds) <- sv_formula
Run DESeq2
dds <- DESeq(dds)
Visualize Batch Effects
library(ggplot2)
PCA before correction
pca_before <- prcomp(t(normalized_expr), scale. = TRUE) pca_df <- data.frame(PC1 = pca_before$x[, 1], PC2 = pca_before$x[, 2], batch = metadata$batch, condition = metadata$condition)
p1 <- ggplot(pca_df, aes(PC1, PC2, color = batch, shape = condition)) + geom_point(size = 3) + ggtitle('Before Correction')
PCA after correction
pca_after <- prcomp(t(corrected_expr), scale. = TRUE) pca_df_after <- data.frame(PC1 = pca_after$x[, 1], PC2 = pca_after$x[, 2], batch = metadata$batch, condition = metadata$condition)
p2 <- ggplot(pca_df_after, aes(PC1, PC2, color = batch, shape = condition)) + geom_point(size = 3) + ggtitle('After Correction')
library(patchwork) p1 + p2
Quantify Batch Effect
PVCA - Principal Variance Component Analysis
library(pvca)
Proportion of variance explained by batch vs condition
pvcaObj <- pvcaBatchAssess(normalized_expr, metadata, threshold = 0.6, theInteractionTerms = c('batch', 'condition'))
Or manual approach
pca <- prcomp(t(normalized_expr), scale. = TRUE) variance_explained <- summary(pca)$importance[2, 1:5]
Correlation of PCs with batch
cor(pca$x[, 1], as.numeric(as.factor(metadata$batch)))
Harmony (Single-Cell Integration)
library(harmony) library(Seurat)
For single-cell data with multiple batches
seurat_obj <- RunHarmony(seurat_obj, group.by.vars = 'batch', reduction = 'pca', dims.use = 1:30)
Use harmony reduction for downstream
seurat_obj <- RunUMAP(seurat_obj, reduction = 'harmony', dims = 1:30) seurat_obj <- FindNeighbors(seurat_obj, reduction = 'harmony', dims = 1:30)
When NOT to Correct
DON'T use batch-corrected values for:
- Differential expression (use design formula instead)
- Count-based methods expecting raw/normalized counts
DO use batch-corrected values for:
- Visualization (PCA, UMAP, heatmaps)
- Clustering
- Machine learning features
- Cross-study comparisons
Related Skills
-
differential-expression/deseq2-basics - DE with batch in design
-
single-cell/clustering - Integration methods
-
expression-matrix/matrix-operations - Data transformation