Gene ID Mapping

Python: mygene

import mygene import pandas as pd

mg = mygene.MyGeneInfo()

Ensembl to Symbol

ensembl_ids = ['ENSG00000141510', 'ENSG00000012048', 'ENSG00000141736'] results = mg.querymany(ensembl_ids, scopes='ensembl.gene', fields='symbol', species='human') mapping = {r['query']: r.get('symbol', None) for r in results}

{'ENSG00000141510': 'TP53', 'ENSG00000012048': 'BRCA1', 'ENSG00000141736': 'ERBB2'}

Symbol to Entrez

symbols = ['TP53', 'BRCA1', 'ERBB2'] results = mg.querymany(symbols, scopes='symbol', fields='entrezgene', species='human') mapping = {r['query']: r.get('entrezgene', None) for r in results}

Ensembl to multiple fields

results = mg.querymany(ensembl_ids, scopes='ensembl.gene', fields=['symbol', 'entrezgene', 'uniprot'], species='human')

Python: pyensembl

from pyensembl import EnsemblRelease

Load Ensembl release (downloads automatically first time)

ensembl = EnsemblRelease(110, species='human') # or 'mouse'

Gene ID to symbol

gene = ensembl.gene_by_id('ENSG00000141510') print(gene.gene_name) # TP53

Symbol to gene ID

gene = ensembl.genes_by_name('TP53')[0] print(gene.gene_id) # ENSG00000141510

Batch conversion

def ensembl_to_symbol(ensembl_ids, release=110): ens = EnsemblRelease(release, species='human') mapping = {} for eid in ensembl_ids: try: gene = ens.gene_by_id(eid.split('.')[0]) # Remove version mapping[eid] = gene.gene_name except ValueError: mapping[eid] = None return mapping

Python: gseapy

import gseapy as gp

Ensembl to Symbol using Enrichr

gene_list = ['ENSG00000141510', 'ENSG00000012048'] converted = gp.biomart.ensembl2name(gene_list, organism='hsapiens')

R: biomaRt

library(biomaRt)

Connect to Ensembl

ensembl <- useEnsembl(biomart='genes', dataset='hsapiens_gene_ensembl')

Ensembl to Symbol

ensembl_ids <- c('ENSG00000141510', 'ENSG00000012048', 'ENSG00000141736') results <- getBM( attributes=c('ensembl_gene_id', 'hgnc_symbol', 'entrezgene_id'), filters='ensembl_gene_id', values=ensembl_ids, mart=ensembl )

Symbol to Ensembl

symbols <- c('TP53', 'BRCA1', 'ERBB2') results <- getBM( attributes=c('hgnc_symbol', 'ensembl_gene_id'), filters='hgnc_symbol', values=symbols, mart=ensembl )

All available attributes

listAttributes(ensembl)

R: org.db Packages

library(org.Hs.eg.db) # Human library(AnnotationDbi)

Ensembl to Symbol

ensembl_ids <- c('ENSG00000141510', 'ENSG00000012048') symbols <- mapIds(org.Hs.eg.db, keys=ensembl_ids, keytype='ENSEMBL', column='SYMBOL')

Symbol to Entrez

symbols <- c('TP53', 'BRCA1') entrez <- mapIds(org.Hs.eg.db, keys=symbols, keytype='SYMBOL', column='ENTREZID')

Available keytypes

keytypes(org.Hs.eg.db)

ENSEMBL, ENSEMBLPROT, ENSEMBLTRANS, ENTREZID, SYMBOL, UNIPROT, etc.

Apply Mapping to Count Matrix

import pandas as pd import mygene

def map_count_matrix_ids(counts, from_type='ensembl.gene', to_type='symbol', species='human'): '''Map gene IDs in count matrix index.''' mg = mygene.MyGeneInfo()

# Remove version numbers from Ensembl IDs
clean_ids = [g.split('.')[0] for g in counts.index]

# Query mygene
results = mg.querymany(clean_ids, scopes=from_type, fields=to_type, species=species)

# Build mapping
mapping = {}
for r in results:
    if to_type in r:
        mapping[r['query']] = r[to_type]

# Apply mapping
new_index = [mapping.get(g.split('.')[0], g) for g in counts.index]
counts_mapped = counts.copy()
counts_mapped.index = new_index

# Handle duplicates (sum)
counts_mapped = counts_mapped.groupby(counts_mapped.index).sum()

return counts_mapped

Usage

counts_symbols = map_count_matrix_ids(counts, 'ensembl.gene', 'symbol')

R Equivalent

library(biomaRt)

map_count_matrix_ids <- function(counts, from_type='ensembl_gene_id', to_type='hgnc_symbol') { ensembl <- useEnsembl(biomart='genes', dataset='hsapiens_gene_ensembl')

# Remove version numbers
clean_ids &#x3C;- gsub('\\..*', '', rownames(counts))

# Get mapping
mapping &#x3C;- getBM(
    attributes=c(from_type, to_type),
    filters=from_type,
    values=clean_ids,
    mart=ensembl
)

# Merge and aggregate duplicates
counts$gene_id &#x3C;- clean_ids
merged &#x3C;- merge(counts, mapping, by.x='gene_id', by.y=from_type, all.x=TRUE)
merged$gene_id &#x3C;- NULL

# Use symbol as rowname, sum duplicates
rownames(merged) &#x3C;- merged[[to_type]]
merged[[to_type]] &#x3C;- NULL
counts_mapped &#x3C;- aggregate(. ~ rownames(merged), data=merged, FUN=sum)
rownames(counts_mapped) &#x3C;- counts_mapped[,1]
counts_mapped &#x3C;- counts_mapped[,-1]

return(counts_mapped)

}

Handle Unmapped IDs

def robust_id_mapping(gene_ids, from_type, to_type, species='human'): '''Map IDs with fallback for unmapped genes.''' import mygene mg = mygene.MyGeneInfo()

clean_ids = [g.split('.')[0] for g in gene_ids]
results = mg.querymany(clean_ids, scopes=from_type, fields=to_type, species=species)

mapping = {}
unmapped = []
for r in results:
    original = gene_ids[clean_ids.index(r['query'])]
    if to_type in r:
        mapping[original] = r[to_type]
    else:
        mapping[original] = original  # Keep original if unmapped
        unmapped.append(original)

print(f'Mapped: {len(gene_ids) - len(unmapped)}/{len(gene_ids)}')
print(f'Unmapped: {len(unmapped)}')

return mapping, unmapped

Common ID Types

Type Example Use Case

Ensembl Gene ENSG00000141510 RNA-seq, GTF files

Ensembl Transcript ENST00000269305 Transcript-level analysis

Entrez Gene 7157 NCBI databases, KEGG

HGNC Symbol TP53 Human readable

UniProt P04637 Protein databases

RefSeq NM_000546 NCBI RefSeq

Related Skills

expression-matrix/counts-ingest - Load count data
expression-matrix/metadata-joins - Add annotations
pathway-analysis/go-enrichment - Requires Entrez IDs
pathway-analysis/kegg-pathways - Requires Entrez IDs

bio-expression-matrix-gene-id-mapping

Safety Notice

Copy this and send it to your AI assistant to learn

Ensembl to Symbol

{'ENSG00000141510': 'TP53', 'ENSG00000012048': 'BRCA1', 'ENSG00000141736': 'ERBB2'}

Symbol to Entrez

Ensembl to multiple fields

Load Ensembl release (downloads automatically first time)

Gene ID to symbol

Symbol to gene ID

Batch conversion

Ensembl to Symbol using Enrichr

Connect to Ensembl

Ensembl to Symbol

Symbol to Ensembl

All available attributes

Ensembl to Symbol

Symbol to Entrez

Available keytypes

ENSEMBL, ENSEMBLPROT, ENSEMBLTRANS, ENTREZID, SYMBOL, UNIPROT, etc.

Usage

Source Transparency

Related Skills

bioskills

bio-data-visualization-genome-tracks

bio-epitranscriptomics-merip-preprocessing

bio-data-visualization-multipanel-figures