Go Optimization Skill
This skill provides expert guidance on Go performance optimization, covering profiling, benchmarking, memory management, and runtime tuning for building high-performance applications.
When to Use
Activate this skill when:
-
Profiling application performance
-
Optimizing CPU-intensive operations
-
Reducing memory allocations
-
Tuning garbage collection
-
Writing benchmarks
-
Analyzing performance bottlenecks
-
Optimizing hot paths
-
Reducing lock contention
Profiling
CPU Profiling
import ( "os" "runtime/pprof" )
func main() { // Start CPU profiling f, err := os.Create("cpu.prof") if err != nil { log.Fatal(err) } defer f.Close()
if err := pprof.StartCPUProfile(f); err != nil {
log.Fatal(err)
}
defer pprof.StopCPUProfile()
// Your code here
runApplication()
}
// Analyze: // go tool pprof cpu.prof // (pprof) top10 // (pprof) list functionName // (pprof) web
Memory Profiling
import ( "os" "runtime" "runtime/pprof" )
func writeMemProfile(filename string) { f, err := os.Create(filename) if err != nil { log.Fatal(err) } defer f.Close()
runtime.GC() // Force GC before snapshot
if err := pprof.WriteHeapProfile(f); err != nil {
log.Fatal(err)
}
}
// Analyze: // go tool pprof -alloc_space mem.prof // go tool pprof -inuse_space mem.prof
HTTP Profiling
import ( _ "net/http/pprof" "net/http" )
func main() { // Enable pprof endpoints go func() { log.Println(http.ListenAndServe("localhost:6060", nil)) }()
// Your application
runServer()
}
// Access profiles: // http://localhost:6060/debug/pprof/ // go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30 // go tool pprof http://localhost:6060/debug/pprof/heap
Execution Tracing
import ( "os" "runtime/trace" )
func main() { f, err := os.Create("trace.out") if err != nil { log.Fatal(err) } defer f.Close()
if err := trace.Start(f); err != nil {
log.Fatal(err)
}
defer trace.Stop()
// Your code
runApplication()
}
// View trace: // go tool trace trace.out
Benchmarking
Basic Benchmarks
func BenchmarkStringConcat(b *testing.B) { for i := 0; i < b.N; i++ { _ = "hello" + " " + "world" } }
func BenchmarkStringBuilder(b *testing.B) { for i := 0; i < b.N; i++ { var sb strings.Builder sb.WriteString("hello") sb.WriteString(" ") sb.WriteString("world") _ = sb.String() } }
// Run: go test -bench=. -benchmem
Sub-benchmarks
func BenchmarkEncode(b *testing.B) { data := generateTestData()
b.Run("JSON", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
json.Marshal(data)
}
})
b.Run("MessagePack", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
msgpack.Marshal(data)
}
})
}
Parallel Benchmarks
func BenchmarkConcurrentAccess(b *testing.B) { cache := NewCache()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
cache.Get("key")
}
})
}
Benchmark Comparison
Run benchmarks and save results
go test -bench=. -benchmem > old.txt
Make optimizations
Run again and compare
go test -bench=. -benchmem > new.txt benchstat old.txt new.txt
Memory Optimization
Escape Analysis
// Check what escapes to heap // go build -gcflags="-m" main.go
// ✅ GOOD: Stack allocation func stackAlloc() int { x := 42 return x }
// ❌ BAD: Heap escape func heapEscape() *int { x := 42 return &x // x escapes to heap }
// ✅ GOOD: Interface without allocation func noAlloc(w io.Writer, data []byte) { w.Write(data) }
// ❌ BAD: Interface causes allocation func withAlloc() io.Writer { var b bytes.Buffer return &b // &b escapes }
Pre-allocation
// ❌ BAD: Growing slice func badAppend(n int) []int { var result []int for i := 0; i < n; i++ { result = append(result, i) // Multiple allocations } return result }
// ✅ GOOD: Pre-allocate func goodAppend(n int) []int { result := make([]int, 0, n) // Single allocation for i := 0; i < n; i++ { result = append(result, i) } return result }
// ✅ GOOD: Known length func knownLength(n int) []int { result := make([]int, n) for i := 0; i < n; i++ { result[i] = i } return result }
// ❌ BAD: String concatenation func badConcat(strs []string) string { result := "" for _, s := range strs { result += s // New allocation each time } return result }
// ✅ GOOD: strings.Builder func goodConcat(strs []string) string { var sb strings.Builder sb.Grow(estimateSize(strs)) for _, s := range strs { sb.WriteString(s) } return sb.String() }
sync.Pool
var bufferPool = sync.Pool{ New: func() interface{} { return new(bytes.Buffer) }, }
func processData(data []byte) []byte { // Get buffer from pool buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() defer bufferPool.Put(buf)
// Use buffer
buf.Write(data)
// Process...
return buf.Bytes()
}
// String builder pool var sbPool = sync.Pool{ New: func() interface{} { return &strings.Builder{} }, }
func buildString(parts []string) string { sb := sbPool.Get().(*strings.Builder) sb.Reset() defer sbPool.Put(sb)
for _, part := range parts {
sb.WriteString(part)
}
return sb.String()
}
Zero-Copy Techniques
// Use byte slices instead of strings func parseHeader(header []byte) (key, value []byte) { i := bytes.IndexByte(header, ':') if i < 0 { return nil, nil } return header[:i], header[i+1:] }
// Reuse buffers type Parser struct { buf []byte }
func (p *Parser) Parse(data []byte) error { p.buf = p.buf[:0] // Reset length, keep capacity p.buf = append(p.buf, data...) // Process p.buf... return nil }
// Direct writing func writeResponse(w io.Writer, data interface{}) error { enc := json.NewEncoder(w) // Write directly to w return enc.Encode(data) }
Garbage Collection Tuning
GC Control
import "runtime/debug"
// Adjust GC target percentage debug.SetGCPercent(100) // Default // Higher = less frequent GC, more memory // Lower = more frequent GC, less memory
// Force GC (use sparingly!) runtime.GC()
// Monitor GC stats var stats runtime.MemStats runtime.ReadMemStats(&stats) fmt.Printf("Alloc = %v MB\n", stats.Alloc/1024/1024) fmt.Printf("TotalAlloc = %v MB\n", stats.TotalAlloc/1024/1024) fmt.Printf("Sys = %v MB\n", stats.Sys/1024/1024) fmt.Printf("NumGC = %v\n", stats.NumGC)
GOGC Environment Variable
Default (100%)
GOGC=100 ./myapp
More aggressive GC (uses less memory)
GOGC=50 ./myapp
Less frequent GC (uses more memory)
GOGC=200 ./myapp
Disable GC (for debugging)
GOGC=off ./myapp
Concurrency Optimization
Reduce Lock Contention
// ❌ BAD: Single lock type BadCache struct { mu sync.Mutex items map[string]interface{} }
// ✅ GOOD: RWMutex type GoodCache struct { mu sync.RWMutex items map[string]interface{} }
func (c *GoodCache) Get(key string) interface{} { c.mu.RLock() defer c.mu.RUnlock() return c.items[key] }
// ✅ BETTER: Sharded locks type ShardedCache struct { shards [256]*shard }
type shard struct { mu sync.RWMutex items map[string]interface{} }
func (c *ShardedCache) Get(key string) interface{} { shard := c.getShard(key) shard.mu.RLock() defer shard.mu.RUnlock() return shard.items[key] }
Channel Buffering
// ❌ BAD: Unbuffered channel causes blocking ch := make(chan int)
// ✅ GOOD: Buffered channel ch := make(chan int, 100)
// Optimal buffer size depends on: // - Producer/consumer rates // - Memory constraints // - Latency requirements
Atomic Operations
import "sync/atomic"
type Counter struct { value int64 }
func (c *Counter) Increment() { atomic.AddInt64(&c.value, 1) }
func (c *Counter) Value() int64 { return atomic.LoadInt64(&c.value) }
// ✅ Faster than mutex for simple operations // ❌ Limited to basic types and operations
Algorithmic Optimization
Map Pre-sizing
// ❌ BAD: Growing map func badMap(items []Item) map[string]Item { m := make(map[string]Item) for _, item := range items { m[item.ID] = item } return m }
// ✅ GOOD: Pre-sized map func goodMap(items []Item) map[string]Item { m := make(map[string]Item, len(items)) for _, item := range items { m[item.ID] = item } return m }
Avoid Unnecessary Work
// ❌ BAD: Repeated computation func process(items []Item) { for _, item := range items { if isValid(item) { result := expensiveComputation(item) if result > threshold { handleResult(result) } } } }
// ✅ GOOD: Early returns func process(items []Item) { for _, item := range items { if !isValid(item) { continue // Skip early } result := expensiveComputation(item) if result <= threshold { continue // Skip early } handleResult(result) } }
// ✅ BETTER: Fast path func process(items []Item) { for _, item := range items { // Fast path for common case if item.IsSimple() { handleSimple(item) continue } // Slow path for complex case handleComplex(item) } }
Runtime Tuning
GOMAXPROCS
import "runtime"
// Set number of OS threads runtime.GOMAXPROCS(runtime.NumCPU())
// For CPU-bound: NumCPU // For I/O-bound: NumCPU * 2 or more
Environment Variables
Max OS threads
GOMAXPROCS=8 ./myapp
GC aggressiveness
GOGC=100 ./myapp
Memory limit (Go 1.19+)
GOMEMLIMIT=4GiB ./myapp
Trace execution
GODEBUG=gctrace=1 ./myapp
Performance Patterns
Inline Functions
// Compiler inlines small functions automatically
//go:inline func add(a, b int) int { return a + b }
// Keep hot-path functions small for inlining
Avoid Interface Allocations
// ❌ BAD: Interface allocation func badPrint(value interface{}) { fmt.Println(value) // value escapes }
// ✅ GOOD: Type-specific functions func printInt(value int) { fmt.Println(value) }
func printString(value string) { fmt.Println(value) }
Batch Operations
// ❌ BAD: Individual operations for _, item := range items { db.Insert(item) // N database calls }
// ✅ GOOD: Batch operations db.BatchInsert(items) // 1 database call
Best Practices
-
Profile before optimizing - Measure, don't guess
-
Focus on hot paths - Optimize the 20% that matters
-
Reduce allocations - Reuse objects, pre-allocate
-
Use appropriate data structures - Map vs slice vs array
-
Minimize lock contention - Use RWMutex, sharding
-
Benchmark changes - Use benchstat for comparisons
-
Test with race detector - go test -race
-
Monitor in production - Use profiling endpoints
-
Balance readability and performance - Don't over-optimize
-
Use PGO - Profile-guided optimization (Go 1.20+)
Profile-Guided Optimization (PGO)
1. Build with profiling
go build -o myapp
2. Run and collect profile
./myapp -cpuprofile=default.pgo
3. Rebuild with PGO
go build -pgo=default.pgo -o myapp-optimized
Performance improvement: 5-15% typical
Resources
Additional resources in:
-
assets/examples/
-
Performance optimization examples
-
assets/benchmarks/
-
Benchmark templates
-
references/
-
Links to profiling guides and performance papers