Site Reliability Engineering Expert
Expert guidance for SRE practices, reliability engineering, SLOs/SLIs, incident management, and operational excellence.
Core Concepts
SRE Fundamentals
-
Service Level Objectives (SLOs)
-
Service Level Indicators (SLIs)
-
Error budgets
-
Toil reduction
-
Monitoring and alerting
-
Capacity planning
Reliability Practices
-
Incident management
-
Post-incident reviews (PIRs)
-
On-call rotations
-
Chaos engineering
-
Disaster recovery
-
Change management
Automation
-
Infrastructure as Code
-
Configuration management
-
Deployment automation
-
Self-healing systems
-
Runbook automation
-
Automated remediation
SLO/SLI Management
from dataclasses import dataclass from datetime import datetime, timedelta from typing import List, Dict import numpy as np
@dataclass class SLI: """Service Level Indicator""" name: str description: str query: str unit: str # 'percentage', 'milliseconds', etc.
@dataclass class SLO: """Service Level Objective""" name: str sli: SLI target: float window_days: int
class SLOTracker: """Track and manage SLOs"""
def __init__(self):
self.slos: Dict[str, SLO] = {}
self.measurements: Dict[str, List[Dict]] = {}
def define_slo(self, slo: SLO):
"""Define a new SLO"""
self.slos[slo.name] = slo
self.measurements[slo.name] = []
def record_measurement(self, slo_name: str, value: float, timestamp: datetime):
"""Record SLI measurement"""
if slo_name in self.slos:
self.measurements[slo_name].append({
'value': value,
'timestamp': timestamp
})
def calculate_slo_compliance(self, slo_name: str) -> Dict:
"""Calculate SLO compliance"""
slo = self.slos.get(slo_name)
if not slo:
return {}
measurements = self.measurements.get(slo_name, [])
window_start = datetime.now() - timedelta(days=slo.window_days)
recent_measurements = [
m for m in measurements
if m['timestamp'] > window_start
]
if not recent_measurements:
return {'status': 'no_data'}
values = [m['value'] for m in recent_measurements]
actual = np.mean(values)
return {
'slo_name': slo_name,
'target': slo.target,
'actual': actual,
'compliant': actual >= slo.target,
'window_days': slo.window_days,
'sample_count': len(recent_measurements)
}
def calculate_error_budget(self, slo_name: str) -> Dict:
"""Calculate remaining error budget"""
compliance = self.calculate_slo_compliance(slo_name)
if compliance.get('status') == 'no_data':
return {'status': 'no_data'}
target = compliance['target']
actual = compliance['actual']
error_budget_target = 100 - target
errors_actual = 100 - actual
remaining = error_budget_target - errors_actual
remaining_pct = (remaining / error_budget_target) * 100 if error_budget_target > 0 else 100
return {
'slo_name': slo_name,
'error_budget_target': error_budget_target,
'errors_actual': errors_actual,
'remaining': remaining,
'remaining_percentage': remaining_pct,
'exhausted': remaining < 0
}
Example SLOs
def define_standard_slos() -> List[SLO]: """Define standard SLOs for a web service""" return [ SLO( name="api_availability", sli=SLI( name="availability", description="Percentage of successful requests", query="sum(rate(http_requests_total{code!~'5..'}[5m])) / sum(rate(http_requests_total[5m])) * 100", unit="percentage" ), target=99.9, window_days=30 ), SLO( name="api_latency", sli=SLI( name="latency_p95", description="95th percentile latency", query="histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", unit="seconds" ), target=0.5, # 500ms window_days=30 ) ]
Incident Management
from enum import Enum from datetime import datetime from typing import List, Optional
class Severity(Enum): SEV1 = "sev1" # Critical SEV2 = "sev2" # High SEV3 = "sev3" # Medium SEV4 = "sev4" # Low
class IncidentStatus(Enum): INVESTIGATING = "investigating" IDENTIFIED = "identified" MONITORING = "monitoring" RESOLVED = "resolved"
@dataclass class Incident: incident_id: str title: str severity: Severity status: IncidentStatus started_at: datetime detected_at: datetime resolved_at: Optional[datetime] incident_commander: str responders: List[str] affected_services: List[str] timeline: List[Dict] root_cause: Optional[str] = None
class IncidentManager: """Manage incidents following SRE best practices"""
def __init__(self):
self.incidents: Dict[str, Incident] = {}
def create_incident(self, incident: Incident) -> str:
"""Create new incident"""
self.incidents[incident.incident_id] = incident
# Notify on-call
self.notify_oncall(incident)
# Start incident timeline
self.add_timeline_event(
incident.incident_id,
"Incident created",
datetime.now()
)
return incident.incident_id
def update_status(self, incident_id: str, new_status: IncidentStatus,
note: str):
"""Update incident status"""
if incident_id in self.incidents:
incident = self.incidents[incident_id]
incident.status = new_status
self.add_timeline_event(
incident_id,
f"Status changed to {new_status.value}: {note}",
datetime.now()
)
if new_status == IncidentStatus.RESOLVED:
incident.resolved_at = datetime.now()
def add_timeline_event(self, incident_id: str, event: str,
timestamp: datetime):
"""Add event to incident timeline"""
if incident_id in self.incidents:
self.incidents[incident_id].timeline.append({
'timestamp': timestamp,
'event': event
})
def calculate_mttr(self, incident_id: str) -> Optional[float]:
"""Calculate Mean Time To Resolution"""
incident = self.incidents.get(incident_id)
if incident and incident.resolved_at:
duration = incident.resolved_at - incident.detected_at
return duration.total_seconds() / 60 # minutes
return None
def generate_incident_report(self, incident_id: str) -> Dict:
"""Generate incident report"""
incident = self.incidents.get(incident_id)
if not incident:
return {}
return {
'incident_id': incident.incident_id,
'title': incident.title,
'severity': incident.severity.value,
'status': incident.status.value,
'duration_minutes': self.calculate_mttr(incident_id),
'affected_services': incident.affected_services,
'incident_commander': incident.incident_commander,
'responders': incident.responders,
'timeline': incident.timeline,
'root_cause': incident.root_cause
}
def notify_oncall(self, incident: Incident):
"""Notify on-call engineer (integrate with PagerDuty, etc.)"""
# Implementation would integrate with alerting system
pass
Monitoring and Alerting
from prometheus_client import Counter, Histogram, Gauge import time
Metrics
request_count = Counter('http_requests_total', 'Total HTTP requests', ['method', 'endpoint', 'status']) request_duration = Histogram('http_request_duration_seconds', 'HTTP request duration') active_connections = Gauge('active_connections', 'Number of active connections')
class MonitoringSystem: """Implement monitoring best practices"""
def __init__(self):
self.alerts = []
def record_request(self, method: str, endpoint: str, status: int, duration: float):
"""Record HTTP request metrics"""
request_count.labels(method=method, endpoint=endpoint, status=status).inc()
request_duration.observe(duration)
def define_alert(self, name: str, expression: str, threshold: float,
duration: str, severity: str) -> Dict:
"""Define alerting rule"""
alert = {
'name': name,
'expression': expression,
'threshold': threshold,
'duration': duration,
'severity': severity,
'annotations': {
'summary': f'{name} alert triggered',
'runbook_url': f'https://runbooks.example.com/{name}'
}
}
self.alerts.append(alert)
return alert
def check_golden_signals(self, metrics: Dict) -> Dict:
"""Check the four golden signals"""
return {
'latency': self._check_latency(metrics.get('latency', [])),
'traffic': self._check_traffic(metrics.get('traffic', 0)),
'errors': self._check_errors(metrics.get('error_rate', 0)),
'saturation': self._check_saturation(metrics.get('cpu_usage', 0))
}
def _check_latency(self, latencies: List[float]) -> Dict:
if not latencies:
return {'status': 'unknown'}
p95 = np.percentile(latencies, 95)
return {
'status': 'critical' if p95 > 1000 else 'ok',
'p95_ms': p95
}
def _check_traffic(self, requests_per_second: float) -> Dict:
return {
'status': 'ok',
'rps': requests_per_second
}
def _check_errors(self, error_rate: float) -> Dict:
return {
'status': 'critical' if error_rate > 1.0 else 'ok',
'error_rate': error_rate
}
def _check_saturation(self, cpu_usage: float) -> Dict:
return {
'status': 'warning' if cpu_usage > 80 else 'ok',
'cpu_usage': cpu_usage
}
Chaos Engineering
import random from typing import Callable
class ChaosExperiment: """Run chaos engineering experiments"""
def __init__(self, name: str, hypothesis: str):
self.name = name
self.hypothesis = hypothesis
self.results = []
def inject_latency(self, service_call: Callable, delay_ms: int):
"""Inject latency into service call"""
time.sleep(delay_ms / 1000)
return service_call()
def inject_failure(self, service_call: Callable, failure_rate: float):
"""Randomly fail service calls"""
if random.random() < failure_rate:
raise Exception("Chaos: Simulated failure")
return service_call()
def kill_random_instance(self, instances: List[str]) -> str:
"""Kill random instance"""
victim = random.choice(instances)
# Implementation would actually kill the instance
return victim
def run_experiment(self, experiment_func: Callable) -> Dict:
"""Run chaos experiment"""
start_time = datetime.now()
try:
result = experiment_func()
status = "success"
error = None
except Exception as e:
result = None
status = "failed"
error = str(e)
end_time = datetime.now()
experiment_result = {
'name': self.name,
'hypothesis': self.hypothesis,
'status': status,
'result': result,
'error': error,
'duration': (end_time - start_time).total_seconds(),
'timestamp': start_time
}
self.results.append(experiment_result)
return experiment_result
Best Practices
SRE Principles
-
Embrace risk management
-
Set SLOs based on user experience
-
Use error budgets for decision making
-
Automate toil away
-
Monitor the four golden signals
-
Practice blameless post-mortems
-
Gradual rollouts and canary deployments
Incident Management
-
Clear incident severity definitions
-
Defined incident commander role
-
Communicate proactively
-
Document timeline during incident
-
Conduct post-incident reviews
-
Track action items to completion
-
Share learnings across teams
On-Call
-
Reasonable on-call rotations
-
Comprehensive runbooks
-
Alert on symptoms, not causes
-
Actionable alerts only
-
Escalation policies
-
Support on-call engineers
-
Measure and reduce alert fatigue
Anti-Patterns
❌ No SLOs defined ❌ Alerts without runbooks ❌ Blame culture for incidents ❌ No post-incident reviews ❌ 100% uptime expectations ❌ Toil not tracked or reduced ❌ Manual processes for common tasks
Resources
-
Google SRE Book: https://sre.google/sre-book/table-of-contents/
-
Site Reliability Engineering: https://sre.google/
-
SLO Workshop: https://github.com/google/slo-workshop
-
Chaos Engineering: https://principlesofchaos.org/
-
Prometheus: https://prometheus.io/