Site Reliability Engineering Expert

Expert guidance for SRE practices, reliability engineering, SLOs/SLIs, incident management, and operational excellence.

Core Concepts

SRE Fundamentals

Service Level Objectives (SLOs)
Service Level Indicators (SLIs)
Error budgets
Toil reduction
Monitoring and alerting
Capacity planning

Reliability Practices

Incident management
Post-incident reviews (PIRs)
On-call rotations
Chaos engineering
Disaster recovery
Change management

Automation

Infrastructure as Code
Configuration management
Deployment automation
Self-healing systems
Runbook automation
Automated remediation

SLO/SLI Management

from dataclasses import dataclass from datetime import datetime, timedelta from typing import List, Dict import numpy as np

@dataclass class SLI: """Service Level Indicator""" name: str description: str query: str unit: str # 'percentage', 'milliseconds', etc.

@dataclass class SLO: """Service Level Objective""" name: str sli: SLI target: float window_days: int

class SLOTracker: """Track and manage SLOs"""

def __init__(self):
    self.slos: Dict[str, SLO] = {}
    self.measurements: Dict[str, List[Dict]] = {}

def define_slo(self, slo: SLO):
    """Define a new SLO"""
    self.slos[slo.name] = slo
    self.measurements[slo.name] = []

def record_measurement(self, slo_name: str, value: float, timestamp: datetime):
    """Record SLI measurement"""
    if slo_name in self.slos:
        self.measurements[slo_name].append({
            'value': value,
            'timestamp': timestamp
        })

def calculate_slo_compliance(self, slo_name: str) -> Dict:
    """Calculate SLO compliance"""
    slo = self.slos.get(slo_name)
    if not slo:
        return {}

    measurements = self.measurements.get(slo_name, [])
    window_start = datetime.now() - timedelta(days=slo.window_days)

    recent_measurements = [
        m for m in measurements
        if m['timestamp'] > window_start
    ]

    if not recent_measurements:
        return {'status': 'no_data'}

    values = [m['value'] for m in recent_measurements]
    actual = np.mean(values)

    return {
        'slo_name': slo_name,
        'target': slo.target,
        'actual': actual,
        'compliant': actual >= slo.target,
        'window_days': slo.window_days,
        'sample_count': len(recent_measurements)
    }

def calculate_error_budget(self, slo_name: str) -> Dict:
    """Calculate remaining error budget"""
    compliance = self.calculate_slo_compliance(slo_name)

    if compliance.get('status') == 'no_data':
        return {'status': 'no_data'}

    target = compliance['target']
    actual = compliance['actual']

    error_budget_target = 100 - target
    errors_actual = 100 - actual

    remaining = error_budget_target - errors_actual
    remaining_pct = (remaining / error_budget_target) * 100 if error_budget_target > 0 else 100

    return {
        'slo_name': slo_name,
        'error_budget_target': error_budget_target,
        'errors_actual': errors_actual,
        'remaining': remaining,
        'remaining_percentage': remaining_pct,
        'exhausted': remaining &#x3C; 0
    }

Example SLOs

def define_standard_slos() -> List[SLO]: """Define standard SLOs for a web service""" return [ SLO( name="api_availability", sli=SLI( name="availability", description="Percentage of successful requests", query="sum(rate(http_requests_total{code!~'5..'}[5m])) / sum(rate(http_requests_total[5m])) * 100", unit="percentage" ), target=99.9, window_days=30 ), SLO( name="api_latency", sli=SLI( name="latency_p95", description="95th percentile latency", query="histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", unit="seconds" ), target=0.5, # 500ms window_days=30 ) ]

Incident Management

from enum import Enum from datetime import datetime from typing import List, Optional

class Severity(Enum): SEV1 = "sev1" # Critical SEV2 = "sev2" # High SEV3 = "sev3" # Medium SEV4 = "sev4" # Low

class IncidentStatus(Enum): INVESTIGATING = "investigating" IDENTIFIED = "identified" MONITORING = "monitoring" RESOLVED = "resolved"

@dataclass class Incident: incident_id: str title: str severity: Severity status: IncidentStatus started_at: datetime detected_at: datetime resolved_at: Optional[datetime] incident_commander: str responders: List[str] affected_services: List[str] timeline: List[Dict] root_cause: Optional[str] = None

class IncidentManager: """Manage incidents following SRE best practices"""

def __init__(self):
    self.incidents: Dict[str, Incident] = {}

def create_incident(self, incident: Incident) -> str:
    """Create new incident"""
    self.incidents[incident.incident_id] = incident

    # Notify on-call
    self.notify_oncall(incident)

    # Start incident timeline
    self.add_timeline_event(
        incident.incident_id,
        "Incident created",
        datetime.now()
    )

    return incident.incident_id

def update_status(self, incident_id: str, new_status: IncidentStatus,
                 note: str):
    """Update incident status"""
    if incident_id in self.incidents:
        incident = self.incidents[incident_id]
        incident.status = new_status

        self.add_timeline_event(
            incident_id,
            f"Status changed to {new_status.value}: {note}",
            datetime.now()
        )

        if new_status == IncidentStatus.RESOLVED:
            incident.resolved_at = datetime.now()

def add_timeline_event(self, incident_id: str, event: str,
                      timestamp: datetime):
    """Add event to incident timeline"""
    if incident_id in self.incidents:
        self.incidents[incident_id].timeline.append({
            'timestamp': timestamp,
            'event': event
        })

def calculate_mttr(self, incident_id: str) -> Optional[float]:
    """Calculate Mean Time To Resolution"""
    incident = self.incidents.get(incident_id)

    if incident and incident.resolved_at:
        duration = incident.resolved_at - incident.detected_at
        return duration.total_seconds() / 60  # minutes

    return None

def generate_incident_report(self, incident_id: str) -> Dict:
    """Generate incident report"""
    incident = self.incidents.get(incident_id)

    if not incident:
        return {}

    return {
        'incident_id': incident.incident_id,
        'title': incident.title,
        'severity': incident.severity.value,
        'status': incident.status.value,
        'duration_minutes': self.calculate_mttr(incident_id),
        'affected_services': incident.affected_services,
        'incident_commander': incident.incident_commander,
        'responders': incident.responders,
        'timeline': incident.timeline,
        'root_cause': incident.root_cause
    }

def notify_oncall(self, incident: Incident):
    """Notify on-call engineer (integrate with PagerDuty, etc.)"""
    # Implementation would integrate with alerting system
    pass

Monitoring and Alerting

from prometheus_client import Counter, Histogram, Gauge import time

Metrics

request_count = Counter('http_requests_total', 'Total HTTP requests', ['method', 'endpoint', 'status']) request_duration = Histogram('http_request_duration_seconds', 'HTTP request duration') active_connections = Gauge('active_connections', 'Number of active connections')

class MonitoringSystem: """Implement monitoring best practices"""

def __init__(self):
    self.alerts = []

def record_request(self, method: str, endpoint: str, status: int, duration: float):
    """Record HTTP request metrics"""
    request_count.labels(method=method, endpoint=endpoint, status=status).inc()
    request_duration.observe(duration)

def define_alert(self, name: str, expression: str, threshold: float,
                duration: str, severity: str) -> Dict:
    """Define alerting rule"""
    alert = {
        'name': name,
        'expression': expression,
        'threshold': threshold,
        'duration': duration,
        'severity': severity,
        'annotations': {
            'summary': f'{name} alert triggered',
            'runbook_url': f'https://runbooks.example.com/{name}'
        }
    }

    self.alerts.append(alert)
    return alert

def check_golden_signals(self, metrics: Dict) -> Dict:
    """Check the four golden signals"""
    return {
        'latency': self._check_latency(metrics.get('latency', [])),
        'traffic': self._check_traffic(metrics.get('traffic', 0)),
        'errors': self._check_errors(metrics.get('error_rate', 0)),
        'saturation': self._check_saturation(metrics.get('cpu_usage', 0))
    }

def _check_latency(self, latencies: List[float]) -> Dict:
    if not latencies:
        return {'status': 'unknown'}

    p95 = np.percentile(latencies, 95)
    return {
        'status': 'critical' if p95 > 1000 else 'ok',
        'p95_ms': p95
    }

def _check_traffic(self, requests_per_second: float) -> Dict:
    return {
        'status': 'ok',
        'rps': requests_per_second
    }

def _check_errors(self, error_rate: float) -> Dict:
    return {
        'status': 'critical' if error_rate > 1.0 else 'ok',
        'error_rate': error_rate
    }

def _check_saturation(self, cpu_usage: float) -> Dict:
    return {
        'status': 'warning' if cpu_usage > 80 else 'ok',
        'cpu_usage': cpu_usage
    }

Chaos Engineering

import random from typing import Callable

class ChaosExperiment: """Run chaos engineering experiments"""

def __init__(self, name: str, hypothesis: str):
    self.name = name
    self.hypothesis = hypothesis
    self.results = []

def inject_latency(self, service_call: Callable, delay_ms: int):
    """Inject latency into service call"""
    time.sleep(delay_ms / 1000)
    return service_call()

def inject_failure(self, service_call: Callable, failure_rate: float):
    """Randomly fail service calls"""
    if random.random() &#x3C; failure_rate:
        raise Exception("Chaos: Simulated failure")
    return service_call()

def kill_random_instance(self, instances: List[str]) -> str:
    """Kill random instance"""
    victim = random.choice(instances)
    # Implementation would actually kill the instance
    return victim

def run_experiment(self, experiment_func: Callable) -> Dict:
    """Run chaos experiment"""
    start_time = datetime.now()

    try:
        result = experiment_func()
        status = "success"
        error = None
    except Exception as e:
        result = None
        status = "failed"
        error = str(e)

    end_time = datetime.now()

    experiment_result = {
        'name': self.name,
        'hypothesis': self.hypothesis,
        'status': status,
        'result': result,
        'error': error,
        'duration': (end_time - start_time).total_seconds(),
        'timestamp': start_time
    }

    self.results.append(experiment_result)
    return experiment_result

Best Practices

SRE Principles

Embrace risk management
Set SLOs based on user experience
Use error budgets for decision making
Automate toil away
Monitor the four golden signals
Practice blameless post-mortems
Gradual rollouts and canary deployments

Incident Management

Clear incident severity definitions
Defined incident commander role
Communicate proactively
Document timeline during incident
Conduct post-incident reviews
Track action items to completion
Share learnings across teams

On-Call

Reasonable on-call rotations
Comprehensive runbooks
Alert on symptoms, not causes
Actionable alerts only
Escalation policies
Support on-call engineers
Measure and reduce alert fatigue

Anti-Patterns

❌ No SLOs defined ❌ Alerts without runbooks ❌ Blame culture for incidents ❌ No post-incident reviews ❌ 100% uptime expectations ❌ Toil not tracked or reduced ❌ Manual processes for common tasks

Resources

Google SRE Book: https://sre.google/sre-book/table-of-contents/
Site Reliability Engineering: https://sre.google/
SLO Workshop: https://github.com/google/slo-workshop
Chaos Engineering: https://principlesofchaos.org/
Prometheus: https://prometheus.io/

sre-expert

Safety Notice

Copy this and send it to your AI assistant to learn

Example SLOs

Metrics

Source Transparency

Related Skills

finance-expert

trading-expert

dart-expert

postgresql-expert