sre-expert

Site Reliability Engineering Expert

Safety Notice

This listing is imported from skills.sh public index metadata. Review upstream SKILL.md and repository scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "sre-expert" with this command: npx skills add personamanagmentlayer/pcl/personamanagmentlayer-pcl-sre-expert

Site Reliability Engineering Expert

Expert guidance for SRE practices, reliability engineering, SLOs/SLIs, incident management, and operational excellence.

Core Concepts

SRE Fundamentals

  • Service Level Objectives (SLOs)

  • Service Level Indicators (SLIs)

  • Error budgets

  • Toil reduction

  • Monitoring and alerting

  • Capacity planning

Reliability Practices

  • Incident management

  • Post-incident reviews (PIRs)

  • On-call rotations

  • Chaos engineering

  • Disaster recovery

  • Change management

Automation

  • Infrastructure as Code

  • Configuration management

  • Deployment automation

  • Self-healing systems

  • Runbook automation

  • Automated remediation

SLO/SLI Management

from dataclasses import dataclass from datetime import datetime, timedelta from typing import List, Dict import numpy as np

@dataclass class SLI: """Service Level Indicator""" name: str description: str query: str unit: str # 'percentage', 'milliseconds', etc.

@dataclass class SLO: """Service Level Objective""" name: str sli: SLI target: float window_days: int

class SLOTracker: """Track and manage SLOs"""

def __init__(self):
    self.slos: Dict[str, SLO] = {}
    self.measurements: Dict[str, List[Dict]] = {}

def define_slo(self, slo: SLO):
    """Define a new SLO"""
    self.slos[slo.name] = slo
    self.measurements[slo.name] = []

def record_measurement(self, slo_name: str, value: float, timestamp: datetime):
    """Record SLI measurement"""
    if slo_name in self.slos:
        self.measurements[slo_name].append({
            'value': value,
            'timestamp': timestamp
        })

def calculate_slo_compliance(self, slo_name: str) -> Dict:
    """Calculate SLO compliance"""
    slo = self.slos.get(slo_name)
    if not slo:
        return {}

    measurements = self.measurements.get(slo_name, [])
    window_start = datetime.now() - timedelta(days=slo.window_days)

    recent_measurements = [
        m for m in measurements
        if m['timestamp'] > window_start
    ]

    if not recent_measurements:
        return {'status': 'no_data'}

    values = [m['value'] for m in recent_measurements]
    actual = np.mean(values)

    return {
        'slo_name': slo_name,
        'target': slo.target,
        'actual': actual,
        'compliant': actual >= slo.target,
        'window_days': slo.window_days,
        'sample_count': len(recent_measurements)
    }

def calculate_error_budget(self, slo_name: str) -> Dict:
    """Calculate remaining error budget"""
    compliance = self.calculate_slo_compliance(slo_name)

    if compliance.get('status') == 'no_data':
        return {'status': 'no_data'}

    target = compliance['target']
    actual = compliance['actual']

    error_budget_target = 100 - target
    errors_actual = 100 - actual

    remaining = error_budget_target - errors_actual
    remaining_pct = (remaining / error_budget_target) * 100 if error_budget_target > 0 else 100

    return {
        'slo_name': slo_name,
        'error_budget_target': error_budget_target,
        'errors_actual': errors_actual,
        'remaining': remaining,
        'remaining_percentage': remaining_pct,
        'exhausted': remaining < 0
    }

Example SLOs

def define_standard_slos() -> List[SLO]: """Define standard SLOs for a web service""" return [ SLO( name="api_availability", sli=SLI( name="availability", description="Percentage of successful requests", query="sum(rate(http_requests_total{code!~'5..'}[5m])) / sum(rate(http_requests_total[5m])) * 100", unit="percentage" ), target=99.9, window_days=30 ), SLO( name="api_latency", sli=SLI( name="latency_p95", description="95th percentile latency", query="histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", unit="seconds" ), target=0.5, # 500ms window_days=30 ) ]

Incident Management

from enum import Enum from datetime import datetime from typing import List, Optional

class Severity(Enum): SEV1 = "sev1" # Critical SEV2 = "sev2" # High SEV3 = "sev3" # Medium SEV4 = "sev4" # Low

class IncidentStatus(Enum): INVESTIGATING = "investigating" IDENTIFIED = "identified" MONITORING = "monitoring" RESOLVED = "resolved"

@dataclass class Incident: incident_id: str title: str severity: Severity status: IncidentStatus started_at: datetime detected_at: datetime resolved_at: Optional[datetime] incident_commander: str responders: List[str] affected_services: List[str] timeline: List[Dict] root_cause: Optional[str] = None

class IncidentManager: """Manage incidents following SRE best practices"""

def __init__(self):
    self.incidents: Dict[str, Incident] = {}

def create_incident(self, incident: Incident) -> str:
    """Create new incident"""
    self.incidents[incident.incident_id] = incident

    # Notify on-call
    self.notify_oncall(incident)

    # Start incident timeline
    self.add_timeline_event(
        incident.incident_id,
        "Incident created",
        datetime.now()
    )

    return incident.incident_id

def update_status(self, incident_id: str, new_status: IncidentStatus,
                 note: str):
    """Update incident status"""
    if incident_id in self.incidents:
        incident = self.incidents[incident_id]
        incident.status = new_status

        self.add_timeline_event(
            incident_id,
            f"Status changed to {new_status.value}: {note}",
            datetime.now()
        )

        if new_status == IncidentStatus.RESOLVED:
            incident.resolved_at = datetime.now()

def add_timeline_event(self, incident_id: str, event: str,
                      timestamp: datetime):
    """Add event to incident timeline"""
    if incident_id in self.incidents:
        self.incidents[incident_id].timeline.append({
            'timestamp': timestamp,
            'event': event
        })

def calculate_mttr(self, incident_id: str) -> Optional[float]:
    """Calculate Mean Time To Resolution"""
    incident = self.incidents.get(incident_id)

    if incident and incident.resolved_at:
        duration = incident.resolved_at - incident.detected_at
        return duration.total_seconds() / 60  # minutes

    return None

def generate_incident_report(self, incident_id: str) -> Dict:
    """Generate incident report"""
    incident = self.incidents.get(incident_id)

    if not incident:
        return {}

    return {
        'incident_id': incident.incident_id,
        'title': incident.title,
        'severity': incident.severity.value,
        'status': incident.status.value,
        'duration_minutes': self.calculate_mttr(incident_id),
        'affected_services': incident.affected_services,
        'incident_commander': incident.incident_commander,
        'responders': incident.responders,
        'timeline': incident.timeline,
        'root_cause': incident.root_cause
    }

def notify_oncall(self, incident: Incident):
    """Notify on-call engineer (integrate with PagerDuty, etc.)"""
    # Implementation would integrate with alerting system
    pass

Monitoring and Alerting

from prometheus_client import Counter, Histogram, Gauge import time

Metrics

request_count = Counter('http_requests_total', 'Total HTTP requests', ['method', 'endpoint', 'status']) request_duration = Histogram('http_request_duration_seconds', 'HTTP request duration') active_connections = Gauge('active_connections', 'Number of active connections')

class MonitoringSystem: """Implement monitoring best practices"""

def __init__(self):
    self.alerts = []

def record_request(self, method: str, endpoint: str, status: int, duration: float):
    """Record HTTP request metrics"""
    request_count.labels(method=method, endpoint=endpoint, status=status).inc()
    request_duration.observe(duration)

def define_alert(self, name: str, expression: str, threshold: float,
                duration: str, severity: str) -> Dict:
    """Define alerting rule"""
    alert = {
        'name': name,
        'expression': expression,
        'threshold': threshold,
        'duration': duration,
        'severity': severity,
        'annotations': {
            'summary': f'{name} alert triggered',
            'runbook_url': f'https://runbooks.example.com/{name}'
        }
    }

    self.alerts.append(alert)
    return alert

def check_golden_signals(self, metrics: Dict) -> Dict:
    """Check the four golden signals"""
    return {
        'latency': self._check_latency(metrics.get('latency', [])),
        'traffic': self._check_traffic(metrics.get('traffic', 0)),
        'errors': self._check_errors(metrics.get('error_rate', 0)),
        'saturation': self._check_saturation(metrics.get('cpu_usage', 0))
    }

def _check_latency(self, latencies: List[float]) -> Dict:
    if not latencies:
        return {'status': 'unknown'}

    p95 = np.percentile(latencies, 95)
    return {
        'status': 'critical' if p95 > 1000 else 'ok',
        'p95_ms': p95
    }

def _check_traffic(self, requests_per_second: float) -> Dict:
    return {
        'status': 'ok',
        'rps': requests_per_second
    }

def _check_errors(self, error_rate: float) -> Dict:
    return {
        'status': 'critical' if error_rate > 1.0 else 'ok',
        'error_rate': error_rate
    }

def _check_saturation(self, cpu_usage: float) -> Dict:
    return {
        'status': 'warning' if cpu_usage > 80 else 'ok',
        'cpu_usage': cpu_usage
    }

Chaos Engineering

import random from typing import Callable

class ChaosExperiment: """Run chaos engineering experiments"""

def __init__(self, name: str, hypothesis: str):
    self.name = name
    self.hypothesis = hypothesis
    self.results = []

def inject_latency(self, service_call: Callable, delay_ms: int):
    """Inject latency into service call"""
    time.sleep(delay_ms / 1000)
    return service_call()

def inject_failure(self, service_call: Callable, failure_rate: float):
    """Randomly fail service calls"""
    if random.random() < failure_rate:
        raise Exception("Chaos: Simulated failure")
    return service_call()

def kill_random_instance(self, instances: List[str]) -> str:
    """Kill random instance"""
    victim = random.choice(instances)
    # Implementation would actually kill the instance
    return victim

def run_experiment(self, experiment_func: Callable) -> Dict:
    """Run chaos experiment"""
    start_time = datetime.now()

    try:
        result = experiment_func()
        status = "success"
        error = None
    except Exception as e:
        result = None
        status = "failed"
        error = str(e)

    end_time = datetime.now()

    experiment_result = {
        'name': self.name,
        'hypothesis': self.hypothesis,
        'status': status,
        'result': result,
        'error': error,
        'duration': (end_time - start_time).total_seconds(),
        'timestamp': start_time
    }

    self.results.append(experiment_result)
    return experiment_result

Best Practices

SRE Principles

  • Embrace risk management

  • Set SLOs based on user experience

  • Use error budgets for decision making

  • Automate toil away

  • Monitor the four golden signals

  • Practice blameless post-mortems

  • Gradual rollouts and canary deployments

Incident Management

  • Clear incident severity definitions

  • Defined incident commander role

  • Communicate proactively

  • Document timeline during incident

  • Conduct post-incident reviews

  • Track action items to completion

  • Share learnings across teams

On-Call

  • Reasonable on-call rotations

  • Comprehensive runbooks

  • Alert on symptoms, not causes

  • Actionable alerts only

  • Escalation policies

  • Support on-call engineers

  • Measure and reduce alert fatigue

Anti-Patterns

❌ No SLOs defined ❌ Alerts without runbooks ❌ Blame culture for incidents ❌ No post-incident reviews ❌ 100% uptime expectations ❌ Toil not tracked or reduced ❌ Manual processes for common tasks

Resources

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.

General

finance-expert

No summary provided by upstream source.

Repository SourceNeeds Review
General

trading-expert

No summary provided by upstream source.

Repository SourceNeeds Review
General

dart-expert

No summary provided by upstream source.

Repository SourceNeeds Review
General

postgresql-expert

No summary provided by upstream source.

Repository SourceNeeds Review