Core Concepts · Additional notes

1 min read
Mid-level2 min read
Rapid overview

Additional notes

Prometheus

Architecture

┌─────────────┐     ┌─────────────┐     ┌─────────────┐
│   Target    │────▶│  Prometheus │────▶│  Alertmgr   │
│  (metrics)  │     │   Server    │     │             │
└─────────────┘     └──────┬──────┘     └─────────────┘
                           │
                    ┌──────▼──────┐
                    │   Grafana   │
                    │             │
                    └─────────────┘

Metric Types

TypeDescriptionExample
CounterMonotonically increasingTotal requests
GaugeCan go up or downCurrent connections
HistogramDistribution bucketsRequest duration
SummaryQuantiles (client-side)Response size

PromQL Basics

# Instant vector
http_requests_total{job="api", status="200"}

# Range vector (last 5 minutes)
http_requests_total[5m]

# Rate (per-second)
rate(http_requests_total[5m])

# Increase (total over period)
increase(http_requests_total[1h])

# Aggregation
sum(rate(http_requests_total[5m])) by (service)

# Histogram quantile (p95)
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))

Service Discovery

# Prometheus config for Kubernetes
scrape_configs:
  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      # Only scrape pods with annotation
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      # Use annotation for port
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: (.+)
        replacement: $1

Recording Rules

groups:
  - name: api_rules
    rules:
      - record: job:http_requests:rate5m
        expr: sum(rate(http_requests_total[5m])) by (job)

      - record: job:http_errors:rate5m
        expr: sum(rate(http_requests_total{status=~"5.."}[5m])) by (job)

      - record: job:http_error_rate:ratio
        expr: job:http_errors:rate5m / job:http_requests:rate5m

Alerting

Alertmanager Rules

groups:
  - name: api_alerts
    rules:
      - alert: HighErrorRate
        expr: job:http_error_rate:ratio > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is {{ $value | humanizePercentage }}"

      - alert: PodCrashLooping
        expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Pod {{ $labels.pod }} is crash looping"

      - alert: HighMemoryUsage
        expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.container }} memory > 90%"

Alertmanager Configuration

global:
  resolve_timeout: 5m
  slack_api_url: 'https://hooks.slack.com/services/xxx'

route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h
  receiver: 'default-receiver'
  routes:
    - match:
        severity: critical
      receiver: 'pagerduty-critical'
    - match:
        severity: warning
      receiver: 'slack-warnings'

receivers:
  - name: 'default-receiver'
    slack_configs:
      - channel: '#alerts'
        send_resolved: true

  - name: 'pagerduty-critical'
    pagerduty_configs:
      - service_key: '<key>'
        severity: critical

  - name: 'slack-warnings'
    slack_configs:
      - channel: '#warnings'

Grafana

Dashboard JSON Model

{
  "title": "API Dashboard",
  "panels": [
    {
      "title": "Request Rate",
      "type": "graph",
      "targets": [
        {
          "expr": "sum(rate(http_requests_total[5m])) by (service)",
          "legendFormat": "{{service}}"
        }
      ]
    },
    {
      "title": "Error Rate",
      "type": "stat",
      "targets": [
        {
          "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m]))",
          "legendFormat": "Error Rate"
        }
      ],
      "thresholds": {
        "steps": [
          {"color": "green", "value": null},
          {"color": "yellow", "value": 0.01},
          {"color": "red", "value": 0.05}
        ]
      }
    }
  ]
}

Dashboard as Code (Grafonnet)

local grafana = import 'grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local prometheus = grafana.prometheus;
local graphPanel = grafana.graphPanel;

dashboard.new(
  'API Dashboard',
  time_from='now-1h',
)
.addPanel(
  graphPanel.new(
    'Request Rate',
    datasource='Prometheus',
  )
  .addTarget(
    prometheus.target(
      'sum(rate(http_requests_total[5m])) by (service)',
      legendFormat='{{service}}',
    )
  ),
  gridPos={x: 0, y: 0, w: 12, h: 8},
)

Logging

Structured Logging

{
  "timestamp": "2024-01-15T10:30:00Z",
  "level": "error",
  "message": "Failed to process order",
  "service": "order-service",
  "trace_id": "abc123",
  "span_id": "def456",
  "user_id": "user-789",
  "order_id": "order-101",
  "error": {
    "type": "ValidationError",
    "message": "Invalid payment method"
  }
}

Fluentd Configuration

# Kubernetes logs to Elasticsearch
<source>
  @type tail
  path /var/log/containers/*.log
  pos_file /var/log/fluentd-containers.log.pos
  tag kubernetes.*
  <parse>
    @type json
    time_key time
    time_format %Y-%m-%dT%H:%M:%S.%NZ
  </parse>
</source>

<filter kubernetes.**>
  @type kubernetes_metadata
  @id filter_kube_metadata
</filter>

<match kubernetes.**>
  @type elasticsearch
  host elasticsearch
  port 9200
  logstash_format true
  logstash_prefix kubernetes
  <buffer>
    @type file
    path /var/log/fluentd-buffers/kubernetes
    flush_mode interval
    flush_interval 5s
  </buffer>
</match>

Loki (Grafana Logging)

# Promtail config
clients:
  - url: http://loki:3100/loki/api/v1/push

scrape_configs:
  - job_name: kubernetes-pods
    kubernetes_sd_configs:
      - role: pod
    pipeline_stages:
      - json:
          expressions:
            level: level
            message: message
      - labels:
          level:
      - timestamp:
          source: timestamp
          format: RFC3339Nano

LogQL Queries

# Filter by label
{namespace="production", app="api"}

# Text filter
{app="api"} |= "error"

# JSON parsing
{app="api"} | json | level="error"

# Aggregation (logs per minute)
sum(rate({app="api"}[1m])) by (level)

# Pattern matching
{app="api"} |~ "user_id=\\d+"

Distributed Tracing

OpenTelemetry SDK

// Node.js instrumentation
const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node');
const { SimpleSpanProcessor } = require('@opentelemetry/sdk-trace-base');
const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc');
const { registerInstrumentations } = require('@opentelemetry/instrumentation');
const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http');
const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express');

const provider = new NodeTracerProvider();

provider.addSpanProcessor(
  new SimpleSpanProcessor(
    new OTLPTraceExporter({
      url: 'http://otel-collector:4317',
    })
  )
);

provider.register();

registerInstrumentations({
  instrumentations: [
    new HttpInstrumentation(),
    new ExpressInstrumentation(),
  ],
});

Jaeger Query Examples

# Find traces with errors
service=api AND status.code=ERROR

# Find slow traces
service=api AND duration>1s

# Find by tag
service=api AND user.id="123"

Trace Context Propagation

# Kubernetes deployment with trace context
spec:
  containers:
    - name: api
      env:
        - name: OTEL_EXPORTER_OTLP_ENDPOINT
          value: "http://otel-collector:4317"
        - name: OTEL_SERVICE_NAME
          value: "api-service"
        - name: OTEL_PROPAGATORS
          value: "tracecontext,baggage"

SLOs and SLIs

Defining SLIs

# SLI definitions
apiVersion: sloth.slok.dev/v1
kind: PrometheusServiceLevel
metadata:
  name: api-service
spec:
  service: "api"
  labels:
    team: platform
  slos:
    - name: "availability"
      objective: 99.9
      description: "API availability"
      sli:
        events:
          errorQuery: sum(rate(http_requests_total{status=~"5.."}[{{.window}}]))
          totalQuery: sum(rate(http_requests_total[{{.window}}]))
      alerting:
        pageAlert:
          disable: false
        ticketAlert:
          disable: false

    - name: "latency"
      objective: 99
      description: "API latency p99 < 200ms"
      sli:
        events:
          errorQuery: sum(rate(http_request_duration_seconds_bucket{le="0.2"}[{{.window}}]))
          totalQuery: sum(rate(http_request_duration_seconds_count[{{.window}}]))

Error Budget

# Error budget remaining
1 - (
  (1 - (sum(rate(http_requests_total{status!~"5.."}[30d])) / sum(rate(http_requests_total[30d]))))
  /
  (1 - 0.999)  # SLO target
)

Golden Signals

SignalDescriptionMetric Example
LatencyRequest durationhistogram_quantile(0.99, http_request_duration_seconds_bucket)
TrafficRequest ratesum(rate(http_requests_total[5m]))
ErrorsError ratesum(rate(http_requests_total{status=~"5.."}[5m]))
SaturationResource utilizationcontainer_memory_usage_bytes / container_spec_memory_limit_bytes

See also