mirror of
https://github.com/wshobson/agents.git
synced 2026-03-18 09:37:15 +00:00
style: format all files with prettier
This commit is contained in:
@@ -22,6 +22,7 @@ Track requests across distributed systems to understand latency, dependencies, a
|
||||
## Distributed Tracing Concepts
|
||||
|
||||
### Trace Structure
|
||||
|
||||
```
|
||||
Trace (Request ID: abc123)
|
||||
↓
|
||||
@@ -34,6 +35,7 @@ Span (api-gateway) [80ms]
|
||||
```
|
||||
|
||||
### Key Components
|
||||
|
||||
- **Trace** - End-to-end request journey
|
||||
- **Span** - Single operation within a trace
|
||||
- **Context** - Metadata propagated between services
|
||||
@@ -71,7 +73,7 @@ EOF
|
||||
### Docker Compose
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
version: "3.8"
|
||||
services:
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
@@ -80,10 +82,10 @@ services:
|
||||
- "6831:6831/udp"
|
||||
- "6832:6832/udp"
|
||||
- "5778:5778"
|
||||
- "16686:16686" # UI
|
||||
- "14268:14268" # Collector
|
||||
- "14250:14250" # gRPC
|
||||
- "9411:9411" # Zipkin
|
||||
- "16686:16686" # UI
|
||||
- "14268:14268" # Collector
|
||||
- "14250:14250" # gRPC
|
||||
- "9411:9411" # Zipkin
|
||||
environment:
|
||||
- COLLECTOR_ZIPKIN_HOST_PORT=:9411
|
||||
```
|
||||
@@ -95,6 +97,7 @@ services:
|
||||
### OpenTelemetry (Recommended)
|
||||
|
||||
#### Python (Flask)
|
||||
|
||||
```python
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
|
||||
@@ -139,21 +142,24 @@ def fetch_users_from_db():
|
||||
```
|
||||
|
||||
#### Node.js (Express)
|
||||
|
||||
```javascript
|
||||
const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node');
|
||||
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
|
||||
const { BatchSpanProcessor } = require('@opentelemetry/sdk-trace-base');
|
||||
const { registerInstrumentations } = require('@opentelemetry/instrumentation');
|
||||
const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http');
|
||||
const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express');
|
||||
const { NodeTracerProvider } = require("@opentelemetry/sdk-trace-node");
|
||||
const { JaegerExporter } = require("@opentelemetry/exporter-jaeger");
|
||||
const { BatchSpanProcessor } = require("@opentelemetry/sdk-trace-base");
|
||||
const { registerInstrumentations } = require("@opentelemetry/instrumentation");
|
||||
const { HttpInstrumentation } = require("@opentelemetry/instrumentation-http");
|
||||
const {
|
||||
ExpressInstrumentation,
|
||||
} = require("@opentelemetry/instrumentation-express");
|
||||
|
||||
// Initialize tracer
|
||||
const provider = new NodeTracerProvider({
|
||||
resource: { attributes: { 'service.name': 'my-service' } }
|
||||
resource: { attributes: { "service.name": "my-service" } },
|
||||
});
|
||||
|
||||
const exporter = new JaegerExporter({
|
||||
endpoint: 'http://jaeger:14268/api/traces'
|
||||
endpoint: "http://jaeger:14268/api/traces",
|
||||
});
|
||||
|
||||
provider.addSpanProcessor(new BatchSpanProcessor(exporter));
|
||||
@@ -161,22 +167,19 @@ provider.register();
|
||||
|
||||
// Instrument libraries
|
||||
registerInstrumentations({
|
||||
instrumentations: [
|
||||
new HttpInstrumentation(),
|
||||
new ExpressInstrumentation(),
|
||||
],
|
||||
instrumentations: [new HttpInstrumentation(), new ExpressInstrumentation()],
|
||||
});
|
||||
|
||||
const express = require('express');
|
||||
const express = require("express");
|
||||
const app = express();
|
||||
|
||||
app.get('/api/users', async (req, res) => {
|
||||
const tracer = trace.getTracer('my-service');
|
||||
const span = tracer.startSpan('get_users');
|
||||
app.get("/api/users", async (req, res) => {
|
||||
const tracer = trace.getTracer("my-service");
|
||||
const span = tracer.startSpan("get_users");
|
||||
|
||||
try {
|
||||
const users = await fetchUsers();
|
||||
span.setAttributes({ 'user.count': users.length });
|
||||
span.setAttributes({ "user.count": users.length });
|
||||
res.json({ users });
|
||||
} finally {
|
||||
span.end();
|
||||
@@ -185,6 +188,7 @@ app.get('/api/users', async (req, res) => {
|
||||
```
|
||||
|
||||
#### Go
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
@@ -240,6 +244,7 @@ func getUsers(ctx context.Context) ([]User, error) {
|
||||
## Context Propagation
|
||||
|
||||
### HTTP Headers
|
||||
|
||||
```
|
||||
traceparent: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01
|
||||
tracestate: congo=t61rcWkgMzE
|
||||
@@ -248,6 +253,7 @@ tracestate: congo=t61rcWkgMzE
|
||||
### Propagation in HTTP Requests
|
||||
|
||||
#### Python
|
||||
|
||||
```python
|
||||
from opentelemetry.propagate import inject
|
||||
|
||||
@@ -258,13 +264,14 @@ response = requests.get('http://downstream-service/api', headers=headers)
|
||||
```
|
||||
|
||||
#### Node.js
|
||||
|
||||
```javascript
|
||||
const { propagation } = require('@opentelemetry/api');
|
||||
const { propagation } = require("@opentelemetry/api");
|
||||
|
||||
const headers = {};
|
||||
propagation.inject(context.active(), headers);
|
||||
|
||||
axios.get('http://downstream-service/api', { headers });
|
||||
axios.get("http://downstream-service/api", { headers });
|
||||
```
|
||||
|
||||
## Tempo Setup (Grafana)
|
||||
@@ -312,17 +319,17 @@ spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: tempo
|
||||
image: grafana/tempo:latest
|
||||
args:
|
||||
- -config.file=/etc/tempo/tempo.yaml
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/tempo
|
||||
- name: tempo
|
||||
image: grafana/tempo:latest
|
||||
args:
|
||||
- -config.file=/etc/tempo/tempo.yaml
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/tempo
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: tempo-config
|
||||
- name: config
|
||||
configMap:
|
||||
name: tempo-config
|
||||
```
|
||||
|
||||
**Reference:** See `assets/jaeger-config.yaml.template`
|
||||
@@ -330,6 +337,7 @@ spec:
|
||||
## Sampling Strategies
|
||||
|
||||
### Probabilistic Sampling
|
||||
|
||||
```yaml
|
||||
# Sample 1% of traces
|
||||
sampler:
|
||||
@@ -338,6 +346,7 @@ sampler:
|
||||
```
|
||||
|
||||
### Rate Limiting Sampling
|
||||
|
||||
```yaml
|
||||
# Sample max 100 traces per second
|
||||
sampler:
|
||||
@@ -346,6 +355,7 @@ sampler:
|
||||
```
|
||||
|
||||
### Adaptive Sampling
|
||||
|
||||
```python
|
||||
from opentelemetry.sdk.trace.sampling import ParentBased, TraceIdRatioBased
|
||||
|
||||
@@ -358,6 +368,7 @@ sampler = ParentBased(root=TraceIdRatioBased(0.01))
|
||||
### Finding Slow Requests
|
||||
|
||||
**Jaeger Query:**
|
||||
|
||||
```
|
||||
service=my-service
|
||||
duration > 1s
|
||||
@@ -366,6 +377,7 @@ duration > 1s
|
||||
### Finding Errors
|
||||
|
||||
**Jaeger Query:**
|
||||
|
||||
```
|
||||
service=my-service
|
||||
error=true
|
||||
@@ -375,6 +387,7 @@ tags.http.status_code >= 500
|
||||
### Service Dependency Graph
|
||||
|
||||
Jaeger automatically generates service dependency graphs showing:
|
||||
|
||||
- Service relationships
|
||||
- Request rates
|
||||
- Error rates
|
||||
@@ -396,6 +409,7 @@ Jaeger automatically generates service dependency graphs showing:
|
||||
## Integration with Logging
|
||||
|
||||
### Correlated Logs
|
||||
|
||||
```python
|
||||
import logging
|
||||
from opentelemetry import trace
|
||||
@@ -415,12 +429,14 @@ def process_request():
|
||||
## Troubleshooting
|
||||
|
||||
**No traces appearing:**
|
||||
|
||||
- Check collector endpoint
|
||||
- Verify network connectivity
|
||||
- Check sampling configuration
|
||||
- Review application logs
|
||||
|
||||
**High latency overhead:**
|
||||
|
||||
- Reduce sampling rate
|
||||
- Use batch span processor
|
||||
- Check exporter configuration
|
||||
|
||||
@@ -22,6 +22,7 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
## Dashboard Design Principles
|
||||
|
||||
### 1. Hierarchy of Information
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────┐
|
||||
│ Critical Metrics (Big Numbers) │
|
||||
@@ -33,11 +34,13 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
```
|
||||
|
||||
### 2. RED Method (Services)
|
||||
|
||||
- **Rate** - Requests per second
|
||||
- **Errors** - Error rate
|
||||
- **Duration** - Latency/response time
|
||||
|
||||
### 3. USE Method (Resources)
|
||||
|
||||
- **Utilization** - % time resource is busy
|
||||
- **Saturation** - Queue length/wait time
|
||||
- **Errors** - Error count
|
||||
@@ -63,7 +66,7 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}
|
||||
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 }
|
||||
},
|
||||
{
|
||||
"title": "Error Rate %",
|
||||
@@ -77,14 +80,14 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
"alert": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {"params": [5], "type": "gt"},
|
||||
"operator": {"type": "and"},
|
||||
"query": {"params": ["A", "5m", "now"]},
|
||||
"evaluator": { "params": [5], "type": "gt" },
|
||||
"operator": { "type": "and" },
|
||||
"query": { "params": ["A", "5m", "now"] },
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8}
|
||||
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 }
|
||||
},
|
||||
{
|
||||
"title": "P95 Latency",
|
||||
@@ -95,7 +98,7 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"x": 0, "y": 8, "w": 24, "h": 8}
|
||||
"gridPos": { "x": 0, "y": 8, "w": 24, "h": 8 }
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -107,13 +110,16 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
## Panel Types
|
||||
|
||||
### 1. Stat Panel (Single Value)
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Total Requests",
|
||||
"targets": [{
|
||||
"expr": "sum(http_requests_total)"
|
||||
}],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(http_requests_total)"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
@@ -128,9 +134,9 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"value": 0, "color": "green"},
|
||||
{"value": 80, "color": "yellow"},
|
||||
{"value": 90, "color": "red"}
|
||||
{ "value": 0, "color": "green" },
|
||||
{ "value": 80, "color": "yellow" },
|
||||
{ "value": 90, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -139,35 +145,41 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
```
|
||||
|
||||
### 2. Time Series Graph
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "graph",
|
||||
"title": "CPU Usage",
|
||||
"targets": [{
|
||||
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)"
|
||||
}],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)"
|
||||
}
|
||||
],
|
||||
"yaxes": [
|
||||
{"format": "percent", "max": 100, "min": 0},
|
||||
{"format": "short"}
|
||||
{ "format": "percent", "max": 100, "min": 0 },
|
||||
{ "format": "short" }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Table Panel
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "table",
|
||||
"title": "Service Status",
|
||||
"targets": [{
|
||||
"expr": "up",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {"Time": true},
|
||||
"excludeByName": { "Time": true },
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"instance": "Instance",
|
||||
@@ -181,14 +193,17 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
```
|
||||
|
||||
### 4. Heatmap
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "heatmap",
|
||||
"title": "Latency Heatmap",
|
||||
"targets": [{
|
||||
"expr": "sum(rate(http_request_duration_seconds_bucket[5m])) by (le)",
|
||||
"format": "heatmap"
|
||||
}],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_request_duration_seconds_bucket[5m])) by (le)",
|
||||
"format": "heatmap"
|
||||
}
|
||||
],
|
||||
"dataFormat": "tsbuckets",
|
||||
"yAxis": {
|
||||
"format": "s"
|
||||
@@ -199,6 +214,7 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
## Variables
|
||||
|
||||
### Query Variables
|
||||
|
||||
```json
|
||||
{
|
||||
"templating": {
|
||||
@@ -225,6 +241,7 @@ Design effective Grafana dashboards for monitoring applications, infrastructure,
|
||||
```
|
||||
|
||||
### Use Variables in Queries
|
||||
|
||||
```
|
||||
sum(rate(http_requests_total{namespace="$namespace", service=~"$service"}[5m]))
|
||||
```
|
||||
@@ -241,11 +258,11 @@ sum(rate(http_requests_total{namespace="$namespace", service=~"$service"}[5m]))
|
||||
"params": [5],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {"type": "and"},
|
||||
"operator": { "type": "and" },
|
||||
"query": {
|
||||
"params": ["A", "5m", "now"]
|
||||
},
|
||||
"reducer": {"type": "avg"},
|
||||
"reducer": { "type": "avg" },
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
@@ -254,9 +271,7 @@ sum(rate(http_requests_total{namespace="$namespace", service=~"$service"}[5m]))
|
||||
"frequency": "1m",
|
||||
"message": "Error rate is above 5%",
|
||||
"noDataState": "no_data",
|
||||
"notifications": [
|
||||
{"uid": "slack-channel"}
|
||||
]
|
||||
"notifications": [{ "uid": "slack-channel" }]
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -264,13 +279,14 @@ sum(rate(http_requests_total{namespace="$namespace", service=~"$service"}[5m]))
|
||||
## Dashboard Provisioning
|
||||
|
||||
**dashboards.yml:**
|
||||
|
||||
```yaml
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'default'
|
||||
- name: "default"
|
||||
orgId: 1
|
||||
folder: 'General'
|
||||
folder: "General"
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
@@ -284,6 +300,7 @@ providers:
|
||||
### Infrastructure Dashboard
|
||||
|
||||
**Key Panels:**
|
||||
|
||||
- CPU utilization per node
|
||||
- Memory usage per node
|
||||
- Disk I/O
|
||||
@@ -296,6 +313,7 @@ providers:
|
||||
### Database Dashboard
|
||||
|
||||
**Key Panels:**
|
||||
|
||||
- Queries per second
|
||||
- Connection pool usage
|
||||
- Query latency (P50, P95, P99)
|
||||
@@ -309,6 +327,7 @@ providers:
|
||||
### Application Dashboard
|
||||
|
||||
**Key Panels:**
|
||||
|
||||
- Request rate
|
||||
- Error rate
|
||||
- Response time (percentiles)
|
||||
|
||||
@@ -55,7 +55,7 @@ helm install prometheus prometheus-community/kube-prometheus-stack \
|
||||
### Docker Compose
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
version: "3.8"
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
@@ -65,9 +65,9 @@ services:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--storage.tsdb.retention.time=30d"
|
||||
|
||||
volumes:
|
||||
prometheus-data:
|
||||
@@ -76,20 +76,21 @@ volumes:
|
||||
## Configuration File
|
||||
|
||||
**prometheus.yml:**
|
||||
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'production'
|
||||
region: 'us-west-2'
|
||||
cluster: "production"
|
||||
region: "us-west-2"
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
- alertmanager:9093
|
||||
|
||||
# Load rules files
|
||||
rule_files:
|
||||
@@ -98,25 +99,25 @@ rule_files:
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# Prometheus itself
|
||||
- job_name: 'prometheus'
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
# Node exporters
|
||||
- job_name: 'node-exporter'
|
||||
- job_name: "node-exporter"
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'node1:9100'
|
||||
- 'node2:9100'
|
||||
- 'node3:9100'
|
||||
- "node1:9100"
|
||||
- "node2:9100"
|
||||
- "node3:9100"
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
regex: '([^:]+)(:[0-9]+)?'
|
||||
replacement: '${1}'
|
||||
regex: "([^:]+)(:[0-9]+)?"
|
||||
replacement: "${1}"
|
||||
|
||||
# Kubernetes pods with annotations
|
||||
- job_name: 'kubernetes-pods'
|
||||
- job_name: "kubernetes-pods"
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
@@ -127,7 +128,8 @@ scrape_configs:
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
- source_labels:
|
||||
[__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
@@ -140,13 +142,13 @@ scrape_configs:
|
||||
target_label: pod
|
||||
|
||||
# Application metrics
|
||||
- job_name: 'my-app'
|
||||
- job_name: "my-app"
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'app1.example.com:9090'
|
||||
- 'app2.example.com:9090'
|
||||
metrics_path: '/metrics'
|
||||
scheme: 'https'
|
||||
- "app1.example.com:9090"
|
||||
- "app2.example.com:9090"
|
||||
metrics_path: "/metrics"
|
||||
scheme: "https"
|
||||
tls_config:
|
||||
ca_file: /etc/prometheus/ca.crt
|
||||
cert_file: /etc/prometheus/client.crt
|
||||
@@ -161,27 +163,28 @@ scrape_configs:
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'static-targets'
|
||||
- job_name: "static-targets"
|
||||
static_configs:
|
||||
- targets: ['host1:9100', 'host2:9100']
|
||||
- targets: ["host1:9100", "host2:9100"]
|
||||
labels:
|
||||
env: 'production'
|
||||
region: 'us-west-2'
|
||||
env: "production"
|
||||
region: "us-west-2"
|
||||
```
|
||||
|
||||
### File-based Service Discovery
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'file-sd'
|
||||
- job_name: "file-sd"
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- /etc/prometheus/targets/*.json
|
||||
- /etc/prometheus/targets/*.yml
|
||||
- /etc/prometheus/targets/*.json
|
||||
- /etc/prometheus/targets/*.yml
|
||||
refresh_interval: 5m
|
||||
```
|
||||
|
||||
**targets/production.json:**
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
@@ -198,14 +201,16 @@ scrape_configs:
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'kubernetes-services'
|
||||
- job_name: "kubernetes-services"
|
||||
kubernetes_sd_configs:
|
||||
- role: service
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
|
||||
- source_labels:
|
||||
[__meta_kubernetes_service_annotation_prometheus_io_scrape]
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
|
||||
- source_labels:
|
||||
[__meta_kubernetes_service_annotation_prometheus_io_scheme]
|
||||
action: replace
|
||||
target_label: __scheme__
|
||||
regex: (https?)
|
||||
@@ -364,16 +369,19 @@ promtool query instant http://localhost:9090 'up'
|
||||
## Troubleshooting
|
||||
|
||||
**Check scrape targets:**
|
||||
|
||||
```bash
|
||||
curl http://localhost:9090/api/v1/targets
|
||||
```
|
||||
|
||||
**Check configuration:**
|
||||
|
||||
```bash
|
||||
curl http://localhost:9090/api/v1/status/config
|
||||
```
|
||||
|
||||
**Test query:**
|
||||
|
||||
```bash
|
||||
curl 'http://localhost:9090/api/v1/query?query=up'
|
||||
```
|
||||
|
||||
@@ -35,6 +35,7 @@ SLI (Service Level Indicator)
|
||||
### Common SLI Types
|
||||
|
||||
#### 1. Availability SLI
|
||||
|
||||
```promql
|
||||
# Successful requests / Total requests
|
||||
sum(rate(http_requests_total{status!~"5.."}[28d]))
|
||||
@@ -43,6 +44,7 @@ sum(rate(http_requests_total[28d]))
|
||||
```
|
||||
|
||||
#### 2. Latency SLI
|
||||
|
||||
```promql
|
||||
# Requests below latency threshold / Total requests
|
||||
sum(rate(http_request_duration_seconds_bucket{le="0.5"}[28d]))
|
||||
@@ -51,6 +53,7 @@ sum(rate(http_request_duration_seconds_count[28d]))
|
||||
```
|
||||
|
||||
#### 3. Durability SLI
|
||||
|
||||
```
|
||||
# Successful writes / Total writes
|
||||
sum(storage_writes_successful_total)
|
||||
@@ -64,16 +67,17 @@ sum(storage_writes_total)
|
||||
|
||||
### Availability SLO Examples
|
||||
|
||||
| SLO % | Downtime/Month | Downtime/Year |
|
||||
|-------|----------------|---------------|
|
||||
| 99% | 7.2 hours | 3.65 days |
|
||||
| 99.9% | 43.2 minutes | 8.76 hours |
|
||||
| 99.95%| 21.6 minutes | 4.38 hours |
|
||||
| 99.99%| 4.32 minutes | 52.56 minutes |
|
||||
| SLO % | Downtime/Month | Downtime/Year |
|
||||
| ------ | -------------- | ------------- |
|
||||
| 99% | 7.2 hours | 3.65 days |
|
||||
| 99.9% | 43.2 minutes | 8.76 hours |
|
||||
| 99.95% | 21.6 minutes | 4.38 hours |
|
||||
| 99.99% | 4.32 minutes | 52.56 minutes |
|
||||
|
||||
### Choose Appropriate SLOs
|
||||
|
||||
**Consider:**
|
||||
|
||||
- User expectations
|
||||
- Business requirements
|
||||
- Current performance
|
||||
@@ -81,6 +85,7 @@ sum(storage_writes_total)
|
||||
- Competitor benchmarks
|
||||
|
||||
**Example SLOs:**
|
||||
|
||||
```yaml
|
||||
slos:
|
||||
- name: api_availability
|
||||
@@ -109,6 +114,7 @@ Error Budget = 1 - SLO Target
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
- SLO: 99.9% availability
|
||||
- Error Budget: 0.1% = 43.2 minutes/month
|
||||
- Current Error: 0.05% = 21.6 minutes/month
|
||||
@@ -287,18 +293,21 @@ rules:
|
||||
## SLO Review Process
|
||||
|
||||
### Weekly Review
|
||||
|
||||
- Current SLO compliance
|
||||
- Error budget status
|
||||
- Trend analysis
|
||||
- Incident impact
|
||||
|
||||
### Monthly Review
|
||||
|
||||
- SLO achievement
|
||||
- Error budget usage
|
||||
- Incident postmortems
|
||||
- SLO adjustments
|
||||
|
||||
### Quarterly Review
|
||||
|
||||
- SLO relevance
|
||||
- Target adjustments
|
||||
- Process improvements
|
||||
|
||||
Reference in New Issue
Block a user