feat: add 5 new specialized agents with 20 skills

Add domain expert agents with comprehensive skill sets:
- service-mesh-expert (cloud-infrastructure): Istio/Linkerd patterns, mTLS, observability
- event-sourcing-architect (backend-development): CQRS, event stores, projections, sagas
- vector-database-engineer (llm-application-dev): embeddings, similarity search, hybrid search
- monorepo-architect (developer-essentials): Nx, Turborepo, Bazel, pnpm workspaces
- threat-modeling-expert (security-scanning): STRIDE, attack trees, security requirements

Update all documentation to reflect correct counts:
- 67 plugins, 99 agents, 107 skills, 71 commands
This commit is contained in:
Seth Hobson
2025-12-16 16:00:58 -05:00
parent c7ad381360
commit 01d93fc227
58 changed files with 24830 additions and 50 deletions

View File

@@ -6,7 +6,7 @@
"url": "https://github.com/wshobson"
},
"metadata": {
"description": "Production-ready workflow orchestration with 65 focused plugins, 91 specialized agents, and 45 tools - optimized for granular installation and minimal token usage",
"description": "Production-ready workflow orchestration with 67 focused plugins, 99 specialized agents, and 107 skills - optimized for granular installation and minimal token usage",
"version": "1.3.0"
},
"plugins": [
@@ -129,14 +129,19 @@
"./agents/backend-architect.md",
"./agents/graphql-architect.md",
"./agents/tdd-orchestrator.md",
"./agents/temporal-python-pro.md"
"./agents/temporal-python-pro.md",
"./agents/event-sourcing-architect.md"
],
"skills": [
"./skills/api-design-principles",
"./skills/architecture-patterns",
"./skills/microservices-patterns",
"./skills/workflow-orchestration-patterns",
"./skills/temporal-python-testing"
"./skills/temporal-python-testing",
"./skills/event-store-design",
"./skills/projection-patterns",
"./skills/saga-orchestration",
"./skills/cqrs-implementation"
]
},
{
@@ -166,6 +171,12 @@
"agents": [
"./agents/frontend-developer.md",
"./agents/mobile-developer.md"
],
"skills": [
"./skills/react-state-management",
"./skills/nextjs-app-router-patterns",
"./skills/react-native-architecture",
"./skills/tailwind-design-system"
]
},
{
@@ -431,13 +442,18 @@
],
"agents": [
"./agents/ai-engineer.md",
"./agents/prompt-engineer.md"
"./agents/prompt-engineer.md",
"./agents/vector-database-engineer.md"
],
"skills": [
"./skills/langchain-architecture",
"./skills/llm-evaluation",
"./skills/prompt-engineering-patterns",
"./skills/rag-implementation"
"./skills/rag-implementation",
"./skills/embedding-strategies",
"./skills/similarity-search-patterns",
"./skills/vector-index-tuning",
"./skills/hybrid-search-implementation"
]
},
{
@@ -558,6 +574,12 @@
"agents": [
"./agents/data-engineer.md",
"./agents/backend-architect.md"
],
"skills": [
"./skills/dbt-transformation-patterns",
"./skills/airflow-dag-patterns",
"./skills/spark-optimization",
"./skills/data-quality-frameworks"
]
},
{
@@ -587,6 +609,11 @@
"agents": [
"./agents/incident-responder.md",
"./agents/devops-troubleshooter.md"
],
"skills": [
"./skills/incident-runbook-templates",
"./skills/postmortem-writing",
"./skills/on-call-handoff-patterns"
]
},
{
@@ -805,13 +832,18 @@
"./agents/hybrid-cloud-architect.md",
"./agents/terraform-specialist.md",
"./agents/network-engineer.md",
"./agents/deployment-engineer.md"
"./agents/deployment-engineer.md",
"./agents/service-mesh-expert.md"
],
"skills": [
"./skills/cost-optimization",
"./skills/hybrid-cloud-networking",
"./skills/multi-cloud-architecture",
"./skills/terraform-module-library"
"./skills/terraform-module-library",
"./skills/istio-traffic-management",
"./skills/linkerd-patterns",
"./skills/mtls-configuration",
"./skills/service-mesh-observability"
]
},
{
@@ -1123,10 +1155,15 @@
"./commands/security-dependencies.md"
],
"agents": [
"./agents/security-auditor.md"
"./agents/security-auditor.md",
"./agents/threat-modeling-expert.md"
],
"skills": [
"./skills/sast-configuration"
"./skills/sast-configuration",
"./skills/stride-analysis-patterns",
"./skills/attack-tree-construction",
"./skills/security-requirement-extraction",
"./skills/threat-mitigation-mapping"
]
},
{
@@ -1417,6 +1454,11 @@
"./agents/mermaid-expert.md",
"./agents/tutorial-engineer.md",
"./agents/reference-builder.md"
],
"skills": [
"./skills/openapi-spec-generation",
"./skills/architecture-decision-records",
"./skills/changelog-automation"
]
},
{
@@ -1512,6 +1554,10 @@
"commands": [],
"agents": [
"./agents/business-analyst.md"
],
"skills": [
"./skills/kpi-dashboard-design",
"./skills/data-storytelling"
]
},
{
@@ -1541,6 +1587,10 @@
"agents": [
"./agents/hr-pro.md",
"./agents/legal-advisor.md"
],
"skills": [
"./skills/gdpr-data-handling",
"./skills/employment-contract-templates"
]
},
{
@@ -1655,6 +1705,10 @@
"agents": [
"./agents/quant-analyst.md",
"./agents/risk-manager.md"
],
"skills": [
"./skills/backtesting-frameworks",
"./skills/risk-metrics-calculation"
]
},
{
@@ -1717,6 +1771,10 @@
"agents": [
"./agents/unity-developer.md",
"./agents/minecraft-bukkit-pro.md"
],
"skills": [
"./skills/unity-ecs-patterns",
"./skills/godot-gdscript-patterns"
]
},
{
@@ -1745,6 +1803,10 @@
],
"agents": [
"./agents/ui-visual-validator.md"
],
"skills": [
"./skills/wcag-audit-patterns",
"./skills/screen-reader-testing"
]
},
{
@@ -1849,6 +1911,11 @@
"./agents/golang-pro.md",
"./agents/c-pro.md",
"./agents/cpp-pro.md"
],
"skills": [
"./skills/rust-async-patterns",
"./skills/go-concurrency-patterns",
"./skills/memory-safety-patterns"
]
},
{
@@ -2048,7 +2115,9 @@
"category": "development",
"strict": false,
"commands": [],
"agents": [],
"agents": [
"./agents/monorepo-architect.md"
],
"skills": [
"./skills/git-advanced-workflows",
"./skills/sql-optimization-patterns",
@@ -2057,7 +2126,11 @@
"./skills/e2e-testing-patterns",
"./skills/auth-implementation-patterns",
"./skills/debugging-strategies",
"./skills/monorepo-management"
"./skills/monorepo-management",
"./skills/nx-workspace-patterns",
"./skills/turborepo-caching",
"./skills/bazel-build-optimization",
"./skills/monorepo-dependency-management"
]
}
]

View File

@@ -4,26 +4,26 @@
[![Run in Smithery](https://smithery.ai/badge/skills/wshobson)](https://smithery.ai/skills?ns=wshobson&utm_source=github&utm_medium=badge)
> **🎯 Agent Skills Enabled** — 47 specialized skills extend Claude's capabilities across plugins with progressive disclosure
> **🎯 Agent Skills Enabled** — 107 specialized skills extend Claude's capabilities across plugins with progressive disclosure
A comprehensive production-ready system combining **91 specialized AI agents**, **15 multi-agent workflow orchestrators**, **47 agent skills**, and **45 development tools** organized into **65 focused, single-purpose plugins** for [Claude Code](https://docs.claude.com/en/docs/claude-code/overview).
A comprehensive production-ready system combining **99 specialized AI agents**, **15 multi-agent workflow orchestrators**, **107 agent skills**, and **71 development tools** organized into **67 focused, single-purpose plugins** for [Claude Code](https://docs.claude.com/en/docs/claude-code/overview).
## Overview
This unified repository provides everything needed for intelligent automation and multi-agent orchestration across modern software development:
- **65 Focused Plugins** - Granular, single-purpose plugins optimized for minimal token usage and composability
- **91 Specialized Agents** - Domain experts with deep knowledge across architecture, languages, infrastructure, quality, data/AI, documentation, business operations, and SEO
- **47 Agent Skills** - Modular knowledge packages with progressive disclosure for specialized expertise
- **67 Focused Plugins** - Granular, single-purpose plugins optimized for minimal token usage and composability
- **99 Specialized Agents** - Domain experts with deep knowledge across architecture, languages, infrastructure, quality, data/AI, documentation, business operations, and SEO
- **107 Agent Skills** - Modular knowledge packages with progressive disclosure for specialized expertise
- **15 Workflow Orchestrators** - Multi-agent coordination systems for complex operations like full-stack development, security hardening, ML pipelines, and incident response
- **45 Development Tools** - Optimized utilities including project scaffolding, security scanning, test automation, and infrastructure setup
- **71 Development Tools** - Optimized utilities including project scaffolding, security scanning, test automation, and infrastructure setup
### Key Features
- **Granular Plugin Architecture**: 65 focused plugins optimized for minimal token usage
- **Comprehensive Tooling**: 45 development tools including test generation, scaffolding, and security scanning
- **Granular Plugin Architecture**: 67 focused plugins optimized for minimal token usage
- **Comprehensive Tooling**: 71 development tools including test generation, scaffolding, and security scanning
- **100% Agent Coverage**: All plugins include specialized agents
- **Agent Skills**: 47 specialized skills following for progressive disclosure and token efficiency
- **Agent Skills**: 107 specialized skills following for progressive disclosure and token efficiency
- **Clear Organization**: 23 categories with 1-6 plugins each for easy discovery
- **Efficient Design**: Average 3.4 components per plugin (follows Anthropic's 2-8 pattern)
@@ -49,7 +49,7 @@ Add this marketplace to Claude Code:
/plugin marketplace add wshobson/agents
```
This makes all 65 plugins available for installation, but **does not load any agents or tools** into your context.
This makes all 67 plugins available for installation, but **does not load any agents or tools** into your context.
### Step 2: Install Plugins
@@ -85,9 +85,9 @@ Each installed plugin loads **only its specific agents, commands, and skills** i
### Core Guides
- **[Plugin Reference](docs/plugins.md)** - Complete catalog of all 65 plugins
- **[Agent Reference](docs/agents.md)** - All 91 agents organized by category
- **[Agent Skills](docs/agent-skills.md)** - 47 specialized skills with progressive disclosure
- **[Plugin Reference](docs/plugins.md)** - Complete catalog of all 67 plugins
- **[Agent Reference](docs/agents.md)** - All 99 agents organized by category
- **[Agent Skills](docs/agent-skills.md)** - 107 specialized skills with progressive disclosure
- **[Usage Guide](docs/usage.md)** - Commands, workflows, and best practices
- **[Architecture](docs/architecture.md)** - Design principles and patterns
@@ -101,7 +101,7 @@ Each installed plugin loads **only its specific agents, commands, and skills** i
## What's New
### Agent Skills (47 skills across 14 plugins)
### Agent Skills (107 skills across 18 plugins)
Specialized knowledge packages following Anthropic's progressive disclosure architecture:
@@ -205,7 +205,7 @@ Uses kubernetes-architect agent with 4 specialized skills for production-grade c
## Plugin Categories
**23 categories, 65 plugins:**
**23 categories, 67 plugins:**
- 🎨 **Development** (4) - debugging, backend, frontend, multi-platform
- 📚 **Documentation** (3) - code docs, API specs, diagrams, C4 architecture
@@ -237,7 +237,7 @@ Uses kubernetes-architect agent with 4 specialized skills for production-grade c
- **Single responsibility** - Each plugin does one thing well
- **Minimal token usage** - Average 3.4 components per plugin
- **Composable** - Mix and match for complex workflows
- **100% coverage** - All 91 agents accessible across plugins
- **100% coverage** - All 99 agents accessible across plugins
### Progressive Disclosure (Skills)
@@ -251,7 +251,7 @@ Three-tier architecture for token efficiency:
```
claude-agents/
├── .claude-plugin/
│ └── marketplace.json # 65 plugins
│ └── marketplace.json # 67 plugins
├── plugins/
│ ├── python-development/
│ │ ├── agents/ # 3 Python experts
@@ -261,7 +261,7 @@ claude-agents/
│ │ ├── agents/ # K8s architect
│ │ ├── commands/ # Deployment tools
│ │ └── skills/ # 4 K8s skills
│ └── ... (63 more plugins)
│ └── ... (65 more plugins)
├── docs/ # Comprehensive documentation
└── README.md # This file
```

View File

@@ -1,6 +1,6 @@
# Agent Skills
Agent Skills are modular packages that extend Claude's capabilities with specialized domain knowledge, following Anthropic's [Agent Skills Specification](https://github.com/anthropics/skills/blob/main/agent_skills_spec.md). This plugin ecosystem includes **57 specialized skills** across 15 plugins, enabling progressive disclosure and efficient token usage.
Agent Skills are modular packages that extend Claude's capabilities with specialized domain knowledge, following Anthropic's [Agent Skills Specification](https://github.com/anthropics/skills/blob/main/agent_skills_spec.md). This plugin ecosystem includes **107 specialized skills** across 18 plugins, enabling progressive disclosure and efficient token usage.
## Overview
@@ -187,7 +187,7 @@ fastapi-templates skill → Supplies production-ready templates
## Specification Compliance
All 55 skills follow the [Agent Skills Specification](https://github.com/anthropics/skills/blob/main/agent_skills_spec.md):
All 107 skills follow the [Agent Skills Specification](https://github.com/anthropics/skills/blob/main/agent_skills_spec.md):
- ✓ Required `name` field (hyphen-case)
- ✓ Required `description` field with "Use when" clause

View File

@@ -1,6 +1,6 @@
# Agent Reference
Complete reference for all **91 specialized AI agents** organized by category with model assignments.
Complete reference for all **99 specialized AI agents** organized by category with model assignments.
## Agent Categories

View File

@@ -36,7 +36,7 @@ This marketplace follows industry best practices with a focus on granularity, co
### Plugin Distribution
- **65 focused plugins** optimized for specific use cases
- **67 focused plugins** optimized for specific use cases
- **23 clear categories** with 1-6 plugins each for easy discovery
- Organized by domain:
- **Development**: 4 plugins (debugging, backend, frontend, multi-platform)
@@ -48,17 +48,17 @@ This marketplace follows industry best practices with a focus on granularity, co
### Component Breakdown
**91 Specialized Agents**
**99 Specialized Agents**
- Domain experts with deep knowledge
- Organized across architecture, languages, infrastructure, quality, data/AI, documentation, business, and SEO
- Model-optimized (48 Haiku, 100 Sonnet) for performance and cost
- Model-optimized with three-tier strategy (Opus, Sonnet, Haiku) for performance and cost
**15 Workflow Orchestrators**
- Multi-agent coordination systems
- Complex operations like full-stack development, security hardening, ML pipelines, incident response
- Pre-configured agent workflows
**44 Development Tools**
**71 Development Tools**
- Optimized utilities including:
- Project scaffolding (Python, TypeScript, Rust)
- Security scanning (SAST, dependency audit, XSS)
@@ -66,10 +66,10 @@ This marketplace follows industry best practices with a focus on granularity, co
- Component scaffolding (React, React Native)
- Infrastructure setup (Terraform, Kubernetes)
**47 Agent Skills**
**107 Agent Skills**
- Modular knowledge packages
- Progressive disclosure architecture
- Domain-specific expertise across 14 plugins
- Domain-specific expertise across 18 plugins
- Spec-compliant (Anthropic Agent Skills Specification)
## Repository Structure
@@ -77,7 +77,7 @@ This marketplace follows industry best practices with a focus on granularity, co
```
claude-agents/
├── .claude-plugin/
│ └── marketplace.json # Marketplace catalog (65 plugins)
│ └── marketplace.json # Marketplace catalog (67 plugins)
├── plugins/ # Isolated plugin directories
│ ├── python-development/
│ │ ├── agents/ # Python language agents
@@ -120,7 +120,7 @@ claude-agents/
│ │ │ └── c4-context.md
│ │ └── commands/
│ │ └── c4-architecture.md
│ └── ... (60 more isolated plugins)
│ └── ... (62 more isolated plugins)
├── docs/ # Documentation
│ ├── agent-skills.md # Agent Skills guide
│ ├── agents.md # Agent reference
@@ -191,7 +191,7 @@ description: What the skill does. Use when [trigger]. # Required: < 1024 chars
- **Composability**: Mix and match skills across workflows
- **Maintainability**: Isolated updates don't affect other skills
See [Agent Skills](./agent-skills.md) for complete details on the 47 skills.
See [Agent Skills](./agent-skills.md) for complete details on the 107 skills.
## Model Configuration Strategy
@@ -201,8 +201,9 @@ The system uses Claude Opus and Sonnet models strategically:
| Model | Count | Use Case |
|-------|-------|----------|
| Haiku | 48 agents | Fast execution, deterministic tasks |
| Sonnet | 100 agents | Complex reasoning, architecture decisions |
| Opus | 42 agents | Critical architecture, security, code review |
| Sonnet | 39 agents | Complex tasks, support with intelligence |
| Haiku | 18 agents | Fast operational tasks |
### Selection Criteria
@@ -255,7 +256,7 @@ code-reviewer (Sonnet) validates architecture
### Component Coverage
- **100% agent coverage** - all plugins include at least one agent
- **100% component availability** - all 91 agents accessible across plugins
- **100% component availability** - all 99 agents accessible across plugins
- **Efficient distribution** - 3.4 components per plugin average
### Discoverability
@@ -383,5 +384,5 @@ Feature Development Workflow:
- [Agent Skills](./agent-skills.md) - Modular knowledge packages
- [Agent Reference](./agents.md) - Complete agent catalog
- [Plugin Reference](./plugins.md) - All 65 plugins
- [Plugin Reference](./plugins.md) - All 67 plugins
- [Usage Guide](./usage.md) - Commands and workflows

View File

@@ -1,6 +1,6 @@
# Complete Plugin Reference
Browse all **65 focused, single-purpose plugins** organized by category.
Browse all **67 focused, single-purpose plugins** organized by category.
## Quick Start - Essential Plugins
@@ -329,7 +329,7 @@ plugins/python-development/
/plugin marketplace add wshobson/agents
```
This makes all 65 plugins available for installation, but **does not load any agents or tools** into your context.
This makes all 67 plugins available for installation, but **does not load any agents or tools** into your context.
### Step 2: Install Specific Plugins
@@ -369,7 +369,7 @@ Each installed plugin loads **only its specific agents and commands** into Claud
## See Also
- [Agent Skills](./agent-skills.md) - 47 specialized skills across plugins
- [Agent Skills](./agent-skills.md) - 107 specialized skills across plugins
- [Agent Reference](./agents.md) - Complete agent catalog
- [Usage Guide](./usage.md) - Commands and workflows
- [Architecture](./architecture.md) - Design principles

View File

@@ -386,11 +386,11 @@ User: "Implement Kubernetes deployment with Helm"
→ Result: Production-grade K8s manifests with Helm charts
```
See [Agent Skills](./agent-skills.md) for details on the 47 specialized skills.
See [Agent Skills](./agent-skills.md) for details on the 107 specialized skills.
## See Also
- [Agent Skills](./agent-skills.md) - Specialized knowledge packages
- [Agent Reference](./agents.md) - Complete agent catalog
- [Plugin Reference](./plugins.md) - All 65 plugins
- [Plugin Reference](./plugins.md) - All 67 plugins
- [Architecture](./architecture.md) - Design principles

View File

@@ -0,0 +1,533 @@
---
name: screen-reader-testing
description: Test web applications with screen readers including VoiceOver, NVDA, and JAWS. Use when validating screen reader compatibility, debugging accessibility issues, or ensuring assistive technology support.
---
# Screen Reader Testing
Practical guide to testing web applications with screen readers for comprehensive accessibility validation.
## When to Use This Skill
- Validating screen reader compatibility
- Testing ARIA implementations
- Debugging assistive technology issues
- Verifying form accessibility
- Testing dynamic content announcements
- Ensuring navigation accessibility
## Core Concepts
### 1. Major Screen Readers
| Screen Reader | Platform | Browser | Usage |
|---------------|----------|---------|-------|
| **VoiceOver** | macOS/iOS | Safari | ~15% |
| **NVDA** | Windows | Firefox/Chrome | ~31% |
| **JAWS** | Windows | Chrome/IE | ~40% |
| **TalkBack** | Android | Chrome | ~10% |
| **Narrator** | Windows | Edge | ~4% |
### 2. Testing Priority
```
Minimum Coverage:
1. NVDA + Firefox (Windows)
2. VoiceOver + Safari (macOS)
3. VoiceOver + Safari (iOS)
Comprehensive Coverage:
+ JAWS + Chrome (Windows)
+ TalkBack + Chrome (Android)
+ Narrator + Edge (Windows)
```
### 3. Screen Reader Modes
| Mode | Purpose | When Used |
|------|---------|-----------|
| **Browse/Virtual** | Read content | Default reading |
| **Focus/Forms** | Interact with controls | Filling forms |
| **Application** | Custom widgets | ARIA applications |
## VoiceOver (macOS)
### Setup
```
Enable: System Preferences → Accessibility → VoiceOver
Toggle: Cmd + F5
Quick Toggle: Triple-press Touch ID
```
### Essential Commands
```
Navigation:
VO = Ctrl + Option (VoiceOver modifier)
VO + Right Arrow Next element
VO + Left Arrow Previous element
VO + Shift + Down Enter group
VO + Shift + Up Exit group
Reading:
VO + A Read all from cursor
Ctrl Stop speaking
VO + B Read current paragraph
Interaction:
VO + Space Activate element
VO + Shift + M Open menu
Tab Next focusable element
Shift + Tab Previous focusable element
Rotor (VO + U):
Navigate by: Headings, Links, Forms, Landmarks
Left/Right Arrow Change rotor category
Up/Down Arrow Navigate within category
Enter Go to item
Web Specific:
VO + Cmd + H Next heading
VO + Cmd + J Next form control
VO + Cmd + L Next link
VO + Cmd + T Next table
```
### Testing Checklist
```markdown
## VoiceOver Testing Checklist
### Page Load
- [ ] Page title announced
- [ ] Main landmark found
- [ ] Skip link works
### Navigation
- [ ] All headings discoverable via rotor
- [ ] Heading levels logical (H1 → H2 → H3)
- [ ] Landmarks properly labeled
- [ ] Skip links functional
### Links & Buttons
- [ ] Link purpose clear
- [ ] Button actions described
- [ ] New window/tab announced
### Forms
- [ ] All labels read with inputs
- [ ] Required fields announced
- [ ] Error messages read
- [ ] Instructions available
- [ ] Focus moves to errors
### Dynamic Content
- [ ] Alerts announced immediately
- [ ] Loading states communicated
- [ ] Content updates announced
- [ ] Modals trap focus correctly
### Tables
- [ ] Headers associated with cells
- [ ] Table navigation works
- [ ] Complex tables have captions
```
### Common Issues & Fixes
```html
<!-- Issue: Button not announcing purpose -->
<button><svg>...</svg></button>
<!-- Fix -->
<button aria-label="Close dialog"><svg aria-hidden="true">...</svg></button>
<!-- Issue: Dynamic content not announced -->
<div id="results">New results loaded</div>
<!-- Fix -->
<div id="results" role="status" aria-live="polite">New results loaded</div>
<!-- Issue: Form error not read -->
<input type="email">
<span class="error">Invalid email</span>
<!-- Fix -->
<input type="email" aria-invalid="true" aria-describedby="email-error">
<span id="email-error" role="alert">Invalid email</span>
```
## NVDA (Windows)
### Setup
```
Download: nvaccess.org
Start: Ctrl + Alt + N
Stop: Insert + Q
```
### Essential Commands
```
Navigation:
Insert = NVDA modifier
Down Arrow Next line
Up Arrow Previous line
Tab Next focusable
Shift + Tab Previous focusable
Reading:
NVDA + Down Arrow Say all
Ctrl Stop speech
NVDA + Up Arrow Current line
Headings:
H Next heading
Shift + H Previous heading
1-6 Heading level 1-6
Forms:
F Next form field
B Next button
E Next edit field
X Next checkbox
C Next combo box
Links:
K Next link
U Next unvisited link
V Next visited link
Landmarks:
D Next landmark
Shift + D Previous landmark
Tables:
T Next table
Ctrl + Alt + Arrows Navigate cells
Elements List (NVDA + F7):
Shows all links, headings, form fields, landmarks
```
### Browse vs Focus Mode
```
NVDA automatically switches modes:
- Browse Mode: Arrow keys navigate content
- Focus Mode: Arrow keys control interactive elements
Manual switch: NVDA + Space
Watch for:
- "Browse mode" announcement when navigating
- "Focus mode" when entering form fields
- Application role forces forms mode
```
### Testing Script
```markdown
## NVDA Test Script
### Initial Load
1. Navigate to page
2. Let page finish loading
3. Press Insert + Down to read all
4. Note: Page title, main content identified?
### Landmark Navigation
1. Press D repeatedly
2. Check: All main areas reachable?
3. Check: Landmarks properly labeled?
### Heading Navigation
1. Press Insert + F7 → Headings
2. Check: Logical heading structure?
3. Press H to navigate headings
4. Check: All sections discoverable?
### Form Testing
1. Press F to find first form field
2. Check: Label read?
3. Fill in invalid data
4. Submit form
5. Check: Errors announced?
6. Check: Focus moved to error?
### Interactive Elements
1. Tab through all interactive elements
2. Check: Each announces role and state
3. Activate buttons with Enter/Space
4. Check: Result announced?
### Dynamic Content
1. Trigger content update
2. Check: Change announced?
3. Open modal
4. Check: Focus trapped?
5. Close modal
6. Check: Focus returns?
```
## JAWS (Windows)
### Essential Commands
```
Start: Desktop shortcut or Ctrl + Alt + J
Virtual Cursor: Auto-enabled in browsers
Navigation:
Arrow keys Navigate content
Tab Next focusable
Insert + Down Read all
Ctrl Stop speech
Quick Keys:
H Next heading
T Next table
F Next form field
B Next button
G Next graphic
L Next list
; Next landmark
Forms Mode:
Enter Enter forms mode
Numpad + Exit forms mode
F5 List form fields
Lists:
Insert + F7 Link list
Insert + F6 Heading list
Insert + F5 Form field list
Tables:
Ctrl + Alt + Arrows Table navigation
```
## TalkBack (Android)
### Setup
```
Enable: Settings → Accessibility → TalkBack
Toggle: Hold both volume buttons 3 seconds
```
### Gestures
```
Explore: Drag finger across screen
Next: Swipe right
Previous: Swipe left
Activate: Double tap
Scroll: Two finger swipe
Reading Controls (swipe up then right):
- Headings
- Links
- Controls
- Characters
- Words
- Lines
- Paragraphs
```
## Common Test Scenarios
### 1. Modal Dialog
```html
<!-- Accessible modal structure -->
<div role="dialog"
aria-modal="true"
aria-labelledby="dialog-title"
aria-describedby="dialog-desc">
<h2 id="dialog-title">Confirm Delete</h2>
<p id="dialog-desc">This action cannot be undone.</p>
<button>Cancel</button>
<button>Delete</button>
</div>
```
```javascript
// Focus management
function openModal(modal) {
// Store last focused element
lastFocus = document.activeElement;
// Move focus to modal
modal.querySelector('h2').focus();
// Trap focus
modal.addEventListener('keydown', trapFocus);
}
function closeModal(modal) {
// Return focus
lastFocus.focus();
}
function trapFocus(e) {
if (e.key === 'Tab') {
const focusable = modal.querySelectorAll(
'button, [href], input, select, textarea, [tabindex]:not([tabindex="-1"])'
);
const first = focusable[0];
const last = focusable[focusable.length - 1];
if (e.shiftKey && document.activeElement === first) {
last.focus();
e.preventDefault();
} else if (!e.shiftKey && document.activeElement === last) {
first.focus();
e.preventDefault();
}
}
if (e.key === 'Escape') {
closeModal(modal);
}
}
```
### 2. Live Regions
```html
<!-- Status messages (polite) -->
<div role="status" aria-live="polite" aria-atomic="true">
<!-- Content updates will be announced after current speech -->
</div>
<!-- Alerts (assertive) -->
<div role="alert" aria-live="assertive">
<!-- Content updates interrupt current speech -->
</div>
<!-- Progress updates -->
<div role="progressbar"
aria-valuenow="75"
aria-valuemin="0"
aria-valuemax="100"
aria-label="Upload progress">
</div>
<!-- Log (additions only) -->
<div role="log" aria-live="polite" aria-relevant="additions">
<!-- New messages announced, removals not -->
</div>
```
### 3. Tab Interface
```html
<div role="tablist" aria-label="Product information">
<button role="tab"
id="tab-1"
aria-selected="true"
aria-controls="panel-1">
Description
</button>
<button role="tab"
id="tab-2"
aria-selected="false"
aria-controls="panel-2"
tabindex="-1">
Reviews
</button>
</div>
<div role="tabpanel"
id="panel-1"
aria-labelledby="tab-1">
Product description content...
</div>
<div role="tabpanel"
id="panel-2"
aria-labelledby="tab-2"
hidden>
Reviews content...
</div>
```
```javascript
// Tab keyboard navigation
tablist.addEventListener('keydown', (e) => {
const tabs = [...tablist.querySelectorAll('[role="tab"]')];
const index = tabs.indexOf(document.activeElement);
let newIndex;
switch (e.key) {
case 'ArrowRight':
newIndex = (index + 1) % tabs.length;
break;
case 'ArrowLeft':
newIndex = (index - 1 + tabs.length) % tabs.length;
break;
case 'Home':
newIndex = 0;
break;
case 'End':
newIndex = tabs.length - 1;
break;
default:
return;
}
tabs[newIndex].focus();
activateTab(tabs[newIndex]);
e.preventDefault();
});
```
## Debugging Tips
```javascript
// Log what screen reader sees
function logAccessibleName(element) {
const computed = window.getComputedStyle(element);
console.log({
role: element.getAttribute('role') || element.tagName,
name: element.getAttribute('aria-label') ||
element.getAttribute('aria-labelledby') ||
element.textContent,
state: {
expanded: element.getAttribute('aria-expanded'),
selected: element.getAttribute('aria-selected'),
checked: element.getAttribute('aria-checked'),
disabled: element.disabled
},
visible: computed.display !== 'none' && computed.visibility !== 'hidden'
});
}
```
## Best Practices
### Do's
- **Test with actual screen readers** - Not just simulators
- **Use semantic HTML first** - ARIA is supplemental
- **Test in browse and focus modes** - Different experiences
- **Verify focus management** - Especially for SPAs
- **Test keyboard only first** - Foundation for SR testing
### Don'ts
- **Don't assume one SR is enough** - Test multiple
- **Don't ignore mobile** - Growing user base
- **Don't test only happy path** - Test error states
- **Don't skip dynamic content** - Most common issues
- **Don't rely on visual testing** - Different experience
## Resources
- [VoiceOver User Guide](https://support.apple.com/guide/voiceover/welcome/mac)
- [NVDA User Guide](https://www.nvaccess.org/files/nvda/documentation/userGuide.html)
- [JAWS Documentation](https://support.freedomscientific.com/Products/Blindness/JAWS)
- [WebAIM Screen Reader Survey](https://webaim.org/projects/screenreadersurvey/)

View File

@@ -0,0 +1,508 @@
---
name: wcag-audit-patterns
description: Conduct WCAG 2.2 accessibility audits with automated testing, manual verification, and remediation guidance. Use when auditing websites for accessibility, fixing WCAG violations, or implementing accessible design patterns.
---
# WCAG Audit Patterns
Comprehensive guide to auditing web content against WCAG 2.2 guidelines with actionable remediation strategies.
## When to Use This Skill
- Conducting accessibility audits
- Fixing WCAG violations
- Implementing accessible components
- Preparing for accessibility lawsuits
- Meeting ADA/Section 508 requirements
- Achieving VPAT compliance
## Core Concepts
### 1. WCAG Conformance Levels
| Level | Description | Required For |
|-------|-------------|--------------|
| **A** | Minimum accessibility | Legal baseline |
| **AA** | Standard conformance | Most regulations |
| **AAA** | Enhanced accessibility | Specialized needs |
### 2. POUR Principles
```
Perceivable: Can users perceive the content?
Operable: Can users operate the interface?
Understandable: Can users understand the content?
Robust: Does it work with assistive tech?
```
### 3. Common Violations by Impact
```
Critical (Blockers):
├── Missing alt text for functional images
├── No keyboard access to interactive elements
├── Missing form labels
└── Auto-playing media without controls
Serious:
├── Insufficient color contrast
├── Missing skip links
├── Inaccessible custom widgets
└── Missing page titles
Moderate:
├── Missing language attribute
├── Unclear link text
├── Missing landmarks
└── Improper heading hierarchy
```
## Audit Checklist
### Perceivable (Principle 1)
```markdown
## 1.1 Text Alternatives
### 1.1.1 Non-text Content (Level A)
- [ ] All images have alt text
- [ ] Decorative images have alt=""
- [ ] Complex images have long descriptions
- [ ] Icons with meaning have accessible names
- [ ] CAPTCHAs have alternatives
Check:
```html
<!-- Good -->
<img src="chart.png" alt="Sales increased 25% from Q1 to Q2">
<img src="decorative-line.png" alt="">
<!-- Bad -->
<img src="chart.png">
<img src="decorative-line.png" alt="decorative line">
```
## 1.2 Time-based Media
### 1.2.1 Audio-only and Video-only (Level A)
- [ ] Audio has text transcript
- [ ] Video has audio description or transcript
### 1.2.2 Captions (Level A)
- [ ] All video has synchronized captions
- [ ] Captions are accurate and complete
- [ ] Speaker identification included
### 1.2.3 Audio Description (Level A)
- [ ] Video has audio description for visual content
## 1.3 Adaptable
### 1.3.1 Info and Relationships (Level A)
- [ ] Headings use proper tags (h1-h6)
- [ ] Lists use ul/ol/dl
- [ ] Tables have headers
- [ ] Form inputs have labels
- [ ] ARIA landmarks present
Check:
```html
<!-- Heading hierarchy -->
<h1>Page Title</h1>
<h2>Section</h2>
<h3>Subsection</h3>
<h2>Another Section</h2>
<!-- Table headers -->
<table>
<thead>
<tr><th scope="col">Name</th><th scope="col">Price</th></tr>
</thead>
</table>
```
### 1.3.2 Meaningful Sequence (Level A)
- [ ] Reading order is logical
- [ ] CSS positioning doesn't break order
- [ ] Focus order matches visual order
### 1.3.3 Sensory Characteristics (Level A)
- [ ] Instructions don't rely on shape/color alone
- [ ] "Click the red button" → "Click Submit (red button)"
## 1.4 Distinguishable
### 1.4.1 Use of Color (Level A)
- [ ] Color is not only means of conveying info
- [ ] Links distinguishable without color
- [ ] Error states not color-only
### 1.4.3 Contrast (Minimum) (Level AA)
- [ ] Text: 4.5:1 contrast ratio
- [ ] Large text (18pt+): 3:1 ratio
- [ ] UI components: 3:1 ratio
Tools: WebAIM Contrast Checker, axe DevTools
### 1.4.4 Resize Text (Level AA)
- [ ] Text resizes to 200% without loss
- [ ] No horizontal scrolling at 320px
- [ ] Content reflows properly
### 1.4.10 Reflow (Level AA)
- [ ] Content reflows at 400% zoom
- [ ] No two-dimensional scrolling
- [ ] All content accessible at 320px width
### 1.4.11 Non-text Contrast (Level AA)
- [ ] UI components have 3:1 contrast
- [ ] Focus indicators visible
- [ ] Graphical objects distinguishable
### 1.4.12 Text Spacing (Level AA)
- [ ] No content loss with increased spacing
- [ ] Line height 1.5x font size
- [ ] Paragraph spacing 2x font size
- [ ] Letter spacing 0.12x font size
- [ ] Word spacing 0.16x font size
```
### Operable (Principle 2)
```markdown
## 2.1 Keyboard Accessible
### 2.1.1 Keyboard (Level A)
- [ ] All functionality keyboard accessible
- [ ] No keyboard traps
- [ ] Tab order is logical
- [ ] Custom widgets are keyboard operable
Check:
```javascript
// Custom button must be keyboard accessible
<div role="button" tabindex="0"
onkeydown="if(event.key === 'Enter' || event.key === ' ') activate()">
```
### 2.1.2 No Keyboard Trap (Level A)
- [ ] Focus can move away from all components
- [ ] Modal dialogs trap focus correctly
- [ ] Focus returns after modal closes
## 2.2 Enough Time
### 2.2.1 Timing Adjustable (Level A)
- [ ] Session timeouts can be extended
- [ ] User warned before timeout
- [ ] Option to disable auto-refresh
### 2.2.2 Pause, Stop, Hide (Level A)
- [ ] Moving content can be paused
- [ ] Auto-updating content can be paused
- [ ] Animations respect prefers-reduced-motion
```css
@media (prefers-reduced-motion: reduce) {
* {
animation: none !important;
transition: none !important;
}
}
```
## 2.3 Seizures and Physical Reactions
### 2.3.1 Three Flashes (Level A)
- [ ] No content flashes more than 3 times/second
- [ ] Flashing area is small (<25% viewport)
## 2.4 Navigable
### 2.4.1 Bypass Blocks (Level A)
- [ ] Skip to main content link present
- [ ] Landmark regions defined
- [ ] Proper heading structure
```html
<a href="#main" class="skip-link">Skip to main content</a>
<main id="main">...</main>
```
### 2.4.2 Page Titled (Level A)
- [ ] Unique, descriptive page titles
- [ ] Title reflects page content
### 2.4.3 Focus Order (Level A)
- [ ] Focus order matches visual order
- [ ] tabindex used correctly
### 2.4.4 Link Purpose (In Context) (Level A)
- [ ] Links make sense out of context
- [ ] No "click here" or "read more" alone
```html
<!-- Bad -->
<a href="report.pdf">Click here</a>
<!-- Good -->
<a href="report.pdf">Download Q4 Sales Report (PDF)</a>
```
### 2.4.6 Headings and Labels (Level AA)
- [ ] Headings describe content
- [ ] Labels describe purpose
### 2.4.7 Focus Visible (Level AA)
- [ ] Focus indicator visible on all elements
- [ ] Custom focus styles meet contrast
```css
:focus {
outline: 3px solid #005fcc;
outline-offset: 2px;
}
```
### 2.4.11 Focus Not Obscured (Level AA) - WCAG 2.2
- [ ] Focused element not fully hidden
- [ ] Sticky headers don't obscure focus
```
### Understandable (Principle 3)
```markdown
## 3.1 Readable
### 3.1.1 Language of Page (Level A)
- [ ] HTML lang attribute set
- [ ] Language correct for content
```html
<html lang="en">
```
### 3.1.2 Language of Parts (Level AA)
- [ ] Language changes marked
```html
<p>The French word <span lang="fr">bonjour</span> means hello.</p>
```
## 3.2 Predictable
### 3.2.1 On Focus (Level A)
- [ ] No context change on focus alone
- [ ] No unexpected popups on focus
### 3.2.2 On Input (Level A)
- [ ] No automatic form submission
- [ ] User warned before context change
### 3.2.3 Consistent Navigation (Level AA)
- [ ] Navigation consistent across pages
- [ ] Repeated components same order
### 3.2.4 Consistent Identification (Level AA)
- [ ] Same functionality = same label
- [ ] Icons used consistently
## 3.3 Input Assistance
### 3.3.1 Error Identification (Level A)
- [ ] Errors clearly identified
- [ ] Error message describes problem
- [ ] Error linked to field
```html
<input aria-describedby="email-error" aria-invalid="true">
<span id="email-error" role="alert">Please enter valid email</span>
```
### 3.3.2 Labels or Instructions (Level A)
- [ ] All inputs have visible labels
- [ ] Required fields indicated
- [ ] Format hints provided
### 3.3.3 Error Suggestion (Level AA)
- [ ] Errors include correction suggestion
- [ ] Suggestions are specific
### 3.3.4 Error Prevention (Level AA)
- [ ] Legal/financial forms reversible
- [ ] Data checked before submission
- [ ] User can review before submit
```
### Robust (Principle 4)
```markdown
## 4.1 Compatible
### 4.1.1 Parsing (Level A) - Obsolete in WCAG 2.2
- [ ] Valid HTML (good practice)
- [ ] No duplicate IDs
- [ ] Complete start/end tags
### 4.1.2 Name, Role, Value (Level A)
- [ ] Custom widgets have accessible names
- [ ] ARIA roles correct
- [ ] State changes announced
```html
<!-- Accessible custom checkbox -->
<div role="checkbox"
aria-checked="false"
tabindex="0"
aria-labelledby="label">
</div>
<span id="label">Accept terms</span>
```
### 4.1.3 Status Messages (Level AA)
- [ ] Status updates announced
- [ ] Live regions used correctly
```html
<div role="status" aria-live="polite">
3 items added to cart
</div>
<div role="alert" aria-live="assertive">
Error: Form submission failed
</div>
```
```
## Automated Testing
```javascript
// axe-core integration
const axe = require('axe-core');
async function runAccessibilityAudit(page) {
await page.addScriptTag({ path: require.resolve('axe-core') });
const results = await page.evaluate(async () => {
return await axe.run(document, {
runOnly: {
type: 'tag',
values: ['wcag2a', 'wcag2aa', 'wcag21aa', 'wcag22aa']
}
});
});
return {
violations: results.violations,
passes: results.passes,
incomplete: results.incomplete
};
}
// Playwright test example
test('should have no accessibility violations', async ({ page }) => {
await page.goto('/');
const results = await runAccessibilityAudit(page);
expect(results.violations).toHaveLength(0);
});
```
```bash
# CLI tools
npx @axe-core/cli https://example.com
npx pa11y https://example.com
lighthouse https://example.com --only-categories=accessibility
```
## Remediation Patterns
### Fix: Missing Form Labels
```html
<!-- Before -->
<input type="email" placeholder="Email">
<!-- After: Option 1 - Visible label -->
<label for="email">Email address</label>
<input id="email" type="email">
<!-- After: Option 2 - aria-label -->
<input type="email" aria-label="Email address">
<!-- After: Option 3 - aria-labelledby -->
<span id="email-label">Email</span>
<input type="email" aria-labelledby="email-label">
```
### Fix: Insufficient Color Contrast
```css
/* Before: 2.5:1 contrast */
.text { color: #767676; }
/* After: 4.5:1 contrast */
.text { color: #595959; }
/* Or add background */
.text {
color: #767676;
background: #000;
}
```
### Fix: Keyboard Navigation
```javascript
// Make custom element keyboard accessible
class AccessibleDropdown extends HTMLElement {
connectedCallback() {
this.setAttribute('tabindex', '0');
this.setAttribute('role', 'combobox');
this.setAttribute('aria-expanded', 'false');
this.addEventListener('keydown', (e) => {
switch (e.key) {
case 'Enter':
case ' ':
this.toggle();
e.preventDefault();
break;
case 'Escape':
this.close();
break;
case 'ArrowDown':
this.focusNext();
e.preventDefault();
break;
case 'ArrowUp':
this.focusPrevious();
e.preventDefault();
break;
}
});
}
}
```
## Best Practices
### Do's
- **Start early** - Accessibility from design phase
- **Test with real users** - Disabled users provide best feedback
- **Automate what you can** - 30-50% issues detectable
- **Use semantic HTML** - Reduces ARIA needs
- **Document patterns** - Build accessible component library
### Don'ts
- **Don't rely only on automated testing** - Manual testing required
- **Don't use ARIA as first solution** - Native HTML first
- **Don't hide focus outlines** - Keyboard users need them
- **Don't disable zoom** - Users need to resize
- **Don't use color alone** - Multiple indicators needed
## Resources
- [WCAG 2.2 Guidelines](https://www.w3.org/TR/WCAG22/)
- [WebAIM](https://webaim.org/)
- [A11y Project Checklist](https://www.a11yproject.com/checklist/)
- [axe DevTools](https://www.deque.com/axe/)

View File

@@ -0,0 +1,42 @@
# Event Sourcing Architect
Expert in event sourcing, CQRS, and event-driven architecture patterns. Masters event store design, projection building, saga orchestration, and eventual consistency patterns. Use PROACTIVELY for event-sourced systems, audit trail requirements, or complex domain modeling with temporal queries.
## Capabilities
- Event store design and implementation
- CQRS (Command Query Responsibility Segregation) patterns
- Projection building and read model optimization
- Saga and process manager orchestration
- Event versioning and schema evolution
- Snapshotting strategies for performance
- Eventual consistency handling
## When to Use
- Building systems requiring complete audit trails
- Implementing complex business workflows with compensating actions
- Designing systems needing temporal queries ("what was state at time X")
- Separating read and write models for performance
- Building event-driven microservices architectures
- Implementing undo/redo or time-travel debugging
## Workflow
1. Identify aggregate boundaries and event streams
2. Design events as immutable facts
3. Implement command handlers and event application
4. Build projections for query requirements
5. Design saga/process managers for cross-aggregate workflows
6. Implement snapshotting for long-lived aggregates
7. Set up event versioning strategy
## Best Practices
- Events are facts - never delete or modify them
- Keep events small and focused
- Version events from day one
- Design for eventual consistency
- Use correlation IDs for tracing
- Implement idempotent event handlers
- Plan for projection rebuilding

View File

@@ -0,0 +1,552 @@
---
name: cqrs-implementation
description: Implement Command Query Responsibility Segregation for scalable architectures. Use when separating read and write models, optimizing query performance, or building event-sourced systems.
---
# CQRS Implementation
Comprehensive guide to implementing CQRS (Command Query Responsibility Segregation) patterns.
## When to Use This Skill
- Separating read and write concerns
- Scaling reads independently from writes
- Building event-sourced systems
- Optimizing complex query scenarios
- Different read/write data models needed
- High-performance reporting requirements
## Core Concepts
### 1. CQRS Architecture
```
┌─────────────┐
│ Client │
└──────┬──────┘
┌────────────┴────────────┐
│ │
▼ ▼
┌─────────────┐ ┌─────────────┐
│ Commands │ │ Queries │
│ API │ │ API │
└──────┬──────┘ └──────┬──────┘
│ │
▼ ▼
┌─────────────┐ ┌─────────────┐
│ Command │ │ Query │
│ Handlers │ │ Handlers │
└──────┬──────┘ └──────┬──────┘
│ │
▼ ▼
┌─────────────┐ ┌─────────────┐
│ Write │─────────►│ Read │
│ Model │ Events │ Model │
└─────────────┘ └─────────────┘
```
### 2. Key Components
| Component | Responsibility |
|-----------|---------------|
| **Command** | Intent to change state |
| **Command Handler** | Validates and executes commands |
| **Event** | Record of state change |
| **Query** | Request for data |
| **Query Handler** | Retrieves data from read model |
| **Projector** | Updates read model from events |
## Templates
### Template 1: Command Infrastructure
```python
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TypeVar, Generic, Dict, Any, Type
from datetime import datetime
import uuid
# Command base
@dataclass
class Command:
command_id: str = None
timestamp: datetime = None
def __post_init__(self):
self.command_id = self.command_id or str(uuid.uuid4())
self.timestamp = self.timestamp or datetime.utcnow()
# Concrete commands
@dataclass
class CreateOrder(Command):
customer_id: str
items: list
shipping_address: dict
@dataclass
class AddOrderItem(Command):
order_id: str
product_id: str
quantity: int
price: float
@dataclass
class CancelOrder(Command):
order_id: str
reason: str
# Command handler base
T = TypeVar('T', bound=Command)
class CommandHandler(ABC, Generic[T]):
@abstractmethod
async def handle(self, command: T) -> Any:
pass
# Command bus
class CommandBus:
def __init__(self):
self._handlers: Dict[Type[Command], CommandHandler] = {}
def register(self, command_type: Type[Command], handler: CommandHandler):
self._handlers[command_type] = handler
async def dispatch(self, command: Command) -> Any:
handler = self._handlers.get(type(command))
if not handler:
raise ValueError(f"No handler for {type(command).__name__}")
return await handler.handle(command)
# Command handler implementation
class CreateOrderHandler(CommandHandler[CreateOrder]):
def __init__(self, order_repository, event_store):
self.order_repository = order_repository
self.event_store = event_store
async def handle(self, command: CreateOrder) -> str:
# Validate
if not command.items:
raise ValueError("Order must have at least one item")
# Create aggregate
order = Order.create(
customer_id=command.customer_id,
items=command.items,
shipping_address=command.shipping_address
)
# Persist events
await self.event_store.append_events(
stream_id=f"Order-{order.id}",
stream_type="Order",
events=order.uncommitted_events
)
return order.id
```
### Template 2: Query Infrastructure
```python
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TypeVar, Generic, List, Optional
# Query base
@dataclass
class Query:
pass
# Concrete queries
@dataclass
class GetOrderById(Query):
order_id: str
@dataclass
class GetCustomerOrders(Query):
customer_id: str
status: Optional[str] = None
page: int = 1
page_size: int = 20
@dataclass
class SearchOrders(Query):
query: str
filters: dict = None
sort_by: str = "created_at"
sort_order: str = "desc"
# Query result types
@dataclass
class OrderView:
order_id: str
customer_id: str
status: str
total_amount: float
item_count: int
created_at: datetime
shipped_at: Optional[datetime] = None
@dataclass
class PaginatedResult(Generic[T]):
items: List[T]
total: int
page: int
page_size: int
@property
def total_pages(self) -> int:
return (self.total + self.page_size - 1) // self.page_size
# Query handler base
T = TypeVar('T', bound=Query)
R = TypeVar('R')
class QueryHandler(ABC, Generic[T, R]):
@abstractmethod
async def handle(self, query: T) -> R:
pass
# Query bus
class QueryBus:
def __init__(self):
self._handlers: Dict[Type[Query], QueryHandler] = {}
def register(self, query_type: Type[Query], handler: QueryHandler):
self._handlers[query_type] = handler
async def dispatch(self, query: Query) -> Any:
handler = self._handlers.get(type(query))
if not handler:
raise ValueError(f"No handler for {type(query).__name__}")
return await handler.handle(query)
# Query handler implementation
class GetOrderByIdHandler(QueryHandler[GetOrderById, Optional[OrderView]]):
def __init__(self, read_db):
self.read_db = read_db
async def handle(self, query: GetOrderById) -> Optional[OrderView]:
async with self.read_db.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT order_id, customer_id, status, total_amount,
item_count, created_at, shipped_at
FROM order_views
WHERE order_id = $1
""",
query.order_id
)
if row:
return OrderView(**dict(row))
return None
class GetCustomerOrdersHandler(QueryHandler[GetCustomerOrders, PaginatedResult[OrderView]]):
def __init__(self, read_db):
self.read_db = read_db
async def handle(self, query: GetCustomerOrders) -> PaginatedResult[OrderView]:
async with self.read_db.acquire() as conn:
# Build query with optional status filter
where_clause = "customer_id = $1"
params = [query.customer_id]
if query.status:
where_clause += " AND status = $2"
params.append(query.status)
# Get total count
total = await conn.fetchval(
f"SELECT COUNT(*) FROM order_views WHERE {where_clause}",
*params
)
# Get paginated results
offset = (query.page - 1) * query.page_size
rows = await conn.fetch(
f"""
SELECT order_id, customer_id, status, total_amount,
item_count, created_at, shipped_at
FROM order_views
WHERE {where_clause}
ORDER BY created_at DESC
LIMIT ${len(params) + 1} OFFSET ${len(params) + 2}
""",
*params, query.page_size, offset
)
return PaginatedResult(
items=[OrderView(**dict(row)) for row in rows],
total=total,
page=query.page,
page_size=query.page_size
)
```
### Template 3: FastAPI CQRS Application
```python
from fastapi import FastAPI, HTTPException, Depends
from pydantic import BaseModel
from typing import List, Optional
app = FastAPI()
# Request/Response models
class CreateOrderRequest(BaseModel):
customer_id: str
items: List[dict]
shipping_address: dict
class OrderResponse(BaseModel):
order_id: str
customer_id: str
status: str
total_amount: float
item_count: int
created_at: datetime
# Dependency injection
def get_command_bus() -> CommandBus:
return app.state.command_bus
def get_query_bus() -> QueryBus:
return app.state.query_bus
# Command endpoints (POST, PUT, DELETE)
@app.post("/orders", response_model=dict)
async def create_order(
request: CreateOrderRequest,
command_bus: CommandBus = Depends(get_command_bus)
):
command = CreateOrder(
customer_id=request.customer_id,
items=request.items,
shipping_address=request.shipping_address
)
order_id = await command_bus.dispatch(command)
return {"order_id": order_id}
@app.post("/orders/{order_id}/items")
async def add_item(
order_id: str,
product_id: str,
quantity: int,
price: float,
command_bus: CommandBus = Depends(get_command_bus)
):
command = AddOrderItem(
order_id=order_id,
product_id=product_id,
quantity=quantity,
price=price
)
await command_bus.dispatch(command)
return {"status": "item_added"}
@app.delete("/orders/{order_id}")
async def cancel_order(
order_id: str,
reason: str,
command_bus: CommandBus = Depends(get_command_bus)
):
command = CancelOrder(order_id=order_id, reason=reason)
await command_bus.dispatch(command)
return {"status": "cancelled"}
# Query endpoints (GET)
@app.get("/orders/{order_id}", response_model=OrderResponse)
async def get_order(
order_id: str,
query_bus: QueryBus = Depends(get_query_bus)
):
query = GetOrderById(order_id=order_id)
result = await query_bus.dispatch(query)
if not result:
raise HTTPException(status_code=404, detail="Order not found")
return result
@app.get("/customers/{customer_id}/orders")
async def get_customer_orders(
customer_id: str,
status: Optional[str] = None,
page: int = 1,
page_size: int = 20,
query_bus: QueryBus = Depends(get_query_bus)
):
query = GetCustomerOrders(
customer_id=customer_id,
status=status,
page=page,
page_size=page_size
)
return await query_bus.dispatch(query)
@app.get("/orders/search")
async def search_orders(
q: str,
sort_by: str = "created_at",
query_bus: QueryBus = Depends(get_query_bus)
):
query = SearchOrders(query=q, sort_by=sort_by)
return await query_bus.dispatch(query)
```
### Template 4: Read Model Synchronization
```python
class ReadModelSynchronizer:
"""Keeps read models in sync with events."""
def __init__(self, event_store, read_db, projections: List[Projection]):
self.event_store = event_store
self.read_db = read_db
self.projections = {p.name: p for p in projections}
async def run(self):
"""Continuously sync read models."""
while True:
for name, projection in self.projections.items():
await self._sync_projection(projection)
await asyncio.sleep(0.1)
async def _sync_projection(self, projection: Projection):
checkpoint = await self._get_checkpoint(projection.name)
events = await self.event_store.read_all(
from_position=checkpoint,
limit=100
)
for event in events:
if event.event_type in projection.handles():
try:
await projection.apply(event)
except Exception as e:
# Log error, possibly retry or skip
logger.error(f"Projection error: {e}")
continue
await self._save_checkpoint(projection.name, event.global_position)
async def rebuild_projection(self, projection_name: str):
"""Rebuild a projection from scratch."""
projection = self.projections[projection_name]
# Clear existing data
await projection.clear()
# Reset checkpoint
await self._save_checkpoint(projection_name, 0)
# Rebuild
while True:
checkpoint = await self._get_checkpoint(projection_name)
events = await self.event_store.read_all(checkpoint, 1000)
if not events:
break
for event in events:
if event.event_type in projection.handles():
await projection.apply(event)
await self._save_checkpoint(
projection_name,
events[-1].global_position
)
```
### Template 5: Eventual Consistency Handling
```python
class ConsistentQueryHandler:
"""Query handler that can wait for consistency."""
def __init__(self, read_db, event_store):
self.read_db = read_db
self.event_store = event_store
async def query_after_command(
self,
query: Query,
expected_version: int,
stream_id: str,
timeout: float = 5.0
):
"""
Execute query, ensuring read model is at expected version.
Used for read-your-writes consistency.
"""
start_time = time.time()
while time.time() - start_time < timeout:
# Check if read model is caught up
projection_version = await self._get_projection_version(stream_id)
if projection_version >= expected_version:
return await self.execute_query(query)
# Wait a bit and retry
await asyncio.sleep(0.1)
# Timeout - return stale data with warning
return {
"data": await self.execute_query(query),
"_warning": "Data may be stale"
}
async def _get_projection_version(self, stream_id: str) -> int:
"""Get the last processed event version for a stream."""
async with self.read_db.acquire() as conn:
return await conn.fetchval(
"SELECT last_event_version FROM projection_state WHERE stream_id = $1",
stream_id
) or 0
```
## Best Practices
### Do's
- **Separate command and query models** - Different needs
- **Use eventual consistency** - Accept propagation delay
- **Validate in command handlers** - Before state change
- **Denormalize read models** - Optimize for queries
- **Version your events** - For schema evolution
### Don'ts
- **Don't query in commands** - Use only for writes
- **Don't couple read/write schemas** - Independent evolution
- **Don't over-engineer** - Start simple
- **Don't ignore consistency SLAs** - Define acceptable lag
## Resources
- [CQRS Pattern](https://martinfowler.com/bliki/CQRS.html)
- [Microsoft CQRS Guidance](https://docs.microsoft.com/en-us/azure/architecture/patterns/cqrs)

View File

@@ -0,0 +1,435 @@
---
name: event-store-design
description: Design and implement event stores for event-sourced systems. Use when building event sourcing infrastructure, choosing event store technologies, or implementing event persistence patterns.
---
# Event Store Design
Comprehensive guide to designing event stores for event-sourced applications.
## When to Use This Skill
- Designing event sourcing infrastructure
- Choosing between event store technologies
- Implementing custom event stores
- Optimizing event storage and retrieval
- Setting up event store schemas
- Planning for event store scaling
## Core Concepts
### 1. Event Store Architecture
```
┌─────────────────────────────────────────────────────┐
│ Event Store │
├─────────────────────────────────────────────────────┤
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Stream 1 │ │ Stream 2 │ │ Stream 3 │ │
│ │ (Aggregate) │ │ (Aggregate) │ │ (Aggregate) │ │
│ ├─────────────┤ ├─────────────┤ ├─────────────┤ │
│ │ Event 1 │ │ Event 1 │ │ Event 1 │ │
│ │ Event 2 │ │ Event 2 │ │ Event 2 │ │
│ │ Event 3 │ │ ... │ │ Event 3 │ │
│ │ ... │ │ │ │ Event 4 │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
├─────────────────────────────────────────────────────┤
│ Global Position: 1 → 2 → 3 → 4 → 5 → 6 → ... │
└─────────────────────────────────────────────────────┘
```
### 2. Event Store Requirements
| Requirement | Description |
|-------------|-------------|
| **Append-only** | Events are immutable, only appends |
| **Ordered** | Per-stream and global ordering |
| **Versioned** | Optimistic concurrency control |
| **Subscriptions** | Real-time event notifications |
| **Idempotent** | Handle duplicate writes safely |
## Technology Comparison
| Technology | Best For | Limitations |
|------------|----------|-------------|
| **EventStoreDB** | Pure event sourcing | Single-purpose |
| **PostgreSQL** | Existing Postgres stack | Manual implementation |
| **Kafka** | High-throughput streaming | Not ideal for per-stream queries |
| **DynamoDB** | Serverless, AWS-native | Query limitations |
| **Marten** | .NET ecosystems | .NET specific |
## Templates
### Template 1: PostgreSQL Event Store Schema
```sql
-- Events table
CREATE TABLE events (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
stream_id VARCHAR(255) NOT NULL,
stream_type VARCHAR(255) NOT NULL,
event_type VARCHAR(255) NOT NULL,
event_data JSONB NOT NULL,
metadata JSONB DEFAULT '{}',
version BIGINT NOT NULL,
global_position BIGSERIAL,
created_at TIMESTAMPTZ DEFAULT NOW(),
CONSTRAINT unique_stream_version UNIQUE (stream_id, version)
);
-- Index for stream queries
CREATE INDEX idx_events_stream_id ON events(stream_id, version);
-- Index for global subscription
CREATE INDEX idx_events_global_position ON events(global_position);
-- Index for event type queries
CREATE INDEX idx_events_event_type ON events(event_type);
-- Index for time-based queries
CREATE INDEX idx_events_created_at ON events(created_at);
-- Snapshots table
CREATE TABLE snapshots (
stream_id VARCHAR(255) PRIMARY KEY,
stream_type VARCHAR(255) NOT NULL,
snapshot_data JSONB NOT NULL,
version BIGINT NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Subscriptions checkpoint table
CREATE TABLE subscription_checkpoints (
subscription_id VARCHAR(255) PRIMARY KEY,
last_position BIGINT NOT NULL DEFAULT 0,
updated_at TIMESTAMPTZ DEFAULT NOW()
);
```
### Template 2: Python Event Store Implementation
```python
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Optional, List
from uuid import UUID, uuid4
import json
import asyncpg
@dataclass
class Event:
stream_id: str
event_type: str
data: dict
metadata: dict = field(default_factory=dict)
event_id: UUID = field(default_factory=uuid4)
version: Optional[int] = None
global_position: Optional[int] = None
created_at: datetime = field(default_factory=datetime.utcnow)
class EventStore:
def __init__(self, pool: asyncpg.Pool):
self.pool = pool
async def append_events(
self,
stream_id: str,
stream_type: str,
events: List[Event],
expected_version: Optional[int] = None
) -> List[Event]:
"""Append events to a stream with optimistic concurrency."""
async with self.pool.acquire() as conn:
async with conn.transaction():
# Check expected version
if expected_version is not None:
current = await conn.fetchval(
"SELECT MAX(version) FROM events WHERE stream_id = $1",
stream_id
)
current = current or 0
if current != expected_version:
raise ConcurrencyError(
f"Expected version {expected_version}, got {current}"
)
# Get starting version
start_version = await conn.fetchval(
"SELECT COALESCE(MAX(version), 0) + 1 FROM events WHERE stream_id = $1",
stream_id
)
# Insert events
saved_events = []
for i, event in enumerate(events):
event.version = start_version + i
row = await conn.fetchrow(
"""
INSERT INTO events (id, stream_id, stream_type, event_type,
event_data, metadata, version, created_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
RETURNING global_position
""",
event.event_id,
stream_id,
stream_type,
event.event_type,
json.dumps(event.data),
json.dumps(event.metadata),
event.version,
event.created_at
)
event.global_position = row['global_position']
saved_events.append(event)
return saved_events
async def read_stream(
self,
stream_id: str,
from_version: int = 0,
limit: int = 1000
) -> List[Event]:
"""Read events from a stream."""
async with self.pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT id, stream_id, event_type, event_data, metadata,
version, global_position, created_at
FROM events
WHERE stream_id = $1 AND version >= $2
ORDER BY version
LIMIT $3
""",
stream_id, from_version, limit
)
return [self._row_to_event(row) for row in rows]
async def read_all(
self,
from_position: int = 0,
limit: int = 1000
) -> List[Event]:
"""Read all events globally."""
async with self.pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT id, stream_id, event_type, event_data, metadata,
version, global_position, created_at
FROM events
WHERE global_position > $1
ORDER BY global_position
LIMIT $2
""",
from_position, limit
)
return [self._row_to_event(row) for row in rows]
async def subscribe(
self,
subscription_id: str,
handler,
from_position: int = 0,
batch_size: int = 100
):
"""Subscribe to all events from a position."""
# Get checkpoint
async with self.pool.acquire() as conn:
checkpoint = await conn.fetchval(
"""
SELECT last_position FROM subscription_checkpoints
WHERE subscription_id = $1
""",
subscription_id
)
position = checkpoint or from_position
while True:
events = await self.read_all(position, batch_size)
if not events:
await asyncio.sleep(1) # Poll interval
continue
for event in events:
await handler(event)
position = event.global_position
# Save checkpoint
async with self.pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO subscription_checkpoints (subscription_id, last_position)
VALUES ($1, $2)
ON CONFLICT (subscription_id)
DO UPDATE SET last_position = $2, updated_at = NOW()
""",
subscription_id, position
)
def _row_to_event(self, row) -> Event:
return Event(
event_id=row['id'],
stream_id=row['stream_id'],
event_type=row['event_type'],
data=json.loads(row['event_data']),
metadata=json.loads(row['metadata']),
version=row['version'],
global_position=row['global_position'],
created_at=row['created_at']
)
class ConcurrencyError(Exception):
"""Raised when optimistic concurrency check fails."""
pass
```
### Template 3: EventStoreDB Usage
```python
from esdbclient import EventStoreDBClient, NewEvent, StreamState
import json
# Connect
client = EventStoreDBClient(uri="esdb://localhost:2113?tls=false")
# Append events
def append_events(stream_name: str, events: list, expected_revision=None):
new_events = [
NewEvent(
type=event['type'],
data=json.dumps(event['data']).encode(),
metadata=json.dumps(event.get('metadata', {})).encode()
)
for event in events
]
if expected_revision is None:
state = StreamState.ANY
elif expected_revision == -1:
state = StreamState.NO_STREAM
else:
state = expected_revision
return client.append_to_stream(
stream_name=stream_name,
events=new_events,
current_version=state
)
# Read stream
def read_stream(stream_name: str, from_revision: int = 0):
events = client.get_stream(
stream_name=stream_name,
stream_position=from_revision
)
return [
{
'type': event.type,
'data': json.loads(event.data),
'metadata': json.loads(event.metadata) if event.metadata else {},
'stream_position': event.stream_position,
'commit_position': event.commit_position
}
for event in events
]
# Subscribe to all
async def subscribe_to_all(handler, from_position: int = 0):
subscription = client.subscribe_to_all(commit_position=from_position)
async for event in subscription:
await handler({
'type': event.type,
'data': json.loads(event.data),
'stream_id': event.stream_name,
'position': event.commit_position
})
# Category projection ($ce-Category)
def read_category(category: str):
"""Read all events for a category using system projection."""
return read_stream(f"$ce-{category}")
```
### Template 4: DynamoDB Event Store
```python
import boto3
from boto3.dynamodb.conditions import Key
from datetime import datetime
import json
import uuid
class DynamoEventStore:
def __init__(self, table_name: str):
self.dynamodb = boto3.resource('dynamodb')
self.table = self.dynamodb.Table(table_name)
def append_events(self, stream_id: str, events: list, expected_version: int = None):
"""Append events with conditional write for concurrency."""
with self.table.batch_writer() as batch:
for i, event in enumerate(events):
version = (expected_version or 0) + i + 1
item = {
'PK': f"STREAM#{stream_id}",
'SK': f"VERSION#{version:020d}",
'GSI1PK': 'EVENTS',
'GSI1SK': datetime.utcnow().isoformat(),
'event_id': str(uuid.uuid4()),
'stream_id': stream_id,
'event_type': event['type'],
'event_data': json.dumps(event['data']),
'version': version,
'created_at': datetime.utcnow().isoformat()
}
batch.put_item(Item=item)
return events
def read_stream(self, stream_id: str, from_version: int = 0):
"""Read events from a stream."""
response = self.table.query(
KeyConditionExpression=Key('PK').eq(f"STREAM#{stream_id}") &
Key('SK').gte(f"VERSION#{from_version:020d}")
)
return [
{
'event_type': item['event_type'],
'data': json.loads(item['event_data']),
'version': item['version']
}
for item in response['Items']
]
# Table definition (CloudFormation/Terraform)
"""
DynamoDB Table:
- PK (Partition Key): String
- SK (Sort Key): String
- GSI1PK, GSI1SK for global ordering
Capacity: On-demand or provisioned based on throughput needs
"""
```
## Best Practices
### Do's
- **Use stream IDs that include aggregate type** - `Order-{uuid}`
- **Include correlation/causation IDs** - For tracing
- **Version events from day one** - Plan for schema evolution
- **Implement idempotency** - Use event IDs for deduplication
- **Index appropriately** - For your query patterns
### Don'ts
- **Don't update or delete events** - They're immutable facts
- **Don't store large payloads** - Keep events small
- **Don't skip optimistic concurrency** - Prevents data corruption
- **Don't ignore backpressure** - Handle slow consumers
## Resources
- [EventStoreDB](https://www.eventstore.com/)
- [Marten Events](https://martendb.io/events/)
- [Event Sourcing Pattern](https://docs.microsoft.com/en-us/azure/architecture/patterns/event-sourcing)

View File

@@ -0,0 +1,488 @@
---
name: projection-patterns
description: Build read models and projections from event streams. Use when implementing CQRS read sides, building materialized views, or optimizing query performance in event-sourced systems.
---
# Projection Patterns
Comprehensive guide to building projections and read models for event-sourced systems.
## When to Use This Skill
- Building CQRS read models
- Creating materialized views from events
- Optimizing query performance
- Implementing real-time dashboards
- Building search indexes from events
- Aggregating data across streams
## Core Concepts
### 1. Projection Architecture
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Event Store │────►│ Projector │────►│ Read Model │
│ │ │ │ │ (Database) │
│ ┌─────────┐ │ │ ┌─────────┐ │ │ ┌─────────┐ │
│ │ Events │ │ │ │ Handler │ │ │ │ Tables │ │
│ └─────────┘ │ │ │ Logic │ │ │ │ Views │ │
│ │ │ └─────────┘ │ │ │ Cache │ │
└─────────────┘ └─────────────┘ └─────────────┘
```
### 2. Projection Types
| Type | Description | Use Case |
|------|-------------|----------|
| **Live** | Real-time from subscription | Current state queries |
| **Catchup** | Process historical events | Rebuilding read models |
| **Persistent** | Stores checkpoint | Resume after restart |
| **Inline** | Same transaction as write | Strong consistency |
## Templates
### Template 1: Basic Projector
```python
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict, Any, Callable, List
import asyncpg
@dataclass
class Event:
stream_id: str
event_type: str
data: dict
version: int
global_position: int
class Projection(ABC):
"""Base class for projections."""
@property
@abstractmethod
def name(self) -> str:
"""Unique projection name for checkpointing."""
pass
@abstractmethod
def handles(self) -> List[str]:
"""List of event types this projection handles."""
pass
@abstractmethod
async def apply(self, event: Event) -> None:
"""Apply event to the read model."""
pass
class Projector:
"""Runs projections from event store."""
def __init__(self, event_store, checkpoint_store):
self.event_store = event_store
self.checkpoint_store = checkpoint_store
self.projections: List[Projection] = []
def register(self, projection: Projection):
self.projections.append(projection)
async def run(self, batch_size: int = 100):
"""Run all projections continuously."""
while True:
for projection in self.projections:
await self._run_projection(projection, batch_size)
await asyncio.sleep(0.1)
async def _run_projection(self, projection: Projection, batch_size: int):
checkpoint = await self.checkpoint_store.get(projection.name)
position = checkpoint or 0
events = await self.event_store.read_all(position, batch_size)
for event in events:
if event.event_type in projection.handles():
await projection.apply(event)
await self.checkpoint_store.save(
projection.name,
event.global_position
)
async def rebuild(self, projection: Projection):
"""Rebuild a projection from scratch."""
await self.checkpoint_store.delete(projection.name)
# Optionally clear read model tables
await self._run_projection(projection, batch_size=1000)
```
### Template 2: Order Summary Projection
```python
class OrderSummaryProjection(Projection):
"""Projects order events to a summary read model."""
def __init__(self, db_pool: asyncpg.Pool):
self.pool = db_pool
@property
def name(self) -> str:
return "order_summary"
def handles(self) -> List[str]:
return [
"OrderCreated",
"OrderItemAdded",
"OrderItemRemoved",
"OrderShipped",
"OrderCompleted",
"OrderCancelled"
]
async def apply(self, event: Event) -> None:
handlers = {
"OrderCreated": self._handle_created,
"OrderItemAdded": self._handle_item_added,
"OrderItemRemoved": self._handle_item_removed,
"OrderShipped": self._handle_shipped,
"OrderCompleted": self._handle_completed,
"OrderCancelled": self._handle_cancelled,
}
handler = handlers.get(event.event_type)
if handler:
await handler(event)
async def _handle_created(self, event: Event):
async with self.pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO order_summaries
(order_id, customer_id, status, total_amount, item_count, created_at)
VALUES ($1, $2, $3, $4, $5, $6)
""",
event.data['order_id'],
event.data['customer_id'],
'pending',
0,
0,
event.data['created_at']
)
async def _handle_item_added(self, event: Event):
async with self.pool.acquire() as conn:
await conn.execute(
"""
UPDATE order_summaries
SET total_amount = total_amount + $2,
item_count = item_count + 1,
updated_at = NOW()
WHERE order_id = $1
""",
event.data['order_id'],
event.data['price'] * event.data['quantity']
)
async def _handle_item_removed(self, event: Event):
async with self.pool.acquire() as conn:
await conn.execute(
"""
UPDATE order_summaries
SET total_amount = total_amount - $2,
item_count = item_count - 1,
updated_at = NOW()
WHERE order_id = $1
""",
event.data['order_id'],
event.data['price'] * event.data['quantity']
)
async def _handle_shipped(self, event: Event):
async with self.pool.acquire() as conn:
await conn.execute(
"""
UPDATE order_summaries
SET status = 'shipped',
shipped_at = $2,
updated_at = NOW()
WHERE order_id = $1
""",
event.data['order_id'],
event.data['shipped_at']
)
async def _handle_completed(self, event: Event):
async with self.pool.acquire() as conn:
await conn.execute(
"""
UPDATE order_summaries
SET status = 'completed',
completed_at = $2,
updated_at = NOW()
WHERE order_id = $1
""",
event.data['order_id'],
event.data['completed_at']
)
async def _handle_cancelled(self, event: Event):
async with self.pool.acquire() as conn:
await conn.execute(
"""
UPDATE order_summaries
SET status = 'cancelled',
cancelled_at = $2,
cancellation_reason = $3,
updated_at = NOW()
WHERE order_id = $1
""",
event.data['order_id'],
event.data['cancelled_at'],
event.data.get('reason')
)
```
### Template 3: Elasticsearch Search Projection
```python
from elasticsearch import AsyncElasticsearch
class ProductSearchProjection(Projection):
"""Projects product events to Elasticsearch for full-text search."""
def __init__(self, es_client: AsyncElasticsearch):
self.es = es_client
self.index = "products"
@property
def name(self) -> str:
return "product_search"
def handles(self) -> List[str]:
return [
"ProductCreated",
"ProductUpdated",
"ProductPriceChanged",
"ProductDeleted"
]
async def apply(self, event: Event) -> None:
if event.event_type == "ProductCreated":
await self.es.index(
index=self.index,
id=event.data['product_id'],
document={
'name': event.data['name'],
'description': event.data['description'],
'category': event.data['category'],
'price': event.data['price'],
'tags': event.data.get('tags', []),
'created_at': event.data['created_at']
}
)
elif event.event_type == "ProductUpdated":
await self.es.update(
index=self.index,
id=event.data['product_id'],
doc={
'name': event.data['name'],
'description': event.data['description'],
'category': event.data['category'],
'tags': event.data.get('tags', []),
'updated_at': event.data['updated_at']
}
)
elif event.event_type == "ProductPriceChanged":
await self.es.update(
index=self.index,
id=event.data['product_id'],
doc={
'price': event.data['new_price'],
'price_updated_at': event.data['changed_at']
}
)
elif event.event_type == "ProductDeleted":
await self.es.delete(
index=self.index,
id=event.data['product_id']
)
```
### Template 4: Aggregating Projection
```python
class DailySalesProjection(Projection):
"""Aggregates sales data by day for reporting."""
def __init__(self, db_pool: asyncpg.Pool):
self.pool = db_pool
@property
def name(self) -> str:
return "daily_sales"
def handles(self) -> List[str]:
return ["OrderCompleted", "OrderRefunded"]
async def apply(self, event: Event) -> None:
if event.event_type == "OrderCompleted":
await self._increment_sales(event)
elif event.event_type == "OrderRefunded":
await self._decrement_sales(event)
async def _increment_sales(self, event: Event):
date = event.data['completed_at'][:10] # YYYY-MM-DD
async with self.pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO daily_sales (date, total_orders, total_revenue, total_items)
VALUES ($1, 1, $2, $3)
ON CONFLICT (date) DO UPDATE SET
total_orders = daily_sales.total_orders + 1,
total_revenue = daily_sales.total_revenue + $2,
total_items = daily_sales.total_items + $3,
updated_at = NOW()
""",
date,
event.data['total_amount'],
event.data['item_count']
)
async def _decrement_sales(self, event: Event):
date = event.data['original_completed_at'][:10]
async with self.pool.acquire() as conn:
await conn.execute(
"""
UPDATE daily_sales SET
total_orders = total_orders - 1,
total_revenue = total_revenue - $2,
total_refunds = total_refunds + $2,
updated_at = NOW()
WHERE date = $1
""",
date,
event.data['refund_amount']
)
```
### Template 5: Multi-Table Projection
```python
class CustomerActivityProjection(Projection):
"""Projects customer activity across multiple tables."""
def __init__(self, db_pool: asyncpg.Pool):
self.pool = db_pool
@property
def name(self) -> str:
return "customer_activity"
def handles(self) -> List[str]:
return [
"CustomerCreated",
"OrderCompleted",
"ReviewSubmitted",
"CustomerTierChanged"
]
async def apply(self, event: Event) -> None:
async with self.pool.acquire() as conn:
async with conn.transaction():
if event.event_type == "CustomerCreated":
# Insert into customers table
await conn.execute(
"""
INSERT INTO customers (customer_id, email, name, tier, created_at)
VALUES ($1, $2, $3, 'bronze', $4)
""",
event.data['customer_id'],
event.data['email'],
event.data['name'],
event.data['created_at']
)
# Initialize activity summary
await conn.execute(
"""
INSERT INTO customer_activity_summary
(customer_id, total_orders, total_spent, total_reviews)
VALUES ($1, 0, 0, 0)
""",
event.data['customer_id']
)
elif event.event_type == "OrderCompleted":
# Update activity summary
await conn.execute(
"""
UPDATE customer_activity_summary SET
total_orders = total_orders + 1,
total_spent = total_spent + $2,
last_order_at = $3
WHERE customer_id = $1
""",
event.data['customer_id'],
event.data['total_amount'],
event.data['completed_at']
)
# Insert into order history
await conn.execute(
"""
INSERT INTO customer_order_history
(customer_id, order_id, amount, completed_at)
VALUES ($1, $2, $3, $4)
""",
event.data['customer_id'],
event.data['order_id'],
event.data['total_amount'],
event.data['completed_at']
)
elif event.event_type == "ReviewSubmitted":
await conn.execute(
"""
UPDATE customer_activity_summary SET
total_reviews = total_reviews + 1,
last_review_at = $2
WHERE customer_id = $1
""",
event.data['customer_id'],
event.data['submitted_at']
)
elif event.event_type == "CustomerTierChanged":
await conn.execute(
"""
UPDATE customers SET tier = $2, updated_at = NOW()
WHERE customer_id = $1
""",
event.data['customer_id'],
event.data['new_tier']
)
```
## Best Practices
### Do's
- **Make projections idempotent** - Safe to replay
- **Use transactions** - For multi-table updates
- **Store checkpoints** - Resume after failures
- **Monitor lag** - Alert on projection delays
- **Plan for rebuilds** - Design for reconstruction
### Don'ts
- **Don't couple projections** - Each is independent
- **Don't skip error handling** - Log and alert on failures
- **Don't ignore ordering** - Events must be processed in order
- **Don't over-normalize** - Denormalize for query patterns
## Resources
- [CQRS Pattern](https://docs.microsoft.com/en-us/azure/architecture/patterns/cqrs)
- [Projection Building Blocks](https://zimarev.com/blog/event-sourcing/projections/)

View File

@@ -0,0 +1,482 @@
---
name: saga-orchestration
description: Implement saga patterns for distributed transactions and cross-aggregate workflows. Use when coordinating multi-step business processes, handling compensating transactions, or managing long-running workflows.
---
# Saga Orchestration
Patterns for managing distributed transactions and long-running business processes.
## When to Use This Skill
- Coordinating multi-service transactions
- Implementing compensating transactions
- Managing long-running business workflows
- Handling failures in distributed systems
- Building order fulfillment processes
- Implementing approval workflows
## Core Concepts
### 1. Saga Types
```
Choreography Orchestration
┌─────┐ ┌─────┐ ┌─────┐ ┌─────────────┐
│Svc A│─►│Svc B│─►│Svc C│ │ Orchestrator│
└─────┘ └─────┘ └─────┘ └──────┬──────┘
│ │ │ │
▼ ▼ ▼ ┌─────┼─────┐
Event Event Event ▼ ▼ ▼
┌────┐┌────┐┌────┐
│Svc1││Svc2││Svc3│
└────┘└────┘└────┘
```
### 2. Saga Execution States
| State | Description |
|-------|-------------|
| **Started** | Saga initiated |
| **Pending** | Waiting for step completion |
| **Compensating** | Rolling back due to failure |
| **Completed** | All steps succeeded |
| **Failed** | Saga failed after compensation |
## Templates
### Template 1: Saga Orchestrator Base
```python
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Any, Optional
from datetime import datetime
import uuid
class SagaState(Enum):
STARTED = "started"
PENDING = "pending"
COMPENSATING = "compensating"
COMPLETED = "completed"
FAILED = "failed"
@dataclass
class SagaStep:
name: str
action: str
compensation: str
status: str = "pending"
result: Optional[Dict] = None
error: Optional[str] = None
executed_at: Optional[datetime] = None
compensated_at: Optional[datetime] = None
@dataclass
class Saga:
saga_id: str
saga_type: str
state: SagaState
data: Dict[str, Any]
steps: List[SagaStep]
current_step: int = 0
created_at: datetime = field(default_factory=datetime.utcnow)
updated_at: datetime = field(default_factory=datetime.utcnow)
class SagaOrchestrator(ABC):
"""Base class for saga orchestrators."""
def __init__(self, saga_store, event_publisher):
self.saga_store = saga_store
self.event_publisher = event_publisher
@abstractmethod
def define_steps(self, data: Dict) -> List[SagaStep]:
"""Define the saga steps."""
pass
@property
@abstractmethod
def saga_type(self) -> str:
"""Unique saga type identifier."""
pass
async def start(self, data: Dict) -> Saga:
"""Start a new saga."""
saga = Saga(
saga_id=str(uuid.uuid4()),
saga_type=self.saga_type,
state=SagaState.STARTED,
data=data,
steps=self.define_steps(data)
)
await self.saga_store.save(saga)
await self._execute_next_step(saga)
return saga
async def handle_step_completed(self, saga_id: str, step_name: str, result: Dict):
"""Handle successful step completion."""
saga = await self.saga_store.get(saga_id)
# Update step
for step in saga.steps:
if step.name == step_name:
step.status = "completed"
step.result = result
step.executed_at = datetime.utcnow()
break
saga.current_step += 1
saga.updated_at = datetime.utcnow()
# Check if saga is complete
if saga.current_step >= len(saga.steps):
saga.state = SagaState.COMPLETED
await self.saga_store.save(saga)
await self._on_saga_completed(saga)
else:
saga.state = SagaState.PENDING
await self.saga_store.save(saga)
await self._execute_next_step(saga)
async def handle_step_failed(self, saga_id: str, step_name: str, error: str):
"""Handle step failure - start compensation."""
saga = await self.saga_store.get(saga_id)
# Mark step as failed
for step in saga.steps:
if step.name == step_name:
step.status = "failed"
step.error = error
break
saga.state = SagaState.COMPENSATING
saga.updated_at = datetime.utcnow()
await self.saga_store.save(saga)
# Start compensation from current step backwards
await self._compensate(saga)
async def _execute_next_step(self, saga: Saga):
"""Execute the next step in the saga."""
if saga.current_step >= len(saga.steps):
return
step = saga.steps[saga.current_step]
step.status = "executing"
await self.saga_store.save(saga)
# Publish command to execute step
await self.event_publisher.publish(
step.action,
{
"saga_id": saga.saga_id,
"step_name": step.name,
**saga.data
}
)
async def _compensate(self, saga: Saga):
"""Execute compensation for completed steps."""
# Compensate in reverse order
for i in range(saga.current_step - 1, -1, -1):
step = saga.steps[i]
if step.status == "completed":
step.status = "compensating"
await self.saga_store.save(saga)
await self.event_publisher.publish(
step.compensation,
{
"saga_id": saga.saga_id,
"step_name": step.name,
"original_result": step.result,
**saga.data
}
)
async def handle_compensation_completed(self, saga_id: str, step_name: str):
"""Handle compensation completion."""
saga = await self.saga_store.get(saga_id)
for step in saga.steps:
if step.name == step_name:
step.status = "compensated"
step.compensated_at = datetime.utcnow()
break
# Check if all compensations complete
all_compensated = all(
s.status in ("compensated", "pending", "failed")
for s in saga.steps
)
if all_compensated:
saga.state = SagaState.FAILED
await self._on_saga_failed(saga)
await self.saga_store.save(saga)
async def _on_saga_completed(self, saga: Saga):
"""Called when saga completes successfully."""
await self.event_publisher.publish(
f"{self.saga_type}Completed",
{"saga_id": saga.saga_id, **saga.data}
)
async def _on_saga_failed(self, saga: Saga):
"""Called when saga fails after compensation."""
await self.event_publisher.publish(
f"{self.saga_type}Failed",
{"saga_id": saga.saga_id, "error": "Saga failed", **saga.data}
)
```
### Template 2: Order Fulfillment Saga
```python
class OrderFulfillmentSaga(SagaOrchestrator):
"""Orchestrates order fulfillment across services."""
@property
def saga_type(self) -> str:
return "OrderFulfillment"
def define_steps(self, data: Dict) -> List[SagaStep]:
return [
SagaStep(
name="reserve_inventory",
action="InventoryService.ReserveItems",
compensation="InventoryService.ReleaseReservation"
),
SagaStep(
name="process_payment",
action="PaymentService.ProcessPayment",
compensation="PaymentService.RefundPayment"
),
SagaStep(
name="create_shipment",
action="ShippingService.CreateShipment",
compensation="ShippingService.CancelShipment"
),
SagaStep(
name="send_confirmation",
action="NotificationService.SendOrderConfirmation",
compensation="NotificationService.SendCancellationNotice"
)
]
# Usage
async def create_order(order_data: Dict):
saga = OrderFulfillmentSaga(saga_store, event_publisher)
return await saga.start({
"order_id": order_data["order_id"],
"customer_id": order_data["customer_id"],
"items": order_data["items"],
"payment_method": order_data["payment_method"],
"shipping_address": order_data["shipping_address"]
})
# Event handlers in each service
class InventoryService:
async def handle_reserve_items(self, command: Dict):
try:
# Reserve inventory
reservation = await self.reserve(
command["items"],
command["order_id"]
)
# Report success
await self.event_publisher.publish(
"SagaStepCompleted",
{
"saga_id": command["saga_id"],
"step_name": "reserve_inventory",
"result": {"reservation_id": reservation.id}
}
)
except InsufficientInventoryError as e:
await self.event_publisher.publish(
"SagaStepFailed",
{
"saga_id": command["saga_id"],
"step_name": "reserve_inventory",
"error": str(e)
}
)
async def handle_release_reservation(self, command: Dict):
# Compensating action
await self.release_reservation(
command["original_result"]["reservation_id"]
)
await self.event_publisher.publish(
"SagaCompensationCompleted",
{
"saga_id": command["saga_id"],
"step_name": "reserve_inventory"
}
)
```
### Template 3: Choreography-Based Saga
```python
from dataclasses import dataclass
from typing import Dict, Any
import asyncio
@dataclass
class SagaContext:
"""Passed through choreographed saga events."""
saga_id: str
step: int
data: Dict[str, Any]
completed_steps: list
class OrderChoreographySaga:
"""Choreography-based saga using events."""
def __init__(self, event_bus):
self.event_bus = event_bus
self._register_handlers()
def _register_handlers(self):
self.event_bus.subscribe("OrderCreated", self._on_order_created)
self.event_bus.subscribe("InventoryReserved", self._on_inventory_reserved)
self.event_bus.subscribe("PaymentProcessed", self._on_payment_processed)
self.event_bus.subscribe("ShipmentCreated", self._on_shipment_created)
# Compensation handlers
self.event_bus.subscribe("PaymentFailed", self._on_payment_failed)
self.event_bus.subscribe("ShipmentFailed", self._on_shipment_failed)
async def _on_order_created(self, event: Dict):
"""Step 1: Order created, reserve inventory."""
await self.event_bus.publish("ReserveInventory", {
"saga_id": event["order_id"],
"order_id": event["order_id"],
"items": event["items"]
})
async def _on_inventory_reserved(self, event: Dict):
"""Step 2: Inventory reserved, process payment."""
await self.event_bus.publish("ProcessPayment", {
"saga_id": event["saga_id"],
"order_id": event["order_id"],
"amount": event["total_amount"],
"reservation_id": event["reservation_id"]
})
async def _on_payment_processed(self, event: Dict):
"""Step 3: Payment done, create shipment."""
await self.event_bus.publish("CreateShipment", {
"saga_id": event["saga_id"],
"order_id": event["order_id"],
"payment_id": event["payment_id"]
})
async def _on_shipment_created(self, event: Dict):
"""Step 4: Complete - send confirmation."""
await self.event_bus.publish("OrderFulfilled", {
"saga_id": event["saga_id"],
"order_id": event["order_id"],
"tracking_number": event["tracking_number"]
})
# Compensation handlers
async def _on_payment_failed(self, event: Dict):
"""Payment failed - release inventory."""
await self.event_bus.publish("ReleaseInventory", {
"saga_id": event["saga_id"],
"reservation_id": event["reservation_id"]
})
await self.event_bus.publish("OrderFailed", {
"order_id": event["order_id"],
"reason": "Payment failed"
})
async def _on_shipment_failed(self, event: Dict):
"""Shipment failed - refund payment and release inventory."""
await self.event_bus.publish("RefundPayment", {
"saga_id": event["saga_id"],
"payment_id": event["payment_id"]
})
await self.event_bus.publish("ReleaseInventory", {
"saga_id": event["saga_id"],
"reservation_id": event["reservation_id"]
})
```
### Template 4: Saga with Timeouts
```python
class TimeoutSagaOrchestrator(SagaOrchestrator):
"""Saga orchestrator with step timeouts."""
def __init__(self, saga_store, event_publisher, scheduler):
super().__init__(saga_store, event_publisher)
self.scheduler = scheduler
async def _execute_next_step(self, saga: Saga):
if saga.current_step >= len(saga.steps):
return
step = saga.steps[saga.current_step]
step.status = "executing"
step.timeout_at = datetime.utcnow() + timedelta(minutes=5)
await self.saga_store.save(saga)
# Schedule timeout check
await self.scheduler.schedule(
f"saga_timeout_{saga.saga_id}_{step.name}",
self._check_timeout,
{"saga_id": saga.saga_id, "step_name": step.name},
run_at=step.timeout_at
)
await self.event_publisher.publish(
step.action,
{"saga_id": saga.saga_id, "step_name": step.name, **saga.data}
)
async def _check_timeout(self, data: Dict):
"""Check if step has timed out."""
saga = await self.saga_store.get(data["saga_id"])
step = next(s for s in saga.steps if s.name == data["step_name"])
if step.status == "executing":
# Step timed out - fail it
await self.handle_step_failed(
data["saga_id"],
data["step_name"],
"Step timed out"
)
```
## Best Practices
### Do's
- **Make steps idempotent** - Safe to retry
- **Design compensations carefully** - They must work
- **Use correlation IDs** - For tracing across services
- **Implement timeouts** - Don't wait forever
- **Log everything** - For debugging failures
### Don'ts
- **Don't assume instant completion** - Sagas take time
- **Don't skip compensation testing** - Most critical part
- **Don't couple services** - Use async messaging
- **Don't ignore partial failures** - Handle gracefully
## Resources
- [Saga Pattern](https://microservices.io/patterns/data/saga.html)
- [Designing Data-Intensive Applications](https://dataintensive.net/)

View File

@@ -0,0 +1,423 @@
---
name: data-storytelling
description: Transform data into compelling narratives using visualization, context, and persuasive structure. Use when presenting analytics to stakeholders, creating data reports, or building executive presentations.
---
# Data Storytelling
Transform raw data into compelling narratives that drive decisions and inspire action.
## When to Use This Skill
- Presenting analytics to executives
- Creating quarterly business reviews
- Building investor presentations
- Writing data-driven reports
- Communicating insights to non-technical audiences
- Making recommendations based on data
## Core Concepts
### 1. Story Structure
```
Setup → Conflict → Resolution
Setup: Context and baseline
Conflict: The problem or opportunity
Resolution: Insights and recommendations
```
### 2. Narrative Arc
```
1. Hook: Grab attention with surprising insight
2. Context: Establish the baseline
3. Rising Action: Build through data points
4. Climax: The key insight
5. Resolution: Recommendations
6. Call to Action: Next steps
```
### 3. Three Pillars
| Pillar | Purpose | Components |
|--------|---------|------------|
| **Data** | Evidence | Numbers, trends, comparisons |
| **Narrative** | Meaning | Context, causation, implications |
| **Visuals** | Clarity | Charts, diagrams, highlights |
## Story Frameworks
### Framework 1: The Problem-Solution Story
```markdown
# Customer Churn Analysis
## The Hook
"We're losing $2.4M annually to preventable churn."
## The Context
- Current churn rate: 8.5% (industry average: 5%)
- Average customer lifetime value: $4,800
- 500 customers churned last quarter
## The Problem
Analysis of churned customers reveals a pattern:
- 73% churned within first 90 days
- Common factor: < 3 support interactions
- Low feature adoption in first month
## The Insight
[Show engagement curve visualization]
Customers who don't engage in the first 14 days
are 4x more likely to churn.
## The Solution
1. Implement 14-day onboarding sequence
2. Proactive outreach at day 7
3. Feature adoption tracking
## Expected Impact
- Reduce early churn by 40%
- Save $960K annually
- Payback period: 3 months
## Call to Action
Approve $50K budget for onboarding automation.
```
### Framework 2: The Trend Story
```markdown
# Q4 Performance Analysis
## Where We Started
Q3 ended with $1.2M MRR, 15% below target.
Team morale was low after missed goals.
## What Changed
[Timeline visualization]
- Oct: Launched self-serve pricing
- Nov: Reduced friction in signup
- Dec: Added customer success calls
## The Transformation
[Before/after comparison chart]
| Metric | Q3 | Q4 | Change |
|----------------|--------|--------|--------|
| Trial → Paid | 8% | 15% | +87% |
| Time to Value | 14 days| 5 days | -64% |
| Expansion Rate | 2% | 8% | +300% |
## Key Insight
Self-serve + high-touch creates compound growth.
Customers who self-serve AND get a success call
have 3x higher expansion rate.
## Going Forward
Double down on hybrid model.
Target: $1.8M MRR by Q2.
```
### Framework 3: The Comparison Story
```markdown
# Market Opportunity Analysis
## The Question
Should we expand into EMEA or APAC first?
## The Comparison
[Side-by-side market analysis]
### EMEA
- Market size: $4.2B
- Growth rate: 8%
- Competition: High
- Regulatory: Complex (GDPR)
- Language: Multiple
### APAC
- Market size: $3.8B
- Growth rate: 15%
- Competition: Moderate
- Regulatory: Varied
- Language: Multiple
## The Analysis
[Weighted scoring matrix visualization]
| Factor | Weight | EMEA Score | APAC Score |
|-------------|--------|------------|------------|
| Market Size | 25% | 5 | 4 |
| Growth | 30% | 3 | 5 |
| Competition | 20% | 2 | 4 |
| Ease | 25% | 2 | 3 |
| **Total** | | **2.9** | **4.1** |
## The Recommendation
APAC first. Higher growth, less competition.
Start with Singapore hub (English, business-friendly).
Enter EMEA in Year 2 with localization ready.
## Risk Mitigation
- Timezone coverage: Hire 24/7 support
- Cultural fit: Local partnerships
- Payment: Multi-currency from day 1
```
## Visualization Techniques
### Technique 1: Progressive Reveal
```markdown
Start simple, add layers:
Slide 1: "Revenue is growing" [single line chart]
Slide 2: "But growth is slowing" [add growth rate overlay]
Slide 3: "Driven by one segment" [add segment breakdown]
Slide 4: "Which is saturating" [add market share]
Slide 5: "We need new segments" [add opportunity zones]
```
### Technique 2: Contrast and Compare
```markdown
Before/After:
┌─────────────────┬─────────────────┐
│ BEFORE │ AFTER │
│ │ │
│ Process: 5 days│ Process: 1 day │
│ Errors: 15% │ Errors: 2% │
│ Cost: $50/unit │ Cost: $20/unit │
└─────────────────┴─────────────────┘
This/That (emphasize difference):
┌─────────────────────────────────────┐
│ CUSTOMER A vs B │
│ ┌──────────┐ ┌──────────┐ │
│ │ ████████ │ │ ██ │ │
│ │ $45,000 │ │ $8,000 │ │
│ │ LTV │ │ LTV │ │
│ └──────────┘ └──────────┘ │
│ Onboarded No onboarding │
└─────────────────────────────────────┘
```
### Technique 3: Annotation and Highlight
```python
import matplotlib.pyplot as plt
import pandas as pd
fig, ax = plt.subplots(figsize=(12, 6))
# Plot the main data
ax.plot(dates, revenue, linewidth=2, color='#2E86AB')
# Add annotation for key events
ax.annotate(
'Product Launch\n+32% spike',
xy=(launch_date, launch_revenue),
xytext=(launch_date, launch_revenue * 1.2),
fontsize=10,
arrowprops=dict(arrowstyle='->', color='#E63946'),
color='#E63946'
)
# Highlight a region
ax.axvspan(growth_start, growth_end, alpha=0.2, color='green',
label='Growth Period')
# Add threshold line
ax.axhline(y=target, color='gray', linestyle='--',
label=f'Target: ${target:,.0f}')
ax.set_title('Revenue Growth Story', fontsize=14, fontweight='bold')
ax.legend()
```
## Presentation Templates
### Template 1: Executive Summary Slide
```
┌─────────────────────────────────────────────────────────────┐
│ KEY INSIGHT │
│ ══════════════════════════════════════════════════════════│
│ │
│ "Customers who complete onboarding in week 1 │
│ have 3x higher lifetime value" │
│ │
├──────────────────────┬──────────────────────────────────────┤
│ │ │
│ THE DATA │ THE IMPLICATION │
│ │ │
│ Week 1 completers: │ ✓ Prioritize onboarding UX │
│ • LTV: $4,500 │ ✓ Add day-1 success milestones │
│ • Retention: 85% │ ✓ Proactive week-1 outreach │
│ • NPS: 72 │ │
│ │ Investment: $75K │
│ Others: │ Expected ROI: 8x │
│ • LTV: $1,500 │ │
│ • Retention: 45% │ │
│ • NPS: 34 │ │
│ │ │
└──────────────────────┴──────────────────────────────────────┘
```
### Template 2: Data Story Flow
```
Slide 1: THE HEADLINE
"We can grow 40% faster by fixing onboarding"
Slide 2: THE CONTEXT
Current state metrics
Industry benchmarks
Gap analysis
Slide 3: THE DISCOVERY
What the data revealed
Surprising finding
Pattern identification
Slide 4: THE DEEP DIVE
Root cause analysis
Segment breakdowns
Statistical significance
Slide 5: THE RECOMMENDATION
Proposed actions
Resource requirements
Timeline
Slide 6: THE IMPACT
Expected outcomes
ROI calculation
Risk assessment
Slide 7: THE ASK
Specific request
Decision needed
Next steps
```
### Template 3: One-Page Dashboard Story
```markdown
# Monthly Business Review: January 2024
## THE HEADLINE
Revenue up 15% but CAC increasing faster than LTV
## KEY METRICS AT A GLANCE
┌────────┬────────┬────────┬────────┐
│ MRR │ NRR │ CAC │ LTV │
│ $125K │ 108% │ $450 │ $2,200 │
│ ▲15% │ ▲3% │ ▲22% │ ▲8% │
└────────┴────────┴────────┴────────┘
## WHAT'S WORKING
✓ Enterprise segment growing 25% MoM
✓ Referral program driving 30% of new logos
✓ Support satisfaction at all-time high (94%)
## WHAT NEEDS ATTENTION
✗ SMB acquisition cost up 40%
✗ Trial conversion down 5 points
✗ Time-to-value increased by 3 days
## ROOT CAUSE
[Mini chart showing SMB vs Enterprise CAC trend]
SMB paid ads becoming less efficient.
CPC up 35% while conversion flat.
## RECOMMENDATION
1. Shift $20K/mo from paid to content
2. Launch SMB self-serve trial
3. A/B test shorter onboarding
## NEXT MONTH'S FOCUS
- Launch content marketing pilot
- Complete self-serve MVP
- Reduce time-to-value to < 7 days
```
## Writing Techniques
### Headlines That Work
```markdown
BAD: "Q4 Sales Analysis"
GOOD: "Q4 Sales Beat Target by 23% - Here's Why"
BAD: "Customer Churn Report"
GOOD: "We're Losing $2.4M to Preventable Churn"
BAD: "Marketing Performance"
GOOD: "Content Marketing Delivers 4x ROI vs. Paid"
Formula:
[Specific Number] + [Business Impact] + [Actionable Context]
```
### Transition Phrases
```markdown
Building the narrative:
• "This leads us to ask..."
• "When we dig deeper..."
• "The pattern becomes clear when..."
• "Contrast this with..."
Introducing insights:
• "The data reveals..."
• "What surprised us was..."
• "The inflection point came when..."
• "The key finding is..."
Moving to action:
• "This insight suggests..."
• "Based on this analysis..."
• "The implication is clear..."
• "Our recommendation is..."
```
### Handling Uncertainty
```markdown
Acknowledge limitations:
• "With 95% confidence, we can say..."
• "The sample size of 500 shows..."
• "While correlation is strong, causation requires..."
• "This trend holds for [segment], though [caveat]..."
Present ranges:
• "Impact estimate: $400K-$600K"
• "Confidence interval: 15-20% improvement"
• "Best case: X, Conservative: Y"
```
## Best Practices
### Do's
- **Start with the "so what"** - Lead with insight
- **Use the rule of three** - Three points, three comparisons
- **Show, don't tell** - Let data speak
- **Make it personal** - Connect to audience goals
- **End with action** - Clear next steps
### Don'ts
- **Don't data dump** - Curate ruthlessly
- **Don't bury the insight** - Front-load key findings
- **Don't use jargon** - Match audience vocabulary
- **Don't show methodology first** - Context, then method
- **Don't forget the narrative** - Numbers need meaning
## Resources
- [Storytelling with Data (Cole Nussbaumer)](https://www.storytellingwithdata.com/)
- [The Pyramid Principle (Barbara Minto)](https://www.amazon.com/Pyramid-Principle-Logic-Writing-Thinking/dp/0273710516)
- [Resonate (Nancy Duarte)](https://www.duarte.com/resonate/)

View File

@@ -0,0 +1,426 @@
---
name: kpi-dashboard-design
description: Design effective KPI dashboards with metrics selection, visualization best practices, and real-time monitoring patterns. Use when building business dashboards, selecting metrics, or designing data visualization layouts.
---
# KPI Dashboard Design
Comprehensive patterns for designing effective Key Performance Indicator (KPI) dashboards that drive business decisions.
## When to Use This Skill
- Designing executive dashboards
- Selecting meaningful KPIs
- Building real-time monitoring displays
- Creating department-specific metrics views
- Improving existing dashboard layouts
- Establishing metric governance
## Core Concepts
### 1. KPI Framework
| Level | Focus | Update Frequency | Audience |
|-------|-------|------------------|----------|
| **Strategic** | Long-term goals | Monthly/Quarterly | Executives |
| **Tactical** | Department goals | Weekly/Monthly | Managers |
| **Operational** | Day-to-day | Real-time/Daily | Teams |
### 2. SMART KPIs
```
Specific: Clear definition
Measurable: Quantifiable
Achievable: Realistic targets
Relevant: Aligned to goals
Time-bound: Defined period
```
### 3. Dashboard Hierarchy
```
├── Executive Summary (1 page)
│ ├── 4-6 headline KPIs
│ ├── Trend indicators
│ └── Key alerts
├── Department Views
│ ├── Sales Dashboard
│ ├── Marketing Dashboard
│ ├── Operations Dashboard
│ └── Finance Dashboard
└── Detailed Drilldowns
├── Individual metrics
└── Root cause analysis
```
## Common KPIs by Department
### Sales KPIs
```yaml
Revenue Metrics:
- Monthly Recurring Revenue (MRR)
- Annual Recurring Revenue (ARR)
- Average Revenue Per User (ARPU)
- Revenue Growth Rate
Pipeline Metrics:
- Sales Pipeline Value
- Win Rate
- Average Deal Size
- Sales Cycle Length
Activity Metrics:
- Calls/Emails per Rep
- Demos Scheduled
- Proposals Sent
- Close Rate
```
### Marketing KPIs
```yaml
Acquisition:
- Cost Per Acquisition (CPA)
- Customer Acquisition Cost (CAC)
- Lead Volume
- Marketing Qualified Leads (MQL)
Engagement:
- Website Traffic
- Conversion Rate
- Email Open/Click Rate
- Social Engagement
ROI:
- Marketing ROI
- Campaign Performance
- Channel Attribution
- CAC Payback Period
```
### Product KPIs
```yaml
Usage:
- Daily/Monthly Active Users (DAU/MAU)
- Session Duration
- Feature Adoption Rate
- Stickiness (DAU/MAU)
Quality:
- Net Promoter Score (NPS)
- Customer Satisfaction (CSAT)
- Bug/Issue Count
- Time to Resolution
Growth:
- User Growth Rate
- Activation Rate
- Retention Rate
- Churn Rate
```
### Finance KPIs
```yaml
Profitability:
- Gross Margin
- Net Profit Margin
- EBITDA
- Operating Margin
Liquidity:
- Current Ratio
- Quick Ratio
- Cash Flow
- Working Capital
Efficiency:
- Revenue per Employee
- Operating Expense Ratio
- Days Sales Outstanding
- Inventory Turnover
```
## Dashboard Layout Patterns
### Pattern 1: Executive Summary
```
┌─────────────────────────────────────────────────────────────┐
│ EXECUTIVE DASHBOARD [Date Range ▼] │
├─────────────┬─────────────┬─────────────┬─────────────────┤
│ REVENUE │ PROFIT │ CUSTOMERS │ NPS SCORE │
│ $2.4M │ $450K │ 12,450 │ 72 │
│ ▲ 12% │ ▲ 8% │ ▲ 15% │ ▲ 5pts │
├─────────────┴─────────────┴─────────────┴─────────────────┤
│ │
│ Revenue Trend │ Revenue by Product │
│ ┌───────────────────────┐ │ ┌──────────────────┐ │
│ │ /\ /\ │ │ │ ████████ 45% │ │
│ │ / \ / \ /\ │ │ │ ██████ 32% │ │
│ │ / \/ \ / \ │ │ │ ████ 18% │ │
│ │ / \/ \ │ │ │ ██ 5% │ │
│ └───────────────────────┘ │ └──────────────────┘ │
│ │
├─────────────────────────────────────────────────────────────┤
│ 🔴 Alert: Churn rate exceeded threshold (>5%) │
│ 🟡 Warning: Support ticket volume 20% above average │
└─────────────────────────────────────────────────────────────┘
```
### Pattern 2: SaaS Metrics Dashboard
```
┌─────────────────────────────────────────────────────────────┐
│ SAAS METRICS Jan 2024 [Monthly ▼] │
├──────────────────────┬──────────────────────────────────────┤
│ ┌────────────────┐ │ MRR GROWTH │
│ │ MRR │ │ ┌────────────────────────────────┐ │
│ │ $125,000 │ │ │ /── │ │
│ │ ▲ 8% │ │ │ /────/ │ │
│ └────────────────┘ │ │ /────/ │ │
│ ┌────────────────┐ │ │ /────/ │ │
│ │ ARR │ │ │ /────/ │ │
│ │ $1,500,000 │ │ └────────────────────────────────┘ │
│ │ ▲ 15% │ │ J F M A M J J A S O N D │
│ └────────────────┘ │ │
├──────────────────────┼──────────────────────────────────────┤
│ UNIT ECONOMICS │ COHORT RETENTION │
│ │ │
│ CAC: $450 │ Month 1: ████████████████████ 100% │
│ LTV: $2,700 │ Month 3: █████████████████ 85% │
│ LTV/CAC: 6.0x │ Month 6: ████████████████ 80% │
│ │ Month 12: ██████████████ 72% │
│ Payback: 4 months │ │
├──────────────────────┴──────────────────────────────────────┤
│ CHURN ANALYSIS │
│ ┌──────────┬──────────┬──────────┬──────────────────────┐ │
│ │ Gross │ Net │ Logo │ Expansion │ │
│ │ 4.2% │ 1.8% │ 3.1% │ 2.4% │ │
│ └──────────┴──────────┴──────────┴──────────────────────┘ │
└─────────────────────────────────────────────────────────────┘
```
### Pattern 3: Real-time Operations
```
┌─────────────────────────────────────────────────────────────┐
│ OPERATIONS CENTER Live ● Last: 10:42:15 │
├────────────────────────────┬────────────────────────────────┤
│ SYSTEM HEALTH │ SERVICE STATUS │
│ ┌──────────────────────┐ │ │
│ │ CPU MEM DISK │ │ ● API Gateway Healthy │
│ │ 45% 72% 58% │ │ ● User Service Healthy │
│ │ ███ ████ ███ │ │ ● Payment Service Degraded │
│ │ ███ ████ ███ │ │ ● Database Healthy │
│ │ ███ ████ ███ │ │ ● Cache Healthy │
│ └──────────────────────┘ │ │
├────────────────────────────┼────────────────────────────────┤
│ REQUEST THROUGHPUT │ ERROR RATE │
│ ┌──────────────────────┐ │ ┌──────────────────────────┐ │
│ │ ▁▂▃▄▅▆▇█▇▆▅▄▃▂▁▂▃▄▅ │ │ │ ▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁ │ │
│ └──────────────────────┘ │ └──────────────────────────┘ │
│ Current: 12,450 req/s │ Current: 0.02% │
│ Peak: 18,200 req/s │ Threshold: 1.0% │
├────────────────────────────┴────────────────────────────────┤
│ RECENT ALERTS │
│ 10:40 🟡 High latency on payment-service (p99 > 500ms) │
│ 10:35 🟢 Resolved: Database connection pool recovered │
│ 10:22 🔴 Payment service circuit breaker tripped │
└─────────────────────────────────────────────────────────────┘
```
## Implementation Patterns
### SQL for KPI Calculations
```sql
-- Monthly Recurring Revenue (MRR)
WITH mrr_calculation AS (
SELECT
DATE_TRUNC('month', billing_date) AS month,
SUM(
CASE subscription_interval
WHEN 'monthly' THEN amount
WHEN 'yearly' THEN amount / 12
WHEN 'quarterly' THEN amount / 3
END
) AS mrr
FROM subscriptions
WHERE status = 'active'
GROUP BY DATE_TRUNC('month', billing_date)
)
SELECT
month,
mrr,
LAG(mrr) OVER (ORDER BY month) AS prev_mrr,
(mrr - LAG(mrr) OVER (ORDER BY month)) / LAG(mrr) OVER (ORDER BY month) * 100 AS growth_pct
FROM mrr_calculation;
-- Cohort Retention
WITH cohorts AS (
SELECT
user_id,
DATE_TRUNC('month', created_at) AS cohort_month
FROM users
),
activity AS (
SELECT
user_id,
DATE_TRUNC('month', event_date) AS activity_month
FROM user_events
WHERE event_type = 'active_session'
)
SELECT
c.cohort_month,
EXTRACT(MONTH FROM age(a.activity_month, c.cohort_month)) AS months_since_signup,
COUNT(DISTINCT a.user_id) AS active_users,
COUNT(DISTINCT a.user_id)::FLOAT / COUNT(DISTINCT c.user_id) * 100 AS retention_rate
FROM cohorts c
LEFT JOIN activity a ON c.user_id = a.user_id
AND a.activity_month >= c.cohort_month
GROUP BY c.cohort_month, EXTRACT(MONTH FROM age(a.activity_month, c.cohort_month))
ORDER BY c.cohort_month, months_since_signup;
-- Customer Acquisition Cost (CAC)
SELECT
DATE_TRUNC('month', acquired_date) AS month,
SUM(marketing_spend) / NULLIF(COUNT(new_customers), 0) AS cac,
SUM(marketing_spend) AS total_spend,
COUNT(new_customers) AS customers_acquired
FROM (
SELECT
DATE_TRUNC('month', u.created_at) AS acquired_date,
u.id AS new_customers,
m.spend AS marketing_spend
FROM users u
JOIN marketing_spend m ON DATE_TRUNC('month', u.created_at) = m.month
WHERE u.source = 'marketing'
) acquisition
GROUP BY DATE_TRUNC('month', acquired_date);
```
### Python Dashboard Code (Streamlit)
```python
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
st.set_page_config(page_title="KPI Dashboard", layout="wide")
# Header with date filter
col1, col2 = st.columns([3, 1])
with col1:
st.title("Executive Dashboard")
with col2:
date_range = st.selectbox(
"Period",
["Last 7 Days", "Last 30 Days", "Last Quarter", "YTD"]
)
# KPI Cards
def metric_card(label, value, delta, prefix="", suffix=""):
delta_color = "green" if delta >= 0 else "red"
delta_arrow = "" if delta >= 0 else ""
st.metric(
label=label,
value=f"{prefix}{value:,.0f}{suffix}",
delta=f"{delta_arrow} {abs(delta):.1f}%"
)
col1, col2, col3, col4 = st.columns(4)
with col1:
metric_card("Revenue", 2400000, 12.5, prefix="$")
with col2:
metric_card("Customers", 12450, 15.2)
with col3:
metric_card("NPS Score", 72, 5.0)
with col4:
metric_card("Churn Rate", 4.2, -0.8, suffix="%")
# Charts
col1, col2 = st.columns(2)
with col1:
st.subheader("Revenue Trend")
revenue_data = pd.DataFrame({
'Month': pd.date_range('2024-01-01', periods=12, freq='M'),
'Revenue': [180000, 195000, 210000, 225000, 240000, 255000,
270000, 285000, 300000, 315000, 330000, 345000]
})
fig = px.line(revenue_data, x='Month', y='Revenue',
line_shape='spline', markers=True)
fig.update_layout(height=300)
st.plotly_chart(fig, use_container_width=True)
with col2:
st.subheader("Revenue by Product")
product_data = pd.DataFrame({
'Product': ['Enterprise', 'Professional', 'Starter', 'Other'],
'Revenue': [45, 32, 18, 5]
})
fig = px.pie(product_data, values='Revenue', names='Product',
hole=0.4)
fig.update_layout(height=300)
st.plotly_chart(fig, use_container_width=True)
# Cohort Heatmap
st.subheader("Cohort Retention")
cohort_data = pd.DataFrame({
'Cohort': ['Jan', 'Feb', 'Mar', 'Apr', 'May'],
'M0': [100, 100, 100, 100, 100],
'M1': [85, 87, 84, 86, 88],
'M2': [78, 80, 76, 79, None],
'M3': [72, 74, 70, None, None],
'M4': [68, 70, None, None, None],
})
fig = go.Figure(data=go.Heatmap(
z=cohort_data.iloc[:, 1:].values,
x=['M0', 'M1', 'M2', 'M3', 'M4'],
y=cohort_data['Cohort'],
colorscale='Blues',
text=cohort_data.iloc[:, 1:].values,
texttemplate='%{text}%',
textfont={"size": 12},
))
fig.update_layout(height=250)
st.plotly_chart(fig, use_container_width=True)
# Alerts Section
st.subheader("Alerts")
alerts = [
{"level": "error", "message": "Churn rate exceeded threshold (>5%)"},
{"level": "warning", "message": "Support ticket volume 20% above average"},
]
for alert in alerts:
if alert["level"] == "error":
st.error(f"🔴 {alert['message']}")
elif alert["level"] == "warning":
st.warning(f"🟡 {alert['message']}")
```
## Best Practices
### Do's
- **Limit to 5-7 KPIs** - Focus on what matters
- **Show context** - Comparisons, trends, targets
- **Use consistent colors** - Red=bad, green=good
- **Enable drilldown** - From summary to detail
- **Update appropriately** - Match metric frequency
### Don'ts
- **Don't show vanity metrics** - Focus on actionable data
- **Don't overcrowd** - White space aids comprehension
- **Don't use 3D charts** - They distort perception
- **Don't hide methodology** - Document calculations
- **Don't ignore mobile** - Ensure responsive design
## Resources
- [Stephen Few's Dashboard Design](https://www.perceptualedge.com/articles/visual_business_intelligence/rules_for_using_color.pdf)
- [Edward Tufte's Principles](https://www.edwardtufte.com/tufte/)
- [Google Data Studio Gallery](https://datastudio.google.com/gallery)

View File

@@ -0,0 +1,41 @@
# Service Mesh Expert
Expert service mesh architect specializing in Istio, Linkerd, and cloud-native networking patterns. Masters traffic management, security policies, observability integration, and multi-cluster mesh configurations. Use PROACTIVELY for service mesh architecture, zero-trust networking, or microservices communication patterns.
## Capabilities
- Istio and Linkerd installation, configuration, and optimization
- Traffic management: routing, load balancing, circuit breaking, retries
- mTLS configuration and certificate management
- Service mesh observability with distributed tracing
- Multi-cluster and multi-cloud mesh federation
- Progressive delivery with canary and blue-green deployments
- Security policies and authorization rules
## When to Use
- Implementing service-to-service communication in Kubernetes
- Setting up zero-trust networking with mTLS
- Configuring traffic splitting for canary deployments
- Debugging service mesh connectivity issues
- Implementing rate limiting and circuit breakers
- Setting up cross-cluster service discovery
## Workflow
1. Assess current infrastructure and requirements
2. Design mesh topology and traffic policies
3. Implement security policies (mTLS, AuthorizationPolicy)
4. Configure observability (metrics, traces, logs)
5. Set up traffic management rules
6. Test failover and resilience patterns
7. Document operational runbooks
## Best Practices
- Start with permissive mode, gradually enforce strict mTLS
- Use namespaces for policy isolation
- Implement circuit breakers before they're needed
- Monitor mesh overhead (latency, resource usage)
- Keep sidecar resources appropriately sized
- Use destination rules for consistent load balancing

View File

@@ -0,0 +1,325 @@
---
name: istio-traffic-management
description: Configure Istio traffic management including routing, load balancing, circuit breakers, and canary deployments. Use when implementing service mesh traffic policies, progressive delivery, or resilience patterns.
---
# Istio Traffic Management
Comprehensive guide to Istio traffic management for production service mesh deployments.
## When to Use This Skill
- Configuring service-to-service routing
- Implementing canary or blue-green deployments
- Setting up circuit breakers and retries
- Load balancing configuration
- Traffic mirroring for testing
- Fault injection for chaos engineering
## Core Concepts
### 1. Traffic Management Resources
| Resource | Purpose | Scope |
|----------|---------|-------|
| **VirtualService** | Route traffic to destinations | Host-based |
| **DestinationRule** | Define policies after routing | Service-based |
| **Gateway** | Configure ingress/egress | Cluster edge |
| **ServiceEntry** | Add external services | Mesh-wide |
### 2. Traffic Flow
```
Client → Gateway → VirtualService → DestinationRule → Service
(routing) (policies) (pods)
```
## Templates
### Template 1: Basic Routing
```yaml
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: reviews-route
namespace: bookinfo
spec:
hosts:
- reviews
http:
- match:
- headers:
end-user:
exact: jason
route:
- destination:
host: reviews
subset: v2
- route:
- destination:
host: reviews
subset: v1
---
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: reviews-destination
namespace: bookinfo
spec:
host: reviews
subsets:
- name: v1
labels:
version: v1
- name: v2
labels:
version: v2
- name: v3
labels:
version: v3
```
### Template 2: Canary Deployment
```yaml
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: my-service-canary
spec:
hosts:
- my-service
http:
- route:
- destination:
host: my-service
subset: stable
weight: 90
- destination:
host: my-service
subset: canary
weight: 10
---
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: my-service-dr
spec:
host: my-service
trafficPolicy:
connectionPool:
tcp:
maxConnections: 100
http:
h2UpgradePolicy: UPGRADE
http1MaxPendingRequests: 100
http2MaxRequests: 1000
subsets:
- name: stable
labels:
version: stable
- name: canary
labels:
version: canary
```
### Template 3: Circuit Breaker
```yaml
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: circuit-breaker
spec:
host: my-service
trafficPolicy:
connectionPool:
tcp:
maxConnections: 100
http:
http1MaxPendingRequests: 100
http2MaxRequests: 1000
maxRequestsPerConnection: 10
maxRetries: 3
outlierDetection:
consecutive5xxErrors: 5
interval: 30s
baseEjectionTime: 30s
maxEjectionPercent: 50
minHealthPercent: 30
```
### Template 4: Retry and Timeout
```yaml
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: ratings-retry
spec:
hosts:
- ratings
http:
- route:
- destination:
host: ratings
timeout: 10s
retries:
attempts: 3
perTryTimeout: 3s
retryOn: connect-failure,refused-stream,unavailable,cancelled,retriable-4xx,503
retryRemoteLocalities: true
```
### Template 5: Traffic Mirroring
```yaml
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: mirror-traffic
spec:
hosts:
- my-service
http:
- route:
- destination:
host: my-service
subset: v1
mirror:
host: my-service
subset: v2
mirrorPercentage:
value: 100.0
```
### Template 6: Fault Injection
```yaml
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: fault-injection
spec:
hosts:
- ratings
http:
- fault:
delay:
percentage:
value: 10
fixedDelay: 5s
abort:
percentage:
value: 5
httpStatus: 503
route:
- destination:
host: ratings
```
### Template 7: Ingress Gateway
```yaml
apiVersion: networking.istio.io/v1beta1
kind: Gateway
metadata:
name: my-gateway
spec:
selector:
istio: ingressgateway
servers:
- port:
number: 443
name: https
protocol: HTTPS
tls:
mode: SIMPLE
credentialName: my-tls-secret
hosts:
- "*.example.com"
---
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: my-vs
spec:
hosts:
- "api.example.com"
gateways:
- my-gateway
http:
- match:
- uri:
prefix: /api/v1
route:
- destination:
host: api-service
port:
number: 8080
```
## Load Balancing Strategies
```yaml
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: load-balancing
spec:
host: my-service
trafficPolicy:
loadBalancer:
simple: ROUND_ROBIN # or LEAST_CONN, RANDOM, PASSTHROUGH
---
# Consistent hashing for sticky sessions
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: sticky-sessions
spec:
host: my-service
trafficPolicy:
loadBalancer:
consistentHash:
httpHeaderName: x-user-id
# or: httpCookie, useSourceIp, httpQueryParameterName
```
## Best Practices
### Do's
- **Start simple** - Add complexity incrementally
- **Use subsets** - Version your services clearly
- **Set timeouts** - Always configure reasonable timeouts
- **Enable retries** - But with backoff and limits
- **Monitor** - Use Kiali and Jaeger for visibility
### Don'ts
- **Don't over-retry** - Can cause cascading failures
- **Don't ignore outlier detection** - Enable circuit breakers
- **Don't mirror to production** - Mirror to test environments
- **Don't skip canary** - Test with small traffic percentage first
## Debugging Commands
```bash
# Check VirtualService configuration
istioctl analyze
# View effective routes
istioctl proxy-config routes deploy/my-app -o json
# Check endpoint discovery
istioctl proxy-config endpoints deploy/my-app
# Debug traffic
istioctl proxy-config log deploy/my-app --level debug
```
## Resources
- [Istio Traffic Management](https://istio.io/latest/docs/concepts/traffic-management/)
- [Virtual Service Reference](https://istio.io/latest/docs/reference/config/networking/virtual-service/)
- [Destination Rule Reference](https://istio.io/latest/docs/reference/config/networking/destination-rule/)

View File

@@ -0,0 +1,309 @@
---
name: linkerd-patterns
description: Implement Linkerd service mesh patterns for lightweight, security-focused service mesh deployments. Use when setting up Linkerd, configuring traffic policies, or implementing zero-trust networking with minimal overhead.
---
# Linkerd Patterns
Production patterns for Linkerd service mesh - the lightweight, security-first service mesh for Kubernetes.
## When to Use This Skill
- Setting up a lightweight service mesh
- Implementing automatic mTLS
- Configuring traffic splits for canary deployments
- Setting up service profiles for per-route metrics
- Implementing retries and timeouts
- Multi-cluster service mesh
## Core Concepts
### 1. Linkerd Architecture
```
┌─────────────────────────────────────────────┐
│ Control Plane │
│ ┌─────────┐ ┌──────────┐ ┌──────────────┐ │
│ │ destiny │ │ identity │ │ proxy-inject │ │
│ └─────────┘ └──────────┘ └──────────────┘ │
└─────────────────────────────────────────────┘
┌─────────────────────────────────────────────┐
│ Data Plane │
│ ┌─────┐ ┌─────┐ ┌─────┐ │
│ │proxy│────│proxy│────│proxy│ │
│ └─────┘ └─────┘ └─────┘ │
│ │ │ │ │
│ ┌──┴──┐ ┌──┴──┐ ┌──┴──┐ │
│ │ app │ │ app │ │ app │ │
│ └─────┘ └─────┘ └─────┘ │
└─────────────────────────────────────────────┘
```
### 2. Key Resources
| Resource | Purpose |
|----------|---------|
| **ServiceProfile** | Per-route metrics, retries, timeouts |
| **TrafficSplit** | Canary deployments, A/B testing |
| **Server** | Define server-side policies |
| **ServerAuthorization** | Access control policies |
## Templates
### Template 1: Mesh Installation
```bash
# Install CLI
curl --proto '=https' --tlsv1.2 -sSfL https://run.linkerd.io/install | sh
# Validate cluster
linkerd check --pre
# Install CRDs
linkerd install --crds | kubectl apply -f -
# Install control plane
linkerd install | kubectl apply -f -
# Verify installation
linkerd check
# Install viz extension (optional)
linkerd viz install | kubectl apply -f -
```
### Template 2: Inject Namespace
```yaml
# Automatic injection for namespace
apiVersion: v1
kind: Namespace
metadata:
name: my-app
annotations:
linkerd.io/inject: enabled
---
# Or inject specific deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: my-app
annotations:
linkerd.io/inject: enabled
spec:
template:
metadata:
annotations:
linkerd.io/inject: enabled
```
### Template 3: Service Profile with Retries
```yaml
apiVersion: linkerd.io/v1alpha2
kind: ServiceProfile
metadata:
name: my-service.my-namespace.svc.cluster.local
namespace: my-namespace
spec:
routes:
- name: GET /api/users
condition:
method: GET
pathRegex: /api/users
responseClasses:
- condition:
status:
min: 500
max: 599
isFailure: true
isRetryable: true
- name: POST /api/users
condition:
method: POST
pathRegex: /api/users
# POST not retryable by default
isRetryable: false
- name: GET /api/users/{id}
condition:
method: GET
pathRegex: /api/users/[^/]+
timeout: 5s
isRetryable: true
retryBudget:
retryRatio: 0.2
minRetriesPerSecond: 10
ttl: 10s
```
### Template 4: Traffic Split (Canary)
```yaml
apiVersion: split.smi-spec.io/v1alpha1
kind: TrafficSplit
metadata:
name: my-service-canary
namespace: my-namespace
spec:
service: my-service
backends:
- service: my-service-stable
weight: 900m # 90%
- service: my-service-canary
weight: 100m # 10%
```
### Template 5: Server Authorization Policy
```yaml
# Define the server
apiVersion: policy.linkerd.io/v1beta1
kind: Server
metadata:
name: my-service-http
namespace: my-namespace
spec:
podSelector:
matchLabels:
app: my-service
port: http
proxyProtocol: HTTP/1
---
# Allow traffic from specific clients
apiVersion: policy.linkerd.io/v1beta1
kind: ServerAuthorization
metadata:
name: allow-frontend
namespace: my-namespace
spec:
server:
name: my-service-http
client:
meshTLS:
serviceAccounts:
- name: frontend
namespace: my-namespace
---
# Allow unauthenticated traffic (e.g., from ingress)
apiVersion: policy.linkerd.io/v1beta1
kind: ServerAuthorization
metadata:
name: allow-ingress
namespace: my-namespace
spec:
server:
name: my-service-http
client:
unauthenticated: true
networks:
- cidr: 10.0.0.0/8
```
### Template 6: HTTPRoute for Advanced Routing
```yaml
apiVersion: policy.linkerd.io/v1beta2
kind: HTTPRoute
metadata:
name: my-route
namespace: my-namespace
spec:
parentRefs:
- name: my-service
kind: Service
group: core
port: 8080
rules:
- matches:
- path:
type: PathPrefix
value: /api/v2
- headers:
- name: x-api-version
value: v2
backendRefs:
- name: my-service-v2
port: 8080
- matches:
- path:
type: PathPrefix
value: /api
backendRefs:
- name: my-service-v1
port: 8080
```
### Template 7: Multi-cluster Setup
```bash
# On each cluster, install with cluster credentials
linkerd multicluster install | kubectl apply -f -
# Link clusters
linkerd multicluster link --cluster-name west \
--api-server-address https://west.example.com:6443 \
| kubectl apply -f -
# Export a service to other clusters
kubectl label svc/my-service mirror.linkerd.io/exported=true
# Verify cross-cluster connectivity
linkerd multicluster check
linkerd multicluster gateways
```
## Monitoring Commands
```bash
# Live traffic view
linkerd viz top deploy/my-app
# Per-route metrics
linkerd viz routes deploy/my-app
# Check proxy status
linkerd viz stat deploy -n my-namespace
# View service dependencies
linkerd viz edges deploy -n my-namespace
# Dashboard
linkerd viz dashboard
```
## Debugging
```bash
# Check injection status
linkerd check --proxy -n my-namespace
# View proxy logs
kubectl logs deploy/my-app -c linkerd-proxy
# Debug identity/TLS
linkerd identity -n my-namespace
# Tap traffic (live)
linkerd viz tap deploy/my-app --to deploy/my-backend
```
## Best Practices
### Do's
- **Enable mTLS everywhere** - It's automatic with Linkerd
- **Use ServiceProfiles** - Get per-route metrics and retries
- **Set retry budgets** - Prevent retry storms
- **Monitor golden metrics** - Success rate, latency, throughput
### Don'ts
- **Don't skip check** - Always run `linkerd check` after changes
- **Don't over-configure** - Linkerd defaults are sensible
- **Don't ignore ServiceProfiles** - They unlock advanced features
- **Don't forget timeouts** - Set appropriate values per route
## Resources
- [Linkerd Documentation](https://linkerd.io/2.14/overview/)
- [Service Profiles](https://linkerd.io/2.14/features/service-profiles/)
- [Authorization Policy](https://linkerd.io/2.14/features/server-policy/)

View File

@@ -0,0 +1,347 @@
---
name: mtls-configuration
description: Configure mutual TLS (mTLS) for zero-trust service-to-service communication. Use when implementing zero-trust networking, certificate management, or securing internal service communication.
---
# mTLS Configuration
Comprehensive guide to implementing mutual TLS for zero-trust service mesh communication.
## When to Use This Skill
- Implementing zero-trust networking
- Securing service-to-service communication
- Certificate rotation and management
- Debugging TLS handshake issues
- Compliance requirements (PCI-DSS, HIPAA)
- Multi-cluster secure communication
## Core Concepts
### 1. mTLS Flow
```
┌─────────┐ ┌─────────┐
│ Service │ │ Service │
│ A │ │ B │
└────┬────┘ └────┬────┘
│ │
┌────┴────┐ TLS Handshake ┌────┴────┐
│ Proxy │◄───────────────────────────►│ Proxy │
│(Sidecar)│ 1. ClientHello │(Sidecar)│
│ │ 2. ServerHello + Cert │ │
│ │ 3. Client Cert │ │
│ │ 4. Verify Both Certs │ │
│ │ 5. Encrypted Channel │ │
└─────────┘ └─────────┘
```
### 2. Certificate Hierarchy
```
Root CA (Self-signed, long-lived)
├── Intermediate CA (Cluster-level)
│ │
│ ├── Workload Cert (Service A)
│ └── Workload Cert (Service B)
└── Intermediate CA (Multi-cluster)
└── Cross-cluster certs
```
## Templates
### Template 1: Istio mTLS (Strict Mode)
```yaml
# Enable strict mTLS mesh-wide
apiVersion: security.istio.io/v1beta1
kind: PeerAuthentication
metadata:
name: default
namespace: istio-system
spec:
mtls:
mode: STRICT
---
# Namespace-level override (permissive for migration)
apiVersion: security.istio.io/v1beta1
kind: PeerAuthentication
metadata:
name: default
namespace: legacy-namespace
spec:
mtls:
mode: PERMISSIVE
---
# Workload-specific policy
apiVersion: security.istio.io/v1beta1
kind: PeerAuthentication
metadata:
name: payment-service
namespace: production
spec:
selector:
matchLabels:
app: payment-service
mtls:
mode: STRICT
portLevelMtls:
8080:
mode: STRICT
9090:
mode: DISABLE # Metrics port, no mTLS
```
### Template 2: Istio Destination Rule for mTLS
```yaml
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: default
namespace: istio-system
spec:
host: "*.local"
trafficPolicy:
tls:
mode: ISTIO_MUTUAL
---
# TLS to external service
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: external-api
spec:
host: api.external.com
trafficPolicy:
tls:
mode: SIMPLE
caCertificates: /etc/certs/external-ca.pem
---
# Mutual TLS to external service
apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: partner-api
spec:
host: api.partner.com
trafficPolicy:
tls:
mode: MUTUAL
clientCertificate: /etc/certs/client.pem
privateKey: /etc/certs/client-key.pem
caCertificates: /etc/certs/partner-ca.pem
```
### Template 3: Cert-Manager with Istio
```yaml
# Install cert-manager issuer for Istio
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: istio-ca
spec:
ca:
secretName: istio-ca-secret
---
# Create Istio CA secret
apiVersion: v1
kind: Secret
metadata:
name: istio-ca-secret
namespace: cert-manager
type: kubernetes.io/tls
data:
tls.crt: <base64-encoded-ca-cert>
tls.key: <base64-encoded-ca-key>
---
# Certificate for workload
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: my-service-cert
namespace: my-namespace
spec:
secretName: my-service-tls
duration: 24h
renewBefore: 8h
issuerRef:
name: istio-ca
kind: ClusterIssuer
commonName: my-service.my-namespace.svc.cluster.local
dnsNames:
- my-service
- my-service.my-namespace
- my-service.my-namespace.svc
- my-service.my-namespace.svc.cluster.local
usages:
- server auth
- client auth
```
### Template 4: SPIFFE/SPIRE Integration
```yaml
# SPIRE Server configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: spire-server
namespace: spire
data:
server.conf: |
server {
bind_address = "0.0.0.0"
bind_port = "8081"
trust_domain = "example.org"
data_dir = "/run/spire/data"
log_level = "INFO"
ca_ttl = "168h"
default_x509_svid_ttl = "1h"
}
plugins {
DataStore "sql" {
plugin_data {
database_type = "sqlite3"
connection_string = "/run/spire/data/datastore.sqlite3"
}
}
NodeAttestor "k8s_psat" {
plugin_data {
clusters = {
"demo-cluster" = {
service_account_allow_list = ["spire:spire-agent"]
}
}
}
}
KeyManager "memory" {
plugin_data {}
}
UpstreamAuthority "disk" {
plugin_data {
key_file_path = "/run/spire/secrets/bootstrap.key"
cert_file_path = "/run/spire/secrets/bootstrap.crt"
}
}
}
---
# SPIRE Agent DaemonSet (abbreviated)
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: spire-agent
namespace: spire
spec:
selector:
matchLabels:
app: spire-agent
template:
spec:
containers:
- name: spire-agent
image: ghcr.io/spiffe/spire-agent:1.8.0
volumeMounts:
- name: spire-agent-socket
mountPath: /run/spire/sockets
volumes:
- name: spire-agent-socket
hostPath:
path: /run/spire/sockets
type: DirectoryOrCreate
```
### Template 5: Linkerd mTLS (Automatic)
```yaml
# Linkerd enables mTLS automatically
# Verify with:
# linkerd viz edges deployment -n my-namespace
# For external services without mTLS
apiVersion: policy.linkerd.io/v1beta1
kind: Server
metadata:
name: external-api
namespace: my-namespace
spec:
podSelector:
matchLabels:
app: my-app
port: external-api
proxyProtocol: HTTP/1 # or TLS for passthrough
---
# Skip TLS for specific port
apiVersion: v1
kind: Service
metadata:
name: my-service
annotations:
config.linkerd.io/skip-outbound-ports: "3306" # MySQL
```
## Certificate Rotation
```bash
# Istio - Check certificate expiry
istioctl proxy-config secret deploy/my-app -o json | \
jq '.dynamicActiveSecrets[0].secret.tlsCertificate.certificateChain.inlineBytes' | \
tr -d '"' | base64 -d | openssl x509 -text -noout
# Force certificate rotation
kubectl rollout restart deployment/my-app
# Check Linkerd identity
linkerd identity -n my-namespace
```
## Debugging mTLS Issues
```bash
# Istio - Check if mTLS is enabled
istioctl authn tls-check my-service.my-namespace.svc.cluster.local
# Verify peer authentication
kubectl get peerauthentication --all-namespaces
# Check destination rules
kubectl get destinationrule --all-namespaces
# Debug TLS handshake
istioctl proxy-config log deploy/my-app --level debug
kubectl logs deploy/my-app -c istio-proxy | grep -i tls
# Linkerd - Check mTLS status
linkerd viz edges deployment -n my-namespace
linkerd viz tap deploy/my-app --to deploy/my-backend
```
## Best Practices
### Do's
- **Start with PERMISSIVE** - Migrate gradually to STRICT
- **Monitor certificate expiry** - Set up alerts
- **Use short-lived certs** - 24h or less for workloads
- **Rotate CA periodically** - Plan for CA rotation
- **Log TLS errors** - For debugging and audit
### Don'ts
- **Don't disable mTLS** - For convenience in production
- **Don't ignore cert expiry** - Automate rotation
- **Don't use self-signed certs** - Use proper CA hierarchy
- **Don't skip verification** - Verify the full chain
## Resources
- [Istio Security](https://istio.io/latest/docs/concepts/security/)
- [SPIFFE/SPIRE](https://spiffe.io/)
- [cert-manager](https://cert-manager.io/)
- [Zero Trust Architecture (NIST)](https://www.nist.gov/publications/zero-trust-architecture)

View File

@@ -0,0 +1,383 @@
---
name: service-mesh-observability
description: Implement comprehensive observability for service meshes including distributed tracing, metrics, and visualization. Use when setting up mesh monitoring, debugging latency issues, or implementing SLOs for service communication.
---
# Service Mesh Observability
Complete guide to observability patterns for Istio, Linkerd, and service mesh deployments.
## When to Use This Skill
- Setting up distributed tracing across services
- Implementing service mesh metrics and dashboards
- Debugging latency and error issues
- Defining SLOs for service communication
- Visualizing service dependencies
- Troubleshooting mesh connectivity
## Core Concepts
### 1. Three Pillars of Observability
```
┌─────────────────────────────────────────────────────┐
│ Observability │
├─────────────────┬─────────────────┬─────────────────┤
│ Metrics │ Traces │ Logs │
│ │ │ │
│ • Request rate │ • Span context │ • Access logs │
│ • Error rate │ • Latency │ • Error details │
│ • Latency P50 │ • Dependencies │ • Debug info │
│ • Saturation │ • Bottlenecks │ • Audit trail │
└─────────────────┴─────────────────┴─────────────────┘
```
### 2. Golden Signals for Mesh
| Signal | Description | Alert Threshold |
|--------|-------------|-----------------|
| **Latency** | Request duration P50, P99 | P99 > 500ms |
| **Traffic** | Requests per second | Anomaly detection |
| **Errors** | 5xx error rate | > 1% |
| **Saturation** | Resource utilization | > 80% |
## Templates
### Template 1: Istio with Prometheus & Grafana
```yaml
# Install Prometheus
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus
namespace: istio-system
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'istio-mesh'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name]
action: keep
regex: istio-telemetry
---
# ServiceMonitor for Prometheus Operator
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: istio-mesh
namespace: istio-system
spec:
selector:
matchLabels:
app: istiod
endpoints:
- port: http-monitoring
interval: 15s
```
### Template 2: Key Istio Metrics Queries
```promql
# Request rate by service
sum(rate(istio_requests_total{reporter="destination"}[5m])) by (destination_service_name)
# Error rate (5xx)
sum(rate(istio_requests_total{reporter="destination", response_code=~"5.."}[5m]))
/ sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100
# P99 latency
histogram_quantile(0.99,
sum(rate(istio_request_duration_milliseconds_bucket{reporter="destination"}[5m]))
by (le, destination_service_name))
# TCP connections
sum(istio_tcp_connections_opened_total{reporter="destination"}) by (destination_service_name)
# Request size
histogram_quantile(0.99,
sum(rate(istio_request_bytes_bucket{reporter="destination"}[5m]))
by (le, destination_service_name))
```
### Template 3: Jaeger Distributed Tracing
```yaml
# Jaeger installation for Istio
apiVersion: install.istio.io/v1alpha1
kind: IstioOperator
spec:
meshConfig:
enableTracing: true
defaultConfig:
tracing:
sampling: 100.0 # 100% in dev, lower in prod
zipkin:
address: jaeger-collector.istio-system:9411
---
# Jaeger deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: jaeger
namespace: istio-system
spec:
selector:
matchLabels:
app: jaeger
template:
metadata:
labels:
app: jaeger
spec:
containers:
- name: jaeger
image: jaegertracing/all-in-one:1.50
ports:
- containerPort: 5775 # UDP
- containerPort: 6831 # Thrift
- containerPort: 6832 # Thrift
- containerPort: 5778 # Config
- containerPort: 16686 # UI
- containerPort: 14268 # HTTP
- containerPort: 14250 # gRPC
- containerPort: 9411 # Zipkin
env:
- name: COLLECTOR_ZIPKIN_HOST_PORT
value: ":9411"
```
### Template 4: Linkerd Viz Dashboard
```bash
# Install Linkerd viz extension
linkerd viz install | kubectl apply -f -
# Access dashboard
linkerd viz dashboard
# CLI commands for observability
# Top requests
linkerd viz top deploy/my-app
# Per-route metrics
linkerd viz routes deploy/my-app --to deploy/backend
# Live traffic inspection
linkerd viz tap deploy/my-app --to deploy/backend
# Service edges (dependencies)
linkerd viz edges deployment -n my-namespace
```
### Template 5: Grafana Dashboard JSON
```json
{
"dashboard": {
"title": "Service Mesh Overview",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(istio_requests_total{reporter=\"destination\"}[5m])) by (destination_service_name)",
"legendFormat": "{{destination_service_name}}"
}
]
},
{
"title": "Error Rate",
"type": "gauge",
"targets": [
{
"expr": "sum(rate(istio_requests_total{response_code=~\"5..\"}[5m])) / sum(rate(istio_requests_total[5m])) * 100"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{"value": 0, "color": "green"},
{"value": 1, "color": "yellow"},
{"value": 5, "color": "red"}
]
}
}
}
},
{
"title": "P99 Latency",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket{reporter=\"destination\"}[5m])) by (le, destination_service_name))",
"legendFormat": "{{destination_service_name}}"
}
]
},
{
"title": "Service Topology",
"type": "nodeGraph",
"targets": [
{
"expr": "sum(rate(istio_requests_total{reporter=\"destination\"}[5m])) by (source_workload, destination_service_name)"
}
]
}
]
}
}
```
### Template 6: Kiali Service Mesh Visualization
```yaml
# Kiali installation
apiVersion: kiali.io/v1alpha1
kind: Kiali
metadata:
name: kiali
namespace: istio-system
spec:
auth:
strategy: anonymous # or openid, token
deployment:
accessible_namespaces:
- "**"
external_services:
prometheus:
url: http://prometheus.istio-system:9090
tracing:
url: http://jaeger-query.istio-system:16686
grafana:
url: http://grafana.istio-system:3000
```
### Template 7: OpenTelemetry Integration
```yaml
# OpenTelemetry Collector for mesh
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-collector-config
data:
config.yaml: |
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
zipkin:
endpoint: 0.0.0.0:9411
processors:
batch:
timeout: 10s
exporters:
jaeger:
endpoint: jaeger-collector:14250
tls:
insecure: true
prometheus:
endpoint: 0.0.0.0:8889
service:
pipelines:
traces:
receivers: [otlp, zipkin]
processors: [batch]
exporters: [jaeger]
metrics:
receivers: [otlp]
processors: [batch]
exporters: [prometheus]
---
# Istio Telemetry v2 with OTel
apiVersion: telemetry.istio.io/v1alpha1
kind: Telemetry
metadata:
name: mesh-default
namespace: istio-system
spec:
tracing:
- providers:
- name: otel
randomSamplingPercentage: 10
```
## Alerting Rules
```yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: mesh-alerts
namespace: istio-system
spec:
groups:
- name: mesh.rules
rules:
- alert: HighErrorRate
expr: |
sum(rate(istio_requests_total{response_code=~"5.."}[5m])) by (destination_service_name)
/ sum(rate(istio_requests_total[5m])) by (destination_service_name) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate for {{ $labels.destination_service_name }}"
- alert: HighLatency
expr: |
histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[5m]))
by (le, destination_service_name)) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High P99 latency for {{ $labels.destination_service_name }}"
- alert: MeshCertExpiring
expr: |
(certmanager_certificate_expiration_timestamp_seconds - time()) / 86400 < 7
labels:
severity: warning
annotations:
summary: "Mesh certificate expiring in less than 7 days"
```
## Best Practices
### Do's
- **Sample appropriately** - 100% in dev, 1-10% in prod
- **Use trace context** - Propagate headers consistently
- **Set up alerts** - For golden signals
- **Correlate metrics/traces** - Use exemplars
- **Retain strategically** - Hot/cold storage tiers
### Don'ts
- **Don't over-sample** - Storage costs add up
- **Don't ignore cardinality** - Limit label values
- **Don't skip dashboards** - Visualize dependencies
- **Don't forget costs** - Monitor observability costs
## Resources
- [Istio Observability](https://istio.io/latest/docs/tasks/observability/)
- [Linkerd Observability](https://linkerd.io/2.14/features/dashboard/)
- [OpenTelemetry](https://opentelemetry.io/)
- [Kiali](https://kiali.io/)

View File

@@ -0,0 +1,523 @@
---
name: airflow-dag-patterns
description: Build production Apache Airflow DAGs with best practices for operators, sensors, testing, and deployment. Use when creating data pipelines, orchestrating workflows, or scheduling batch jobs.
---
# Apache Airflow DAG Patterns
Production-ready patterns for Apache Airflow including DAG design, operators, sensors, testing, and deployment strategies.
## When to Use This Skill
- Creating data pipeline orchestration with Airflow
- Designing DAG structures and dependencies
- Implementing custom operators and sensors
- Testing Airflow DAGs locally
- Setting up Airflow in production
- Debugging failed DAG runs
## Core Concepts
### 1. DAG Design Principles
| Principle | Description |
|-----------|-------------|
| **Idempotent** | Running twice produces same result |
| **Atomic** | Tasks succeed or fail completely |
| **Incremental** | Process only new/changed data |
| **Observable** | Logs, metrics, alerts at every step |
### 2. Task Dependencies
```python
# Linear
task1 >> task2 >> task3
# Fan-out
task1 >> [task2, task3, task4]
# Fan-in
[task1, task2, task3] >> task4
# Complex
task1 >> task2 >> task4
task1 >> task3 >> task4
```
## Quick Start
```python
# dags/example_dag.py
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.empty import EmptyOperator
default_args = {
'owner': 'data-team',
'depends_on_past': False,
'email_on_failure': True,
'email_on_retry': False,
'retries': 3,
'retry_delay': timedelta(minutes=5),
'retry_exponential_backoff': True,
'max_retry_delay': timedelta(hours=1),
}
with DAG(
dag_id='example_etl',
default_args=default_args,
description='Example ETL pipeline',
schedule='0 6 * * *', # Daily at 6 AM
start_date=datetime(2024, 1, 1),
catchup=False,
tags=['etl', 'example'],
max_active_runs=1,
) as dag:
start = EmptyOperator(task_id='start')
def extract_data(**context):
execution_date = context['ds']
# Extract logic here
return {'records': 1000}
extract = PythonOperator(
task_id='extract',
python_callable=extract_data,
)
end = EmptyOperator(task_id='end')
start >> extract >> end
```
## Patterns
### Pattern 1: TaskFlow API (Airflow 2.0+)
```python
# dags/taskflow_example.py
from datetime import datetime
from airflow.decorators import dag, task
from airflow.models import Variable
@dag(
dag_id='taskflow_etl',
schedule='@daily',
start_date=datetime(2024, 1, 1),
catchup=False,
tags=['etl', 'taskflow'],
)
def taskflow_etl():
"""ETL pipeline using TaskFlow API"""
@task()
def extract(source: str) -> dict:
"""Extract data from source"""
import pandas as pd
df = pd.read_csv(f's3://bucket/{source}/{{ ds }}.csv')
return {'data': df.to_dict(), 'rows': len(df)}
@task()
def transform(extracted: dict) -> dict:
"""Transform extracted data"""
import pandas as pd
df = pd.DataFrame(extracted['data'])
df['processed_at'] = datetime.now()
df = df.dropna()
return {'data': df.to_dict(), 'rows': len(df)}
@task()
def load(transformed: dict, target: str):
"""Load data to target"""
import pandas as pd
df = pd.DataFrame(transformed['data'])
df.to_parquet(f's3://bucket/{target}/{{ ds }}.parquet')
return transformed['rows']
@task()
def notify(rows_loaded: int):
"""Send notification"""
print(f'Loaded {rows_loaded} rows')
# Define dependencies with XCom passing
extracted = extract(source='raw_data')
transformed = transform(extracted)
loaded = load(transformed, target='processed_data')
notify(loaded)
# Instantiate the DAG
taskflow_etl()
```
### Pattern 2: Dynamic DAG Generation
```python
# dags/dynamic_dag_factory.py
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.models import Variable
import json
# Configuration for multiple similar pipelines
PIPELINE_CONFIGS = [
{'name': 'customers', 'schedule': '@daily', 'source': 's3://raw/customers'},
{'name': 'orders', 'schedule': '@hourly', 'source': 's3://raw/orders'},
{'name': 'products', 'schedule': '@weekly', 'source': 's3://raw/products'},
]
def create_dag(config: dict) -> DAG:
"""Factory function to create DAGs from config"""
dag_id = f"etl_{config['name']}"
default_args = {
'owner': 'data-team',
'retries': 3,
'retry_delay': timedelta(minutes=5),
}
dag = DAG(
dag_id=dag_id,
default_args=default_args,
schedule=config['schedule'],
start_date=datetime(2024, 1, 1),
catchup=False,
tags=['etl', 'dynamic', config['name']],
)
with dag:
def extract_fn(source, **context):
print(f"Extracting from {source} for {context['ds']}")
def transform_fn(**context):
print(f"Transforming data for {context['ds']}")
def load_fn(table_name, **context):
print(f"Loading to {table_name} for {context['ds']}")
extract = PythonOperator(
task_id='extract',
python_callable=extract_fn,
op_kwargs={'source': config['source']},
)
transform = PythonOperator(
task_id='transform',
python_callable=transform_fn,
)
load = PythonOperator(
task_id='load',
python_callable=load_fn,
op_kwargs={'table_name': config['name']},
)
extract >> transform >> load
return dag
# Generate DAGs
for config in PIPELINE_CONFIGS:
globals()[f"dag_{config['name']}"] = create_dag(config)
```
### Pattern 3: Branching and Conditional Logic
```python
# dags/branching_example.py
from airflow.decorators import dag, task
from airflow.operators.python import BranchPythonOperator
from airflow.operators.empty import EmptyOperator
from airflow.utils.trigger_rule import TriggerRule
@dag(
dag_id='branching_pipeline',
schedule='@daily',
start_date=datetime(2024, 1, 1),
catchup=False,
)
def branching_pipeline():
@task()
def check_data_quality() -> dict:
"""Check data quality and return metrics"""
quality_score = 0.95 # Simulated
return {'score': quality_score, 'rows': 10000}
def choose_branch(**context) -> str:
"""Determine which branch to execute"""
ti = context['ti']
metrics = ti.xcom_pull(task_ids='check_data_quality')
if metrics['score'] >= 0.9:
return 'high_quality_path'
elif metrics['score'] >= 0.7:
return 'medium_quality_path'
else:
return 'low_quality_path'
quality_check = check_data_quality()
branch = BranchPythonOperator(
task_id='branch',
python_callable=choose_branch,
)
high_quality = EmptyOperator(task_id='high_quality_path')
medium_quality = EmptyOperator(task_id='medium_quality_path')
low_quality = EmptyOperator(task_id='low_quality_path')
# Join point - runs after any branch completes
join = EmptyOperator(
task_id='join',
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,
)
quality_check >> branch >> [high_quality, medium_quality, low_quality] >> join
branching_pipeline()
```
### Pattern 4: Sensors and External Dependencies
```python
# dags/sensor_patterns.py
from datetime import datetime, timedelta
from airflow import DAG
from airflow.sensors.filesystem import FileSensor
from airflow.providers.amazon.aws.sensors.s3 import S3KeySensor
from airflow.sensors.external_task import ExternalTaskSensor
from airflow.operators.python import PythonOperator
with DAG(
dag_id='sensor_example',
schedule='@daily',
start_date=datetime(2024, 1, 1),
catchup=False,
) as dag:
# Wait for file on S3
wait_for_file = S3KeySensor(
task_id='wait_for_s3_file',
bucket_name='data-lake',
bucket_key='raw/{{ ds }}/data.parquet',
aws_conn_id='aws_default',
timeout=60 * 60 * 2, # 2 hours
poke_interval=60 * 5, # Check every 5 minutes
mode='reschedule', # Free up worker slot while waiting
)
# Wait for another DAG to complete
wait_for_upstream = ExternalTaskSensor(
task_id='wait_for_upstream_dag',
external_dag_id='upstream_etl',
external_task_id='final_task',
execution_date_fn=lambda dt: dt, # Same execution date
timeout=60 * 60 * 3,
mode='reschedule',
)
# Custom sensor using @task.sensor decorator
@task.sensor(poke_interval=60, timeout=3600, mode='reschedule')
def wait_for_api() -> PokeReturnValue:
"""Custom sensor for API availability"""
import requests
response = requests.get('https://api.example.com/health')
is_done = response.status_code == 200
return PokeReturnValue(is_done=is_done, xcom_value=response.json())
api_ready = wait_for_api()
def process_data(**context):
api_result = context['ti'].xcom_pull(task_ids='wait_for_api')
print(f"API returned: {api_result}")
process = PythonOperator(
task_id='process',
python_callable=process_data,
)
[wait_for_file, wait_for_upstream, api_ready] >> process
```
### Pattern 5: Error Handling and Alerts
```python
# dags/error_handling.py
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.trigger_rule import TriggerRule
from airflow.models import Variable
def task_failure_callback(context):
"""Callback on task failure"""
task_instance = context['task_instance']
exception = context.get('exception')
# Send to Slack/PagerDuty/etc
message = f"""
Task Failed!
DAG: {task_instance.dag_id}
Task: {task_instance.task_id}
Execution Date: {context['ds']}
Error: {exception}
Log URL: {task_instance.log_url}
"""
# send_slack_alert(message)
print(message)
def dag_failure_callback(context):
"""Callback on DAG failure"""
# Aggregate failures, send summary
pass
with DAG(
dag_id='error_handling_example',
schedule='@daily',
start_date=datetime(2024, 1, 1),
catchup=False,
on_failure_callback=dag_failure_callback,
default_args={
'on_failure_callback': task_failure_callback,
'retries': 3,
'retry_delay': timedelta(minutes=5),
},
) as dag:
def might_fail(**context):
import random
if random.random() < 0.3:
raise ValueError("Random failure!")
return "Success"
risky_task = PythonOperator(
task_id='risky_task',
python_callable=might_fail,
)
def cleanup(**context):
"""Cleanup runs regardless of upstream failures"""
print("Cleaning up...")
cleanup_task = PythonOperator(
task_id='cleanup',
python_callable=cleanup,
trigger_rule=TriggerRule.ALL_DONE, # Run even if upstream fails
)
def notify_success(**context):
"""Only runs if all upstream succeeded"""
print("All tasks succeeded!")
success_notification = PythonOperator(
task_id='notify_success',
python_callable=notify_success,
trigger_rule=TriggerRule.ALL_SUCCESS,
)
risky_task >> [cleanup_task, success_notification]
```
### Pattern 6: Testing DAGs
```python
# tests/test_dags.py
import pytest
from datetime import datetime
from airflow.models import DagBag
@pytest.fixture
def dagbag():
return DagBag(dag_folder='dags/', include_examples=False)
def test_dag_loaded(dagbag):
"""Test that all DAGs load without errors"""
assert len(dagbag.import_errors) == 0, f"DAG import errors: {dagbag.import_errors}"
def test_dag_structure(dagbag):
"""Test specific DAG structure"""
dag = dagbag.get_dag('example_etl')
assert dag is not None
assert len(dag.tasks) == 3
assert dag.schedule_interval == '0 6 * * *'
def test_task_dependencies(dagbag):
"""Test task dependencies are correct"""
dag = dagbag.get_dag('example_etl')
extract_task = dag.get_task('extract')
assert 'start' in [t.task_id for t in extract_task.upstream_list]
assert 'end' in [t.task_id for t in extract_task.downstream_list]
def test_dag_integrity(dagbag):
"""Test DAG has no cycles and is valid"""
for dag_id, dag in dagbag.dags.items():
assert dag.test_cycle() is None, f"Cycle detected in {dag_id}"
# Test individual task logic
def test_extract_function():
"""Unit test for extract function"""
from dags.example_dag import extract_data
result = extract_data(ds='2024-01-01')
assert 'records' in result
assert isinstance(result['records'], int)
```
## Project Structure
```
airflow/
├── dags/
│ ├── __init__.py
│ ├── common/
│ │ ├── __init__.py
│ │ ├── operators.py # Custom operators
│ │ ├── sensors.py # Custom sensors
│ │ └── callbacks.py # Alert callbacks
│ ├── etl/
│ │ ├── customers.py
│ │ └── orders.py
│ └── ml/
│ └── training.py
├── plugins/
│ └── custom_plugin.py
├── tests/
│ ├── __init__.py
│ ├── test_dags.py
│ └── test_operators.py
├── docker-compose.yml
└── requirements.txt
```
## Best Practices
### Do's
- **Use TaskFlow API** - Cleaner code, automatic XCom
- **Set timeouts** - Prevent zombie tasks
- **Use `mode='reschedule'`** - For sensors, free up workers
- **Test DAGs** - Unit tests and integration tests
- **Idempotent tasks** - Safe to retry
### Don'ts
- **Don't use `depends_on_past=True`** - Creates bottlenecks
- **Don't hardcode dates** - Use `{{ ds }}` macros
- **Don't use global state** - Tasks should be stateless
- **Don't skip catchup blindly** - Understand implications
- **Don't put heavy logic in DAG file** - Import from modules
## Resources
- [Airflow Documentation](https://airflow.apache.org/docs/)
- [Astronomer Guides](https://docs.astronomer.io/learn)
- [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/tutorial/taskflow.html)

View File

@@ -0,0 +1,587 @@
---
name: data-quality-frameworks
description: Implement data quality validation with Great Expectations, dbt tests, and data contracts. Use when building data quality pipelines, implementing validation rules, or establishing data contracts.
---
# Data Quality Frameworks
Production patterns for implementing data quality with Great Expectations, dbt tests, and data contracts to ensure reliable data pipelines.
## When to Use This Skill
- Implementing data quality checks in pipelines
- Setting up Great Expectations validation
- Building comprehensive dbt test suites
- Establishing data contracts between teams
- Monitoring data quality metrics
- Automating data validation in CI/CD
## Core Concepts
### 1. Data Quality Dimensions
| Dimension | Description | Example Check |
|-----------|-------------|---------------|
| **Completeness** | No missing values | `expect_column_values_to_not_be_null` |
| **Uniqueness** | No duplicates | `expect_column_values_to_be_unique` |
| **Validity** | Values in expected range | `expect_column_values_to_be_in_set` |
| **Accuracy** | Data matches reality | Cross-reference validation |
| **Consistency** | No contradictions | `expect_column_pair_values_A_to_be_greater_than_B` |
| **Timeliness** | Data is recent | `expect_column_max_to_be_between` |
### 2. Testing Pyramid for Data
```
/\
/ \ Integration Tests (cross-table)
/────\
/ \ Unit Tests (single column)
/────────\
/ \ Schema Tests (structure)
/────────────\
```
## Quick Start
### Great Expectations Setup
```bash
# Install
pip install great_expectations
# Initialize project
great_expectations init
# Create datasource
great_expectations datasource new
```
```python
# great_expectations/checkpoints/daily_validation.yml
import great_expectations as gx
# Create context
context = gx.get_context()
# Create expectation suite
suite = context.add_expectation_suite("orders_suite")
# Add expectations
suite.add_expectation(
gx.expectations.ExpectColumnValuesToNotBeNull(column="order_id")
)
suite.add_expectation(
gx.expectations.ExpectColumnValuesToBeUnique(column="order_id")
)
# Validate
results = context.run_checkpoint(checkpoint_name="daily_orders")
```
## Patterns
### Pattern 1: Great Expectations Suite
```python
# expectations/orders_suite.py
import great_expectations as gx
from great_expectations.core import ExpectationSuite
from great_expectations.core.expectation_configuration import ExpectationConfiguration
def build_orders_suite() -> ExpectationSuite:
"""Build comprehensive orders expectation suite"""
suite = ExpectationSuite(expectation_suite_name="orders_suite")
# Schema expectations
suite.add_expectation(ExpectationConfiguration(
expectation_type="expect_table_columns_to_match_set",
kwargs={
"column_set": ["order_id", "customer_id", "amount", "status", "created_at"],
"exact_match": False # Allow additional columns
}
))
# Primary key
suite.add_expectation(ExpectationConfiguration(
expectation_type="expect_column_values_to_not_be_null",
kwargs={"column": "order_id"}
))
suite.add_expectation(ExpectationConfiguration(
expectation_type="expect_column_values_to_be_unique",
kwargs={"column": "order_id"}
))
# Foreign key
suite.add_expectation(ExpectationConfiguration(
expectation_type="expect_column_values_to_not_be_null",
kwargs={"column": "customer_id"}
))
# Categorical values
suite.add_expectation(ExpectationConfiguration(
expectation_type="expect_column_values_to_be_in_set",
kwargs={
"column": "status",
"value_set": ["pending", "processing", "shipped", "delivered", "cancelled"]
}
))
# Numeric ranges
suite.add_expectation(ExpectationConfiguration(
expectation_type="expect_column_values_to_be_between",
kwargs={
"column": "amount",
"min_value": 0,
"max_value": 100000,
"strict_min": True # amount > 0
}
))
# Date validity
suite.add_expectation(ExpectationConfiguration(
expectation_type="expect_column_values_to_be_dateutil_parseable",
kwargs={"column": "created_at"}
))
# Freshness - data should be recent
suite.add_expectation(ExpectationConfiguration(
expectation_type="expect_column_max_to_be_between",
kwargs={
"column": "created_at",
"min_value": {"$PARAMETER": "now - timedelta(days=1)"},
"max_value": {"$PARAMETER": "now"}
}
))
# Row count sanity
suite.add_expectation(ExpectationConfiguration(
expectation_type="expect_table_row_count_to_be_between",
kwargs={
"min_value": 1000, # Expect at least 1000 rows
"max_value": 10000000
}
))
# Statistical expectations
suite.add_expectation(ExpectationConfiguration(
expectation_type="expect_column_mean_to_be_between",
kwargs={
"column": "amount",
"min_value": 50,
"max_value": 500
}
))
return suite
```
### Pattern 2: Great Expectations Checkpoint
```yaml
# great_expectations/checkpoints/orders_checkpoint.yml
name: orders_checkpoint
config_version: 1.0
class_name: Checkpoint
run_name_template: "%Y%m%d-%H%M%S-orders-validation"
validations:
- batch_request:
datasource_name: warehouse
data_connector_name: default_inferred_data_connector_name
data_asset_name: orders
data_connector_query:
index: -1 # Latest batch
expectation_suite_name: orders_suite
action_list:
- name: store_validation_result
action:
class_name: StoreValidationResultAction
- name: store_evaluation_parameters
action:
class_name: StoreEvaluationParametersAction
- name: update_data_docs
action:
class_name: UpdateDataDocsAction
# Slack notification on failure
- name: send_slack_notification
action:
class_name: SlackNotificationAction
slack_webhook: ${SLACK_WEBHOOK}
notify_on: failure
renderer:
module_name: great_expectations.render.renderer.slack_renderer
class_name: SlackRenderer
```
```python
# Run checkpoint
import great_expectations as gx
context = gx.get_context()
result = context.run_checkpoint(checkpoint_name="orders_checkpoint")
if not result.success:
failed_expectations = [
r for r in result.run_results.values()
if not r.success
]
raise ValueError(f"Data quality check failed: {failed_expectations}")
```
### Pattern 3: dbt Data Tests
```yaml
# models/marts/core/_core__models.yml
version: 2
models:
- name: fct_orders
description: Order fact table
tests:
# Table-level tests
- dbt_utils.recency:
datepart: day
field: created_at
interval: 1
- dbt_utils.at_least_one
- dbt_utils.expression_is_true:
expression: "total_amount >= 0"
columns:
- name: order_id
description: Primary key
tests:
- unique
- not_null
- name: customer_id
description: Foreign key to dim_customers
tests:
- not_null
- relationships:
to: ref('dim_customers')
field: customer_id
- name: order_status
tests:
- accepted_values:
values: ['pending', 'processing', 'shipped', 'delivered', 'cancelled']
- name: total_amount
tests:
- not_null
- dbt_utils.expression_is_true:
expression: ">= 0"
- name: created_at
tests:
- not_null
- dbt_utils.expression_is_true:
expression: "<= current_timestamp"
- name: dim_customers
columns:
- name: customer_id
tests:
- unique
- not_null
- name: email
tests:
- unique
- not_null
# Custom regex test
- dbt_utils.expression_is_true:
expression: "email ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'"
```
### Pattern 4: Custom dbt Tests
```sql
-- tests/generic/test_row_count_in_range.sql
{% test row_count_in_range(model, min_count, max_count) %}
with row_count as (
select count(*) as cnt from {{ model }}
)
select cnt
from row_count
where cnt < {{ min_count }} or cnt > {{ max_count }}
{% endtest %}
-- Usage in schema.yml:
-- tests:
-- - row_count_in_range:
-- min_count: 1000
-- max_count: 10000000
```
```sql
-- tests/generic/test_sequential_values.sql
{% test sequential_values(model, column_name, interval=1) %}
with lagged as (
select
{{ column_name }},
lag({{ column_name }}) over (order by {{ column_name }}) as prev_value
from {{ model }}
)
select *
from lagged
where {{ column_name }} - prev_value != {{ interval }}
and prev_value is not null
{% endtest %}
```
```sql
-- tests/singular/assert_orders_customers_match.sql
-- Singular test: specific business rule
with orders_customers as (
select distinct customer_id from {{ ref('fct_orders') }}
),
dim_customers as (
select customer_id from {{ ref('dim_customers') }}
),
orphaned_orders as (
select o.customer_id
from orders_customers o
left join dim_customers c using (customer_id)
where c.customer_id is null
)
select * from orphaned_orders
-- Test passes if this returns 0 rows
```
### Pattern 5: Data Contracts
```yaml
# contracts/orders_contract.yaml
apiVersion: datacontract.com/v1.0.0
kind: DataContract
metadata:
name: orders
version: 1.0.0
owner: data-platform-team
contact: data-team@company.com
info:
title: Orders Data Contract
description: Contract for order event data from the ecommerce platform
purpose: Analytics, reporting, and ML features
servers:
production:
type: snowflake
account: company.us-east-1
database: ANALYTICS
schema: CORE
terms:
usage: Internal analytics only
limitations: PII must not be exposed in downstream marts
billing: Charged per query TB scanned
schema:
type: object
properties:
order_id:
type: string
format: uuid
description: Unique order identifier
required: true
unique: true
pii: false
customer_id:
type: string
format: uuid
description: Customer identifier
required: true
pii: true
piiClassification: indirect
total_amount:
type: number
minimum: 0
maximum: 100000
description: Order total in USD
created_at:
type: string
format: date-time
description: Order creation timestamp
required: true
status:
type: string
enum: [pending, processing, shipped, delivered, cancelled]
description: Current order status
quality:
type: SodaCL
specification:
checks for orders:
- row_count > 0
- missing_count(order_id) = 0
- duplicate_count(order_id) = 0
- invalid_count(status) = 0:
valid values: [pending, processing, shipped, delivered, cancelled]
- freshness(created_at) < 24h
sla:
availability: 99.9%
freshness: 1 hour
latency: 5 minutes
```
### Pattern 6: Automated Quality Pipeline
```python
# quality_pipeline.py
from dataclasses import dataclass
from typing import List, Dict, Any
import great_expectations as gx
from datetime import datetime
@dataclass
class QualityResult:
table: str
passed: bool
total_expectations: int
failed_expectations: int
details: List[Dict[str, Any]]
timestamp: datetime
class DataQualityPipeline:
"""Orchestrate data quality checks across tables"""
def __init__(self, context: gx.DataContext):
self.context = context
self.results: List[QualityResult] = []
def validate_table(self, table: str, suite: str) -> QualityResult:
"""Validate a single table against expectation suite"""
checkpoint_config = {
"name": f"{table}_validation",
"config_version": 1.0,
"class_name": "Checkpoint",
"validations": [{
"batch_request": {
"datasource_name": "warehouse",
"data_asset_name": table,
},
"expectation_suite_name": suite,
}],
}
result = self.context.run_checkpoint(**checkpoint_config)
# Parse results
validation_result = list(result.run_results.values())[0]
results = validation_result.results
failed = [r for r in results if not r.success]
return QualityResult(
table=table,
passed=result.success,
total_expectations=len(results),
failed_expectations=len(failed),
details=[{
"expectation": r.expectation_config.expectation_type,
"success": r.success,
"observed_value": r.result.get("observed_value"),
} for r in results],
timestamp=datetime.now()
)
def run_all(self, tables: Dict[str, str]) -> Dict[str, QualityResult]:
"""Run validation for all tables"""
results = {}
for table, suite in tables.items():
print(f"Validating {table}...")
results[table] = self.validate_table(table, suite)
return results
def generate_report(self, results: Dict[str, QualityResult]) -> str:
"""Generate quality report"""
report = ["# Data Quality Report", f"Generated: {datetime.now()}", ""]
total_passed = sum(1 for r in results.values() if r.passed)
total_tables = len(results)
report.append(f"## Summary: {total_passed}/{total_tables} tables passed")
report.append("")
for table, result in results.items():
status = "" if result.passed else ""
report.append(f"### {status} {table}")
report.append(f"- Expectations: {result.total_expectations}")
report.append(f"- Failed: {result.failed_expectations}")
if not result.passed:
report.append("- Failed checks:")
for detail in result.details:
if not detail["success"]:
report.append(f" - {detail['expectation']}: {detail['observed_value']}")
report.append("")
return "\n".join(report)
# Usage
context = gx.get_context()
pipeline = DataQualityPipeline(context)
tables_to_validate = {
"orders": "orders_suite",
"customers": "customers_suite",
"products": "products_suite",
}
results = pipeline.run_all(tables_to_validate)
report = pipeline.generate_report(results)
# Fail pipeline if any table failed
if not all(r.passed for r in results.values()):
print(report)
raise ValueError("Data quality checks failed!")
```
## Best Practices
### Do's
- **Test early** - Validate source data before transformations
- **Test incrementally** - Add tests as you find issues
- **Document expectations** - Clear descriptions for each test
- **Alert on failures** - Integrate with monitoring
- **Version contracts** - Track schema changes
### Don'ts
- **Don't test everything** - Focus on critical columns
- **Don't ignore warnings** - They often precede failures
- **Don't skip freshness** - Stale data is bad data
- **Don't hardcode thresholds** - Use dynamic baselines
- **Don't test in isolation** - Test relationships too
## Resources
- [Great Expectations Documentation](https://docs.greatexpectations.io/)
- [dbt Testing Documentation](https://docs.getdbt.com/docs/build/tests)
- [Data Contract Specification](https://datacontract.com/)
- [Soda Core](https://docs.soda.io/soda-core/overview.html)

View File

@@ -0,0 +1,561 @@
---
name: dbt-transformation-patterns
description: Master dbt (data build tool) for analytics engineering with model organization, testing, documentation, and incremental strategies. Use when building data transformations, creating data models, or implementing analytics engineering best practices.
---
# dbt Transformation Patterns
Production-ready patterns for dbt (data build tool) including model organization, testing strategies, documentation, and incremental processing.
## When to Use This Skill
- Building data transformation pipelines with dbt
- Organizing models into staging, intermediate, and marts layers
- Implementing data quality tests
- Creating incremental models for large datasets
- Documenting data models and lineage
- Setting up dbt project structure
## Core Concepts
### 1. Model Layers (Medallion Architecture)
```
sources/ Raw data definitions
staging/ 1:1 with source, light cleaning
intermediate/ Business logic, joins, aggregations
marts/ Final analytics tables
```
### 2. Naming Conventions
| Layer | Prefix | Example |
|-------|--------|---------|
| Staging | `stg_` | `stg_stripe__payments` |
| Intermediate | `int_` | `int_payments_pivoted` |
| Marts | `dim_`, `fct_` | `dim_customers`, `fct_orders` |
## Quick Start
```yaml
# dbt_project.yml
name: 'analytics'
version: '1.0.0'
profile: 'analytics'
model-paths: ["models"]
analysis-paths: ["analyses"]
test-paths: ["tests"]
seed-paths: ["seeds"]
macro-paths: ["macros"]
vars:
start_date: '2020-01-01'
models:
analytics:
staging:
+materialized: view
+schema: staging
intermediate:
+materialized: ephemeral
marts:
+materialized: table
+schema: analytics
```
```
# Project structure
models/
├── staging/
│ ├── stripe/
│ │ ├── _stripe__sources.yml
│ │ ├── _stripe__models.yml
│ │ ├── stg_stripe__customers.sql
│ │ └── stg_stripe__payments.sql
│ └── shopify/
│ ├── _shopify__sources.yml
│ └── stg_shopify__orders.sql
├── intermediate/
│ └── finance/
│ └── int_payments_pivoted.sql
└── marts/
├── core/
│ ├── _core__models.yml
│ ├── dim_customers.sql
│ └── fct_orders.sql
└── finance/
└── fct_revenue.sql
```
## Patterns
### Pattern 1: Source Definitions
```yaml
# models/staging/stripe/_stripe__sources.yml
version: 2
sources:
- name: stripe
description: Raw Stripe data loaded via Fivetran
database: raw
schema: stripe
loader: fivetran
loaded_at_field: _fivetran_synced
freshness:
warn_after: {count: 12, period: hour}
error_after: {count: 24, period: hour}
tables:
- name: customers
description: Stripe customer records
columns:
- name: id
description: Primary key
tests:
- unique
- not_null
- name: email
description: Customer email
- name: created
description: Account creation timestamp
- name: payments
description: Stripe payment transactions
columns:
- name: id
tests:
- unique
- not_null
- name: customer_id
tests:
- not_null
- relationships:
to: source('stripe', 'customers')
field: id
```
### Pattern 2: Staging Models
```sql
-- models/staging/stripe/stg_stripe__customers.sql
with source as (
select * from {{ source('stripe', 'customers') }}
),
renamed as (
select
-- ids
id as customer_id,
-- strings
lower(email) as email,
name as customer_name,
-- timestamps
created as created_at,
-- metadata
_fivetran_synced as _loaded_at
from source
)
select * from renamed
```
```sql
-- models/staging/stripe/stg_stripe__payments.sql
{{
config(
materialized='incremental',
unique_key='payment_id',
on_schema_change='append_new_columns'
)
}}
with source as (
select * from {{ source('stripe', 'payments') }}
{% if is_incremental() %}
where _fivetran_synced > (select max(_loaded_at) from {{ this }})
{% endif %}
),
renamed as (
select
-- ids
id as payment_id,
customer_id,
invoice_id,
-- amounts (convert cents to dollars)
amount / 100.0 as amount,
amount_refunded / 100.0 as amount_refunded,
-- status
status as payment_status,
-- timestamps
created as created_at,
-- metadata
_fivetran_synced as _loaded_at
from source
)
select * from renamed
```
### Pattern 3: Intermediate Models
```sql
-- models/intermediate/finance/int_payments_pivoted_to_customer.sql
with payments as (
select * from {{ ref('stg_stripe__payments') }}
),
customers as (
select * from {{ ref('stg_stripe__customers') }}
),
payment_summary as (
select
customer_id,
count(*) as total_payments,
count(case when payment_status = 'succeeded' then 1 end) as successful_payments,
sum(case when payment_status = 'succeeded' then amount else 0 end) as total_amount_paid,
min(created_at) as first_payment_at,
max(created_at) as last_payment_at
from payments
group by customer_id
)
select
customers.customer_id,
customers.email,
customers.created_at as customer_created_at,
coalesce(payment_summary.total_payments, 0) as total_payments,
coalesce(payment_summary.successful_payments, 0) as successful_payments,
coalesce(payment_summary.total_amount_paid, 0) as lifetime_value,
payment_summary.first_payment_at,
payment_summary.last_payment_at
from customers
left join payment_summary using (customer_id)
```
### Pattern 4: Mart Models (Dimensions and Facts)
```sql
-- models/marts/core/dim_customers.sql
{{
config(
materialized='table',
unique_key='customer_id'
)
}}
with customers as (
select * from {{ ref('int_payments_pivoted_to_customer') }}
),
orders as (
select * from {{ ref('stg_shopify__orders') }}
),
order_summary as (
select
customer_id,
count(*) as total_orders,
sum(total_price) as total_order_value,
min(created_at) as first_order_at,
max(created_at) as last_order_at
from orders
group by customer_id
),
final as (
select
-- surrogate key
{{ dbt_utils.generate_surrogate_key(['customers.customer_id']) }} as customer_key,
-- natural key
customers.customer_id,
-- attributes
customers.email,
customers.customer_created_at,
-- payment metrics
customers.total_payments,
customers.successful_payments,
customers.lifetime_value,
customers.first_payment_at,
customers.last_payment_at,
-- order metrics
coalesce(order_summary.total_orders, 0) as total_orders,
coalesce(order_summary.total_order_value, 0) as total_order_value,
order_summary.first_order_at,
order_summary.last_order_at,
-- calculated fields
case
when customers.lifetime_value >= 1000 then 'high'
when customers.lifetime_value >= 100 then 'medium'
else 'low'
end as customer_tier,
-- timestamps
current_timestamp as _loaded_at
from customers
left join order_summary using (customer_id)
)
select * from final
```
```sql
-- models/marts/core/fct_orders.sql
{{
config(
materialized='incremental',
unique_key='order_id',
incremental_strategy='merge'
)
}}
with orders as (
select * from {{ ref('stg_shopify__orders') }}
{% if is_incremental() %}
where updated_at > (select max(updated_at) from {{ this }})
{% endif %}
),
customers as (
select * from {{ ref('dim_customers') }}
),
final as (
select
-- keys
orders.order_id,
customers.customer_key,
orders.customer_id,
-- dimensions
orders.order_status,
orders.fulfillment_status,
orders.payment_status,
-- measures
orders.subtotal,
orders.tax,
orders.shipping,
orders.total_price,
orders.total_discount,
orders.item_count,
-- timestamps
orders.created_at,
orders.updated_at,
orders.fulfilled_at,
-- metadata
current_timestamp as _loaded_at
from orders
left join customers on orders.customer_id = customers.customer_id
)
select * from final
```
### Pattern 5: Testing and Documentation
```yaml
# models/marts/core/_core__models.yml
version: 2
models:
- name: dim_customers
description: Customer dimension with payment and order metrics
columns:
- name: customer_key
description: Surrogate key for the customer dimension
tests:
- unique
- not_null
- name: customer_id
description: Natural key from source system
tests:
- unique
- not_null
- name: email
description: Customer email address
tests:
- not_null
- name: customer_tier
description: Customer value tier based on lifetime value
tests:
- accepted_values:
values: ['high', 'medium', 'low']
- name: lifetime_value
description: Total amount paid by customer
tests:
- dbt_utils.expression_is_true:
expression: ">= 0"
- name: fct_orders
description: Order fact table with all order transactions
tests:
- dbt_utils.recency:
datepart: day
field: created_at
interval: 1
columns:
- name: order_id
tests:
- unique
- not_null
- name: customer_key
tests:
- not_null
- relationships:
to: ref('dim_customers')
field: customer_key
```
### Pattern 6: Macros and DRY Code
```sql
-- macros/cents_to_dollars.sql
{% macro cents_to_dollars(column_name, precision=2) %}
round({{ column_name }} / 100.0, {{ precision }})
{% endmacro %}
-- macros/generate_schema_name.sql
{% macro generate_schema_name(custom_schema_name, node) %}
{%- set default_schema = target.schema -%}
{%- if custom_schema_name is none -%}
{{ default_schema }}
{%- else -%}
{{ default_schema }}_{{ custom_schema_name }}
{%- endif -%}
{% endmacro %}
-- macros/limit_data_in_dev.sql
{% macro limit_data_in_dev(column_name, days=3) %}
{% if target.name == 'dev' %}
where {{ column_name }} >= dateadd(day, -{{ days }}, current_date)
{% endif %}
{% endmacro %}
-- Usage in model
select * from {{ ref('stg_orders') }}
{{ limit_data_in_dev('created_at') }}
```
### Pattern 7: Incremental Strategies
```sql
-- Delete+Insert (default for most warehouses)
{{
config(
materialized='incremental',
unique_key='id',
incremental_strategy='delete+insert'
)
}}
-- Merge (best for late-arriving data)
{{
config(
materialized='incremental',
unique_key='id',
incremental_strategy='merge',
merge_update_columns=['status', 'amount', 'updated_at']
)
}}
-- Insert Overwrite (partition-based)
{{
config(
materialized='incremental',
incremental_strategy='insert_overwrite',
partition_by={
"field": "created_date",
"data_type": "date",
"granularity": "day"
}
)
}}
select
*,
date(created_at) as created_date
from {{ ref('stg_events') }}
{% if is_incremental() %}
where created_date >= dateadd(day, -3, current_date)
{% endif %}
```
## dbt Commands
```bash
# Development
dbt run # Run all models
dbt run --select staging # Run staging models only
dbt run --select +fct_orders # Run fct_orders and its upstream
dbt run --select fct_orders+ # Run fct_orders and its downstream
dbt run --full-refresh # Rebuild incremental models
# Testing
dbt test # Run all tests
dbt test --select stg_stripe # Test specific models
dbt build # Run + test in DAG order
# Documentation
dbt docs generate # Generate docs
dbt docs serve # Serve docs locally
# Debugging
dbt compile # Compile SQL without running
dbt debug # Test connection
dbt ls --select tag:critical # List models by tag
```
## Best Practices
### Do's
- **Use staging layer** - Clean data once, use everywhere
- **Test aggressively** - Not null, unique, relationships
- **Document everything** - Column descriptions, model descriptions
- **Use incremental** - For tables > 1M rows
- **Version control** - dbt project in Git
### Don'ts
- **Don't skip staging** - Raw → mart is tech debt
- **Don't hardcode dates** - Use `{{ var('start_date') }}`
- **Don't repeat logic** - Extract to macros
- **Don't test in prod** - Use dev target
- **Don't ignore freshness** - Monitor source data
## Resources
- [dbt Documentation](https://docs.getdbt.com/)
- [dbt Best Practices](https://docs.getdbt.com/guides/best-practices)
- [dbt-utils Package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/)
- [dbt Discourse](https://discourse.getdbt.com/)

View File

@@ -0,0 +1,415 @@
---
name: spark-optimization
description: Optimize Apache Spark jobs with partitioning, caching, shuffle optimization, and memory tuning. Use when improving Spark performance, debugging slow jobs, or scaling data processing pipelines.
---
# Apache Spark Optimization
Production patterns for optimizing Apache Spark jobs including partitioning strategies, memory management, shuffle optimization, and performance tuning.
## When to Use This Skill
- Optimizing slow Spark jobs
- Tuning memory and executor configuration
- Implementing efficient partitioning strategies
- Debugging Spark performance issues
- Scaling Spark pipelines for large datasets
- Reducing shuffle and data skew
## Core Concepts
### 1. Spark Execution Model
```
Driver Program
Job (triggered by action)
Stages (separated by shuffles)
Tasks (one per partition)
```
### 2. Key Performance Factors
| Factor | Impact | Solution |
|--------|--------|----------|
| **Shuffle** | Network I/O, disk I/O | Minimize wide transformations |
| **Data Skew** | Uneven task duration | Salting, broadcast joins |
| **Serialization** | CPU overhead | Use Kryo, columnar formats |
| **Memory** | GC pressure, spills | Tune executor memory |
| **Partitions** | Parallelism | Right-size partitions |
## Quick Start
```python
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Create optimized Spark session
spark = (SparkSession.builder
.appName("OptimizedJob")
.config("spark.sql.adaptive.enabled", "true")
.config("spark.sql.adaptive.coalescePartitions.enabled", "true")
.config("spark.sql.adaptive.skewJoin.enabled", "true")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.sql.shuffle.partitions", "200")
.getOrCreate())
# Read with optimized settings
df = (spark.read
.format("parquet")
.option("mergeSchema", "false")
.load("s3://bucket/data/"))
# Efficient transformations
result = (df
.filter(F.col("date") >= "2024-01-01")
.select("id", "amount", "category")
.groupBy("category")
.agg(F.sum("amount").alias("total")))
result.write.mode("overwrite").parquet("s3://bucket/output/")
```
## Patterns
### Pattern 1: Optimal Partitioning
```python
# Calculate optimal partition count
def calculate_partitions(data_size_gb: float, partition_size_mb: int = 128) -> int:
"""
Optimal partition size: 128MB - 256MB
Too few: Under-utilization, memory pressure
Too many: Task scheduling overhead
"""
return max(int(data_size_gb * 1024 / partition_size_mb), 1)
# Repartition for even distribution
df_repartitioned = df.repartition(200, "partition_key")
# Coalesce to reduce partitions (no shuffle)
df_coalesced = df.coalesce(100)
# Partition pruning with predicate pushdown
df = (spark.read.parquet("s3://bucket/data/")
.filter(F.col("date") == "2024-01-01")) # Spark pushes this down
# Write with partitioning for future queries
(df.write
.partitionBy("year", "month", "day")
.mode("overwrite")
.parquet("s3://bucket/partitioned_output/"))
```
### Pattern 2: Join Optimization
```python
from pyspark.sql import functions as F
from pyspark.sql.types import *
# 1. Broadcast Join - Small table joins
# Best when: One side < 10MB (configurable)
small_df = spark.read.parquet("s3://bucket/small_table/") # < 10MB
large_df = spark.read.parquet("s3://bucket/large_table/") # TBs
# Explicit broadcast hint
result = large_df.join(
F.broadcast(small_df),
on="key",
how="left"
)
# 2. Sort-Merge Join - Default for large tables
# Requires shuffle, but handles any size
result = large_df1.join(large_df2, on="key", how="inner")
# 3. Bucket Join - Pre-sorted, no shuffle at join time
# Write bucketed tables
(df.write
.bucketBy(200, "customer_id")
.sortBy("customer_id")
.mode("overwrite")
.saveAsTable("bucketed_orders"))
# Join bucketed tables (no shuffle!)
orders = spark.table("bucketed_orders")
customers = spark.table("bucketed_customers") # Same bucket count
result = orders.join(customers, on="customer_id")
# 4. Skew Join Handling
# Enable AQE skew join optimization
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionFactor", "5")
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes", "256MB")
# Manual salting for severe skew
def salt_join(df_skewed, df_other, key_col, num_salts=10):
"""Add salt to distribute skewed keys"""
# Add salt to skewed side
df_salted = df_skewed.withColumn(
"salt",
(F.rand() * num_salts).cast("int")
).withColumn(
"salted_key",
F.concat(F.col(key_col), F.lit("_"), F.col("salt"))
)
# Explode other side with all salts
df_exploded = df_other.crossJoin(
spark.range(num_salts).withColumnRenamed("id", "salt")
).withColumn(
"salted_key",
F.concat(F.col(key_col), F.lit("_"), F.col("salt"))
)
# Join on salted key
return df_salted.join(df_exploded, on="salted_key", how="inner")
```
### Pattern 3: Caching and Persistence
```python
from pyspark import StorageLevel
# Cache when reusing DataFrame multiple times
df = spark.read.parquet("s3://bucket/data/")
df_filtered = df.filter(F.col("status") == "active")
# Cache in memory (MEMORY_AND_DISK is default)
df_filtered.cache()
# Or with specific storage level
df_filtered.persist(StorageLevel.MEMORY_AND_DISK_SER)
# Force materialization
df_filtered.count()
# Use in multiple actions
agg1 = df_filtered.groupBy("category").count()
agg2 = df_filtered.groupBy("region").sum("amount")
# Unpersist when done
df_filtered.unpersist()
# Storage levels explained:
# MEMORY_ONLY - Fast, but may not fit
# MEMORY_AND_DISK - Spills to disk if needed (recommended)
# MEMORY_ONLY_SER - Serialized, less memory, more CPU
# DISK_ONLY - When memory is tight
# OFF_HEAP - Tungsten off-heap memory
# Checkpoint for complex lineage
spark.sparkContext.setCheckpointDir("s3://bucket/checkpoints/")
df_complex = (df
.join(other_df, "key")
.groupBy("category")
.agg(F.sum("amount")))
df_complex.checkpoint() # Breaks lineage, materializes
```
### Pattern 4: Memory Tuning
```python
# Executor memory configuration
# spark-submit --executor-memory 8g --executor-cores 4
# Memory breakdown (8GB executor):
# - spark.memory.fraction = 0.6 (60% = 4.8GB for execution + storage)
# - spark.memory.storageFraction = 0.5 (50% of 4.8GB = 2.4GB for cache)
# - Remaining 2.4GB for execution (shuffles, joins, sorts)
# - 40% = 3.2GB for user data structures and internal metadata
spark = (SparkSession.builder
.config("spark.executor.memory", "8g")
.config("spark.executor.memoryOverhead", "2g") # For non-JVM memory
.config("spark.memory.fraction", "0.6")
.config("spark.memory.storageFraction", "0.5")
.config("spark.sql.shuffle.partitions", "200")
# For memory-intensive operations
.config("spark.sql.autoBroadcastJoinThreshold", "50MB")
# Prevent OOM on large shuffles
.config("spark.sql.files.maxPartitionBytes", "128MB")
.getOrCreate())
# Monitor memory usage
def print_memory_usage(spark):
"""Print current memory usage"""
sc = spark.sparkContext
for executor in sc._jsc.sc().getExecutorMemoryStatus().keySet().toArray():
mem_status = sc._jsc.sc().getExecutorMemoryStatus().get(executor)
total = mem_status._1() / (1024**3)
free = mem_status._2() / (1024**3)
print(f"{executor}: {total:.2f}GB total, {free:.2f}GB free")
```
### Pattern 5: Shuffle Optimization
```python
# Reduce shuffle data size
spark.conf.set("spark.sql.shuffle.partitions", "auto") # With AQE
spark.conf.set("spark.shuffle.compress", "true")
spark.conf.set("spark.shuffle.spill.compress", "true")
# Pre-aggregate before shuffle
df_optimized = (df
# Local aggregation first (combiner)
.groupBy("key", "partition_col")
.agg(F.sum("value").alias("partial_sum"))
# Then global aggregation
.groupBy("key")
.agg(F.sum("partial_sum").alias("total")))
# Avoid shuffle with map-side operations
# BAD: Shuffle for each distinct
distinct_count = df.select("category").distinct().count()
# GOOD: Approximate distinct (no shuffle)
approx_count = df.select(F.approx_count_distinct("category")).collect()[0][0]
# Use coalesce instead of repartition when reducing partitions
df_reduced = df.coalesce(10) # No shuffle
# Optimize shuffle with compression
spark.conf.set("spark.io.compression.codec", "lz4") # Fast compression
```
### Pattern 6: Data Format Optimization
```python
# Parquet optimizations
(df.write
.option("compression", "snappy") # Fast compression
.option("parquet.block.size", 128 * 1024 * 1024) # 128MB row groups
.parquet("s3://bucket/output/"))
# Column pruning - only read needed columns
df = (spark.read.parquet("s3://bucket/data/")
.select("id", "amount", "date")) # Spark only reads these columns
# Predicate pushdown - filter at storage level
df = (spark.read.parquet("s3://bucket/partitioned/year=2024/")
.filter(F.col("status") == "active")) # Pushed to Parquet reader
# Delta Lake optimizations
(df.write
.format("delta")
.option("optimizeWrite", "true") # Bin-packing
.option("autoCompact", "true") # Compact small files
.mode("overwrite")
.save("s3://bucket/delta_table/"))
# Z-ordering for multi-dimensional queries
spark.sql("""
OPTIMIZE delta.`s3://bucket/delta_table/`
ZORDER BY (customer_id, date)
""")
```
### Pattern 7: Monitoring and Debugging
```python
# Enable detailed metrics
spark.conf.set("spark.sql.codegen.wholeStage", "true")
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
# Explain query plan
df.explain(mode="extended")
# Modes: simple, extended, codegen, cost, formatted
# Get physical plan statistics
df.explain(mode="cost")
# Monitor task metrics
def analyze_stage_metrics(spark):
"""Analyze recent stage metrics"""
status_tracker = spark.sparkContext.statusTracker()
for stage_id in status_tracker.getActiveStageIds():
stage_info = status_tracker.getStageInfo(stage_id)
print(f"Stage {stage_id}:")
print(f" Tasks: {stage_info.numTasks}")
print(f" Completed: {stage_info.numCompletedTasks}")
print(f" Failed: {stage_info.numFailedTasks}")
# Identify data skew
def check_partition_skew(df):
"""Check for partition skew"""
partition_counts = (df
.withColumn("partition_id", F.spark_partition_id())
.groupBy("partition_id")
.count()
.orderBy(F.desc("count")))
partition_counts.show(20)
stats = partition_counts.select(
F.min("count").alias("min"),
F.max("count").alias("max"),
F.avg("count").alias("avg"),
F.stddev("count").alias("stddev")
).collect()[0]
skew_ratio = stats["max"] / stats["avg"]
print(f"Skew ratio: {skew_ratio:.2f}x (>2x indicates skew)")
```
## Configuration Cheat Sheet
```python
# Production configuration template
spark_configs = {
# Adaptive Query Execution (AQE)
"spark.sql.adaptive.enabled": "true",
"spark.sql.adaptive.coalescePartitions.enabled": "true",
"spark.sql.adaptive.skewJoin.enabled": "true",
# Memory
"spark.executor.memory": "8g",
"spark.executor.memoryOverhead": "2g",
"spark.memory.fraction": "0.6",
"spark.memory.storageFraction": "0.5",
# Parallelism
"spark.sql.shuffle.partitions": "200",
"spark.default.parallelism": "200",
# Serialization
"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
"spark.sql.execution.arrow.pyspark.enabled": "true",
# Compression
"spark.io.compression.codec": "lz4",
"spark.shuffle.compress": "true",
# Broadcast
"spark.sql.autoBroadcastJoinThreshold": "50MB",
# File handling
"spark.sql.files.maxPartitionBytes": "128MB",
"spark.sql.files.openCostInBytes": "4MB",
}
```
## Best Practices
### Do's
- **Enable AQE** - Adaptive query execution handles many issues
- **Use Parquet/Delta** - Columnar formats with compression
- **Broadcast small tables** - Avoid shuffle for small joins
- **Monitor Spark UI** - Check for skew, spills, GC
- **Right-size partitions** - 128MB - 256MB per partition
### Don'ts
- **Don't collect large data** - Keep data distributed
- **Don't use UDFs unnecessarily** - Use built-in functions
- **Don't over-cache** - Memory is limited
- **Don't ignore data skew** - It dominates job time
- **Don't use `.count()` for existence** - Use `.take(1)` or `.isEmpty()`
## Resources
- [Spark Performance Tuning](https://spark.apache.org/docs/latest/sql-performance-tuning.html)
- [Spark Configuration](https://spark.apache.org/docs/latest/configuration.html)
- [Databricks Optimization Guide](https://docs.databricks.com/en/optimizations/index.html)

View File

@@ -0,0 +1,44 @@
# Monorepo Architect
Expert in monorepo architecture, build systems, and dependency management at scale. Masters Nx, Turborepo, Bazel, and Lerna for efficient multi-project development. Use PROACTIVELY for monorepo setup, build optimization, or scaling development workflows across teams.
## Capabilities
- Monorepo tool selection (Nx, Turborepo, Bazel, Lerna)
- Workspace configuration and project structure
- Build caching (local and remote)
- Dependency graph management
- Affected/changed detection for CI optimization
- Code sharing and library extraction
- Task orchestration and parallelization
## When to Use
- Setting up a new monorepo from scratch
- Migrating from polyrepo to monorepo
- Optimizing slow CI/CD pipelines
- Sharing code between multiple applications
- Managing dependencies across projects
- Implementing consistent tooling across teams
## Workflow
1. Assess codebase size and team structure
2. Select appropriate monorepo tooling
3. Design workspace and project structure
4. Configure build caching strategy
5. Set up affected/changed detection
6. Implement task pipelines
7. Configure remote caching for CI
8. Document conventions and workflows
## Best Practices
- Start with clear project boundaries
- Use consistent naming conventions
- Implement remote caching early
- Keep shared libraries focused
- Use tags for dependency constraints
- Automate dependency updates
- Document the dependency graph
- Set up code ownership rules

View File

@@ -0,0 +1,385 @@
---
name: bazel-build-optimization
description: Optimize Bazel builds for large-scale monorepos. Use when configuring Bazel, implementing remote execution, or optimizing build performance for enterprise codebases.
---
# Bazel Build Optimization
Production patterns for Bazel in large-scale monorepos.
## When to Use This Skill
- Setting up Bazel for monorepos
- Configuring remote caching/execution
- Optimizing build times
- Writing custom Bazel rules
- Debugging build issues
- Migrating to Bazel
## Core Concepts
### 1. Bazel Architecture
```
workspace/
├── WORKSPACE.bazel # External dependencies
├── .bazelrc # Build configurations
├── .bazelversion # Bazel version
├── BUILD.bazel # Root build file
├── apps/
│ └── web/
│ └── BUILD.bazel
├── libs/
│ └── utils/
│ └── BUILD.bazel
└── tools/
└── bazel/
└── rules/
```
### 2. Key Concepts
| Concept | Description |
|---------|-------------|
| **Target** | Buildable unit (library, binary, test) |
| **Package** | Directory with BUILD file |
| **Label** | Target identifier `//path/to:target` |
| **Rule** | Defines how to build a target |
| **Aspect** | Cross-cutting build behavior |
## Templates
### Template 1: WORKSPACE Configuration
```python
# WORKSPACE.bazel
workspace(name = "myproject")
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
# Rules for JavaScript/TypeScript
http_archive(
name = "aspect_rules_js",
sha256 = "...",
strip_prefix = "rules_js-1.34.0",
url = "https://github.com/aspect-build/rules_js/releases/download/v1.34.0/rules_js-v1.34.0.tar.gz",
)
load("@aspect_rules_js//js:repositories.bzl", "rules_js_dependencies")
rules_js_dependencies()
load("@rules_nodejs//nodejs:repositories.bzl", "nodejs_register_toolchains")
nodejs_register_toolchains(
name = "nodejs",
node_version = "20.9.0",
)
load("@aspect_rules_js//npm:repositories.bzl", "npm_translate_lock")
npm_translate_lock(
name = "npm",
pnpm_lock = "//:pnpm-lock.yaml",
verify_node_modules_ignored = "//:.bazelignore",
)
load("@npm//:repositories.bzl", "npm_repositories")
npm_repositories()
# Rules for Python
http_archive(
name = "rules_python",
sha256 = "...",
strip_prefix = "rules_python-0.27.0",
url = "https://github.com/bazelbuild/rules_python/releases/download/0.27.0/rules_python-0.27.0.tar.gz",
)
load("@rules_python//python:repositories.bzl", "py_repositories")
py_repositories()
```
### Template 2: .bazelrc Configuration
```bash
# .bazelrc
# Build settings
build --enable_platform_specific_config
build --incompatible_enable_cc_toolchain_resolution
build --experimental_strict_conflict_checks
# Performance
build --jobs=auto
build --local_cpu_resources=HOST_CPUS*.75
build --local_ram_resources=HOST_RAM*.75
# Caching
build --disk_cache=~/.cache/bazel-disk
build --repository_cache=~/.cache/bazel-repo
# Remote caching (optional)
build:remote-cache --remote_cache=grpcs://cache.example.com
build:remote-cache --remote_upload_local_results=true
build:remote-cache --remote_timeout=3600
# Remote execution (optional)
build:remote-exec --remote_executor=grpcs://remote.example.com
build:remote-exec --remote_instance_name=projects/myproject/instances/default
build:remote-exec --jobs=500
# Platform configurations
build:linux --platforms=//platforms:linux_x86_64
build:macos --platforms=//platforms:macos_arm64
# CI configuration
build:ci --config=remote-cache
build:ci --build_metadata=ROLE=CI
build:ci --bes_results_url=https://results.example.com/invocation/
build:ci --bes_backend=grpcs://bes.example.com
# Test settings
test --test_output=errors
test --test_summary=detailed
# Coverage
coverage --combined_report=lcov
coverage --instrumentation_filter="//..."
# Convenience aliases
build:opt --compilation_mode=opt
build:dbg --compilation_mode=dbg
# Import user settings
try-import %workspace%/user.bazelrc
```
### Template 3: TypeScript Library BUILD
```python
# libs/utils/BUILD.bazel
load("@aspect_rules_ts//ts:defs.bzl", "ts_project")
load("@aspect_rules_js//js:defs.bzl", "js_library")
load("@npm//:defs.bzl", "npm_link_all_packages")
npm_link_all_packages(name = "node_modules")
ts_project(
name = "utils_ts",
srcs = glob(["src/**/*.ts"]),
declaration = True,
source_map = True,
tsconfig = "//:tsconfig.json",
deps = [
":node_modules/@types/node",
],
)
js_library(
name = "utils",
srcs = [":utils_ts"],
visibility = ["//visibility:public"],
)
# Tests
load("@aspect_rules_jest//jest:defs.bzl", "jest_test")
jest_test(
name = "utils_test",
config = "//:jest.config.js",
data = [
":utils",
"//:node_modules/jest",
],
node_modules = "//:node_modules",
)
```
### Template 4: Python Library BUILD
```python
# libs/ml/BUILD.bazel
load("@rules_python//python:defs.bzl", "py_library", "py_test", "py_binary")
load("@pip//:requirements.bzl", "requirement")
py_library(
name = "ml",
srcs = glob(["src/**/*.py"]),
deps = [
requirement("numpy"),
requirement("pandas"),
requirement("scikit-learn"),
"//libs/utils:utils_py",
],
visibility = ["//visibility:public"],
)
py_test(
name = "ml_test",
srcs = glob(["tests/**/*.py"]),
deps = [
":ml",
requirement("pytest"),
],
size = "medium",
timeout = "moderate",
)
py_binary(
name = "train",
srcs = ["train.py"],
deps = [":ml"],
data = ["//data:training_data"],
)
```
### Template 5: Custom Rule for Docker
```python
# tools/bazel/rules/docker.bzl
def _docker_image_impl(ctx):
dockerfile = ctx.file.dockerfile
base_image = ctx.attr.base_image
layers = ctx.files.layers
# Build the image
output = ctx.actions.declare_file(ctx.attr.name + ".tar")
args = ctx.actions.args()
args.add("--dockerfile", dockerfile)
args.add("--output", output)
args.add("--base", base_image)
args.add_all("--layer", layers)
ctx.actions.run(
inputs = [dockerfile] + layers,
outputs = [output],
executable = ctx.executable._builder,
arguments = [args],
mnemonic = "DockerBuild",
progress_message = "Building Docker image %s" % ctx.label,
)
return [DefaultInfo(files = depset([output]))]
docker_image = rule(
implementation = _docker_image_impl,
attrs = {
"dockerfile": attr.label(
allow_single_file = [".dockerfile", "Dockerfile"],
mandatory = True,
),
"base_image": attr.string(mandatory = True),
"layers": attr.label_list(allow_files = True),
"_builder": attr.label(
default = "//tools/docker:builder",
executable = True,
cfg = "exec",
),
},
)
```
### Template 6: Query and Dependency Analysis
```bash
# Find all dependencies of a target
bazel query "deps(//apps/web:web)"
# Find reverse dependencies (what depends on this)
bazel query "rdeps(//..., //libs/utils:utils)"
# Find all targets in a package
bazel query "//libs/..."
# Find changed targets since commit
bazel query "rdeps(//..., set($(git diff --name-only HEAD~1 | sed 's/.*/"&"/' | tr '\n' ' ')))"
# Generate dependency graph
bazel query "deps(//apps/web:web)" --output=graph | dot -Tpng > deps.png
# Find all test targets
bazel query "kind('.*_test', //...)"
# Find targets with specific tag
bazel query "attr(tags, 'integration', //...)"
# Compute build graph size
bazel query "deps(//...)" --output=package | wc -l
```
### Template 7: Remote Execution Setup
```python
# platforms/BUILD.bazel
platform(
name = "linux_x86_64",
constraint_values = [
"@platforms//os:linux",
"@platforms//cpu:x86_64",
],
exec_properties = {
"container-image": "docker://gcr.io/myproject/bazel-worker:latest",
"OSFamily": "Linux",
},
)
platform(
name = "remote_linux",
parents = [":linux_x86_64"],
exec_properties = {
"Pool": "default",
"dockerNetwork": "standard",
},
)
# toolchains/BUILD.bazel
toolchain(
name = "cc_toolchain_linux",
exec_compatible_with = [
"@platforms//os:linux",
"@platforms//cpu:x86_64",
],
target_compatible_with = [
"@platforms//os:linux",
"@platforms//cpu:x86_64",
],
toolchain = "@remotejdk11_linux//:jdk",
toolchain_type = "@bazel_tools//tools/jdk:runtime_toolchain_type",
)
```
## Performance Optimization
```bash
# Profile build
bazel build //... --profile=profile.json
bazel analyze-profile profile.json
# Identify slow actions
bazel build //... --execution_log_json_file=exec_log.json
# Memory profiling
bazel build //... --memory_profile=memory.json
# Skip analysis cache
bazel build //... --notrack_incremental_state
```
## Best Practices
### Do's
- **Use fine-grained targets** - Better caching
- **Pin dependencies** - Reproducible builds
- **Enable remote caching** - Share build artifacts
- **Use visibility wisely** - Enforce architecture
- **Write BUILD files per directory** - Standard convention
### Don'ts
- **Don't use glob for deps** - Explicit is better
- **Don't commit bazel-* dirs** - Add to .gitignore
- **Don't skip WORKSPACE setup** - Foundation of build
- **Don't ignore build warnings** - Technical debt
## Resources
- [Bazel Documentation](https://bazel.build/docs)
- [Bazel Remote Execution](https://bazel.build/docs/remote-execution)
- [rules_js](https://github.com/aspect-build/rules_js)

View File

@@ -0,0 +1,452 @@
---
name: nx-workspace-patterns
description: Configure and optimize Nx monorepo workspaces. Use when setting up Nx, configuring project boundaries, optimizing build caching, or implementing affected commands.
---
# Nx Workspace Patterns
Production patterns for Nx monorepo management.
## When to Use This Skill
- Setting up new Nx workspaces
- Configuring project boundaries
- Optimizing CI with affected commands
- Implementing remote caching
- Managing dependencies between projects
- Migrating to Nx
## Core Concepts
### 1. Nx Architecture
```
workspace/
├── apps/ # Deployable applications
│ ├── web/
│ └── api/
├── libs/ # Shared libraries
│ ├── shared/
│ │ ├── ui/
│ │ └── utils/
│ └── feature/
│ ├── auth/
│ └── dashboard/
├── tools/ # Custom executors/generators
├── nx.json # Nx configuration
└── workspace.json # Project configuration
```
### 2. Library Types
| Type | Purpose | Example |
|------|---------|---------|
| **feature** | Smart components, business logic | `feature-auth` |
| **ui** | Presentational components | `ui-buttons` |
| **data-access** | API calls, state management | `data-access-users` |
| **util** | Pure functions, helpers | `util-formatting` |
| **shell** | App bootstrapping | `shell-web` |
## Templates
### Template 1: nx.json Configuration
```json
{
"$schema": "./node_modules/nx/schemas/nx-schema.json",
"npmScope": "myorg",
"affected": {
"defaultBase": "main"
},
"tasksRunnerOptions": {
"default": {
"runner": "nx/tasks-runners/default",
"options": {
"cacheableOperations": [
"build",
"lint",
"test",
"e2e",
"build-storybook"
],
"parallel": 3
}
}
},
"targetDefaults": {
"build": {
"dependsOn": ["^build"],
"inputs": ["production", "^production"],
"cache": true
},
"test": {
"inputs": ["default", "^production", "{workspaceRoot}/jest.preset.js"],
"cache": true
},
"lint": {
"inputs": ["default", "{workspaceRoot}/.eslintrc.json"],
"cache": true
},
"e2e": {
"inputs": ["default", "^production"],
"cache": true
}
},
"namedInputs": {
"default": ["{projectRoot}/**/*", "sharedGlobals"],
"production": [
"default",
"!{projectRoot}/**/?(*.)+(spec|test).[jt]s?(x)?(.snap)",
"!{projectRoot}/tsconfig.spec.json",
"!{projectRoot}/jest.config.[jt]s",
"!{projectRoot}/.eslintrc.json"
],
"sharedGlobals": [
"{workspaceRoot}/babel.config.json",
"{workspaceRoot}/tsconfig.base.json"
]
},
"generators": {
"@nx/react": {
"application": {
"style": "css",
"linter": "eslint",
"bundler": "webpack"
},
"library": {
"style": "css",
"linter": "eslint"
},
"component": {
"style": "css"
}
}
}
}
```
### Template 2: Project Configuration
```json
// apps/web/project.json
{
"name": "web",
"$schema": "../../node_modules/nx/schemas/project-schema.json",
"sourceRoot": "apps/web/src",
"projectType": "application",
"tags": ["type:app", "scope:web"],
"targets": {
"build": {
"executor": "@nx/webpack:webpack",
"outputs": ["{options.outputPath}"],
"defaultConfiguration": "production",
"options": {
"compiler": "babel",
"outputPath": "dist/apps/web",
"index": "apps/web/src/index.html",
"main": "apps/web/src/main.tsx",
"tsConfig": "apps/web/tsconfig.app.json",
"assets": ["apps/web/src/assets"],
"styles": ["apps/web/src/styles.css"]
},
"configurations": {
"development": {
"extractLicenses": false,
"optimization": false,
"sourceMap": true
},
"production": {
"optimization": true,
"outputHashing": "all",
"sourceMap": false,
"extractLicenses": true
}
}
},
"serve": {
"executor": "@nx/webpack:dev-server",
"defaultConfiguration": "development",
"options": {
"buildTarget": "web:build"
},
"configurations": {
"development": {
"buildTarget": "web:build:development"
},
"production": {
"buildTarget": "web:build:production"
}
}
},
"test": {
"executor": "@nx/jest:jest",
"outputs": ["{workspaceRoot}/coverage/{projectRoot}"],
"options": {
"jestConfig": "apps/web/jest.config.ts",
"passWithNoTests": true
}
},
"lint": {
"executor": "@nx/eslint:lint",
"outputs": ["{options.outputFile}"],
"options": {
"lintFilePatterns": ["apps/web/**/*.{ts,tsx,js,jsx}"]
}
}
}
}
```
### Template 3: Module Boundary Rules
```json
// .eslintrc.json
{
"root": true,
"ignorePatterns": ["**/*"],
"plugins": ["@nx"],
"overrides": [
{
"files": ["*.ts", "*.tsx", "*.js", "*.jsx"],
"rules": {
"@nx/enforce-module-boundaries": [
"error",
{
"enforceBuildableLibDependency": true,
"allow": [],
"depConstraints": [
{
"sourceTag": "type:app",
"onlyDependOnLibsWithTags": [
"type:feature",
"type:ui",
"type:data-access",
"type:util"
]
},
{
"sourceTag": "type:feature",
"onlyDependOnLibsWithTags": [
"type:ui",
"type:data-access",
"type:util"
]
},
{
"sourceTag": "type:ui",
"onlyDependOnLibsWithTags": ["type:ui", "type:util"]
},
{
"sourceTag": "type:data-access",
"onlyDependOnLibsWithTags": ["type:data-access", "type:util"]
},
{
"sourceTag": "type:util",
"onlyDependOnLibsWithTags": ["type:util"]
},
{
"sourceTag": "scope:web",
"onlyDependOnLibsWithTags": ["scope:web", "scope:shared"]
},
{
"sourceTag": "scope:api",
"onlyDependOnLibsWithTags": ["scope:api", "scope:shared"]
},
{
"sourceTag": "scope:shared",
"onlyDependOnLibsWithTags": ["scope:shared"]
}
]
}
]
}
}
]
}
```
### Template 4: Custom Generator
```typescript
// tools/generators/feature-lib/index.ts
import {
Tree,
formatFiles,
generateFiles,
joinPathFragments,
names,
readProjectConfiguration,
} from '@nx/devkit';
import { libraryGenerator } from '@nx/react';
interface FeatureLibraryGeneratorSchema {
name: string;
scope: string;
directory?: string;
}
export default async function featureLibraryGenerator(
tree: Tree,
options: FeatureLibraryGeneratorSchema
) {
const { name, scope, directory } = options;
const projectDirectory = directory
? `${directory}/${name}`
: `libs/${scope}/feature-${name}`;
// Generate base library
await libraryGenerator(tree, {
name: `feature-${name}`,
directory: projectDirectory,
tags: `type:feature,scope:${scope}`,
style: 'css',
skipTsConfig: false,
skipFormat: true,
unitTestRunner: 'jest',
linter: 'eslint',
});
// Add custom files
const projectConfig = readProjectConfiguration(tree, `${scope}-feature-${name}`);
const projectNames = names(name);
generateFiles(
tree,
joinPathFragments(__dirname, 'files'),
projectConfig.sourceRoot,
{
...projectNames,
scope,
tmpl: '',
}
);
await formatFiles(tree);
}
```
### Template 5: CI Configuration with Affected
```yaml
# .github/workflows/ci.yml
name: CI
on:
push:
branches: [main]
pull_request:
branches: [main]
env:
NX_CLOUD_ACCESS_TOKEN: ${{ secrets.NX_CLOUD_ACCESS_TOKEN }}
jobs:
main:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: actions/setup-node@v4
with:
node-version: 20
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Derive SHAs for affected commands
uses: nrwl/nx-set-shas@v4
- name: Run affected lint
run: npx nx affected -t lint --parallel=3
- name: Run affected test
run: npx nx affected -t test --parallel=3 --configuration=ci
- name: Run affected build
run: npx nx affected -t build --parallel=3
- name: Run affected e2e
run: npx nx affected -t e2e --parallel=1
```
### Template 6: Remote Caching Setup
```typescript
// nx.json with Nx Cloud
{
"tasksRunnerOptions": {
"default": {
"runner": "nx-cloud",
"options": {
"cacheableOperations": ["build", "lint", "test", "e2e"],
"accessToken": "your-nx-cloud-token",
"parallel": 3,
"cacheDirectory": ".nx/cache"
}
}
},
"nxCloudAccessToken": "your-nx-cloud-token"
}
// Self-hosted cache with S3
{
"tasksRunnerOptions": {
"default": {
"runner": "@nx-aws-cache/nx-aws-cache",
"options": {
"cacheableOperations": ["build", "lint", "test"],
"awsRegion": "us-east-1",
"awsBucket": "my-nx-cache-bucket",
"awsProfile": "default"
}
}
}
}
```
## Common Commands
```bash
# Generate new library
nx g @nx/react:lib feature-auth --directory=libs/web --tags=type:feature,scope:web
# Run affected tests
nx affected -t test --base=main
# View dependency graph
nx graph
# Run specific project
nx build web --configuration=production
# Reset cache
nx reset
# Run migrations
nx migrate latest
nx migrate --run-migrations
```
## Best Practices
### Do's
- **Use tags consistently** - Enforce with module boundaries
- **Enable caching early** - Significant CI savings
- **Keep libs focused** - Single responsibility
- **Use generators** - Ensure consistency
- **Document boundaries** - Help new developers
### Don'ts
- **Don't create circular deps** - Graph should be acyclic
- **Don't skip affected** - Test only what changed
- **Don't ignore boundaries** - Tech debt accumulates
- **Don't over-granularize** - Balance lib count
## Resources
- [Nx Documentation](https://nx.dev/getting-started/intro)
- [Module Boundaries](https://nx.dev/core-features/enforce-module-boundaries)
- [Nx Cloud](https://nx.app/)

View File

@@ -0,0 +1,407 @@
---
name: turborepo-caching
description: Configure Turborepo for efficient monorepo builds with local and remote caching. Use when setting up Turborepo, optimizing build pipelines, or implementing distributed caching.
---
# Turborepo Caching
Production patterns for Turborepo build optimization.
## When to Use This Skill
- Setting up new Turborepo projects
- Configuring build pipelines
- Implementing remote caching
- Optimizing CI/CD performance
- Migrating from other monorepo tools
- Debugging cache misses
## Core Concepts
### 1. Turborepo Architecture
```
Workspace Root/
├── apps/
│ ├── web/
│ │ └── package.json
│ └── docs/
│ └── package.json
├── packages/
│ ├── ui/
│ │ └── package.json
│ └── config/
│ └── package.json
├── turbo.json
└── package.json
```
### 2. Pipeline Concepts
| Concept | Description |
|---------|-------------|
| **dependsOn** | Tasks that must complete first |
| **cache** | Whether to cache outputs |
| **outputs** | Files to cache |
| **inputs** | Files that affect cache key |
| **persistent** | Long-running tasks (dev servers) |
## Templates
### Template 1: turbo.json Configuration
```json
{
"$schema": "https://turbo.build/schema.json",
"globalDependencies": [
".env",
".env.local"
],
"globalEnv": [
"NODE_ENV",
"VERCEL_URL"
],
"pipeline": {
"build": {
"dependsOn": ["^build"],
"outputs": [
"dist/**",
".next/**",
"!.next/cache/**"
],
"env": [
"API_URL",
"NEXT_PUBLIC_*"
]
},
"test": {
"dependsOn": ["build"],
"outputs": ["coverage/**"],
"inputs": [
"src/**/*.tsx",
"src/**/*.ts",
"test/**/*.ts"
]
},
"lint": {
"outputs": [],
"cache": true
},
"typecheck": {
"dependsOn": ["^build"],
"outputs": []
},
"dev": {
"cache": false,
"persistent": true
},
"clean": {
"cache": false
}
}
}
```
### Template 2: Package-Specific Pipeline
```json
// apps/web/turbo.json
{
"$schema": "https://turbo.build/schema.json",
"extends": ["//"],
"pipeline": {
"build": {
"outputs": [".next/**", "!.next/cache/**"],
"env": [
"NEXT_PUBLIC_API_URL",
"NEXT_PUBLIC_ANALYTICS_ID"
]
},
"test": {
"outputs": ["coverage/**"],
"inputs": [
"src/**",
"tests/**",
"jest.config.js"
]
}
}
}
```
### Template 3: Remote Caching with Vercel
```bash
# Login to Vercel
npx turbo login
# Link to Vercel project
npx turbo link
# Run with remote cache
turbo build --remote-only
# CI environment variables
TURBO_TOKEN=your-token
TURBO_TEAM=your-team
```
```yaml
# .github/workflows/ci.yml
name: CI
on:
push:
branches: [main]
pull_request:
env:
TURBO_TOKEN: ${{ secrets.TURBO_TOKEN }}
TURBO_TEAM: ${{ vars.TURBO_TEAM }}
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 20
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Build
run: npx turbo build --filter='...[origin/main]'
- name: Test
run: npx turbo test --filter='...[origin/main]'
```
### Template 4: Self-Hosted Remote Cache
```typescript
// Custom remote cache server (Express)
import express from 'express';
import { createReadStream, createWriteStream } from 'fs';
import { mkdir } from 'fs/promises';
import { join } from 'path';
const app = express();
const CACHE_DIR = './cache';
// Get artifact
app.get('/v8/artifacts/:hash', async (req, res) => {
const { hash } = req.params;
const team = req.query.teamId || 'default';
const filePath = join(CACHE_DIR, team, hash);
try {
const stream = createReadStream(filePath);
stream.pipe(res);
} catch {
res.status(404).send('Not found');
}
});
// Put artifact
app.put('/v8/artifacts/:hash', async (req, res) => {
const { hash } = req.params;
const team = req.query.teamId || 'default';
const dir = join(CACHE_DIR, team);
const filePath = join(dir, hash);
await mkdir(dir, { recursive: true });
const stream = createWriteStream(filePath);
req.pipe(stream);
stream.on('finish', () => {
res.json({ urls: [`${req.protocol}://${req.get('host')}/v8/artifacts/${hash}`] });
});
});
// Check artifact exists
app.head('/v8/artifacts/:hash', async (req, res) => {
const { hash } = req.params;
const team = req.query.teamId || 'default';
const filePath = join(CACHE_DIR, team, hash);
try {
await fs.access(filePath);
res.status(200).end();
} catch {
res.status(404).end();
}
});
app.listen(3000);
```
```json
// turbo.json for self-hosted cache
{
"remoteCache": {
"signature": false
}
}
```
```bash
# Use self-hosted cache
turbo build --api="http://localhost:3000" --token="my-token" --team="my-team"
```
### Template 5: Filtering and Scoping
```bash
# Build specific package
turbo build --filter=@myorg/web
# Build package and its dependencies
turbo build --filter=@myorg/web...
# Build package and its dependents
turbo build --filter=...@myorg/ui
# Build changed packages since main
turbo build --filter='...[origin/main]'
# Build packages in directory
turbo build --filter='./apps/*'
# Combine filters
turbo build --filter=@myorg/web --filter=@myorg/docs
# Exclude package
turbo build --filter='!@myorg/docs'
# Include dependencies of changed
turbo build --filter='...[HEAD^1]...'
```
### Template 6: Advanced Pipeline Configuration
```json
{
"$schema": "https://turbo.build/schema.json",
"pipeline": {
"build": {
"dependsOn": ["^build"],
"outputs": ["dist/**"],
"inputs": [
"$TURBO_DEFAULT$",
"!**/*.md",
"!**/*.test.*"
]
},
"test": {
"dependsOn": ["^build"],
"outputs": ["coverage/**"],
"inputs": [
"src/**",
"tests/**",
"*.config.*"
],
"env": ["CI", "NODE_ENV"]
},
"test:e2e": {
"dependsOn": ["build"],
"outputs": [],
"cache": false
},
"deploy": {
"dependsOn": ["build", "test", "lint"],
"outputs": [],
"cache": false
},
"db:generate": {
"cache": false
},
"db:push": {
"cache": false,
"dependsOn": ["db:generate"]
},
"@myorg/web#build": {
"dependsOn": ["^build", "@myorg/db#db:generate"],
"outputs": [".next/**"],
"env": ["NEXT_PUBLIC_*"]
}
}
}
```
### Template 7: Root package.json Setup
```json
{
"name": "my-turborepo",
"private": true,
"workspaces": [
"apps/*",
"packages/*"
],
"scripts": {
"build": "turbo build",
"dev": "turbo dev",
"lint": "turbo lint",
"test": "turbo test",
"clean": "turbo clean && rm -rf node_modules",
"format": "prettier --write \"**/*.{ts,tsx,md}\"",
"changeset": "changeset",
"version-packages": "changeset version",
"release": "turbo build --filter=./packages/* && changeset publish"
},
"devDependencies": {
"turbo": "^1.10.0",
"prettier": "^3.0.0",
"@changesets/cli": "^2.26.0"
},
"packageManager": "npm@10.0.0"
}
```
## Debugging Cache
```bash
# Dry run to see what would run
turbo build --dry-run
# Verbose output with hashes
turbo build --verbosity=2
# Show task graph
turbo build --graph
# Force no cache
turbo build --force
# Show cache status
turbo build --summarize
# Debug specific task
TURBO_LOG_VERBOSITY=debug turbo build --filter=@myorg/web
```
## Best Practices
### Do's
- **Define explicit inputs** - Avoid cache invalidation
- **Use workspace protocol** - `"@myorg/ui": "workspace:*"`
- **Enable remote caching** - Share across CI and local
- **Filter in CI** - Build only affected packages
- **Cache build outputs** - Not source files
### Don'ts
- **Don't cache dev servers** - Use `persistent: true`
- **Don't include secrets in env** - Use runtime env vars
- **Don't ignore dependsOn** - Causes race conditions
- **Don't over-filter** - May miss dependencies
## Resources
- [Turborepo Documentation](https://turbo.build/repo/docs)
- [Caching Guide](https://turbo.build/repo/docs/core-concepts/caching)
- [Remote Caching](https://turbo.build/repo/docs/core-concepts/remote-caching)

View File

@@ -0,0 +1,428 @@
---
name: architecture-decision-records
description: Write and maintain Architecture Decision Records (ADRs) following best practices for technical decision documentation. Use when documenting significant technical decisions, reviewing past architectural choices, or establishing decision processes.
---
# Architecture Decision Records
Comprehensive patterns for creating, maintaining, and managing Architecture Decision Records (ADRs) that capture the context and rationale behind significant technical decisions.
## When to Use This Skill
- Making significant architectural decisions
- Documenting technology choices
- Recording design trade-offs
- Onboarding new team members
- Reviewing historical decisions
- Establishing decision-making processes
## Core Concepts
### 1. What is an ADR?
An Architecture Decision Record captures:
- **Context**: Why we needed to make a decision
- **Decision**: What we decided
- **Consequences**: What happens as a result
### 2. When to Write an ADR
| Write ADR | Skip ADR |
|-----------|----------|
| New framework adoption | Minor version upgrades |
| Database technology choice | Bug fixes |
| API design patterns | Implementation details |
| Security architecture | Routine maintenance |
| Integration patterns | Configuration changes |
### 3. ADR Lifecycle
```
Proposed → Accepted → Deprecated → Superseded
Rejected
```
## Templates
### Template 1: Standard ADR (MADR Format)
```markdown
# ADR-0001: Use PostgreSQL as Primary Database
## Status
Accepted
## Context
We need to select a primary database for our new e-commerce platform. The system
will handle:
- ~10,000 concurrent users
- Complex product catalog with hierarchical categories
- Transaction processing for orders and payments
- Full-text search for products
- Geospatial queries for store locator
The team has experience with MySQL, PostgreSQL, and MongoDB. We need ACID
compliance for financial transactions.
## Decision Drivers
* **Must have ACID compliance** for payment processing
* **Must support complex queries** for reporting
* **Should support full-text search** to reduce infrastructure complexity
* **Should have good JSON support** for flexible product attributes
* **Team familiarity** reduces onboarding time
## Considered Options
### Option 1: PostgreSQL
- **Pros**: ACID compliant, excellent JSON support (JSONB), built-in full-text
search, PostGIS for geospatial, team has experience
- **Cons**: Slightly more complex replication setup than MySQL
### Option 2: MySQL
- **Pros**: Very familiar to team, simple replication, large community
- **Cons**: Weaker JSON support, no built-in full-text search (need
Elasticsearch), no geospatial without extensions
### Option 3: MongoDB
- **Pros**: Flexible schema, native JSON, horizontal scaling
- **Cons**: No ACID for multi-document transactions (at decision time),
team has limited experience, requires schema design discipline
## Decision
We will use **PostgreSQL 15** as our primary database.
## Rationale
PostgreSQL provides the best balance of:
1. **ACID compliance** essential for e-commerce transactions
2. **Built-in capabilities** (full-text search, JSONB, PostGIS) reduce
infrastructure complexity
3. **Team familiarity** with SQL databases reduces learning curve
4. **Mature ecosystem** with excellent tooling and community support
The slight complexity in replication is outweighed by the reduction in
additional services (no separate Elasticsearch needed).
## Consequences
### Positive
- Single database handles transactions, search, and geospatial queries
- Reduced operational complexity (fewer services to manage)
- Strong consistency guarantees for financial data
- Team can leverage existing SQL expertise
### Negative
- Need to learn PostgreSQL-specific features (JSONB, full-text search syntax)
- Vertical scaling limits may require read replicas sooner
- Some team members need PostgreSQL-specific training
### Risks
- Full-text search may not scale as well as dedicated search engines
- Mitigation: Design for potential Elasticsearch addition if needed
## Implementation Notes
- Use JSONB for flexible product attributes
- Implement connection pooling with PgBouncer
- Set up streaming replication for read replicas
- Use pg_trgm extension for fuzzy search
## Related Decisions
- ADR-0002: Caching Strategy (Redis) - complements database choice
- ADR-0005: Search Architecture - may supersede if Elasticsearch needed
## References
- [PostgreSQL JSON Documentation](https://www.postgresql.org/docs/current/datatype-json.html)
- [PostgreSQL Full Text Search](https://www.postgresql.org/docs/current/textsearch.html)
- Internal: Performance benchmarks in `/docs/benchmarks/database-comparison.md`
```
### Template 2: Lightweight ADR
```markdown
# ADR-0012: Adopt TypeScript for Frontend Development
**Status**: Accepted
**Date**: 2024-01-15
**Deciders**: @alice, @bob, @charlie
## Context
Our React codebase has grown to 50+ components with increasing bug reports
related to prop type mismatches and undefined errors. PropTypes provide
runtime-only checking.
## Decision
Adopt TypeScript for all new frontend code. Migrate existing code incrementally.
## Consequences
**Good**: Catch type errors at compile time, better IDE support, self-documenting
code.
**Bad**: Learning curve for team, initial slowdown, build complexity increase.
**Mitigations**: TypeScript training sessions, allow gradual adoption with
`allowJs: true`.
```
### Template 3: Y-Statement Format
```markdown
# ADR-0015: API Gateway Selection
In the context of **building a microservices architecture**,
facing **the need for centralized API management, authentication, and rate limiting**,
we decided for **Kong Gateway**
and against **AWS API Gateway and custom Nginx solution**,
to achieve **vendor independence, plugin extensibility, and team familiarity with Lua**,
accepting that **we need to manage Kong infrastructure ourselves**.
```
### Template 4: ADR for Deprecation
```markdown
# ADR-0020: Deprecate MongoDB in Favor of PostgreSQL
## Status
Accepted (Supersedes ADR-0003)
## Context
ADR-0003 (2021) chose MongoDB for user profile storage due to schema flexibility
needs. Since then:
- MongoDB's multi-document transactions remain problematic for our use case
- Our schema has stabilized and rarely changes
- We now have PostgreSQL expertise from other services
- Maintaining two databases increases operational burden
## Decision
Deprecate MongoDB and migrate user profiles to PostgreSQL.
## Migration Plan
1. **Phase 1** (Week 1-2): Create PostgreSQL schema, dual-write enabled
2. **Phase 2** (Week 3-4): Backfill historical data, validate consistency
3. **Phase 3** (Week 5): Switch reads to PostgreSQL, monitor
4. **Phase 4** (Week 6): Remove MongoDB writes, decommission
## Consequences
### Positive
- Single database technology reduces operational complexity
- ACID transactions for user data
- Team can focus PostgreSQL expertise
### Negative
- Migration effort (~4 weeks)
- Risk of data issues during migration
- Lose some schema flexibility
## Lessons Learned
Document from ADR-0003 experience:
- Schema flexibility benefits were overestimated
- Operational cost of multiple databases was underestimated
- Consider long-term maintenance in technology decisions
```
### Template 5: Request for Comments (RFC) Style
```markdown
# RFC-0025: Adopt Event Sourcing for Order Management
## Summary
Propose adopting event sourcing pattern for the order management domain to
improve auditability, enable temporal queries, and support business analytics.
## Motivation
Current challenges:
1. Audit requirements need complete order history
2. "What was the order state at time X?" queries are impossible
3. Analytics team needs event stream for real-time dashboards
4. Order state reconstruction for customer support is manual
## Detailed Design
### Event Store
```
OrderCreated { orderId, customerId, items[], timestamp }
OrderItemAdded { orderId, item, timestamp }
OrderItemRemoved { orderId, itemId, timestamp }
PaymentReceived { orderId, amount, paymentId, timestamp }
OrderShipped { orderId, trackingNumber, timestamp }
```
### Projections
- **CurrentOrderState**: Materialized view for queries
- **OrderHistory**: Complete timeline for audit
- **DailyOrderMetrics**: Analytics aggregation
### Technology
- Event Store: EventStoreDB (purpose-built, handles projections)
- Alternative considered: Kafka + custom projection service
## Drawbacks
- Learning curve for team
- Increased complexity vs. CRUD
- Need to design events carefully (immutable once stored)
- Storage growth (events never deleted)
## Alternatives
1. **Audit tables**: Simpler but doesn't enable temporal queries
2. **CDC from existing DB**: Complex, doesn't change data model
3. **Hybrid**: Event source only for order state changes
## Unresolved Questions
- [ ] Event schema versioning strategy
- [ ] Retention policy for events
- [ ] Snapshot frequency for performance
## Implementation Plan
1. Prototype with single order type (2 weeks)
2. Team training on event sourcing (1 week)
3. Full implementation and migration (4 weeks)
4. Monitoring and optimization (ongoing)
## References
- [Event Sourcing by Martin Fowler](https://martinfowler.com/eaaDev/EventSourcing.html)
- [EventStoreDB Documentation](https://www.eventstore.com/docs)
```
## ADR Management
### Directory Structure
```
docs/
├── adr/
│ ├── README.md # Index and guidelines
│ ├── template.md # Team's ADR template
│ ├── 0001-use-postgresql.md
│ ├── 0002-caching-strategy.md
│ ├── 0003-mongodb-user-profiles.md # [DEPRECATED]
│ └── 0020-deprecate-mongodb.md # Supersedes 0003
```
### ADR Index (README.md)
```markdown
# Architecture Decision Records
This directory contains Architecture Decision Records (ADRs) for [Project Name].
## Index
| ADR | Title | Status | Date |
|-----|-------|--------|------|
| [0001](0001-use-postgresql.md) | Use PostgreSQL as Primary Database | Accepted | 2024-01-10 |
| [0002](0002-caching-strategy.md) | Caching Strategy with Redis | Accepted | 2024-01-12 |
| [0003](0003-mongodb-user-profiles.md) | MongoDB for User Profiles | Deprecated | 2023-06-15 |
| [0020](0020-deprecate-mongodb.md) | Deprecate MongoDB | Accepted | 2024-01-15 |
## Creating a New ADR
1. Copy `template.md` to `NNNN-title-with-dashes.md`
2. Fill in the template
3. Submit PR for review
4. Update this index after approval
## ADR Status
- **Proposed**: Under discussion
- **Accepted**: Decision made, implementing
- **Deprecated**: No longer relevant
- **Superseded**: Replaced by another ADR
- **Rejected**: Considered but not adopted
```
### Automation (adr-tools)
```bash
# Install adr-tools
brew install adr-tools
# Initialize ADR directory
adr init docs/adr
# Create new ADR
adr new "Use PostgreSQL as Primary Database"
# Supersede an ADR
adr new -s 3 "Deprecate MongoDB in Favor of PostgreSQL"
# Generate table of contents
adr generate toc > docs/adr/README.md
# Link related ADRs
adr link 2 "Complements" 1 "Is complemented by"
```
## Review Process
```markdown
## ADR Review Checklist
### Before Submission
- [ ] Context clearly explains the problem
- [ ] All viable options considered
- [ ] Pros/cons balanced and honest
- [ ] Consequences (positive and negative) documented
- [ ] Related ADRs linked
### During Review
- [ ] At least 2 senior engineers reviewed
- [ ] Affected teams consulted
- [ ] Security implications considered
- [ ] Cost implications documented
- [ ] Reversibility assessed
### After Acceptance
- [ ] ADR index updated
- [ ] Team notified
- [ ] Implementation tickets created
- [ ] Related documentation updated
```
## Best Practices
### Do's
- **Write ADRs early** - Before implementation starts
- **Keep them short** - 1-2 pages maximum
- **Be honest about trade-offs** - Include real cons
- **Link related decisions** - Build decision graph
- **Update status** - Deprecate when superseded
### Don'ts
- **Don't change accepted ADRs** - Write new ones to supersede
- **Don't skip context** - Future readers need background
- **Don't hide failures** - Rejected decisions are valuable
- **Don't be vague** - Specific decisions, specific consequences
- **Don't forget implementation** - ADR without action is waste
## Resources
- [Documenting Architecture Decisions (Michael Nygard)](https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions)
- [MADR Template](https://adr.github.io/madr/)
- [ADR GitHub Organization](https://adr.github.io/)
- [adr-tools](https://github.com/npryce/adr-tools)

View File

@@ -0,0 +1,552 @@
---
name: changelog-automation
description: Automate changelog generation from commits, PRs, and releases following Keep a Changelog format. Use when setting up release workflows, generating release notes, or standardizing commit conventions.
---
# Changelog Automation
Patterns and tools for automating changelog generation, release notes, and version management following industry standards.
## When to Use This Skill
- Setting up automated changelog generation
- Implementing Conventional Commits
- Creating release note workflows
- Standardizing commit message formats
- Generating GitHub/GitLab release notes
- Managing semantic versioning
## Core Concepts
### 1. Keep a Changelog Format
```markdown
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Added
- New feature X
## [1.2.0] - 2024-01-15
### Added
- User profile avatars
- Dark mode support
### Changed
- Improved loading performance by 40%
### Deprecated
- Old authentication API (use v2)
### Removed
- Legacy payment gateway
### Fixed
- Login timeout issue (#123)
### Security
- Updated dependencies for CVE-2024-1234
[Unreleased]: https://github.com/user/repo/compare/v1.2.0...HEAD
[1.2.0]: https://github.com/user/repo/compare/v1.1.0...v1.2.0
```
### 2. Conventional Commits
```
<type>[optional scope]: <description>
[optional body]
[optional footer(s)]
```
| Type | Description | Changelog Section |
|------|-------------|-------------------|
| `feat` | New feature | Added |
| `fix` | Bug fix | Fixed |
| `docs` | Documentation | (usually excluded) |
| `style` | Formatting | (usually excluded) |
| `refactor` | Code restructure | Changed |
| `perf` | Performance | Changed |
| `test` | Tests | (usually excluded) |
| `chore` | Maintenance | (usually excluded) |
| `ci` | CI changes | (usually excluded) |
| `build` | Build system | (usually excluded) |
| `revert` | Revert commit | Removed |
### 3. Semantic Versioning
```
MAJOR.MINOR.PATCH
MAJOR: Breaking changes (feat! or BREAKING CHANGE)
MINOR: New features (feat)
PATCH: Bug fixes (fix)
```
## Implementation
### Method 1: Conventional Changelog (Node.js)
```bash
# Install tools
npm install -D @commitlint/cli @commitlint/config-conventional
npm install -D husky
npm install -D standard-version
# or
npm install -D semantic-release
# Setup commitlint
cat > commitlint.config.js << 'EOF'
module.exports = {
extends: ['@commitlint/config-conventional'],
rules: {
'type-enum': [
2,
'always',
[
'feat',
'fix',
'docs',
'style',
'refactor',
'perf',
'test',
'chore',
'ci',
'build',
'revert',
],
],
'subject-case': [2, 'never', ['start-case', 'pascal-case', 'upper-case']],
'subject-max-length': [2, 'always', 72],
},
};
EOF
# Setup husky
npx husky init
echo "npx --no -- commitlint --edit \$1" > .husky/commit-msg
```
### Method 2: standard-version Configuration
```javascript
// .versionrc.js
module.exports = {
types: [
{ type: 'feat', section: 'Features' },
{ type: 'fix', section: 'Bug Fixes' },
{ type: 'perf', section: 'Performance Improvements' },
{ type: 'revert', section: 'Reverts' },
{ type: 'docs', section: 'Documentation', hidden: true },
{ type: 'style', section: 'Styles', hidden: true },
{ type: 'chore', section: 'Miscellaneous', hidden: true },
{ type: 'refactor', section: 'Code Refactoring', hidden: true },
{ type: 'test', section: 'Tests', hidden: true },
{ type: 'build', section: 'Build System', hidden: true },
{ type: 'ci', section: 'CI/CD', hidden: true },
],
commitUrlFormat: '{{host}}/{{owner}}/{{repository}}/commit/{{hash}}',
compareUrlFormat: '{{host}}/{{owner}}/{{repository}}/compare/{{previousTag}}...{{currentTag}}',
issueUrlFormat: '{{host}}/{{owner}}/{{repository}}/issues/{{id}}',
userUrlFormat: '{{host}}/{{user}}',
releaseCommitMessageFormat: 'chore(release): {{currentTag}}',
scripts: {
prebump: 'echo "Running prebump"',
postbump: 'echo "Running postbump"',
prechangelog: 'echo "Running prechangelog"',
postchangelog: 'echo "Running postchangelog"',
},
};
```
```json
// package.json scripts
{
"scripts": {
"release": "standard-version",
"release:minor": "standard-version --release-as minor",
"release:major": "standard-version --release-as major",
"release:patch": "standard-version --release-as patch",
"release:dry": "standard-version --dry-run"
}
}
```
### Method 3: semantic-release (Full Automation)
```javascript
// release.config.js
module.exports = {
branches: [
'main',
{ name: 'beta', prerelease: true },
{ name: 'alpha', prerelease: true },
],
plugins: [
'@semantic-release/commit-analyzer',
'@semantic-release/release-notes-generator',
[
'@semantic-release/changelog',
{
changelogFile: 'CHANGELOG.md',
},
],
[
'@semantic-release/npm',
{
npmPublish: true,
},
],
[
'@semantic-release/github',
{
assets: ['dist/**/*.js', 'dist/**/*.css'],
},
],
[
'@semantic-release/git',
{
assets: ['CHANGELOG.md', 'package.json'],
message: 'chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}',
},
],
],
};
```
### Method 4: GitHub Actions Workflow
```yaml
# .github/workflows/release.yml
name: Release
on:
push:
branches: [main]
workflow_dispatch:
inputs:
release_type:
description: 'Release type'
required: true
default: 'patch'
type: choice
options:
- patch
- minor
- major
permissions:
contents: write
pull-requests: write
jobs:
release:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}
- uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'npm'
- run: npm ci
- name: Configure Git
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
- name: Run semantic-release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
run: npx semantic-release
# Alternative: manual release with standard-version
manual-release:
if: github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: actions/setup-node@v4
with:
node-version: '20'
- run: npm ci
- name: Configure Git
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
- name: Bump version and generate changelog
run: npx standard-version --release-as ${{ inputs.release_type }}
- name: Push changes
run: git push --follow-tags origin main
- name: Create GitHub Release
uses: softprops/action-gh-release@v1
with:
tag_name: ${{ steps.version.outputs.tag }}
body_path: RELEASE_NOTES.md
generate_release_notes: true
```
### Method 5: git-cliff (Rust-based, Fast)
```toml
# cliff.toml
[changelog]
header = """
# Changelog
All notable changes to this project will be documented in this file.
"""
body = """
{% if version %}\
## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }}
{% else %}\
## [Unreleased]
{% endif %}\
{% for group, commits in commits | group_by(attribute="group") %}
### {{ group | upper_first }}
{% for commit in commits %}
- {% if commit.scope %}**{{ commit.scope }}:** {% endif %}\
{{ commit.message | upper_first }}\
{% if commit.github.pr_number %} ([#{{ commit.github.pr_number }}](https://github.com/owner/repo/pull/{{ commit.github.pr_number }})){% endif %}\
{% endfor %}
{% endfor %}
"""
footer = """
{% for release in releases -%}
{% if release.version -%}
{% if release.previous.version -%}
[{{ release.version | trim_start_matches(pat="v") }}]: \
https://github.com/owner/repo/compare/{{ release.previous.version }}...{{ release.version }}
{% endif -%}
{% else -%}
[unreleased]: https://github.com/owner/repo/compare/{{ release.previous.version }}...HEAD
{% endif -%}
{% endfor %}
"""
trim = true
[git]
conventional_commits = true
filter_unconventional = true
split_commits = false
commit_parsers = [
{ message = "^feat", group = "Features" },
{ message = "^fix", group = "Bug Fixes" },
{ message = "^doc", group = "Documentation" },
{ message = "^perf", group = "Performance" },
{ message = "^refactor", group = "Refactoring" },
{ message = "^style", group = "Styling" },
{ message = "^test", group = "Testing" },
{ message = "^chore\\(release\\)", skip = true },
{ message = "^chore", group = "Miscellaneous" },
]
filter_commits = false
tag_pattern = "v[0-9]*"
skip_tags = ""
ignore_tags = ""
topo_order = false
sort_commits = "oldest"
[github]
owner = "owner"
repo = "repo"
```
```bash
# Generate changelog
git cliff -o CHANGELOG.md
# Generate for specific range
git cliff v1.0.0..v2.0.0 -o RELEASE_NOTES.md
# Preview without writing
git cliff --unreleased --dry-run
```
### Method 6: Python (commitizen)
```toml
# pyproject.toml
[tool.commitizen]
name = "cz_conventional_commits"
version = "1.0.0"
version_files = [
"pyproject.toml:version",
"src/__init__.py:__version__",
]
tag_format = "v$version"
update_changelog_on_bump = true
changelog_incremental = true
changelog_start_rev = "v0.1.0"
[tool.commitizen.customize]
message_template = "{{change_type}}{% if scope %}({{scope}}){% endif %}: {{message}}"
schema = "<type>(<scope>): <subject>"
schema_pattern = "^(feat|fix|docs|style|refactor|perf|test|chore)(\\(\\w+\\))?:\\s.*"
bump_pattern = "^(feat|fix|perf|refactor)"
bump_map = {"feat" = "MINOR", "fix" = "PATCH", "perf" = "PATCH", "refactor" = "PATCH"}
```
```bash
# Install
pip install commitizen
# Create commit interactively
cz commit
# Bump version and update changelog
cz bump --changelog
# Check commits
cz check --rev-range HEAD~5..HEAD
```
## Release Notes Templates
### GitHub Release Template
```markdown
## What's Changed
### 🚀 Features
{{ range .Features }}
- {{ .Title }} by @{{ .Author }} in #{{ .PR }}
{{ end }}
### 🐛 Bug Fixes
{{ range .Fixes }}
- {{ .Title }} by @{{ .Author }} in #{{ .PR }}
{{ end }}
### 📚 Documentation
{{ range .Docs }}
- {{ .Title }} by @{{ .Author }} in #{{ .PR }}
{{ end }}
### 🔧 Maintenance
{{ range .Chores }}
- {{ .Title }} by @{{ .Author }} in #{{ .PR }}
{{ end }}
## New Contributors
{{ range .NewContributors }}
- @{{ .Username }} made their first contribution in #{{ .PR }}
{{ end }}
**Full Changelog**: https://github.com/owner/repo/compare/v{{ .Previous }}...v{{ .Current }}
```
### Internal Release Notes
```markdown
# Release v2.1.0 - January 15, 2024
## Summary
This release introduces dark mode support and improves checkout performance
by 40%. It also includes important security updates.
## Highlights
### 🌙 Dark Mode
Users can now switch to dark mode from settings. The preference is
automatically saved and synced across devices.
### ⚡ Performance
- Checkout flow is 40% faster
- Reduced bundle size by 15%
## Breaking Changes
None in this release.
## Upgrade Guide
No special steps required. Standard deployment process applies.
## Known Issues
- Dark mode may flicker on initial load (fix scheduled for v2.1.1)
## Dependencies Updated
| Package | From | To | Reason |
|---------|------|-----|--------|
| react | 18.2.0 | 18.3.0 | Performance improvements |
| lodash | 4.17.20 | 4.17.21 | Security patch |
```
## Commit Message Examples
```bash
# Feature with scope
feat(auth): add OAuth2 support for Google login
# Bug fix with issue reference
fix(checkout): resolve race condition in payment processing
Closes #123
# Breaking change
feat(api)!: change user endpoint response format
BREAKING CHANGE: The user endpoint now returns `userId` instead of `id`.
Migration guide: Update all API consumers to use the new field name.
# Multiple paragraphs
fix(database): handle connection timeouts gracefully
Previously, connection timeouts would cause the entire request to fail
without retry. This change implements exponential backoff with up to
3 retries before failing.
The timeout threshold has been increased from 5s to 10s based on p99
latency analysis.
Fixes #456
Reviewed-by: @alice
```
## Best Practices
### Do's
- **Follow Conventional Commits** - Enables automation
- **Write clear messages** - Future you will thank you
- **Reference issues** - Link commits to tickets
- **Use scopes consistently** - Define team conventions
- **Automate releases** - Reduce manual errors
### Don'ts
- **Don't mix changes** - One logical change per commit
- **Don't skip validation** - Use commitlint
- **Don't manual edit** - Generated changelogs only
- **Don't forget breaking changes** - Mark with `!` or footer
- **Don't ignore CI** - Validate commits in pipeline
## Resources
- [Keep a Changelog](https://keepachangelog.com/)
- [Conventional Commits](https://www.conventionalcommits.org/)
- [Semantic Versioning](https://semver.org/)
- [semantic-release](https://semantic-release.gitbook.io/)
- [git-cliff](https://git-cliff.org/)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,544 @@
---
name: nextjs-app-router-patterns
description: Master Next.js 14+ App Router with Server Components, streaming, parallel routes, and advanced data fetching. Use when building Next.js applications, implementing SSR/SSG, or optimizing React Server Components.
---
# Next.js App Router Patterns
Comprehensive patterns for Next.js 14+ App Router architecture, Server Components, and modern full-stack React development.
## When to Use This Skill
- Building new Next.js applications with App Router
- Migrating from Pages Router to App Router
- Implementing Server Components and streaming
- Setting up parallel and intercepting routes
- Optimizing data fetching and caching
- Building full-stack features with Server Actions
## Core Concepts
### 1. Rendering Modes
| Mode | Where | When to Use |
|------|-------|-------------|
| **Server Components** | Server only | Data fetching, heavy computation, secrets |
| **Client Components** | Browser | Interactivity, hooks, browser APIs |
| **Static** | Build time | Content that rarely changes |
| **Dynamic** | Request time | Personalized or real-time data |
| **Streaming** | Progressive | Large pages, slow data sources |
### 2. File Conventions
```
app/
├── layout.tsx # Shared UI wrapper
├── page.tsx # Route UI
├── loading.tsx # Loading UI (Suspense)
├── error.tsx # Error boundary
├── not-found.tsx # 404 UI
├── route.ts # API endpoint
├── template.tsx # Re-mounted layout
├── default.tsx # Parallel route fallback
└── opengraph-image.tsx # OG image generation
```
## Quick Start
```typescript
// app/layout.tsx
import { Inter } from 'next/font/google'
import { Providers } from './providers'
const inter = Inter({ subsets: ['latin'] })
export const metadata = {
title: { default: 'My App', template: '%s | My App' },
description: 'Built with Next.js App Router',
}
export default function RootLayout({
children,
}: {
children: React.ReactNode
}) {
return (
<html lang="en" suppressHydrationWarning>
<body className={inter.className}>
<Providers>{children}</Providers>
</body>
</html>
)
}
// app/page.tsx - Server Component by default
async function getProducts() {
const res = await fetch('https://api.example.com/products', {
next: { revalidate: 3600 }, // ISR: revalidate every hour
})
return res.json()
}
export default async function HomePage() {
const products = await getProducts()
return (
<main>
<h1>Products</h1>
<ProductGrid products={products} />
</main>
)
}
```
## Patterns
### Pattern 1: Server Components with Data Fetching
```typescript
// app/products/page.tsx
import { Suspense } from 'react'
import { ProductList, ProductListSkeleton } from '@/components/products'
import { FilterSidebar } from '@/components/filters'
interface SearchParams {
category?: string
sort?: 'price' | 'name' | 'date'
page?: string
}
export default async function ProductsPage({
searchParams,
}: {
searchParams: Promise<SearchParams>
}) {
const params = await searchParams
return (
<div className="flex gap-8">
<FilterSidebar />
<Suspense
key={JSON.stringify(params)}
fallback={<ProductListSkeleton />}
>
<ProductList
category={params.category}
sort={params.sort}
page={Number(params.page) || 1}
/>
</Suspense>
</div>
)
}
// components/products/ProductList.tsx - Server Component
async function getProducts(filters: ProductFilters) {
const res = await fetch(
`${process.env.API_URL}/products?${new URLSearchParams(filters)}`,
{ next: { tags: ['products'] } }
)
if (!res.ok) throw new Error('Failed to fetch products')
return res.json()
}
export async function ProductList({ category, sort, page }: ProductFilters) {
const { products, totalPages } = await getProducts({ category, sort, page })
return (
<div>
<div className="grid grid-cols-3 gap-4">
{products.map((product) => (
<ProductCard key={product.id} product={product} />
))}
</div>
<Pagination currentPage={page} totalPages={totalPages} />
</div>
)
}
```
### Pattern 2: Client Components with 'use client'
```typescript
// components/products/AddToCartButton.tsx
'use client'
import { useState, useTransition } from 'react'
import { addToCart } from '@/app/actions/cart'
export function AddToCartButton({ productId }: { productId: string }) {
const [isPending, startTransition] = useTransition()
const [error, setError] = useState<string | null>(null)
const handleClick = () => {
setError(null)
startTransition(async () => {
const result = await addToCart(productId)
if (result.error) {
setError(result.error)
}
})
}
return (
<div>
<button
onClick={handleClick}
disabled={isPending}
className="btn-primary"
>
{isPending ? 'Adding...' : 'Add to Cart'}
</button>
{error && <p className="text-red-500 text-sm">{error}</p>}
</div>
)
}
```
### Pattern 3: Server Actions
```typescript
// app/actions/cart.ts
'use server'
import { revalidateTag } from 'next/cache'
import { cookies } from 'next/headers'
import { redirect } from 'next/navigation'
export async function addToCart(productId: string) {
const cookieStore = await cookies()
const sessionId = cookieStore.get('session')?.value
if (!sessionId) {
redirect('/login')
}
try {
await db.cart.upsert({
where: { sessionId_productId: { sessionId, productId } },
update: { quantity: { increment: 1 } },
create: { sessionId, productId, quantity: 1 },
})
revalidateTag('cart')
return { success: true }
} catch (error) {
return { error: 'Failed to add item to cart' }
}
}
export async function checkout(formData: FormData) {
const address = formData.get('address') as string
const payment = formData.get('payment') as string
// Validate
if (!address || !payment) {
return { error: 'Missing required fields' }
}
// Process order
const order = await processOrder({ address, payment })
// Redirect to confirmation
redirect(`/orders/${order.id}/confirmation`)
}
```
### Pattern 4: Parallel Routes
```typescript
// app/dashboard/layout.tsx
export default function DashboardLayout({
children,
analytics,
team,
}: {
children: React.ReactNode
analytics: React.ReactNode
team: React.ReactNode
}) {
return (
<div className="dashboard-grid">
<main>{children}</main>
<aside className="analytics-panel">{analytics}</aside>
<aside className="team-panel">{team}</aside>
</div>
)
}
// app/dashboard/@analytics/page.tsx
export default async function AnalyticsSlot() {
const stats = await getAnalytics()
return <AnalyticsChart data={stats} />
}
// app/dashboard/@analytics/loading.tsx
export default function AnalyticsLoading() {
return <ChartSkeleton />
}
// app/dashboard/@team/page.tsx
export default async function TeamSlot() {
const members = await getTeamMembers()
return <TeamList members={members} />
}
```
### Pattern 5: Intercepting Routes (Modal Pattern)
```typescript
// File structure for photo modal
// app/
// ├── @modal/
// │ ├── (.)photos/[id]/page.tsx # Intercept
// │ └── default.tsx
// ├── photos/
// │ └── [id]/page.tsx # Full page
// └── layout.tsx
// app/@modal/(.)photos/[id]/page.tsx
import { Modal } from '@/components/Modal'
import { PhotoDetail } from '@/components/PhotoDetail'
export default async function PhotoModal({
params,
}: {
params: Promise<{ id: string }>
}) {
const { id } = await params
const photo = await getPhoto(id)
return (
<Modal>
<PhotoDetail photo={photo} />
</Modal>
)
}
// app/photos/[id]/page.tsx - Full page version
export default async function PhotoPage({
params,
}: {
params: Promise<{ id: string }>
}) {
const { id } = await params
const photo = await getPhoto(id)
return (
<div className="photo-page">
<PhotoDetail photo={photo} />
<RelatedPhotos photoId={id} />
</div>
)
}
// app/layout.tsx
export default function RootLayout({
children,
modal,
}: {
children: React.ReactNode
modal: React.ReactNode
}) {
return (
<html>
<body>
{children}
{modal}
</body>
</html>
)
}
```
### Pattern 6: Streaming with Suspense
```typescript
// app/product/[id]/page.tsx
import { Suspense } from 'react'
export default async function ProductPage({
params,
}: {
params: Promise<{ id: string }>
}) {
const { id } = await params
// This data loads first (blocking)
const product = await getProduct(id)
return (
<div>
{/* Immediate render */}
<ProductHeader product={product} />
{/* Stream in reviews */}
<Suspense fallback={<ReviewsSkeleton />}>
<Reviews productId={id} />
</Suspense>
{/* Stream in recommendations */}
<Suspense fallback={<RecommendationsSkeleton />}>
<Recommendations productId={id} />
</Suspense>
</div>
)
}
// These components fetch their own data
async function Reviews({ productId }: { productId: string }) {
const reviews = await getReviews(productId) // Slow API
return <ReviewList reviews={reviews} />
}
async function Recommendations({ productId }: { productId: string }) {
const products = await getRecommendations(productId) // ML-based, slow
return <ProductCarousel products={products} />
}
```
### Pattern 7: Route Handlers (API Routes)
```typescript
// app/api/products/route.ts
import { NextRequest, NextResponse } from 'next/server'
export async function GET(request: NextRequest) {
const searchParams = request.nextUrl.searchParams
const category = searchParams.get('category')
const products = await db.product.findMany({
where: category ? { category } : undefined,
take: 20,
})
return NextResponse.json(products)
}
export async function POST(request: NextRequest) {
const body = await request.json()
const product = await db.product.create({
data: body,
})
return NextResponse.json(product, { status: 201 })
}
// app/api/products/[id]/route.ts
export async function GET(
request: NextRequest,
{ params }: { params: Promise<{ id: string }> }
) {
const { id } = await params
const product = await db.product.findUnique({ where: { id } })
if (!product) {
return NextResponse.json(
{ error: 'Product not found' },
{ status: 404 }
)
}
return NextResponse.json(product)
}
```
### Pattern 8: Metadata and SEO
```typescript
// app/products/[slug]/page.tsx
import { Metadata } from 'next'
import { notFound } from 'next/navigation'
type Props = {
params: Promise<{ slug: string }>
}
export async function generateMetadata({ params }: Props): Promise<Metadata> {
const { slug } = await params
const product = await getProduct(slug)
if (!product) return {}
return {
title: product.name,
description: product.description,
openGraph: {
title: product.name,
description: product.description,
images: [{ url: product.image, width: 1200, height: 630 }],
},
twitter: {
card: 'summary_large_image',
title: product.name,
description: product.description,
images: [product.image],
},
}
}
export async function generateStaticParams() {
const products = await db.product.findMany({ select: { slug: true } })
return products.map((p) => ({ slug: p.slug }))
}
export default async function ProductPage({ params }: Props) {
const { slug } = await params
const product = await getProduct(slug)
if (!product) notFound()
return <ProductDetail product={product} />
}
```
## Caching Strategies
### Data Cache
```typescript
// No cache (always fresh)
fetch(url, { cache: 'no-store' })
// Cache forever (static)
fetch(url, { cache: 'force-cache' })
// ISR - revalidate after 60 seconds
fetch(url, { next: { revalidate: 60 } })
// Tag-based invalidation
fetch(url, { next: { tags: ['products'] } })
// Invalidate via Server Action
'use server'
import { revalidateTag, revalidatePath } from 'next/cache'
export async function updateProduct(id: string, data: ProductData) {
await db.product.update({ where: { id }, data })
revalidateTag('products')
revalidatePath('/products')
}
```
## Best Practices
### Do's
- **Start with Server Components** - Add 'use client' only when needed
- **Colocate data fetching** - Fetch data where it's used
- **Use Suspense boundaries** - Enable streaming for slow data
- **Leverage parallel routes** - Independent loading states
- **Use Server Actions** - For mutations with progressive enhancement
### Don'ts
- **Don't pass serializable data** - Server → Client boundary limitations
- **Don't use hooks in Server Components** - No useState, useEffect
- **Don't fetch in Client Components** - Use Server Components or React Query
- **Don't over-nest layouts** - Each layout adds to the component tree
- **Don't ignore loading states** - Always provide loading.tsx or Suspense
## Resources
- [Next.js App Router Documentation](https://nextjs.org/docs/app)
- [Server Components RFC](https://github.com/reactjs/rfcs/blob/main/text/0188-server-components.md)
- [Vercel Templates](https://vercel.com/templates/next.js)

View File

@@ -0,0 +1,671 @@
---
name: react-native-architecture
description: Build production React Native apps with Expo, navigation, native modules, offline sync, and cross-platform patterns. Use when developing mobile apps, implementing native integrations, or architecting React Native projects.
---
# React Native Architecture
Production-ready patterns for React Native development with Expo, including navigation, state management, native modules, and offline-first architecture.
## When to Use This Skill
- Starting a new React Native or Expo project
- Implementing complex navigation patterns
- Integrating native modules and platform APIs
- Building offline-first mobile applications
- Optimizing React Native performance
- Setting up CI/CD for mobile releases
## Core Concepts
### 1. Project Structure
```
src/
├── app/ # Expo Router screens
│ ├── (auth)/ # Auth group
│ ├── (tabs)/ # Tab navigation
│ └── _layout.tsx # Root layout
├── components/
│ ├── ui/ # Reusable UI components
│ └── features/ # Feature-specific components
├── hooks/ # Custom hooks
├── services/ # API and native services
├── stores/ # State management
├── utils/ # Utilities
└── types/ # TypeScript types
```
### 2. Expo vs Bare React Native
| Feature | Expo | Bare RN |
|---------|------|---------|
| Setup complexity | Low | High |
| Native modules | EAS Build | Manual linking |
| OTA updates | Built-in | Manual setup |
| Build service | EAS | Custom CI |
| Custom native code | Config plugins | Direct access |
## Quick Start
```bash
# Create new Expo project
npx create-expo-app@latest my-app -t expo-template-blank-typescript
# Install essential dependencies
npx expo install expo-router expo-status-bar react-native-safe-area-context
npx expo install @react-native-async-storage/async-storage
npx expo install expo-secure-store expo-haptics
```
```typescript
// app/_layout.tsx
import { Stack } from 'expo-router'
import { ThemeProvider } from '@/providers/ThemeProvider'
import { QueryProvider } from '@/providers/QueryProvider'
export default function RootLayout() {
return (
<QueryProvider>
<ThemeProvider>
<Stack screenOptions={{ headerShown: false }}>
<Stack.Screen name="(tabs)" />
<Stack.Screen name="(auth)" />
<Stack.Screen name="modal" options={{ presentation: 'modal' }} />
</Stack>
</ThemeProvider>
</QueryProvider>
)
}
```
## Patterns
### Pattern 1: Expo Router Navigation
```typescript
// app/(tabs)/_layout.tsx
import { Tabs } from 'expo-router'
import { Home, Search, User, Settings } from 'lucide-react-native'
import { useTheme } from '@/hooks/useTheme'
export default function TabLayout() {
const { colors } = useTheme()
return (
<Tabs
screenOptions={{
tabBarActiveTintColor: colors.primary,
tabBarInactiveTintColor: colors.textMuted,
tabBarStyle: { backgroundColor: colors.background },
headerShown: false,
}}
>
<Tabs.Screen
name="index"
options={{
title: 'Home',
tabBarIcon: ({ color, size }) => <Home size={size} color={color} />,
}}
/>
<Tabs.Screen
name="search"
options={{
title: 'Search',
tabBarIcon: ({ color, size }) => <Search size={size} color={color} />,
}}
/>
<Tabs.Screen
name="profile"
options={{
title: 'Profile',
tabBarIcon: ({ color, size }) => <User size={size} color={color} />,
}}
/>
<Tabs.Screen
name="settings"
options={{
title: 'Settings',
tabBarIcon: ({ color, size }) => <Settings size={size} color={color} />,
}}
/>
</Tabs>
)
}
// app/(tabs)/profile/[id].tsx - Dynamic route
import { useLocalSearchParams } from 'expo-router'
export default function ProfileScreen() {
const { id } = useLocalSearchParams<{ id: string }>()
return <UserProfile userId={id} />
}
// Navigation from anywhere
import { router } from 'expo-router'
// Programmatic navigation
router.push('/profile/123')
router.replace('/login')
router.back()
// With params
router.push({
pathname: '/product/[id]',
params: { id: '123', referrer: 'home' },
})
```
### Pattern 2: Authentication Flow
```typescript
// providers/AuthProvider.tsx
import { createContext, useContext, useEffect, useState } from 'react'
import { useRouter, useSegments } from 'expo-router'
import * as SecureStore from 'expo-secure-store'
interface AuthContextType {
user: User | null
isLoading: boolean
signIn: (credentials: Credentials) => Promise<void>
signOut: () => Promise<void>
}
const AuthContext = createContext<AuthContextType | null>(null)
export function AuthProvider({ children }: { children: React.ReactNode }) {
const [user, setUser] = useState<User | null>(null)
const [isLoading, setIsLoading] = useState(true)
const segments = useSegments()
const router = useRouter()
// Check authentication on mount
useEffect(() => {
checkAuth()
}, [])
// Protect routes
useEffect(() => {
if (isLoading) return
const inAuthGroup = segments[0] === '(auth)'
if (!user && !inAuthGroup) {
router.replace('/login')
} else if (user && inAuthGroup) {
router.replace('/(tabs)')
}
}, [user, segments, isLoading])
async function checkAuth() {
try {
const token = await SecureStore.getItemAsync('authToken')
if (token) {
const userData = await api.getUser(token)
setUser(userData)
}
} catch (error) {
await SecureStore.deleteItemAsync('authToken')
} finally {
setIsLoading(false)
}
}
async function signIn(credentials: Credentials) {
const { token, user } = await api.login(credentials)
await SecureStore.setItemAsync('authToken', token)
setUser(user)
}
async function signOut() {
await SecureStore.deleteItemAsync('authToken')
setUser(null)
}
if (isLoading) {
return <SplashScreen />
}
return (
<AuthContext.Provider value={{ user, isLoading, signIn, signOut }}>
{children}
</AuthContext.Provider>
)
}
export const useAuth = () => {
const context = useContext(AuthContext)
if (!context) throw new Error('useAuth must be used within AuthProvider')
return context
}
```
### Pattern 3: Offline-First with React Query
```typescript
// providers/QueryProvider.tsx
import { QueryClient, QueryClientProvider } from '@tanstack/react-query'
import { createAsyncStoragePersister } from '@tanstack/query-async-storage-persister'
import { PersistQueryClientProvider } from '@tanstack/react-query-persist-client'
import AsyncStorage from '@react-native-async-storage/async-storage'
import NetInfo from '@react-native-community/netinfo'
import { onlineManager } from '@tanstack/react-query'
// Sync online status
onlineManager.setEventListener((setOnline) => {
return NetInfo.addEventListener((state) => {
setOnline(!!state.isConnected)
})
})
const queryClient = new QueryClient({
defaultOptions: {
queries: {
gcTime: 1000 * 60 * 60 * 24, // 24 hours
staleTime: 1000 * 60 * 5, // 5 minutes
retry: 2,
networkMode: 'offlineFirst',
},
mutations: {
networkMode: 'offlineFirst',
},
},
})
const asyncStoragePersister = createAsyncStoragePersister({
storage: AsyncStorage,
key: 'REACT_QUERY_OFFLINE_CACHE',
})
export function QueryProvider({ children }: { children: React.ReactNode }) {
return (
<PersistQueryClientProvider
client={queryClient}
persistOptions={{ persister: asyncStoragePersister }}
>
{children}
</PersistQueryClientProvider>
)
}
// hooks/useProducts.ts
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
export function useProducts() {
return useQuery({
queryKey: ['products'],
queryFn: api.getProducts,
// Use stale data while revalidating
placeholderData: (previousData) => previousData,
})
}
export function useCreateProduct() {
const queryClient = useQueryClient()
return useMutation({
mutationFn: api.createProduct,
// Optimistic update
onMutate: async (newProduct) => {
await queryClient.cancelQueries({ queryKey: ['products'] })
const previous = queryClient.getQueryData(['products'])
queryClient.setQueryData(['products'], (old: Product[]) => [
...old,
{ ...newProduct, id: 'temp-' + Date.now() },
])
return { previous }
},
onError: (err, newProduct, context) => {
queryClient.setQueryData(['products'], context?.previous)
},
onSettled: () => {
queryClient.invalidateQueries({ queryKey: ['products'] })
},
})
}
```
### Pattern 4: Native Module Integration
```typescript
// services/haptics.ts
import * as Haptics from 'expo-haptics'
import { Platform } from 'react-native'
export const haptics = {
light: () => {
if (Platform.OS !== 'web') {
Haptics.impactAsync(Haptics.ImpactFeedbackStyle.Light)
}
},
medium: () => {
if (Platform.OS !== 'web') {
Haptics.impactAsync(Haptics.ImpactFeedbackStyle.Medium)
}
},
heavy: () => {
if (Platform.OS !== 'web') {
Haptics.impactAsync(Haptics.ImpactFeedbackStyle.Heavy)
}
},
success: () => {
if (Platform.OS !== 'web') {
Haptics.notificationAsync(Haptics.NotificationFeedbackType.Success)
}
},
error: () => {
if (Platform.OS !== 'web') {
Haptics.notificationAsync(Haptics.NotificationFeedbackType.Error)
}
},
}
// services/biometrics.ts
import * as LocalAuthentication from 'expo-local-authentication'
export async function authenticateWithBiometrics(): Promise<boolean> {
const hasHardware = await LocalAuthentication.hasHardwareAsync()
if (!hasHardware) return false
const isEnrolled = await LocalAuthentication.isEnrolledAsync()
if (!isEnrolled) return false
const result = await LocalAuthentication.authenticateAsync({
promptMessage: 'Authenticate to continue',
fallbackLabel: 'Use passcode',
disableDeviceFallback: false,
})
return result.success
}
// services/notifications.ts
import * as Notifications from 'expo-notifications'
import { Platform } from 'react-native'
import Constants from 'expo-constants'
Notifications.setNotificationHandler({
handleNotification: async () => ({
shouldShowAlert: true,
shouldPlaySound: true,
shouldSetBadge: true,
}),
})
export async function registerForPushNotifications() {
let token: string | undefined
if (Platform.OS === 'android') {
await Notifications.setNotificationChannelAsync('default', {
name: 'default',
importance: Notifications.AndroidImportance.MAX,
vibrationPattern: [0, 250, 250, 250],
})
}
const { status: existingStatus } = await Notifications.getPermissionsAsync()
let finalStatus = existingStatus
if (existingStatus !== 'granted') {
const { status } = await Notifications.requestPermissionsAsync()
finalStatus = status
}
if (finalStatus !== 'granted') {
return null
}
const projectId = Constants.expoConfig?.extra?.eas?.projectId
token = (await Notifications.getExpoPushTokenAsync({ projectId })).data
return token
}
```
### Pattern 5: Platform-Specific Code
```typescript
// components/ui/Button.tsx
import { Platform, Pressable, StyleSheet, Text, ViewStyle } from 'react-native'
import * as Haptics from 'expo-haptics'
import Animated, {
useAnimatedStyle,
useSharedValue,
withSpring,
} from 'react-native-reanimated'
const AnimatedPressable = Animated.createAnimatedComponent(Pressable)
interface ButtonProps {
title: string
onPress: () => void
variant?: 'primary' | 'secondary' | 'outline'
disabled?: boolean
}
export function Button({
title,
onPress,
variant = 'primary',
disabled = false,
}: ButtonProps) {
const scale = useSharedValue(1)
const animatedStyle = useAnimatedStyle(() => ({
transform: [{ scale: scale.value }],
}))
const handlePressIn = () => {
scale.value = withSpring(0.95)
if (Platform.OS !== 'web') {
Haptics.impactAsync(Haptics.ImpactFeedbackStyle.Light)
}
}
const handlePressOut = () => {
scale.value = withSpring(1)
}
return (
<AnimatedPressable
onPress={onPress}
onPressIn={handlePressIn}
onPressOut={handlePressOut}
disabled={disabled}
style={[
styles.button,
styles[variant],
disabled && styles.disabled,
animatedStyle,
]}
>
<Text style={[styles.text, styles[`${variant}Text`]]}>{title}</Text>
</AnimatedPressable>
)
}
// Platform-specific files
// Button.ios.tsx - iOS-specific implementation
// Button.android.tsx - Android-specific implementation
// Button.web.tsx - Web-specific implementation
// Or use Platform.select
const styles = StyleSheet.create({
button: {
paddingVertical: 12,
paddingHorizontal: 24,
borderRadius: 8,
alignItems: 'center',
...Platform.select({
ios: {
shadowColor: '#000',
shadowOffset: { width: 0, height: 2 },
shadowOpacity: 0.1,
shadowRadius: 4,
},
android: {
elevation: 4,
},
}),
},
primary: {
backgroundColor: '#007AFF',
},
secondary: {
backgroundColor: '#5856D6',
},
outline: {
backgroundColor: 'transparent',
borderWidth: 1,
borderColor: '#007AFF',
},
disabled: {
opacity: 0.5,
},
text: {
fontSize: 16,
fontWeight: '600',
},
primaryText: {
color: '#FFFFFF',
},
secondaryText: {
color: '#FFFFFF',
},
outlineText: {
color: '#007AFF',
},
})
```
### Pattern 6: Performance Optimization
```typescript
// components/ProductList.tsx
import { FlashList } from '@shopify/flash-list'
import { memo, useCallback } from 'react'
interface ProductListProps {
products: Product[]
onProductPress: (id: string) => void
}
// Memoize list item
const ProductItem = memo(function ProductItem({
item,
onPress,
}: {
item: Product
onPress: (id: string) => void
}) {
const handlePress = useCallback(() => onPress(item.id), [item.id, onPress])
return (
<Pressable onPress={handlePress} style={styles.item}>
<FastImage
source={{ uri: item.image }}
style={styles.image}
resizeMode="cover"
/>
<Text style={styles.title}>{item.name}</Text>
<Text style={styles.price}>${item.price}</Text>
</Pressable>
)
})
export function ProductList({ products, onProductPress }: ProductListProps) {
const renderItem = useCallback(
({ item }: { item: Product }) => (
<ProductItem item={item} onPress={onProductPress} />
),
[onProductPress]
)
const keyExtractor = useCallback((item: Product) => item.id, [])
return (
<FlashList
data={products}
renderItem={renderItem}
keyExtractor={keyExtractor}
estimatedItemSize={100}
// Performance optimizations
removeClippedSubviews={true}
maxToRenderPerBatch={10}
windowSize={5}
// Pull to refresh
onRefresh={onRefresh}
refreshing={isRefreshing}
/>
)
}
```
## EAS Build & Submit
```json
// eas.json
{
"cli": { "version": ">= 5.0.0" },
"build": {
"development": {
"developmentClient": true,
"distribution": "internal",
"ios": { "simulator": true }
},
"preview": {
"distribution": "internal",
"android": { "buildType": "apk" }
},
"production": {
"autoIncrement": true
}
},
"submit": {
"production": {
"ios": { "appleId": "your@email.com", "ascAppId": "123456789" },
"android": { "serviceAccountKeyPath": "./google-services.json" }
}
}
}
```
```bash
# Build commands
eas build --platform ios --profile development
eas build --platform android --profile preview
eas build --platform all --profile production
# Submit to stores
eas submit --platform ios
eas submit --platform android
# OTA updates
eas update --branch production --message "Bug fixes"
```
## Best Practices
### Do's
- **Use Expo** - Faster development, OTA updates, managed native code
- **FlashList over FlatList** - Better performance for long lists
- **Memoize components** - Prevent unnecessary re-renders
- **Use Reanimated** - 60fps animations on native thread
- **Test on real devices** - Simulators miss real-world issues
### Don'ts
- **Don't inline styles** - Use StyleSheet.create for performance
- **Don't fetch in render** - Use useEffect or React Query
- **Don't ignore platform differences** - Test on both iOS and Android
- **Don't store secrets in code** - Use environment variables
- **Don't skip error boundaries** - Mobile crashes are unforgiving
## Resources
- [Expo Documentation](https://docs.expo.dev/)
- [Expo Router](https://docs.expo.dev/router/introduction/)
- [React Native Performance](https://reactnative.dev/docs/performance)
- [FlashList](https://shopify.github.io/flash-list/)

View File

@@ -0,0 +1,429 @@
---
name: react-state-management
description: Master modern React state management with Redux Toolkit, Zustand, Jotai, and React Query. Use when setting up global state, managing server state, or choosing between state management solutions.
---
# React State Management
Comprehensive guide to modern React state management patterns, from local component state to global stores and server state synchronization.
## When to Use This Skill
- Setting up global state management in a React app
- Choosing between Redux Toolkit, Zustand, or Jotai
- Managing server state with React Query or SWR
- Implementing optimistic updates
- Debugging state-related issues
- Migrating from legacy Redux to modern patterns
## Core Concepts
### 1. State Categories
| Type | Description | Solutions |
|------|-------------|-----------|
| **Local State** | Component-specific, UI state | useState, useReducer |
| **Global State** | Shared across components | Redux Toolkit, Zustand, Jotai |
| **Server State** | Remote data, caching | React Query, SWR, RTK Query |
| **URL State** | Route parameters, search | React Router, nuqs |
| **Form State** | Input values, validation | React Hook Form, Formik |
### 2. Selection Criteria
```
Small app, simple state → Zustand or Jotai
Large app, complex state → Redux Toolkit
Heavy server interaction → React Query + light client state
Atomic/granular updates → Jotai
```
## Quick Start
### Zustand (Simplest)
```typescript
// store/useStore.ts
import { create } from 'zustand'
import { devtools, persist } from 'zustand/middleware'
interface AppState {
user: User | null
theme: 'light' | 'dark'
setUser: (user: User | null) => void
toggleTheme: () => void
}
export const useStore = create<AppState>()(
devtools(
persist(
(set) => ({
user: null,
theme: 'light',
setUser: (user) => set({ user }),
toggleTheme: () => set((state) => ({
theme: state.theme === 'light' ? 'dark' : 'light'
})),
}),
{ name: 'app-storage' }
)
)
)
// Usage in component
function Header() {
const { user, theme, toggleTheme } = useStore()
return (
<header className={theme}>
{user?.name}
<button onClick={toggleTheme}>Toggle Theme</button>
</header>
)
}
```
## Patterns
### Pattern 1: Redux Toolkit with TypeScript
```typescript
// store/index.ts
import { configureStore } from '@reduxjs/toolkit'
import { TypedUseSelectorHook, useDispatch, useSelector } from 'react-redux'
import userReducer from './slices/userSlice'
import cartReducer from './slices/cartSlice'
export const store = configureStore({
reducer: {
user: userReducer,
cart: cartReducer,
},
middleware: (getDefaultMiddleware) =>
getDefaultMiddleware({
serializableCheck: {
ignoredActions: ['persist/PERSIST'],
},
}),
})
export type RootState = ReturnType<typeof store.getState>
export type AppDispatch = typeof store.dispatch
// Typed hooks
export const useAppDispatch: () => AppDispatch = useDispatch
export const useAppSelector: TypedUseSelectorHook<RootState> = useSelector
```
```typescript
// store/slices/userSlice.ts
import { createSlice, createAsyncThunk, PayloadAction } from '@reduxjs/toolkit'
interface User {
id: string
email: string
name: string
}
interface UserState {
current: User | null
status: 'idle' | 'loading' | 'succeeded' | 'failed'
error: string | null
}
const initialState: UserState = {
current: null,
status: 'idle',
error: null,
}
export const fetchUser = createAsyncThunk(
'user/fetchUser',
async (userId: string, { rejectWithValue }) => {
try {
const response = await fetch(`/api/users/${userId}`)
if (!response.ok) throw new Error('Failed to fetch user')
return await response.json()
} catch (error) {
return rejectWithValue((error as Error).message)
}
}
)
const userSlice = createSlice({
name: 'user',
initialState,
reducers: {
setUser: (state, action: PayloadAction<User>) => {
state.current = action.payload
state.status = 'succeeded'
},
clearUser: (state) => {
state.current = null
state.status = 'idle'
},
},
extraReducers: (builder) => {
builder
.addCase(fetchUser.pending, (state) => {
state.status = 'loading'
state.error = null
})
.addCase(fetchUser.fulfilled, (state, action) => {
state.status = 'succeeded'
state.current = action.payload
})
.addCase(fetchUser.rejected, (state, action) => {
state.status = 'failed'
state.error = action.payload as string
})
},
})
export const { setUser, clearUser } = userSlice.actions
export default userSlice.reducer
```
### Pattern 2: Zustand with Slices (Scalable)
```typescript
// store/slices/createUserSlice.ts
import { StateCreator } from 'zustand'
export interface UserSlice {
user: User | null
isAuthenticated: boolean
login: (credentials: Credentials) => Promise<void>
logout: () => void
}
export const createUserSlice: StateCreator<
UserSlice & CartSlice, // Combined store type
[],
[],
UserSlice
> = (set, get) => ({
user: null,
isAuthenticated: false,
login: async (credentials) => {
const user = await authApi.login(credentials)
set({ user, isAuthenticated: true })
},
logout: () => {
set({ user: null, isAuthenticated: false })
// Can access other slices
// get().clearCart()
},
})
// store/index.ts
import { create } from 'zustand'
import { createUserSlice, UserSlice } from './slices/createUserSlice'
import { createCartSlice, CartSlice } from './slices/createCartSlice'
type StoreState = UserSlice & CartSlice
export const useStore = create<StoreState>()((...args) => ({
...createUserSlice(...args),
...createCartSlice(...args),
}))
// Selective subscriptions (prevents unnecessary re-renders)
export const useUser = () => useStore((state) => state.user)
export const useCart = () => useStore((state) => state.cart)
```
### Pattern 3: Jotai for Atomic State
```typescript
// atoms/userAtoms.ts
import { atom } from 'jotai'
import { atomWithStorage } from 'jotai/utils'
// Basic atom
export const userAtom = atom<User | null>(null)
// Derived atom (computed)
export const isAuthenticatedAtom = atom((get) => get(userAtom) !== null)
// Atom with localStorage persistence
export const themeAtom = atomWithStorage<'light' | 'dark'>('theme', 'light')
// Async atom
export const userProfileAtom = atom(async (get) => {
const user = get(userAtom)
if (!user) return null
const response = await fetch(`/api/users/${user.id}/profile`)
return response.json()
})
// Write-only atom (action)
export const logoutAtom = atom(null, (get, set) => {
set(userAtom, null)
set(cartAtom, [])
localStorage.removeItem('token')
})
// Usage
function Profile() {
const [user] = useAtom(userAtom)
const [, logout] = useAtom(logoutAtom)
const [profile] = useAtom(userProfileAtom) // Suspense-enabled
return (
<Suspense fallback={<Skeleton />}>
<ProfileContent profile={profile} onLogout={logout} />
</Suspense>
)
}
```
### Pattern 4: React Query for Server State
```typescript
// hooks/useUsers.ts
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
// Query keys factory
export const userKeys = {
all: ['users'] as const,
lists: () => [...userKeys.all, 'list'] as const,
list: (filters: UserFilters) => [...userKeys.lists(), filters] as const,
details: () => [...userKeys.all, 'detail'] as const,
detail: (id: string) => [...userKeys.details(), id] as const,
}
// Fetch hook
export function useUsers(filters: UserFilters) {
return useQuery({
queryKey: userKeys.list(filters),
queryFn: () => fetchUsers(filters),
staleTime: 5 * 60 * 1000, // 5 minutes
gcTime: 30 * 60 * 1000, // 30 minutes (formerly cacheTime)
})
}
// Single user hook
export function useUser(id: string) {
return useQuery({
queryKey: userKeys.detail(id),
queryFn: () => fetchUser(id),
enabled: !!id, // Don't fetch if no id
})
}
// Mutation with optimistic update
export function useUpdateUser() {
const queryClient = useQueryClient()
return useMutation({
mutationFn: updateUser,
onMutate: async (newUser) => {
// Cancel outgoing refetches
await queryClient.cancelQueries({ queryKey: userKeys.detail(newUser.id) })
// Snapshot previous value
const previousUser = queryClient.getQueryData(userKeys.detail(newUser.id))
// Optimistically update
queryClient.setQueryData(userKeys.detail(newUser.id), newUser)
return { previousUser }
},
onError: (err, newUser, context) => {
// Rollback on error
queryClient.setQueryData(
userKeys.detail(newUser.id),
context?.previousUser
)
},
onSettled: (data, error, variables) => {
// Refetch after mutation
queryClient.invalidateQueries({ queryKey: userKeys.detail(variables.id) })
},
})
}
```
### Pattern 5: Combining Client + Server State
```typescript
// Zustand for client state
const useUIStore = create<UIState>((set) => ({
sidebarOpen: true,
modal: null,
toggleSidebar: () => set((s) => ({ sidebarOpen: !s.sidebarOpen })),
openModal: (modal) => set({ modal }),
closeModal: () => set({ modal: null }),
}))
// React Query for server state
function Dashboard() {
const { sidebarOpen, toggleSidebar } = useUIStore()
const { data: users, isLoading } = useUsers({ active: true })
const { data: stats } = useStats()
if (isLoading) return <DashboardSkeleton />
return (
<div className={sidebarOpen ? 'with-sidebar' : ''}>
<Sidebar open={sidebarOpen} onToggle={toggleSidebar} />
<main>
<StatsCards stats={stats} />
<UserTable users={users} />
</main>
</div>
)
}
```
## Best Practices
### Do's
- **Colocate state** - Keep state as close to where it's used as possible
- **Use selectors** - Prevent unnecessary re-renders with selective subscriptions
- **Normalize data** - Flatten nested structures for easier updates
- **Type everything** - Full TypeScript coverage prevents runtime errors
- **Separate concerns** - Server state (React Query) vs client state (Zustand)
### Don'ts
- **Don't over-globalize** - Not everything needs to be in global state
- **Don't duplicate server state** - Let React Query manage it
- **Don't mutate directly** - Always use immutable updates
- **Don't store derived data** - Compute it instead
- **Don't mix paradigms** - Pick one primary solution per category
## Migration Guides
### From Legacy Redux to RTK
```typescript
// Before (legacy Redux)
const ADD_TODO = 'ADD_TODO'
const addTodo = (text) => ({ type: ADD_TODO, payload: text })
function todosReducer(state = [], action) {
switch (action.type) {
case ADD_TODO:
return [...state, { text: action.payload, completed: false }]
default:
return state
}
}
// After (Redux Toolkit)
const todosSlice = createSlice({
name: 'todos',
initialState: [],
reducers: {
addTodo: (state, action: PayloadAction<string>) => {
// Immer allows "mutations"
state.push({ text: action.payload, completed: false })
},
},
})
```
## Resources
- [Redux Toolkit Documentation](https://redux-toolkit.js.org/)
- [Zustand GitHub](https://github.com/pmndrs/zustand)
- [Jotai Documentation](https://jotai.org/)
- [TanStack Query](https://tanstack.com/query)

View File

@@ -0,0 +1,666 @@
---
name: tailwind-design-system
description: Build scalable design systems with Tailwind CSS, design tokens, component libraries, and responsive patterns. Use when creating component libraries, implementing design systems, or standardizing UI patterns.
---
# Tailwind Design System
Build production-ready design systems with Tailwind CSS, including design tokens, component variants, responsive patterns, and accessibility.
## When to Use This Skill
- Creating a component library with Tailwind
- Implementing design tokens and theming
- Building responsive and accessible components
- Standardizing UI patterns across a codebase
- Migrating to or extending Tailwind CSS
- Setting up dark mode and color schemes
## Core Concepts
### 1. Design Token Hierarchy
```
Brand Tokens (abstract)
└── Semantic Tokens (purpose)
└── Component Tokens (specific)
Example:
blue-500 → primary → button-bg
```
### 2. Component Architecture
```
Base styles → Variants → Sizes → States → Overrides
```
## Quick Start
```typescript
// tailwind.config.ts
import type { Config } from 'tailwindcss'
const config: Config = {
content: ['./src/**/*.{js,ts,jsx,tsx,mdx}'],
darkMode: 'class',
theme: {
extend: {
colors: {
// Semantic color tokens
primary: {
DEFAULT: 'hsl(var(--primary))',
foreground: 'hsl(var(--primary-foreground))',
},
secondary: {
DEFAULT: 'hsl(var(--secondary))',
foreground: 'hsl(var(--secondary-foreground))',
},
destructive: {
DEFAULT: 'hsl(var(--destructive))',
foreground: 'hsl(var(--destructive-foreground))',
},
muted: {
DEFAULT: 'hsl(var(--muted))',
foreground: 'hsl(var(--muted-foreground))',
},
accent: {
DEFAULT: 'hsl(var(--accent))',
foreground: 'hsl(var(--accent-foreground))',
},
background: 'hsl(var(--background))',
foreground: 'hsl(var(--foreground))',
border: 'hsl(var(--border))',
ring: 'hsl(var(--ring))',
},
borderRadius: {
lg: 'var(--radius)',
md: 'calc(var(--radius) - 2px)',
sm: 'calc(var(--radius) - 4px)',
},
},
},
plugins: [require('tailwindcss-animate')],
}
export default config
```
```css
/* globals.css */
@tailwind base;
@tailwind components;
@tailwind utilities;
@layer base {
:root {
--background: 0 0% 100%;
--foreground: 222.2 84% 4.9%;
--primary: 222.2 47.4% 11.2%;
--primary-foreground: 210 40% 98%;
--secondary: 210 40% 96.1%;
--secondary-foreground: 222.2 47.4% 11.2%;
--muted: 210 40% 96.1%;
--muted-foreground: 215.4 16.3% 46.9%;
--accent: 210 40% 96.1%;
--accent-foreground: 222.2 47.4% 11.2%;
--destructive: 0 84.2% 60.2%;
--destructive-foreground: 210 40% 98%;
--border: 214.3 31.8% 91.4%;
--ring: 222.2 84% 4.9%;
--radius: 0.5rem;
}
.dark {
--background: 222.2 84% 4.9%;
--foreground: 210 40% 98%;
--primary: 210 40% 98%;
--primary-foreground: 222.2 47.4% 11.2%;
--secondary: 217.2 32.6% 17.5%;
--secondary-foreground: 210 40% 98%;
--muted: 217.2 32.6% 17.5%;
--muted-foreground: 215 20.2% 65.1%;
--accent: 217.2 32.6% 17.5%;
--accent-foreground: 210 40% 98%;
--destructive: 0 62.8% 30.6%;
--destructive-foreground: 210 40% 98%;
--border: 217.2 32.6% 17.5%;
--ring: 212.7 26.8% 83.9%;
}
}
```
## Patterns
### Pattern 1: CVA (Class Variance Authority) Components
```typescript
// components/ui/button.tsx
import { cva, type VariantProps } from 'class-variance-authority'
import { forwardRef } from 'react'
import { cn } from '@/lib/utils'
const buttonVariants = cva(
// Base styles
'inline-flex items-center justify-center whitespace-nowrap rounded-md text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50',
{
variants: {
variant: {
default: 'bg-primary text-primary-foreground hover:bg-primary/90',
destructive: 'bg-destructive text-destructive-foreground hover:bg-destructive/90',
outline: 'border border-input bg-background hover:bg-accent hover:text-accent-foreground',
secondary: 'bg-secondary text-secondary-foreground hover:bg-secondary/80',
ghost: 'hover:bg-accent hover:text-accent-foreground',
link: 'text-primary underline-offset-4 hover:underline',
},
size: {
default: 'h-10 px-4 py-2',
sm: 'h-9 rounded-md px-3',
lg: 'h-11 rounded-md px-8',
icon: 'h-10 w-10',
},
},
defaultVariants: {
variant: 'default',
size: 'default',
},
}
)
export interface ButtonProps
extends React.ButtonHTMLAttributes<HTMLButtonElement>,
VariantProps<typeof buttonVariants> {
asChild?: boolean
}
const Button = forwardRef<HTMLButtonElement, ButtonProps>(
({ className, variant, size, asChild = false, ...props }, ref) => {
const Comp = asChild ? Slot : 'button'
return (
<Comp
className={cn(buttonVariants({ variant, size, className }))}
ref={ref}
{...props}
/>
)
}
)
Button.displayName = 'Button'
export { Button, buttonVariants }
// Usage
<Button variant="destructive" size="lg">Delete</Button>
<Button variant="outline">Cancel</Button>
<Button asChild><Link href="/home">Home</Link></Button>
```
### Pattern 2: Compound Components
```typescript
// components/ui/card.tsx
import { cn } from '@/lib/utils'
import { forwardRef } from 'react'
const Card = forwardRef<HTMLDivElement, React.HTMLAttributes<HTMLDivElement>>(
({ className, ...props }, ref) => (
<div
ref={ref}
className={cn(
'rounded-lg border bg-card text-card-foreground shadow-sm',
className
)}
{...props}
/>
)
)
Card.displayName = 'Card'
const CardHeader = forwardRef<HTMLDivElement, React.HTMLAttributes<HTMLDivElement>>(
({ className, ...props }, ref) => (
<div
ref={ref}
className={cn('flex flex-col space-y-1.5 p-6', className)}
{...props}
/>
)
)
CardHeader.displayName = 'CardHeader'
const CardTitle = forwardRef<HTMLHeadingElement, React.HTMLAttributes<HTMLHeadingElement>>(
({ className, ...props }, ref) => (
<h3
ref={ref}
className={cn('text-2xl font-semibold leading-none tracking-tight', className)}
{...props}
/>
)
)
CardTitle.displayName = 'CardTitle'
const CardDescription = forwardRef<HTMLParagraphElement, React.HTMLAttributes<HTMLParagraphElement>>(
({ className, ...props }, ref) => (
<p
ref={ref}
className={cn('text-sm text-muted-foreground', className)}
{...props}
/>
)
)
CardDescription.displayName = 'CardDescription'
const CardContent = forwardRef<HTMLDivElement, React.HTMLAttributes<HTMLDivElement>>(
({ className, ...props }, ref) => (
<div ref={ref} className={cn('p-6 pt-0', className)} {...props} />
)
)
CardContent.displayName = 'CardContent'
const CardFooter = forwardRef<HTMLDivElement, React.HTMLAttributes<HTMLDivElement>>(
({ className, ...props }, ref) => (
<div
ref={ref}
className={cn('flex items-center p-6 pt-0', className)}
{...props}
/>
)
)
CardFooter.displayName = 'CardFooter'
export { Card, CardHeader, CardTitle, CardDescription, CardContent, CardFooter }
// Usage
<Card>
<CardHeader>
<CardTitle>Account</CardTitle>
<CardDescription>Manage your account settings</CardDescription>
</CardHeader>
<CardContent>
<form>...</form>
</CardContent>
<CardFooter>
<Button>Save</Button>
</CardFooter>
</Card>
```
### Pattern 3: Form Components
```typescript
// components/ui/input.tsx
import { forwardRef } from 'react'
import { cn } from '@/lib/utils'
export interface InputProps extends React.InputHTMLAttributes<HTMLInputElement> {
error?: string
}
const Input = forwardRef<HTMLInputElement, InputProps>(
({ className, type, error, ...props }, ref) => {
return (
<div className="relative">
<input
type={type}
className={cn(
'flex h-10 w-full rounded-md border border-input bg-background px-3 py-2 text-sm ring-offset-background file:border-0 file:bg-transparent file:text-sm file:font-medium placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50',
error && 'border-destructive focus-visible:ring-destructive',
className
)}
ref={ref}
aria-invalid={!!error}
aria-describedby={error ? `${props.id}-error` : undefined}
{...props}
/>
{error && (
<p
id={`${props.id}-error`}
className="mt-1 text-sm text-destructive"
role="alert"
>
{error}
</p>
)}
</div>
)
}
)
Input.displayName = 'Input'
// components/ui/label.tsx
import { cva, type VariantProps } from 'class-variance-authority'
const labelVariants = cva(
'text-sm font-medium leading-none peer-disabled:cursor-not-allowed peer-disabled:opacity-70'
)
const Label = forwardRef<HTMLLabelElement, React.LabelHTMLAttributes<HTMLLabelElement>>(
({ className, ...props }, ref) => (
<label ref={ref} className={cn(labelVariants(), className)} {...props} />
)
)
Label.displayName = 'Label'
// Usage with React Hook Form
import { useForm } from 'react-hook-form'
import { zodResolver } from '@hookform/resolvers/zod'
import * as z from 'zod'
const schema = z.object({
email: z.string().email('Invalid email address'),
password: z.string().min(8, 'Password must be at least 8 characters'),
})
function LoginForm() {
const { register, handleSubmit, formState: { errors } } = useForm({
resolver: zodResolver(schema),
})
return (
<form onSubmit={handleSubmit(onSubmit)} className="space-y-4">
<div className="space-y-2">
<Label htmlFor="email">Email</Label>
<Input
id="email"
type="email"
{...register('email')}
error={errors.email?.message}
/>
</div>
<div className="space-y-2">
<Label htmlFor="password">Password</Label>
<Input
id="password"
type="password"
{...register('password')}
error={errors.password?.message}
/>
</div>
<Button type="submit" className="w-full">Sign In</Button>
</form>
)
}
```
### Pattern 4: Responsive Grid System
```typescript
// components/ui/grid.tsx
import { cn } from '@/lib/utils'
import { cva, type VariantProps } from 'class-variance-authority'
const gridVariants = cva('grid', {
variants: {
cols: {
1: 'grid-cols-1',
2: 'grid-cols-1 sm:grid-cols-2',
3: 'grid-cols-1 sm:grid-cols-2 lg:grid-cols-3',
4: 'grid-cols-1 sm:grid-cols-2 lg:grid-cols-4',
5: 'grid-cols-2 sm:grid-cols-3 lg:grid-cols-5',
6: 'grid-cols-2 sm:grid-cols-3 lg:grid-cols-6',
},
gap: {
none: 'gap-0',
sm: 'gap-2',
md: 'gap-4',
lg: 'gap-6',
xl: 'gap-8',
},
},
defaultVariants: {
cols: 3,
gap: 'md',
},
})
interface GridProps
extends React.HTMLAttributes<HTMLDivElement>,
VariantProps<typeof gridVariants> {}
export function Grid({ className, cols, gap, ...props }: GridProps) {
return (
<div className={cn(gridVariants({ cols, gap, className }))} {...props} />
)
}
// Container component
const containerVariants = cva('mx-auto w-full px-4 sm:px-6 lg:px-8', {
variants: {
size: {
sm: 'max-w-screen-sm',
md: 'max-w-screen-md',
lg: 'max-w-screen-lg',
xl: 'max-w-screen-xl',
'2xl': 'max-w-screen-2xl',
full: 'max-w-full',
},
},
defaultVariants: {
size: 'xl',
},
})
interface ContainerProps
extends React.HTMLAttributes<HTMLDivElement>,
VariantProps<typeof containerVariants> {}
export function Container({ className, size, ...props }: ContainerProps) {
return (
<div className={cn(containerVariants({ size, className }))} {...props} />
)
}
// Usage
<Container>
<Grid cols={4} gap="lg">
{products.map((product) => (
<ProductCard key={product.id} product={product} />
))}
</Grid>
</Container>
```
### Pattern 5: Animation Utilities
```typescript
// lib/animations.ts - Tailwind CSS Animate utilities
import { cn } from './utils'
export const fadeIn = 'animate-in fade-in duration-300'
export const fadeOut = 'animate-out fade-out duration-300'
export const slideInFromTop = 'animate-in slide-in-from-top duration-300'
export const slideInFromBottom = 'animate-in slide-in-from-bottom duration-300'
export const slideInFromLeft = 'animate-in slide-in-from-left duration-300'
export const slideInFromRight = 'animate-in slide-in-from-right duration-300'
export const zoomIn = 'animate-in zoom-in-95 duration-300'
export const zoomOut = 'animate-out zoom-out-95 duration-300'
// Compound animations
export const modalEnter = cn(fadeIn, zoomIn, 'duration-200')
export const modalExit = cn(fadeOut, zoomOut, 'duration-200')
export const dropdownEnter = cn(fadeIn, slideInFromTop, 'duration-150')
export const dropdownExit = cn(fadeOut, 'slide-out-to-top', 'duration-150')
// components/ui/dialog.tsx
import * as DialogPrimitive from '@radix-ui/react-dialog'
const DialogOverlay = forwardRef<
React.ElementRef<typeof DialogPrimitive.Overlay>,
React.ComponentPropsWithoutRef<typeof DialogPrimitive.Overlay>
>(({ className, ...props }, ref) => (
<DialogPrimitive.Overlay
ref={ref}
className={cn(
'fixed inset-0 z-50 bg-black/80',
'data-[state=open]:animate-in data-[state=closed]:animate-out',
'data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0',
className
)}
{...props}
/>
))
const DialogContent = forwardRef<
React.ElementRef<typeof DialogPrimitive.Content>,
React.ComponentPropsWithoutRef<typeof DialogPrimitive.Content>
>(({ className, children, ...props }, ref) => (
<DialogPortal>
<DialogOverlay />
<DialogPrimitive.Content
ref={ref}
className={cn(
'fixed left-[50%] top-[50%] z-50 grid w-full max-w-lg translate-x-[-50%] translate-y-[-50%] gap-4 border bg-background p-6 shadow-lg',
'data-[state=open]:animate-in data-[state=closed]:animate-out',
'data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0',
'data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95',
'data-[state=closed]:slide-out-to-left-1/2 data-[state=closed]:slide-out-to-top-[48%]',
'data-[state=open]:slide-in-from-left-1/2 data-[state=open]:slide-in-from-top-[48%]',
'sm:rounded-lg',
className
)}
{...props}
>
{children}
</DialogPrimitive.Content>
</DialogPortal>
))
```
### Pattern 6: Dark Mode Implementation
```typescript
// providers/ThemeProvider.tsx
'use client'
import { createContext, useContext, useEffect, useState } from 'react'
type Theme = 'dark' | 'light' | 'system'
interface ThemeProviderProps {
children: React.ReactNode
defaultTheme?: Theme
storageKey?: string
}
interface ThemeContextType {
theme: Theme
setTheme: (theme: Theme) => void
resolvedTheme: 'dark' | 'light'
}
const ThemeContext = createContext<ThemeContextType | undefined>(undefined)
export function ThemeProvider({
children,
defaultTheme = 'system',
storageKey = 'theme',
}: ThemeProviderProps) {
const [theme, setTheme] = useState<Theme>(defaultTheme)
const [resolvedTheme, setResolvedTheme] = useState<'dark' | 'light'>('light')
useEffect(() => {
const stored = localStorage.getItem(storageKey) as Theme | null
if (stored) setTheme(stored)
}, [storageKey])
useEffect(() => {
const root = window.document.documentElement
root.classList.remove('light', 'dark')
let resolved: 'dark' | 'light'
if (theme === 'system') {
resolved = window.matchMedia('(prefers-color-scheme: dark)').matches
? 'dark'
: 'light'
} else {
resolved = theme
}
root.classList.add(resolved)
setResolvedTheme(resolved)
}, [theme])
const value = {
theme,
setTheme: (newTheme: Theme) => {
localStorage.setItem(storageKey, newTheme)
setTheme(newTheme)
},
resolvedTheme,
}
return (
<ThemeContext.Provider value={value}>{children}</ThemeContext.Provider>
)
}
export const useTheme = () => {
const context = useContext(ThemeContext)
if (!context) throw new Error('useTheme must be used within ThemeProvider')
return context
}
// components/ThemeToggle.tsx
import { Moon, Sun } from 'lucide-react'
import { useTheme } from '@/providers/ThemeProvider'
export function ThemeToggle() {
const { resolvedTheme, setTheme } = useTheme()
return (
<Button
variant="ghost"
size="icon"
onClick={() => setTheme(resolvedTheme === 'dark' ? 'light' : 'dark')}
>
<Sun className="h-5 w-5 rotate-0 scale-100 transition-all dark:-rotate-90 dark:scale-0" />
<Moon className="absolute h-5 w-5 rotate-90 scale-0 transition-all dark:rotate-0 dark:scale-100" />
<span className="sr-only">Toggle theme</span>
</Button>
)
}
```
## Utility Functions
```typescript
// lib/utils.ts
import { type ClassValue, clsx } from 'clsx'
import { twMerge } from 'tailwind-merge'
export function cn(...inputs: ClassValue[]) {
return twMerge(clsx(inputs))
}
// Focus ring utility
export const focusRing = cn(
'focus-visible:outline-none focus-visible:ring-2',
'focus-visible:ring-ring focus-visible:ring-offset-2'
)
// Disabled utility
export const disabled = 'disabled:pointer-events-none disabled:opacity-50'
```
## Best Practices
### Do's
- **Use CSS variables** - Enable runtime theming
- **Compose with CVA** - Type-safe variants
- **Use semantic colors** - `primary` not `blue-500`
- **Forward refs** - Enable composition
- **Add accessibility** - ARIA attributes, focus states
### Don'ts
- **Don't use arbitrary values** - Extend theme instead
- **Don't nest @apply** - Hurts readability
- **Don't skip focus states** - Keyboard users need them
- **Don't hardcode colors** - Use semantic tokens
- **Don't forget dark mode** - Test both themes
## Resources
- [Tailwind CSS Documentation](https://tailwindcss.com/docs)
- [CVA Documentation](https://cva.style/docs)
- [shadcn/ui](https://ui.shadcn.com/)
- [Radix Primitives](https://www.radix-ui.com/primitives)

View File

@@ -0,0 +1,805 @@
---
name: godot-gdscript-patterns
description: Master Godot 4 GDScript patterns including signals, scenes, state machines, and optimization. Use when building Godot games, implementing game systems, or learning GDScript best practices.
---
# Godot GDScript Patterns
Production patterns for Godot 4.x game development with GDScript, covering architecture, signals, scenes, and optimization.
## When to Use This Skill
- Building games with Godot 4
- Implementing game systems in GDScript
- Designing scene architecture
- Managing game state
- Optimizing GDScript performance
- Learning Godot best practices
## Core Concepts
### 1. Godot Architecture
```
Node: Base building block
├── Scene: Reusable node tree (saved as .tscn)
├── Resource: Data container (saved as .tres)
├── Signal: Event communication
└── Group: Node categorization
```
### 2. GDScript Basics
```gdscript
class_name Player
extends CharacterBody2D
# Signals
signal health_changed(new_health: int)
signal died
# Exports (Inspector-editable)
@export var speed: float = 200.0
@export var max_health: int = 100
@export_range(0, 1) var damage_reduction: float = 0.0
@export_group("Combat")
@export var attack_damage: int = 10
@export var attack_cooldown: float = 0.5
# Onready (initialized when ready)
@onready var sprite: Sprite2D = $Sprite2D
@onready var animation: AnimationPlayer = $AnimationPlayer
@onready var hitbox: Area2D = $Hitbox
# Private variables (convention: underscore prefix)
var _health: int
var _can_attack: bool = true
func _ready() -> void:
_health = max_health
func _physics_process(delta: float) -> void:
var direction := Input.get_vector("left", "right", "up", "down")
velocity = direction * speed
move_and_slide()
func take_damage(amount: int) -> void:
var actual_damage := int(amount * (1.0 - damage_reduction))
_health = max(_health - actual_damage, 0)
health_changed.emit(_health)
if _health <= 0:
died.emit()
```
## Patterns
### Pattern 1: State Machine
```gdscript
# state_machine.gd
class_name StateMachine
extends Node
signal state_changed(from_state: StringName, to_state: StringName)
@export var initial_state: State
var current_state: State
var states: Dictionary = {}
func _ready() -> void:
# Register all State children
for child in get_children():
if child is State:
states[child.name] = child
child.state_machine = self
child.process_mode = Node.PROCESS_MODE_DISABLED
# Start initial state
if initial_state:
current_state = initial_state
current_state.process_mode = Node.PROCESS_MODE_INHERIT
current_state.enter()
func _process(delta: float) -> void:
if current_state:
current_state.update(delta)
func _physics_process(delta: float) -> void:
if current_state:
current_state.physics_update(delta)
func _unhandled_input(event: InputEvent) -> void:
if current_state:
current_state.handle_input(event)
func transition_to(state_name: StringName, msg: Dictionary = {}) -> void:
if not states.has(state_name):
push_error("State '%s' not found" % state_name)
return
var previous_state := current_state
previous_state.exit()
previous_state.process_mode = Node.PROCESS_MODE_DISABLED
current_state = states[state_name]
current_state.process_mode = Node.PROCESS_MODE_INHERIT
current_state.enter(msg)
state_changed.emit(previous_state.name, current_state.name)
```
```gdscript
# state.gd
class_name State
extends Node
var state_machine: StateMachine
func enter(_msg: Dictionary = {}) -> void:
pass
func exit() -> void:
pass
func update(_delta: float) -> void:
pass
func physics_update(_delta: float) -> void:
pass
func handle_input(_event: InputEvent) -> void:
pass
```
```gdscript
# player_idle.gd
class_name PlayerIdle
extends State
@export var player: Player
func enter(_msg: Dictionary = {}) -> void:
player.animation.play("idle")
func physics_update(_delta: float) -> void:
var direction := Input.get_vector("left", "right", "up", "down")
if direction != Vector2.ZERO:
state_machine.transition_to("Move")
func handle_input(event: InputEvent) -> void:
if event.is_action_pressed("attack"):
state_machine.transition_to("Attack")
elif event.is_action_pressed("jump"):
state_machine.transition_to("Jump")
```
### Pattern 2: Autoload Singletons
```gdscript
# game_manager.gd (Add to Project Settings > Autoload)
extends Node
signal game_started
signal game_paused(is_paused: bool)
signal game_over(won: bool)
signal score_changed(new_score: int)
enum GameState { MENU, PLAYING, PAUSED, GAME_OVER }
var state: GameState = GameState.MENU
var score: int = 0:
set(value):
score = value
score_changed.emit(score)
var high_score: int = 0
func _ready() -> void:
process_mode = Node.PROCESS_MODE_ALWAYS
_load_high_score()
func _input(event: InputEvent) -> void:
if event.is_action_pressed("pause") and state == GameState.PLAYING:
toggle_pause()
func start_game() -> void:
score = 0
state = GameState.PLAYING
game_started.emit()
func toggle_pause() -> void:
var is_paused := state != GameState.PAUSED
if is_paused:
state = GameState.PAUSED
get_tree().paused = true
else:
state = GameState.PLAYING
get_tree().paused = false
game_paused.emit(is_paused)
func end_game(won: bool) -> void:
state = GameState.GAME_OVER
if score > high_score:
high_score = score
_save_high_score()
game_over.emit(won)
func add_score(points: int) -> void:
score += points
func _load_high_score() -> void:
if FileAccess.file_exists("user://high_score.save"):
var file := FileAccess.open("user://high_score.save", FileAccess.READ)
high_score = file.get_32()
func _save_high_score() -> void:
var file := FileAccess.open("user://high_score.save", FileAccess.WRITE)
file.store_32(high_score)
```
```gdscript
# event_bus.gd (Global signal bus)
extends Node
# Player events
signal player_spawned(player: Node2D)
signal player_died(player: Node2D)
signal player_health_changed(health: int, max_health: int)
# Enemy events
signal enemy_spawned(enemy: Node2D)
signal enemy_died(enemy: Node2D, position: Vector2)
# Item events
signal item_collected(item_type: StringName, value: int)
signal powerup_activated(powerup_type: StringName)
# Level events
signal level_started(level_number: int)
signal level_completed(level_number: int, time: float)
signal checkpoint_reached(checkpoint_id: int)
```
### Pattern 3: Resource-based Data
```gdscript
# weapon_data.gd
class_name WeaponData
extends Resource
@export var name: StringName
@export var damage: int
@export var attack_speed: float
@export var range: float
@export_multiline var description: String
@export var icon: Texture2D
@export var projectile_scene: PackedScene
@export var sound_attack: AudioStream
```
```gdscript
# character_stats.gd
class_name CharacterStats
extends Resource
signal stat_changed(stat_name: StringName, new_value: float)
@export var max_health: float = 100.0
@export var attack: float = 10.0
@export var defense: float = 5.0
@export var speed: float = 200.0
# Runtime values (not saved)
var _current_health: float
func _init() -> void:
_current_health = max_health
func get_current_health() -> float:
return _current_health
func take_damage(amount: float) -> float:
var actual_damage := maxf(amount - defense, 1.0)
_current_health = maxf(_current_health - actual_damage, 0.0)
stat_changed.emit("health", _current_health)
return actual_damage
func heal(amount: float) -> void:
_current_health = minf(_current_health + amount, max_health)
stat_changed.emit("health", _current_health)
func duplicate_for_runtime() -> CharacterStats:
var copy := duplicate() as CharacterStats
copy._current_health = copy.max_health
return copy
```
```gdscript
# Using resources
class_name Character
extends CharacterBody2D
@export var base_stats: CharacterStats
@export var weapon: WeaponData
var stats: CharacterStats
func _ready() -> void:
# Create runtime copy to avoid modifying the resource
stats = base_stats.duplicate_for_runtime()
stats.stat_changed.connect(_on_stat_changed)
func attack() -> void:
if weapon:
print("Attacking with %s for %d damage" % [weapon.name, weapon.damage])
func _on_stat_changed(stat_name: StringName, value: float) -> void:
if stat_name == "health" and value <= 0:
die()
```
### Pattern 4: Object Pooling
```gdscript
# object_pool.gd
class_name ObjectPool
extends Node
@export var pooled_scene: PackedScene
@export var initial_size: int = 10
@export var can_grow: bool = true
var _available: Array[Node] = []
var _in_use: Array[Node] = []
func _ready() -> void:
_initialize_pool()
func _initialize_pool() -> void:
for i in initial_size:
_create_instance()
func _create_instance() -> Node:
var instance := pooled_scene.instantiate()
instance.process_mode = Node.PROCESS_MODE_DISABLED
instance.visible = false
add_child(instance)
_available.append(instance)
# Connect return signal if exists
if instance.has_signal("returned_to_pool"):
instance.returned_to_pool.connect(_return_to_pool.bind(instance))
return instance
func get_instance() -> Node:
var instance: Node
if _available.is_empty():
if can_grow:
instance = _create_instance()
_available.erase(instance)
else:
push_warning("Pool exhausted and cannot grow")
return null
else:
instance = _available.pop_back()
instance.process_mode = Node.PROCESS_MODE_INHERIT
instance.visible = true
_in_use.append(instance)
if instance.has_method("on_spawn"):
instance.on_spawn()
return instance
func _return_to_pool(instance: Node) -> void:
if not instance in _in_use:
return
_in_use.erase(instance)
if instance.has_method("on_despawn"):
instance.on_despawn()
instance.process_mode = Node.PROCESS_MODE_DISABLED
instance.visible = false
_available.append(instance)
func return_all() -> void:
for instance in _in_use.duplicate():
_return_to_pool(instance)
```
```gdscript
# pooled_bullet.gd
class_name PooledBullet
extends Area2D
signal returned_to_pool
@export var speed: float = 500.0
@export var lifetime: float = 5.0
var direction: Vector2
var _timer: float
func on_spawn() -> void:
_timer = lifetime
func on_despawn() -> void:
direction = Vector2.ZERO
func initialize(pos: Vector2, dir: Vector2) -> void:
global_position = pos
direction = dir.normalized()
rotation = direction.angle()
func _physics_process(delta: float) -> void:
position += direction * speed * delta
_timer -= delta
if _timer <= 0:
returned_to_pool.emit()
func _on_body_entered(body: Node2D) -> void:
if body.has_method("take_damage"):
body.take_damage(10)
returned_to_pool.emit()
```
### Pattern 5: Component System
```gdscript
# health_component.gd
class_name HealthComponent
extends Node
signal health_changed(current: int, maximum: int)
signal damaged(amount: int, source: Node)
signal healed(amount: int)
signal died
@export var max_health: int = 100
@export var invincibility_time: float = 0.0
var current_health: int:
set(value):
var old := current_health
current_health = clampi(value, 0, max_health)
if current_health != old:
health_changed.emit(current_health, max_health)
var _invincible: bool = false
func _ready() -> void:
current_health = max_health
func take_damage(amount: int, source: Node = null) -> int:
if _invincible or current_health <= 0:
return 0
var actual := mini(amount, current_health)
current_health -= actual
damaged.emit(actual, source)
if current_health <= 0:
died.emit()
elif invincibility_time > 0:
_start_invincibility()
return actual
func heal(amount: int) -> int:
var actual := mini(amount, max_health - current_health)
current_health += actual
if actual > 0:
healed.emit(actual)
return actual
func _start_invincibility() -> void:
_invincible = true
await get_tree().create_timer(invincibility_time).timeout
_invincible = false
```
```gdscript
# hitbox_component.gd
class_name HitboxComponent
extends Area2D
signal hit(hurtbox: HurtboxComponent)
@export var damage: int = 10
@export var knockback_force: float = 200.0
var owner_node: Node
func _ready() -> void:
owner_node = get_parent()
area_entered.connect(_on_area_entered)
func _on_area_entered(area: Area2D) -> void:
if area is HurtboxComponent:
var hurtbox := area as HurtboxComponent
if hurtbox.owner_node != owner_node:
hit.emit(hurtbox)
hurtbox.receive_hit(self)
```
```gdscript
# hurtbox_component.gd
class_name HurtboxComponent
extends Area2D
signal hurt(hitbox: HitboxComponent)
@export var health_component: HealthComponent
var owner_node: Node
func _ready() -> void:
owner_node = get_parent()
func receive_hit(hitbox: HitboxComponent) -> void:
hurt.emit(hitbox)
if health_component:
health_component.take_damage(hitbox.damage, hitbox.owner_node)
```
### Pattern 6: Scene Management
```gdscript
# scene_manager.gd (Autoload)
extends Node
signal scene_loading_started(scene_path: String)
signal scene_loading_progress(progress: float)
signal scene_loaded(scene: Node)
signal transition_started
signal transition_finished
@export var transition_scene: PackedScene
@export var loading_scene: PackedScene
var _current_scene: Node
var _transition: CanvasLayer
var _loader: ResourceLoader
func _ready() -> void:
_current_scene = get_tree().current_scene
if transition_scene:
_transition = transition_scene.instantiate()
add_child(_transition)
_transition.visible = false
func change_scene(scene_path: String, with_transition: bool = true) -> void:
if with_transition:
await _play_transition_out()
_load_scene(scene_path)
func change_scene_packed(scene: PackedScene, with_transition: bool = true) -> void:
if with_transition:
await _play_transition_out()
_swap_scene(scene.instantiate())
func _load_scene(path: String) -> void:
scene_loading_started.emit(path)
# Check if already loaded
if ResourceLoader.has_cached(path):
var scene := load(path) as PackedScene
_swap_scene(scene.instantiate())
return
# Async loading
ResourceLoader.load_threaded_request(path)
while true:
var progress := []
var status := ResourceLoader.load_threaded_get_status(path, progress)
match status:
ResourceLoader.THREAD_LOAD_IN_PROGRESS:
scene_loading_progress.emit(progress[0])
await get_tree().process_frame
ResourceLoader.THREAD_LOAD_LOADED:
var scene := ResourceLoader.load_threaded_get(path) as PackedScene
_swap_scene(scene.instantiate())
return
_:
push_error("Failed to load scene: %s" % path)
return
func _swap_scene(new_scene: Node) -> void:
if _current_scene:
_current_scene.queue_free()
_current_scene = new_scene
get_tree().root.add_child(_current_scene)
get_tree().current_scene = _current_scene
scene_loaded.emit(_current_scene)
await _play_transition_in()
func _play_transition_out() -> void:
if not _transition:
return
transition_started.emit()
_transition.visible = true
if _transition.has_method("transition_out"):
await _transition.transition_out()
else:
await get_tree().create_timer(0.3).timeout
func _play_transition_in() -> void:
if not _transition:
transition_finished.emit()
return
if _transition.has_method("transition_in"):
await _transition.transition_in()
else:
await get_tree().create_timer(0.3).timeout
_transition.visible = false
transition_finished.emit()
```
### Pattern 7: Save System
```gdscript
# save_manager.gd (Autoload)
extends Node
const SAVE_PATH := "user://savegame.save"
const ENCRYPTION_KEY := "your_secret_key_here"
signal save_completed
signal load_completed
signal save_error(message: String)
func save_game(data: Dictionary) -> void:
var file := FileAccess.open_encrypted_with_pass(
SAVE_PATH,
FileAccess.WRITE,
ENCRYPTION_KEY
)
if file == null:
save_error.emit("Could not open save file")
return
var json := JSON.stringify(data)
file.store_string(json)
file.close()
save_completed.emit()
func load_game() -> Dictionary:
if not FileAccess.file_exists(SAVE_PATH):
return {}
var file := FileAccess.open_encrypted_with_pass(
SAVE_PATH,
FileAccess.READ,
ENCRYPTION_KEY
)
if file == null:
save_error.emit("Could not open save file")
return {}
var json := file.get_as_text()
file.close()
var parsed := JSON.parse_string(json)
if parsed == null:
save_error.emit("Could not parse save data")
return {}
load_completed.emit()
return parsed
func delete_save() -> void:
if FileAccess.file_exists(SAVE_PATH):
DirAccess.remove_absolute(SAVE_PATH)
func has_save() -> bool:
return FileAccess.file_exists(SAVE_PATH)
```
```gdscript
# saveable.gd (Attach to saveable nodes)
class_name Saveable
extends Node
@export var save_id: String
func _ready() -> void:
if save_id.is_empty():
save_id = str(get_path())
func get_save_data() -> Dictionary:
var parent := get_parent()
var data := {"id": save_id}
if parent is Node2D:
data["position"] = {"x": parent.position.x, "y": parent.position.y}
if parent.has_method("get_custom_save_data"):
data.merge(parent.get_custom_save_data())
return data
func load_save_data(data: Dictionary) -> void:
var parent := get_parent()
if data.has("position") and parent is Node2D:
parent.position = Vector2(data.position.x, data.position.y)
if parent.has_method("load_custom_save_data"):
parent.load_custom_save_data(data)
```
## Performance Tips
```gdscript
# 1. Cache node references
@onready var sprite := $Sprite2D # Good
# $Sprite2D in _process() # Bad - repeated lookup
# 2. Use object pooling for frequent spawning
# See Pattern 4
# 3. Avoid allocations in hot paths
var _reusable_array: Array = []
func _process(_delta: float) -> void:
_reusable_array.clear() # Reuse instead of creating new
# 4. Use static typing
func calculate(value: float) -> float: # Good
return value * 2.0
# 5. Disable processing when not needed
func _on_off_screen() -> void:
set_process(false)
set_physics_process(false)
```
## Best Practices
### Do's
- **Use signals for decoupling** - Avoid direct references
- **Type everything** - Static typing catches errors
- **Use resources for data** - Separate data from logic
- **Pool frequently spawned objects** - Avoid GC hitches
- **Use Autoloads sparingly** - Only for truly global systems
### Don'ts
- **Don't use `get_node()` in loops** - Cache references
- **Don't couple scenes tightly** - Use signals
- **Don't put logic in resources** - Keep them data-only
- **Don't ignore the Profiler** - Monitor performance
- **Don't fight the scene tree** - Work with Godot's design
## Resources
- [Godot Documentation](https://docs.godotengine.org/en/stable/)
- [GDQuest Tutorials](https://www.gdquest.com/)
- [Godot Recipes](https://kidscancode.org/godot_recipes/)

View File

@@ -0,0 +1,626 @@
---
name: unity-ecs-patterns
description: Master Unity ECS (Entity Component System) with DOTS, Jobs, and Burst for high-performance game development. Use when building data-oriented games, optimizing performance, or working with large entity counts.
---
# Unity ECS Patterns
Production patterns for Unity's Data-Oriented Technology Stack (DOTS) including Entity Component System, Job System, and Burst Compiler.
## When to Use This Skill
- Building high-performance Unity games
- Managing thousands of entities efficiently
- Implementing data-oriented game systems
- Optimizing CPU-bound game logic
- Converting OOP game code to ECS
- Using Jobs and Burst for parallelization
## Core Concepts
### 1. ECS vs OOP
| Aspect | Traditional OOP | ECS/DOTS |
|--------|-----------------|----------|
| Data layout | Object-oriented | Data-oriented |
| Memory | Scattered | Contiguous |
| Processing | Per-object | Batched |
| Scaling | Poor with count | Linear scaling |
| Best for | Complex behaviors | Mass simulation |
### 2. DOTS Components
```
Entity: Lightweight ID (no data)
Component: Pure data (no behavior)
System: Logic that processes components
World: Container for entities
Archetype: Unique combination of components
Chunk: Memory block for same-archetype entities
```
## Patterns
### Pattern 1: Basic ECS Setup
```csharp
using Unity.Entities;
using Unity.Mathematics;
using Unity.Transforms;
using Unity.Burst;
using Unity.Collections;
// Component: Pure data, no methods
public struct Speed : IComponentData
{
public float Value;
}
public struct Health : IComponentData
{
public float Current;
public float Max;
}
public struct Target : IComponentData
{
public Entity Value;
}
// Tag component (zero-size marker)
public struct EnemyTag : IComponentData { }
public struct PlayerTag : IComponentData { }
// Buffer component (variable-size array)
[InternalBufferCapacity(8)]
public struct InventoryItem : IBufferElementData
{
public int ItemId;
public int Quantity;
}
// Shared component (grouped entities)
public struct TeamId : ISharedComponentData
{
public int Value;
}
```
### Pattern 2: Systems with ISystem (Recommended)
```csharp
using Unity.Entities;
using Unity.Transforms;
using Unity.Mathematics;
using Unity.Burst;
// ISystem: Unmanaged, Burst-compatible, highest performance
[BurstCompile]
public partial struct MovementSystem : ISystem
{
[BurstCompile]
public void OnCreate(ref SystemState state)
{
// Require components before system runs
state.RequireForUpdate<Speed>();
}
[BurstCompile]
public void OnUpdate(ref SystemState state)
{
float deltaTime = SystemAPI.Time.DeltaTime;
// Simple foreach - auto-generates job
foreach (var (transform, speed) in
SystemAPI.Query<RefRW<LocalTransform>, RefRO<Speed>>())
{
transform.ValueRW.Position +=
new float3(0, 0, speed.ValueRO.Value * deltaTime);
}
}
[BurstCompile]
public void OnDestroy(ref SystemState state) { }
}
// With explicit job for more control
[BurstCompile]
public partial struct MovementJobSystem : ISystem
{
[BurstCompile]
public void OnUpdate(ref SystemState state)
{
var job = new MoveJob
{
DeltaTime = SystemAPI.Time.DeltaTime
};
state.Dependency = job.ScheduleParallel(state.Dependency);
}
}
[BurstCompile]
public partial struct MoveJob : IJobEntity
{
public float DeltaTime;
void Execute(ref LocalTransform transform, in Speed speed)
{
transform.Position += new float3(0, 0, speed.Value * DeltaTime);
}
}
```
### Pattern 3: Entity Queries
```csharp
[BurstCompile]
public partial struct QueryExamplesSystem : ISystem
{
private EntityQuery _enemyQuery;
public void OnCreate(ref SystemState state)
{
// Build query manually for complex cases
_enemyQuery = new EntityQueryBuilder(Allocator.Temp)
.WithAll<EnemyTag, Health, LocalTransform>()
.WithNone<Dead>()
.WithOptions(EntityQueryOptions.FilterWriteGroup)
.Build(ref state);
}
[BurstCompile]
public void OnUpdate(ref SystemState state)
{
// SystemAPI.Query - simplest approach
foreach (var (health, entity) in
SystemAPI.Query<RefRW<Health>>()
.WithAll<EnemyTag>()
.WithEntityAccess())
{
if (health.ValueRO.Current <= 0)
{
// Mark for destruction
SystemAPI.GetSingleton<EndSimulationEntityCommandBufferSystem.Singleton>()
.CreateCommandBuffer(state.WorldUnmanaged)
.DestroyEntity(entity);
}
}
// Get count
int enemyCount = _enemyQuery.CalculateEntityCount();
// Get all entities
var enemies = _enemyQuery.ToEntityArray(Allocator.Temp);
// Get component arrays
var healths = _enemyQuery.ToComponentDataArray<Health>(Allocator.Temp);
}
}
```
### Pattern 4: Entity Command Buffers (Structural Changes)
```csharp
// Structural changes (create/destroy/add/remove) require command buffers
[BurstCompile]
[UpdateInGroup(typeof(SimulationSystemGroup))]
public partial struct SpawnSystem : ISystem
{
[BurstCompile]
public void OnUpdate(ref SystemState state)
{
var ecbSingleton = SystemAPI.GetSingleton<BeginSimulationEntityCommandBufferSystem.Singleton>();
var ecb = ecbSingleton.CreateCommandBuffer(state.WorldUnmanaged);
foreach (var (spawner, transform) in
SystemAPI.Query<RefRW<Spawner>, RefRO<LocalTransform>>())
{
spawner.ValueRW.Timer -= SystemAPI.Time.DeltaTime;
if (spawner.ValueRO.Timer <= 0)
{
spawner.ValueRW.Timer = spawner.ValueRO.Interval;
// Create entity (deferred until sync point)
Entity newEntity = ecb.Instantiate(spawner.ValueRO.Prefab);
// Set component values
ecb.SetComponent(newEntity, new LocalTransform
{
Position = transform.ValueRO.Position,
Rotation = quaternion.identity,
Scale = 1f
});
// Add component
ecb.AddComponent(newEntity, new Speed { Value = 5f });
}
}
}
}
// Parallel ECB usage
[BurstCompile]
public partial struct ParallelSpawnJob : IJobEntity
{
public EntityCommandBuffer.ParallelWriter ECB;
void Execute([EntityIndexInQuery] int index, in Spawner spawner)
{
Entity e = ECB.Instantiate(index, spawner.Prefab);
ECB.AddComponent(index, e, new Speed { Value = 5f });
}
}
```
### Pattern 5: Aspect (Grouping Components)
```csharp
using Unity.Entities;
using Unity.Transforms;
using Unity.Mathematics;
// Aspect: Groups related components for cleaner code
public readonly partial struct CharacterAspect : IAspect
{
public readonly Entity Entity;
private readonly RefRW<LocalTransform> _transform;
private readonly RefRO<Speed> _speed;
private readonly RefRW<Health> _health;
// Optional component
[Optional]
private readonly RefRO<Shield> _shield;
// Buffer
private readonly DynamicBuffer<InventoryItem> _inventory;
public float3 Position
{
get => _transform.ValueRO.Position;
set => _transform.ValueRW.Position = value;
}
public float CurrentHealth => _health.ValueRO.Current;
public float MaxHealth => _health.ValueRO.Max;
public float MoveSpeed => _speed.ValueRO.Value;
public bool HasShield => _shield.IsValid;
public float ShieldAmount => HasShield ? _shield.ValueRO.Amount : 0f;
public void TakeDamage(float amount)
{
float remaining = amount;
if (HasShield && _shield.ValueRO.Amount > 0)
{
// Shield absorbs damage first
remaining = math.max(0, amount - _shield.ValueRO.Amount);
}
_health.ValueRW.Current = math.max(0, _health.ValueRO.Current - remaining);
}
public void Move(float3 direction, float deltaTime)
{
_transform.ValueRW.Position += direction * _speed.ValueRO.Value * deltaTime;
}
public void AddItem(int itemId, int quantity)
{
_inventory.Add(new InventoryItem { ItemId = itemId, Quantity = quantity });
}
}
// Using aspect in system
[BurstCompile]
public partial struct CharacterSystem : ISystem
{
[BurstCompile]
public void OnUpdate(ref SystemState state)
{
float dt = SystemAPI.Time.DeltaTime;
foreach (var character in SystemAPI.Query<CharacterAspect>())
{
character.Move(new float3(1, 0, 0), dt);
if (character.CurrentHealth < character.MaxHealth * 0.5f)
{
// Low health logic
}
}
}
}
```
### Pattern 6: Singleton Components
```csharp
// Singleton: Exactly one entity with this component
public struct GameConfig : IComponentData
{
public float DifficultyMultiplier;
public int MaxEnemies;
public float SpawnRate;
}
public struct GameState : IComponentData
{
public int Score;
public int Wave;
public float TimeRemaining;
}
// Create singleton on world creation
public partial struct GameInitSystem : ISystem
{
public void OnCreate(ref SystemState state)
{
var entity = state.EntityManager.CreateEntity();
state.EntityManager.AddComponentData(entity, new GameConfig
{
DifficultyMultiplier = 1.0f,
MaxEnemies = 100,
SpawnRate = 2.0f
});
state.EntityManager.AddComponentData(entity, new GameState
{
Score = 0,
Wave = 1,
TimeRemaining = 120f
});
}
}
// Access singleton in system
[BurstCompile]
public partial struct ScoreSystem : ISystem
{
[BurstCompile]
public void OnUpdate(ref SystemState state)
{
// Read singleton
var config = SystemAPI.GetSingleton<GameConfig>();
// Write singleton
ref var gameState = ref SystemAPI.GetSingletonRW<GameState>().ValueRW;
gameState.TimeRemaining -= SystemAPI.Time.DeltaTime;
// Check exists
if (SystemAPI.HasSingleton<GameConfig>())
{
// ...
}
}
}
```
### Pattern 7: Baking (Converting GameObjects)
```csharp
using Unity.Entities;
using UnityEngine;
// Authoring component (MonoBehaviour in Editor)
public class EnemyAuthoring : MonoBehaviour
{
public float Speed = 5f;
public float Health = 100f;
public GameObject ProjectilePrefab;
class Baker : Baker<EnemyAuthoring>
{
public override void Bake(EnemyAuthoring authoring)
{
var entity = GetEntity(TransformUsageFlags.Dynamic);
AddComponent(entity, new Speed { Value = authoring.Speed });
AddComponent(entity, new Health
{
Current = authoring.Health,
Max = authoring.Health
});
AddComponent(entity, new EnemyTag());
if (authoring.ProjectilePrefab != null)
{
AddComponent(entity, new ProjectilePrefab
{
Value = GetEntity(authoring.ProjectilePrefab, TransformUsageFlags.Dynamic)
});
}
}
}
}
// Complex baking with dependencies
public class SpawnerAuthoring : MonoBehaviour
{
public GameObject[] Prefabs;
public float Interval = 1f;
class Baker : Baker<SpawnerAuthoring>
{
public override void Bake(SpawnerAuthoring authoring)
{
var entity = GetEntity(TransformUsageFlags.Dynamic);
AddComponent(entity, new Spawner
{
Interval = authoring.Interval,
Timer = 0f
});
// Bake buffer of prefabs
var buffer = AddBuffer<SpawnPrefabElement>(entity);
foreach (var prefab in authoring.Prefabs)
{
buffer.Add(new SpawnPrefabElement
{
Prefab = GetEntity(prefab, TransformUsageFlags.Dynamic)
});
}
// Declare dependencies
DependsOn(authoring.Prefabs);
}
}
}
```
### Pattern 8: Jobs with Native Collections
```csharp
using Unity.Jobs;
using Unity.Collections;
using Unity.Burst;
using Unity.Mathematics;
[BurstCompile]
public struct SpatialHashJob : IJobParallelFor
{
[ReadOnly]
public NativeArray<float3> Positions;
// Thread-safe write to hash map
public NativeParallelMultiHashMap<int, int>.ParallelWriter HashMap;
public float CellSize;
public void Execute(int index)
{
float3 pos = Positions[index];
int hash = GetHash(pos);
HashMap.Add(hash, index);
}
int GetHash(float3 pos)
{
int x = (int)math.floor(pos.x / CellSize);
int y = (int)math.floor(pos.y / CellSize);
int z = (int)math.floor(pos.z / CellSize);
return x * 73856093 ^ y * 19349663 ^ z * 83492791;
}
}
[BurstCompile]
public partial struct SpatialHashSystem : ISystem
{
private NativeParallelMultiHashMap<int, int> _hashMap;
public void OnCreate(ref SystemState state)
{
_hashMap = new NativeParallelMultiHashMap<int, int>(10000, Allocator.Persistent);
}
public void OnDestroy(ref SystemState state)
{
_hashMap.Dispose();
}
[BurstCompile]
public void OnUpdate(ref SystemState state)
{
var query = SystemAPI.QueryBuilder()
.WithAll<LocalTransform>()
.Build();
int count = query.CalculateEntityCount();
// Resize if needed
if (_hashMap.Capacity < count)
{
_hashMap.Capacity = count * 2;
}
_hashMap.Clear();
// Get positions
var positions = query.ToComponentDataArray<LocalTransform>(Allocator.TempJob);
var posFloat3 = new NativeArray<float3>(count, Allocator.TempJob);
for (int i = 0; i < count; i++)
{
posFloat3[i] = positions[i].Position;
}
// Build hash map
var hashJob = new SpatialHashJob
{
Positions = posFloat3,
HashMap = _hashMap.AsParallelWriter(),
CellSize = 10f
};
state.Dependency = hashJob.Schedule(count, 64, state.Dependency);
// Cleanup
positions.Dispose(state.Dependency);
posFloat3.Dispose(state.Dependency);
}
}
```
## Performance Tips
```csharp
// 1. Use Burst everywhere
[BurstCompile]
public partial struct MySystem : ISystem { }
// 2. Prefer IJobEntity over manual iteration
[BurstCompile]
partial struct OptimizedJob : IJobEntity
{
void Execute(ref LocalTransform transform) { }
}
// 3. Schedule parallel when possible
state.Dependency = job.ScheduleParallel(state.Dependency);
// 4. Use ScheduleParallel with chunk iteration
[BurstCompile]
partial struct ChunkJob : IJobChunk
{
public ComponentTypeHandle<Health> HealthHandle;
public void Execute(in ArchetypeChunk chunk, int unfilteredChunkIndex,
bool useEnabledMask, in v128 chunkEnabledMask)
{
var healths = chunk.GetNativeArray(ref HealthHandle);
for (int i = 0; i < chunk.Count; i++)
{
// Process
}
}
}
// 5. Avoid structural changes in hot paths
// Use enableable components instead of add/remove
public struct Disabled : IComponentData, IEnableableComponent { }
```
## Best Practices
### Do's
- **Use ISystem over SystemBase** - Better performance
- **Burst compile everything** - Massive speedup
- **Batch structural changes** - Use ECB
- **Profile with Profiler** - Identify bottlenecks
- **Use Aspects** - Clean component grouping
### Don'ts
- **Don't use managed types** - Breaks Burst
- **Don't structural change in jobs** - Use ECB
- **Don't over-architect** - Start simple
- **Don't ignore chunk utilization** - Group similar entities
- **Don't forget disposal** - Native collections leak
## Resources
- [Unity DOTS Documentation](https://docs.unity3d.com/Packages/com.unity.entities@latest)
- [Unity DOTS Samples](https://github.com/Unity-Technologies/EntityComponentSystemSamples)
- [Burst User Guide](https://docs.unity3d.com/Packages/com.unity.burst@latest)

View File

@@ -0,0 +1,507 @@
---
name: employment-contract-templates
description: Create employment contracts, offer letters, and HR policy documents following legal best practices. Use when drafting employment agreements, creating HR policies, or standardizing employment documentation.
---
# Employment Contract Templates
Templates and patterns for creating legally sound employment documentation including contracts, offer letters, and HR policies.
## When to Use This Skill
- Drafting employment contracts
- Creating offer letters
- Writing employee handbooks
- Developing HR policies
- Standardizing employment documentation
- Onboarding documentation
## Core Concepts
### 1. Employment Document Types
| Document | Purpose | When Used |
|----------|---------|-----------|
| **Offer Letter** | Initial job offer | Pre-hire |
| **Employment Contract** | Formal agreement | Hire |
| **Employee Handbook** | Policies & procedures | Onboarding |
| **NDA** | Confidentiality | Before access |
| **Non-Compete** | Competition restriction | Hire/Exit |
### 2. Key Legal Considerations
```
Employment Relationship:
├── At-Will vs. Contract
├── Employee vs. Contractor
├── Full-Time vs. Part-Time
├── Exempt vs. Non-Exempt
└── Jurisdiction-Specific Requirements
```
**DISCLAIMER: These templates are for informational purposes only and do not constitute legal advice. Consult with qualified legal counsel before using any employment documents.**
## Templates
### Template 1: Offer Letter
```markdown
# EMPLOYMENT OFFER LETTER
[Company Letterhead]
Date: [DATE]
[Candidate Name]
[Address]
[City, State ZIP]
Dear [Candidate Name],
We are pleased to extend an offer of employment for the position of [JOB TITLE]
at [COMPANY NAME]. We believe your skills and experience will be valuable
additions to our team.
## Position Details
**Title:** [Job Title]
**Department:** [Department]
**Reports To:** [Manager Name/Title]
**Location:** [Office Location / Remote]
**Start Date:** [Proposed Start Date]
**Employment Type:** [Full-Time/Part-Time], [Exempt/Non-Exempt]
## Compensation
**Base Salary:** $[AMOUNT] per [year/hour], paid [bi-weekly/semi-monthly/monthly]
**Bonus:** [Eligible for annual bonus of up to X% based on company and individual
performance / Not applicable]
**Equity:** [X shares of stock options vesting over 4 years with 1-year cliff /
Not applicable]
## Benefits
You will be eligible for our standard benefits package, including:
- Health insurance (medical, dental, vision) effective [date]
- 401(k) with [X]% company match
- [X] days paid time off per year
- [X] paid holidays
- [Other benefits]
Full details will be provided during onboarding.
## Contingencies
This offer is contingent upon:
- Successful completion of background check
- Verification of your right to work in [Country]
- Execution of required employment documents including:
- Confidentiality Agreement
- [Non-Compete Agreement, if applicable]
- [IP Assignment Agreement]
## At-Will Employment
Please note that employment with [Company Name] is at-will. This means that
either you or the Company may terminate the employment relationship at any time,
with or without cause or notice. This offer letter does not constitute a
contract of employment for any specific period.
## Acceptance
To accept this offer, please sign below and return by [DEADLINE DATE]. This
offer will expire if not accepted by that date.
We are excited about the possibility of you joining our team. If you have any
questions, please contact [HR Contact] at [email/phone].
Sincerely,
_________________________
[Hiring Manager Name]
[Title]
[Company Name]
---
## ACCEPTANCE
I accept this offer of employment and agree to the terms stated above.
Signature: _________________________
Printed Name: _________________________
Date: _________________________
Anticipated Start Date: _________________________
```
### Template 2: Employment Agreement (Contract Position)
```markdown
# EMPLOYMENT AGREEMENT
This Employment Agreement ("Agreement") is entered into as of [DATE]
("Effective Date") by and between:
**Employer:** [COMPANY LEGAL NAME], a [State] [corporation/LLC]
with principal offices at [Address] ("Company")
**Employee:** [EMPLOYEE NAME], an individual residing at [Address] ("Employee")
## 1. EMPLOYMENT
1.1 **Position.** The Company agrees to employ Employee as [JOB TITLE],
reporting to [Manager Title]. Employee accepts such employment subject to
the terms of this Agreement.
1.2 **Duties.** Employee shall perform duties consistent with their position,
including but not limited to:
- [Primary duty 1]
- [Primary duty 2]
- [Primary duty 3]
- Other duties as reasonably assigned
1.3 **Best Efforts.** Employee agrees to devote their full business time,
attention, and best efforts to the Company's business during employment.
1.4 **Location.** Employee's primary work location shall be [Location/Remote].
[Travel requirements, if any.]
## 2. TERM
2.1 **Employment Period.** This Agreement shall commence on [START DATE] and
continue until terminated as provided herein.
2.2 **At-Will Employment.** [FOR AT-WILL STATES] Notwithstanding anything
herein, employment is at-will and may be terminated by either party at any
time, with or without cause or notice.
[OR FOR FIXED TERM:]
2.2 **Fixed Term.** This Agreement is for a fixed term of [X] months/years,
ending on [END DATE], unless terminated earlier as provided herein or extended
by mutual written agreement.
## 3. COMPENSATION
3.1 **Base Salary.** Employee shall receive a base salary of $[AMOUNT] per year,
payable in accordance with the Company's standard payroll practices, subject to
applicable withholdings.
3.2 **Bonus.** Employee may be eligible for an annual discretionary bonus of up
to [X]% of base salary, based on [criteria]. Bonus payments are at Company's
sole discretion and require active employment at payment date.
3.3 **Equity.** [If applicable] Subject to Board approval and the Company's
equity incentive plan, Employee shall be granted [X shares/options] under the
terms of a separate Stock Option Agreement.
3.4 **Benefits.** Employee shall be entitled to participate in benefit plans
offered to similarly situated employees, subject to plan terms and eligibility
requirements.
3.5 **Expenses.** Company shall reimburse Employee for reasonable business
expenses incurred in accordance with Company policy.
## 4. CONFIDENTIALITY
4.1 **Confidential Information.** Employee acknowledges access to confidential
and proprietary information including: trade secrets, business plans, customer
lists, financial data, technical information, and other non-public information
("Confidential Information").
4.2 **Non-Disclosure.** During and after employment, Employee shall not
disclose, use, or permit use of any Confidential Information except as required
for their duties or with prior written consent.
4.3 **Return of Materials.** Upon termination, Employee shall immediately return
all Company property and Confidential Information in any form.
4.4 **Survival.** Confidentiality obligations survive termination indefinitely
for trade secrets and for [3] years for other Confidential Information.
## 5. INTELLECTUAL PROPERTY
5.1 **Work Product.** All inventions, discoveries, works, and developments
created by Employee during employment, relating to Company's business, or using
Company resources ("Work Product") shall be Company's sole property.
5.2 **Assignment.** Employee hereby assigns to Company all rights in Work
Product, including all intellectual property rights.
5.3 **Assistance.** Employee agrees to execute documents and take actions
necessary to perfect Company's rights in Work Product.
5.4 **Prior Inventions.** Attached as Exhibit A is a list of any prior
inventions that Employee wishes to exclude from this Agreement.
## 6. NON-COMPETITION AND NON-SOLICITATION
[NOTE: Enforceability varies by jurisdiction. Consult local counsel.]
6.1 **Non-Competition.** During employment and for [12] months after
termination, Employee shall not, directly or indirectly, engage in any business
competitive with Company's business within [Geographic Area].
6.2 **Non-Solicitation of Customers.** During employment and for [12] months
after termination, Employee shall not solicit any customer of the Company for
competing products or services.
6.3 **Non-Solicitation of Employees.** During employment and for [12] months
after termination, Employee shall not recruit or solicit any Company employee
to leave Company employment.
## 7. TERMINATION
7.1 **By Company for Cause.** Company may terminate immediately for Cause,
defined as:
(a) Material breach of this Agreement
(b) Conviction of a felony
(c) Fraud, dishonesty, or gross misconduct
(d) Failure to perform duties after written notice and cure period
7.2 **By Company Without Cause.** Company may terminate without Cause upon
[30] days written notice.
7.3 **By Employee.** Employee may terminate upon [30] days written notice.
7.4 **Severance.** [If applicable] Upon termination without Cause, Employee
shall receive [X] weeks base salary as severance, contingent upon execution
of a release agreement.
7.5 **Effect of Termination.** Upon termination:
- All compensation earned through termination date shall be paid
- Unvested equity shall be forfeited
- Benefits terminate per plan terms
- Sections 4, 5, 6, 8, and 9 survive termination
## 8. GENERAL PROVISIONS
8.1 **Entire Agreement.** This Agreement constitutes the entire agreement and
supersedes all prior negotiations, representations, and agreements.
8.2 **Amendments.** This Agreement may be amended only by written agreement
signed by both parties.
8.3 **Governing Law.** This Agreement shall be governed by the laws of [State],
without regard to conflicts of law principles.
8.4 **Dispute Resolution.** [Arbitration clause or jurisdiction selection]
8.5 **Severability.** If any provision is unenforceable, it shall be modified
to the minimum extent necessary, and remaining provisions shall remain in effect.
8.6 **Notices.** Notices shall be in writing and delivered to addresses above.
8.7 **Assignment.** Employee may not assign this Agreement. Company may assign
to a successor.
8.8 **Waiver.** Failure to enforce any provision shall not constitute waiver.
## 9. ACKNOWLEDGMENTS
Employee acknowledges:
- Having read and understood this Agreement
- Having opportunity to consult with counsel
- Agreeing to all terms voluntarily
---
IN WITNESS WHEREOF, the parties have executed this Agreement as of the
Effective Date.
**[COMPANY NAME]**
By: _________________________
Name: [Authorized Signatory]
Title: [Title]
Date: _________________________
**EMPLOYEE**
Signature: _________________________
Name: [Employee Name]
Date: _________________________
---
## EXHIBIT A: PRIOR INVENTIONS
[Employee to list any prior inventions, if any, or write "None"]
_________________________
```
### Template 3: Employee Handbook Policy Section
```markdown
# EMPLOYEE HANDBOOK - POLICY SECTION
## EMPLOYMENT POLICIES
### Equal Employment Opportunity
[Company Name] is an equal opportunity employer. We do not discriminate based on
race, color, religion, sex, sexual orientation, gender identity, national
origin, age, disability, veteran status, or any other protected characteristic.
This policy applies to all employment practices including:
- Recruitment and hiring
- Compensation and benefits
- Training and development
- Promotions and transfers
- Termination
### Anti-Harassment Policy
[Company Name] is committed to providing a workplace free from harassment.
Harassment based on any protected characteristic is strictly prohibited.
**Prohibited Conduct Includes:**
- Unwelcome sexual advances or requests for sexual favors
- Offensive comments, jokes, or slurs
- Physical conduct such as assault or unwanted touching
- Visual conduct such as displaying offensive images
- Threatening, intimidating, or hostile acts
**Reporting Procedure:**
1. Report to your manager, HR, or any member of leadership
2. Reports may be made verbally or in writing
3. Anonymous reports are accepted via [hotline/email]
**Investigation:**
All reports will be promptly investigated. Retaliation against anyone who
reports harassment is strictly prohibited and will result in disciplinary
action up to termination.
### Work Hours and Attendance
**Standard Hours:** [8:00 AM - 5:00 PM, Monday through Friday]
**Core Hours:** [10:00 AM - 3:00 PM] - Employees expected to be available
**Flexible Work:** [Policy on remote work, flexible scheduling]
**Attendance Expectations:**
- Notify your manager as soon as possible if you will be absent
- Excessive unexcused absences may result in disciplinary action
- [X] unexcused absences in [Y] days considered excessive
### Paid Time Off (PTO)
**PTO Accrual:**
| Years of Service | Annual PTO Days |
|------------------|-----------------|
| 0-2 years | 15 days |
| 3-5 years | 20 days |
| 6+ years | 25 days |
**PTO Guidelines:**
- PTO accrues per pay period
- Maximum accrual: [X] days (use it or lose it after)
- Request PTO at least [2] weeks in advance
- Manager approval required
- PTO may not be taken during [blackout periods]
### Sick Leave
- [X] days sick leave per year
- May be used for personal illness or family member care
- Doctor's note required for absences exceeding [3] days
### Holidays
The following paid holidays are observed:
- New Year's Day
- Martin Luther King Jr. Day
- Presidents Day
- Memorial Day
- Independence Day
- Labor Day
- Thanksgiving Day
- Day after Thanksgiving
- Christmas Day
- [Floating holiday]
### Code of Conduct
All employees are expected to:
- Act with integrity and honesty
- Treat colleagues, customers, and partners with respect
- Protect company confidential information
- Avoid conflicts of interest
- Comply with all laws and regulations
- Report any violations of this code
**Violations may result in disciplinary action up to and including termination.**
### Technology and Communication
**Acceptable Use:**
- Company technology is for business purposes
- Limited personal use is permitted if it doesn't interfere with work
- No illegal activities or viewing inappropriate content
**Monitoring:**
- Company reserves the right to monitor company systems
- Employees should have no expectation of privacy on company devices
**Security:**
- Use strong passwords and enable 2FA
- Report security incidents immediately
- Lock devices when unattended
### Social Media Policy
**Personal Social Media:**
- Clearly state opinions are your own, not the company's
- Do not share confidential company information
- Be respectful and professional
**Company Social Media:**
- Only authorized personnel may post on behalf of the company
- Follow brand guidelines
- Escalate negative comments to [Marketing/PR]
---
## ACKNOWLEDGMENT
I acknowledge that I have received a copy of the Employee Handbook and
understand that:
1. I am responsible for reading and understanding its contents
2. The handbook does not create a contract of employment
3. Policies may be changed at any time at the company's discretion
4. Employment is at-will [if applicable]
I agree to abide by the policies and procedures outlined in this handbook.
Employee Signature: _________________________
Employee Name (Print): _________________________
Date: _________________________
```
## Best Practices
### Do's
- **Consult legal counsel** - Employment law varies by jurisdiction
- **Keep copies signed** - Document all agreements
- **Update regularly** - Laws and policies change
- **Be clear and specific** - Avoid ambiguity
- **Train managers** - On policies and procedures
### Don'ts
- **Don't use generic templates** - Customize for your jurisdiction
- **Don't make promises** - That could create implied contracts
- **Don't discriminate** - In language or application
- **Don't forget at-will language** - Where applicable
- **Don't skip review** - Have legal counsel review all documents
## Resources
- [SHRM Employment Templates](https://www.shrm.org/)
- [Department of Labor](https://www.dol.gov/)
- [EEOC Guidance](https://www.eeoc.gov/)
- State-specific labor departments

View File

@@ -0,0 +1,616 @@
---
name: gdpr-data-handling
description: Implement GDPR-compliant data handling with consent management, data subject rights, and privacy by design. Use when building systems that process EU personal data, implementing privacy controls, or conducting GDPR compliance reviews.
---
# GDPR Data Handling
Practical implementation guide for GDPR-compliant data processing, consent management, and privacy controls.
## When to Use This Skill
- Building systems that process EU personal data
- Implementing consent management
- Handling data subject requests (DSRs)
- Conducting GDPR compliance reviews
- Designing privacy-first architectures
- Creating data processing agreements
## Core Concepts
### 1. Personal Data Categories
| Category | Examples | Protection Level |
|----------|----------|------------------|
| **Basic** | Name, email, phone | Standard |
| **Sensitive (Art. 9)** | Health, religion, ethnicity | Explicit consent |
| **Criminal (Art. 10)** | Convictions, offenses | Official authority |
| **Children's** | Under 16 data | Parental consent |
### 2. Legal Bases for Processing
```
Article 6 - Lawful Bases:
├── Consent: Freely given, specific, informed
├── Contract: Necessary for contract performance
├── Legal Obligation: Required by law
├── Vital Interests: Protecting someone's life
├── Public Interest: Official functions
└── Legitimate Interest: Balanced against rights
```
### 3. Data Subject Rights
```
Right to Access (Art. 15) ─┐
Right to Rectification (Art. 16) │
Right to Erasure (Art. 17) │ Must respond
Right to Restrict (Art. 18) │ within 1 month
Right to Portability (Art. 20) │
Right to Object (Art. 21) ─┘
```
## Implementation Patterns
### Pattern 1: Consent Management
```javascript
// Consent data model
const consentSchema = {
userId: String,
consents: [{
purpose: String, // 'marketing', 'analytics', etc.
granted: Boolean,
timestamp: Date,
source: String, // 'web_form', 'api', etc.
version: String, // Privacy policy version
ipAddress: String, // For proof
userAgent: String // For proof
}],
auditLog: [{
action: String, // 'granted', 'withdrawn', 'updated'
purpose: String,
timestamp: Date,
source: String
}]
};
// Consent service
class ConsentManager {
async recordConsent(userId, purpose, granted, metadata) {
const consent = {
purpose,
granted,
timestamp: new Date(),
source: metadata.source,
version: await this.getCurrentPolicyVersion(),
ipAddress: metadata.ipAddress,
userAgent: metadata.userAgent
};
// Store consent
await this.db.consents.updateOne(
{ userId },
{
$push: {
consents: consent,
auditLog: {
action: granted ? 'granted' : 'withdrawn',
purpose,
timestamp: consent.timestamp,
source: metadata.source
}
}
},
{ upsert: true }
);
// Emit event for downstream systems
await this.eventBus.emit('consent.changed', {
userId,
purpose,
granted,
timestamp: consent.timestamp
});
}
async hasConsent(userId, purpose) {
const record = await this.db.consents.findOne({ userId });
if (!record) return false;
const latestConsent = record.consents
.filter(c => c.purpose === purpose)
.sort((a, b) => b.timestamp - a.timestamp)[0];
return latestConsent?.granted === true;
}
async getConsentHistory(userId) {
const record = await this.db.consents.findOne({ userId });
return record?.auditLog || [];
}
}
```
```html
<!-- GDPR-compliant consent UI -->
<div class="consent-banner" role="dialog" aria-labelledby="consent-title">
<h2 id="consent-title">Cookie Preferences</h2>
<p>We use cookies to improve your experience. Select your preferences below.</p>
<form id="consent-form">
<!-- Necessary - always on, no consent needed -->
<div class="consent-category">
<input type="checkbox" id="necessary" checked disabled>
<label for="necessary">
<strong>Necessary</strong>
<span>Required for the website to function. Cannot be disabled.</span>
</label>
</div>
<!-- Analytics - requires consent -->
<div class="consent-category">
<input type="checkbox" id="analytics" name="analytics">
<label for="analytics">
<strong>Analytics</strong>
<span>Help us understand how you use our site.</span>
</label>
</div>
<!-- Marketing - requires consent -->
<div class="consent-category">
<input type="checkbox" id="marketing" name="marketing">
<label for="marketing">
<strong>Marketing</strong>
<span>Personalized ads based on your interests.</span>
</label>
</div>
<div class="consent-actions">
<button type="button" id="accept-all">Accept All</button>
<button type="button" id="reject-all">Reject All</button>
<button type="submit">Save Preferences</button>
</div>
<p class="consent-links">
<a href="/privacy-policy">Privacy Policy</a> |
<a href="/cookie-policy">Cookie Policy</a>
</p>
</form>
</div>
```
### Pattern 2: Data Subject Access Request (DSAR)
```python
from datetime import datetime, timedelta
from typing import Dict, List, Optional
import json
class DSARHandler:
"""Handle Data Subject Access Requests."""
RESPONSE_DEADLINE_DAYS = 30
EXTENSION_ALLOWED_DAYS = 60 # For complex requests
def __init__(self, data_sources: List['DataSource']):
self.data_sources = data_sources
async def submit_request(
self,
request_type: str, # 'access', 'erasure', 'rectification', 'portability'
user_id: str,
verified: bool,
details: Optional[Dict] = None
) -> str:
"""Submit a new DSAR."""
request = {
'id': self.generate_request_id(),
'type': request_type,
'user_id': user_id,
'status': 'pending_verification' if not verified else 'processing',
'submitted_at': datetime.utcnow(),
'deadline': datetime.utcnow() + timedelta(days=self.RESPONSE_DEADLINE_DAYS),
'details': details or {},
'audit_log': [{
'action': 'submitted',
'timestamp': datetime.utcnow(),
'details': 'Request received'
}]
}
await self.db.dsar_requests.insert_one(request)
await self.notify_dpo(request)
return request['id']
async def process_access_request(self, request_id: str) -> Dict:
"""Process a data access request."""
request = await self.get_request(request_id)
if request['type'] != 'access':
raise ValueError("Not an access request")
# Collect data from all sources
user_data = {}
for source in self.data_sources:
try:
data = await source.get_user_data(request['user_id'])
user_data[source.name] = data
except Exception as e:
user_data[source.name] = {'error': str(e)}
# Format response
response = {
'request_id': request_id,
'generated_at': datetime.utcnow().isoformat(),
'data_categories': list(user_data.keys()),
'data': user_data,
'retention_info': await self.get_retention_info(),
'processing_purposes': await self.get_processing_purposes(),
'third_party_recipients': await self.get_recipients()
}
# Update request status
await self.update_request(request_id, 'completed', response)
return response
async def process_erasure_request(self, request_id: str) -> Dict:
"""Process a right to erasure request."""
request = await self.get_request(request_id)
if request['type'] != 'erasure':
raise ValueError("Not an erasure request")
results = {}
exceptions = []
for source in self.data_sources:
try:
# Check for legal exceptions
can_delete, reason = await source.can_delete(request['user_id'])
if can_delete:
await source.delete_user_data(request['user_id'])
results[source.name] = 'deleted'
else:
exceptions.append({
'source': source.name,
'reason': reason # e.g., 'legal retention requirement'
})
results[source.name] = f'retained: {reason}'
except Exception as e:
results[source.name] = f'error: {str(e)}'
response = {
'request_id': request_id,
'completed_at': datetime.utcnow().isoformat(),
'results': results,
'exceptions': exceptions
}
await self.update_request(request_id, 'completed', response)
return response
async def process_portability_request(self, request_id: str) -> bytes:
"""Generate portable data export."""
request = await self.get_request(request_id)
user_data = await self.process_access_request(request_id)
# Convert to machine-readable format (JSON)
portable_data = {
'export_date': datetime.utcnow().isoformat(),
'format_version': '1.0',
'data': user_data['data']
}
return json.dumps(portable_data, indent=2, default=str).encode()
```
### Pattern 3: Data Retention
```python
from datetime import datetime, timedelta
from enum import Enum
class RetentionBasis(Enum):
CONSENT = "consent"
CONTRACT = "contract"
LEGAL_OBLIGATION = "legal_obligation"
LEGITIMATE_INTEREST = "legitimate_interest"
class DataRetentionPolicy:
"""Define and enforce data retention policies."""
POLICIES = {
'user_account': {
'retention_period_days': 365 * 3, # 3 years after last activity
'basis': RetentionBasis.CONTRACT,
'trigger': 'last_activity_date',
'archive_before_delete': True
},
'transaction_records': {
'retention_period_days': 365 * 7, # 7 years for tax
'basis': RetentionBasis.LEGAL_OBLIGATION,
'trigger': 'transaction_date',
'archive_before_delete': True,
'legal_reference': 'Tax regulations require 7 year retention'
},
'marketing_consent': {
'retention_period_days': 365 * 2, # 2 years
'basis': RetentionBasis.CONSENT,
'trigger': 'consent_date',
'archive_before_delete': False
},
'support_tickets': {
'retention_period_days': 365 * 2,
'basis': RetentionBasis.LEGITIMATE_INTEREST,
'trigger': 'ticket_closed_date',
'archive_before_delete': True
},
'analytics_data': {
'retention_period_days': 365, # 1 year
'basis': RetentionBasis.CONSENT,
'trigger': 'collection_date',
'archive_before_delete': False,
'anonymize_instead': True
}
}
async def apply_retention_policies(self):
"""Run retention policy enforcement."""
for data_type, policy in self.POLICIES.items():
cutoff_date = datetime.utcnow() - timedelta(
days=policy['retention_period_days']
)
if policy.get('anonymize_instead'):
await self.anonymize_old_data(data_type, cutoff_date)
else:
if policy.get('archive_before_delete'):
await self.archive_data(data_type, cutoff_date)
await self.delete_old_data(data_type, cutoff_date)
await self.log_retention_action(data_type, cutoff_date)
async def anonymize_old_data(self, data_type: str, before_date: datetime):
"""Anonymize data instead of deleting."""
# Example: Replace identifying fields with hashes
if data_type == 'analytics_data':
await self.db.analytics.update_many(
{'collection_date': {'$lt': before_date}},
{'$set': {
'user_id': None,
'ip_address': None,
'device_id': None,
'anonymized': True,
'anonymized_date': datetime.utcnow()
}}
)
```
### Pattern 4: Privacy by Design
```python
class PrivacyFirstDataModel:
"""Example of privacy-by-design data model."""
# Separate PII from behavioral data
user_profile_schema = {
'user_id': str, # UUID, not sequential
'email_hash': str, # Hashed for lookups
'created_at': datetime,
# Minimal data collection
'preferences': {
'language': str,
'timezone': str
}
}
# Encrypted at rest
user_pii_schema = {
'user_id': str,
'email': str, # Encrypted
'name': str, # Encrypted
'phone': str, # Encrypted (optional)
'address': dict, # Encrypted (optional)
'encryption_key_id': str
}
# Pseudonymized behavioral data
analytics_schema = {
'session_id': str, # Not linked to user_id
'pseudonym_id': str, # Rotating pseudonym
'events': list,
'device_category': str, # Generalized, not specific
'country': str, # Not city-level
}
class DataMinimization:
"""Implement data minimization principles."""
@staticmethod
def collect_only_needed(form_data: dict, purpose: str) -> dict:
"""Filter form data to only fields needed for purpose."""
REQUIRED_FIELDS = {
'account_creation': ['email', 'password'],
'newsletter': ['email'],
'purchase': ['email', 'name', 'address', 'payment'],
'support': ['email', 'message']
}
allowed = REQUIRED_FIELDS.get(purpose, [])
return {k: v for k, v in form_data.items() if k in allowed}
@staticmethod
def generalize_location(ip_address: str) -> str:
"""Generalize IP to country level only."""
import geoip2.database
reader = geoip2.database.Reader('GeoLite2-Country.mmdb')
try:
response = reader.country(ip_address)
return response.country.iso_code
except:
return 'UNKNOWN'
```
### Pattern 5: Breach Notification
```python
from datetime import datetime
from enum import Enum
class BreachSeverity(Enum):
LOW = "low"
MEDIUM = "medium"
HIGH = "high"
CRITICAL = "critical"
class BreachNotificationHandler:
"""Handle GDPR breach notification requirements."""
AUTHORITY_NOTIFICATION_HOURS = 72
AFFECTED_NOTIFICATION_REQUIRED_SEVERITY = BreachSeverity.HIGH
async def report_breach(
self,
description: str,
data_types: List[str],
affected_count: int,
severity: BreachSeverity
) -> dict:
"""Report and handle a data breach."""
breach = {
'id': self.generate_breach_id(),
'reported_at': datetime.utcnow(),
'description': description,
'data_types_affected': data_types,
'affected_individuals_count': affected_count,
'severity': severity.value,
'status': 'investigating',
'timeline': [{
'event': 'breach_reported',
'timestamp': datetime.utcnow(),
'details': description
}]
}
await self.db.breaches.insert_one(breach)
# Immediate notifications
await self.notify_dpo(breach)
await self.notify_security_team(breach)
# Authority notification required within 72 hours
if self.requires_authority_notification(severity, data_types):
breach['authority_notification_deadline'] = (
datetime.utcnow() + timedelta(hours=self.AUTHORITY_NOTIFICATION_HOURS)
)
await self.schedule_authority_notification(breach)
# Affected individuals notification
if severity.value in [BreachSeverity.HIGH.value, BreachSeverity.CRITICAL.value]:
await self.schedule_individual_notifications(breach)
return breach
def requires_authority_notification(
self,
severity: BreachSeverity,
data_types: List[str]
) -> bool:
"""Determine if supervisory authority must be notified."""
# Always notify for sensitive data
sensitive_types = ['health', 'financial', 'credentials', 'biometric']
if any(t in sensitive_types for t in data_types):
return True
# Notify for medium+ severity
return severity in [BreachSeverity.MEDIUM, BreachSeverity.HIGH, BreachSeverity.CRITICAL]
async def generate_authority_report(self, breach_id: str) -> dict:
"""Generate report for supervisory authority."""
breach = await self.get_breach(breach_id)
return {
'organization': {
'name': self.config.org_name,
'contact': self.config.dpo_contact,
'registration': self.config.registration_number
},
'breach': {
'nature': breach['description'],
'categories_affected': breach['data_types_affected'],
'approximate_number_affected': breach['affected_individuals_count'],
'likely_consequences': self.assess_consequences(breach),
'measures_taken': await self.get_remediation_measures(breach_id),
'measures_proposed': await self.get_proposed_measures(breach_id)
},
'timeline': breach['timeline'],
'submitted_at': datetime.utcnow().isoformat()
}
```
## Compliance Checklist
```markdown
## GDPR Implementation Checklist
### Legal Basis
- [ ] Documented legal basis for each processing activity
- [ ] Consent mechanisms meet GDPR requirements
- [ ] Legitimate interest assessments completed
### Transparency
- [ ] Privacy policy is clear and accessible
- [ ] Processing purposes clearly stated
- [ ] Data retention periods documented
### Data Subject Rights
- [ ] Access request process implemented
- [ ] Erasure request process implemented
- [ ] Portability export available
- [ ] Rectification process available
- [ ] Response within 30-day deadline
### Security
- [ ] Encryption at rest implemented
- [ ] Encryption in transit (TLS)
- [ ] Access controls in place
- [ ] Audit logging enabled
### Breach Response
- [ ] Breach detection mechanisms
- [ ] 72-hour notification process
- [ ] Breach documentation system
### Documentation
- [ ] Records of processing activities (Art. 30)
- [ ] Data protection impact assessments
- [ ] Data processing agreements with vendors
```
## Best Practices
### Do's
- **Minimize data collection** - Only collect what's needed
- **Document everything** - Processing activities, legal bases
- **Encrypt PII** - At rest and in transit
- **Implement access controls** - Need-to-know basis
- **Regular audits** - Verify compliance continuously
### Don'ts
- **Don't pre-check consent boxes** - Must be opt-in
- **Don't bundle consent** - Separate purposes separately
- **Don't retain indefinitely** - Define and enforce retention
- **Don't ignore DSARs** - 30-day response required
- **Don't transfer without safeguards** - SCCs or adequacy decisions
## Resources
- [GDPR Full Text](https://gdpr-info.eu/)
- [ICO Guidance](https://ico.org.uk/for-organisations/guide-to-data-protection/guide-to-the-general-data-protection-regulation-gdpr/)
- [EDPB Guidelines](https://edpb.europa.eu/our-work-tools/general-guidance/gdpr-guidelines-recommendations-best-practices_en)

View File

@@ -0,0 +1,383 @@
---
name: incident-runbook-templates
description: Create structured incident response runbooks with step-by-step procedures, escalation paths, and recovery actions. Use when building runbooks, responding to incidents, or establishing incident response procedures.
---
# Incident Runbook Templates
Production-ready templates for incident response runbooks covering detection, triage, mitigation, resolution, and communication.
## When to Use This Skill
- Creating incident response procedures
- Building service-specific runbooks
- Establishing escalation paths
- Documenting recovery procedures
- Responding to active incidents
- Onboarding on-call engineers
## Core Concepts
### 1. Incident Severity Levels
| Severity | Impact | Response Time | Example |
|----------|--------|---------------|---------|
| **SEV1** | Complete outage, data loss | 15 min | Production down |
| **SEV2** | Major degradation | 30 min | Critical feature broken |
| **SEV3** | Minor impact | 2 hours | Non-critical bug |
| **SEV4** | Minimal impact | Next business day | Cosmetic issue |
### 2. Runbook Structure
```
1. Overview & Impact
2. Detection & Alerts
3. Initial Triage
4. Mitigation Steps
5. Root Cause Investigation
6. Resolution Procedures
7. Verification & Rollback
8. Communication Templates
9. Escalation Matrix
```
## Runbook Templates
### Template 1: Service Outage Runbook
```markdown
# [Service Name] Outage Runbook
## Overview
**Service**: Payment Processing Service
**Owner**: Platform Team
**Slack**: #payments-incidents
**PagerDuty**: payments-oncall
## Impact Assessment
- [ ] Which customers are affected?
- [ ] What percentage of traffic is impacted?
- [ ] Are there financial implications?
- [ ] What's the blast radius?
## Detection
### Alerts
- `payment_error_rate > 5%` (PagerDuty)
- `payment_latency_p99 > 2s` (Slack)
- `payment_success_rate < 95%` (PagerDuty)
### Dashboards
- [Payment Service Dashboard](https://grafana/d/payments)
- [Error Tracking](https://sentry.io/payments)
- [Dependency Status](https://status.stripe.com)
## Initial Triage (First 5 Minutes)
### 1. Assess Scope
```bash
# Check service health
kubectl get pods -n payments -l app=payment-service
# Check recent deployments
kubectl rollout history deployment/payment-service -n payments
# Check error rates
curl -s "http://prometheus:9090/api/v1/query?query=sum(rate(http_requests_total{status=~'5..'}[5m]))"
```
### 2. Quick Health Checks
- [ ] Can you reach the service? `curl -I https://api.company.com/payments/health`
- [ ] Database connectivity? Check connection pool metrics
- [ ] External dependencies? Check Stripe, bank API status
- [ ] Recent changes? Check deploy history
### 3. Initial Classification
| Symptom | Likely Cause | Go To Section |
|---------|--------------|---------------|
| All requests failing | Service down | Section 4.1 |
| High latency | Database/dependency | Section 4.2 |
| Partial failures | Code bug | Section 4.3 |
| Spike in errors | Traffic surge | Section 4.4 |
## Mitigation Procedures
### 4.1 Service Completely Down
```bash
# Step 1: Check pod status
kubectl get pods -n payments
# Step 2: If pods are crash-looping, check logs
kubectl logs -n payments -l app=payment-service --tail=100
# Step 3: Check recent deployments
kubectl rollout history deployment/payment-service -n payments
# Step 4: ROLLBACK if recent deploy is suspect
kubectl rollout undo deployment/payment-service -n payments
# Step 5: Scale up if resource constrained
kubectl scale deployment/payment-service -n payments --replicas=10
# Step 6: Verify recovery
kubectl rollout status deployment/payment-service -n payments
```
### 4.2 High Latency
```bash
# Step 1: Check database connections
kubectl exec -n payments deploy/payment-service -- \
curl localhost:8080/metrics | grep db_pool
# Step 2: Check slow queries (if DB issue)
psql -h $DB_HOST -U $DB_USER -c "
SELECT pid, now() - query_start AS duration, query
FROM pg_stat_activity
WHERE state = 'active' AND duration > interval '5 seconds'
ORDER BY duration DESC;"
# Step 3: Kill long-running queries if needed
psql -h $DB_HOST -U $DB_USER -c "SELECT pg_terminate_backend(pid);"
# Step 4: Check external dependency latency
curl -w "@curl-format.txt" -o /dev/null -s https://api.stripe.com/v1/health
# Step 5: Enable circuit breaker if dependency is slow
kubectl set env deployment/payment-service \
STRIPE_CIRCUIT_BREAKER_ENABLED=true -n payments
```
### 4.3 Partial Failures (Specific Errors)
```bash
# Step 1: Identify error pattern
kubectl logs -n payments -l app=payment-service --tail=500 | \
grep -i error | sort | uniq -c | sort -rn | head -20
# Step 2: Check error tracking
# Go to Sentry: https://sentry.io/payments
# Step 3: If specific endpoint, enable feature flag to disable
curl -X POST https://api.company.com/internal/feature-flags \
-d '{"flag": "DISABLE_PROBLEMATIC_FEATURE", "enabled": true}'
# Step 4: If data issue, check recent data changes
psql -h $DB_HOST -c "
SELECT * FROM audit_log
WHERE table_name = 'payment_methods'
AND created_at > now() - interval '1 hour';"
```
### 4.4 Traffic Surge
```bash
# Step 1: Check current request rate
kubectl top pods -n payments
# Step 2: Scale horizontally
kubectl scale deployment/payment-service -n payments --replicas=20
# Step 3: Enable rate limiting
kubectl set env deployment/payment-service \
RATE_LIMIT_ENABLED=true \
RATE_LIMIT_RPS=1000 -n payments
# Step 4: If attack, block suspicious IPs
kubectl apply -f - <<EOF
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: block-suspicious
namespace: payments
spec:
podSelector:
matchLabels:
app: payment-service
ingress:
- from:
- ipBlock:
cidr: 0.0.0.0/0
except:
- 192.168.1.0/24 # Suspicious range
EOF
```
## Verification Steps
```bash
# Verify service is healthy
curl -s https://api.company.com/payments/health | jq
# Verify error rate is back to normal
curl -s "http://prometheus:9090/api/v1/query?query=sum(rate(http_requests_total{status=~'5..'}[5m]))" | jq '.data.result[0].value[1]'
# Verify latency is acceptable
curl -s "http://prometheus:9090/api/v1/query?query=histogram_quantile(0.99,sum(rate(http_request_duration_seconds_bucket[5m]))by(le))" | jq
# Smoke test critical flows
./scripts/smoke-test-payments.sh
```
## Rollback Procedures
```bash
# Rollback Kubernetes deployment
kubectl rollout undo deployment/payment-service -n payments
# Rollback database migration (if applicable)
./scripts/db-rollback.sh $MIGRATION_VERSION
# Rollback feature flag
curl -X POST https://api.company.com/internal/feature-flags \
-d '{"flag": "NEW_PAYMENT_FLOW", "enabled": false}'
```
## Escalation Matrix
| Condition | Escalate To | Contact |
|-----------|-------------|---------|
| > 15 min unresolved SEV1 | Engineering Manager | @manager (Slack) |
| Data breach suspected | Security Team | #security-incidents |
| Financial impact > $10k | Finance + Legal | @finance-oncall |
| Customer communication needed | Support Lead | @support-lead |
## Communication Templates
### Initial Notification (Internal)
```
🚨 INCIDENT: Payment Service Degradation
Severity: SEV2
Status: Investigating
Impact: ~20% of payment requests failing
Start Time: [TIME]
Incident Commander: [NAME]
Current Actions:
- Investigating root cause
- Scaling up service
- Monitoring dashboards
Updates in #payments-incidents
```
### Status Update
```
📊 UPDATE: Payment Service Incident
Status: Mitigating
Impact: Reduced to ~5% failure rate
Duration: 25 minutes
Actions Taken:
- Rolled back deployment v2.3.4 → v2.3.3
- Scaled service from 5 → 10 replicas
Next Steps:
- Continuing to monitor
- Root cause analysis in progress
ETA to Resolution: ~15 minutes
```
### Resolution Notification
```
✅ RESOLVED: Payment Service Incident
Duration: 45 minutes
Impact: ~5,000 affected transactions
Root Cause: Memory leak in v2.3.4
Resolution:
- Rolled back to v2.3.3
- Transactions auto-retried successfully
Follow-up:
- Postmortem scheduled for [DATE]
- Bug fix in progress
```
```
### Template 2: Database Incident Runbook
```markdown
# Database Incident Runbook
## Quick Reference
| Issue | Command |
|-------|---------|
| Check connections | `SELECT count(*) FROM pg_stat_activity;` |
| Kill query | `SELECT pg_terminate_backend(pid);` |
| Check replication lag | `SELECT extract(epoch from (now() - pg_last_xact_replay_timestamp()));` |
| Check locks | `SELECT * FROM pg_locks WHERE NOT granted;` |
## Connection Pool Exhaustion
```sql
-- Check current connections
SELECT datname, usename, state, count(*)
FROM pg_stat_activity
GROUP BY datname, usename, state
ORDER BY count(*) DESC;
-- Identify long-running connections
SELECT pid, usename, datname, state, query_start, query
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY query_start;
-- Terminate idle connections
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'idle'
AND query_start < now() - interval '10 minutes';
```
## Replication Lag
```sql
-- Check lag on replica
SELECT
CASE
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
ELSE extract(epoch from now() - pg_last_xact_replay_timestamp())
END AS lag_seconds;
-- If lag > 60s, consider:
-- 1. Check network between primary/replica
-- 2. Check replica disk I/O
-- 3. Consider failover if unrecoverable
```
## Disk Space Critical
```bash
# Check disk usage
df -h /var/lib/postgresql/data
# Find large tables
psql -c "SELECT relname, pg_size_pretty(pg_total_relation_size(relid))
FROM pg_catalog.pg_statio_user_tables
ORDER BY pg_total_relation_size(relid) DESC
LIMIT 10;"
# VACUUM to reclaim space
psql -c "VACUUM FULL large_table;"
# If emergency, delete old data or expand disk
```
```
## Best Practices
### Do's
- **Keep runbooks updated** - Review after every incident
- **Test runbooks regularly** - Game days, chaos engineering
- **Include rollback steps** - Always have an escape hatch
- **Document assumptions** - What must be true for steps to work
- **Link to dashboards** - Quick access during stress
### Don'ts
- **Don't assume knowledge** - Write for 3 AM brain
- **Don't skip verification** - Confirm each step worked
- **Don't forget communication** - Keep stakeholders informed
- **Don't work alone** - Escalate early
- **Don't skip postmortems** - Learn from every incident
## Resources
- [Google SRE Book - Incident Management](https://sre.google/sre-book/managing-incidents/)
- [PagerDuty Incident Response](https://response.pagerduty.com/)
- [Atlassian Incident Management](https://www.atlassian.com/incident-management)

View File

@@ -0,0 +1,441 @@
---
name: on-call-handoff-patterns
description: Master on-call shift handoffs with context transfer, escalation procedures, and documentation. Use when transitioning on-call responsibilities, documenting shift summaries, or improving on-call processes.
---
# On-Call Handoff Patterns
Effective patterns for on-call shift transitions, ensuring continuity, context transfer, and reliable incident response across shifts.
## When to Use This Skill
- Transitioning on-call responsibilities
- Writing shift handoff summaries
- Documenting ongoing investigations
- Establishing on-call rotation procedures
- Improving handoff quality
- Onboarding new on-call engineers
## Core Concepts
### 1. Handoff Components
| Component | Purpose |
|-----------|---------|
| **Active Incidents** | What's currently broken |
| **Ongoing Investigations** | Issues being debugged |
| **Recent Changes** | Deployments, configs |
| **Known Issues** | Workarounds in place |
| **Upcoming Events** | Maintenance, releases |
### 2. Handoff Timing
```
Recommended: 30 min overlap between shifts
Outgoing:
├── 15 min: Write handoff document
└── 15 min: Sync call with incoming
Incoming:
├── 15 min: Review handoff document
├── 15 min: Sync call with outgoing
└── 5 min: Verify alerting setup
```
## Templates
### Template 1: Shift Handoff Document
```markdown
# On-Call Handoff: Platform Team
**Outgoing**: @alice (2024-01-15 to 2024-01-22)
**Incoming**: @bob (2024-01-22 to 2024-01-29)
**Handoff Time**: 2024-01-22 09:00 UTC
---
## 🔴 Active Incidents
### None currently active
No active incidents at handoff time.
---
## 🟡 Ongoing Investigations
### 1. Intermittent API Timeouts (ENG-1234)
**Status**: Investigating
**Started**: 2024-01-20
**Impact**: ~0.1% of requests timing out
**Context**:
- Timeouts correlate with database backup window (02:00-03:00 UTC)
- Suspect backup process causing lock contention
- Added extra logging in PR #567 (deployed 01/21)
**Next Steps**:
- [ ] Review new logs after tonight's backup
- [ ] Consider moving backup window if confirmed
**Resources**:
- Dashboard: [API Latency](https://grafana/d/api-latency)
- Thread: #platform-eng (01/20, 14:32)
---
### 2. Memory Growth in Auth Service (ENG-1235)
**Status**: Monitoring
**Started**: 2024-01-18
**Impact**: None yet (proactive)
**Context**:
- Memory usage growing ~5% per day
- No memory leak found in profiling
- Suspect connection pool not releasing properly
**Next Steps**:
- [ ] Review heap dump from 01/21
- [ ] Consider restart if usage > 80%
**Resources**:
- Dashboard: [Auth Service Memory](https://grafana/d/auth-memory)
- Analysis doc: [Memory Investigation](https://docs/eng-1235)
---
## 🟢 Resolved This Shift
### Payment Service Outage (2024-01-19)
- **Duration**: 23 minutes
- **Root Cause**: Database connection exhaustion
- **Resolution**: Rolled back v2.3.4, increased pool size
- **Postmortem**: [POSTMORTEM-89](https://docs/postmortem-89)
- **Follow-up tickets**: ENG-1230, ENG-1231
---
## 📋 Recent Changes
### Deployments
| Service | Version | Time | Notes |
|---------|---------|------|-------|
| api-gateway | v3.2.1 | 01/21 14:00 | Bug fix for header parsing |
| user-service | v2.8.0 | 01/20 10:00 | New profile features |
| auth-service | v4.1.2 | 01/19 16:00 | Security patch |
### Configuration Changes
- 01/21: Increased API rate limit from 1000 to 1500 RPS
- 01/20: Updated database connection pool max from 50 to 75
### Infrastructure
- 01/20: Added 2 nodes to Kubernetes cluster
- 01/19: Upgraded Redis from 6.2 to 7.0
---
## ⚠️ Known Issues & Workarounds
### 1. Slow Dashboard Loading
**Issue**: Grafana dashboards slow on Monday mornings
**Workaround**: Wait 5 min after 08:00 UTC for cache warm-up
**Ticket**: OPS-456 (P3)
### 2. Flaky Integration Test
**Issue**: `test_payment_flow` fails intermittently in CI
**Workaround**: Re-run failed job (usually passes on retry)
**Ticket**: ENG-1200 (P2)
---
## 📅 Upcoming Events
| Date | Event | Impact | Contact |
|------|-------|--------|---------|
| 01/23 02:00 | Database maintenance | 5 min read-only | @dba-team |
| 01/24 14:00 | Major release v5.0 | Monitor closely | @release-team |
| 01/25 | Marketing campaign | 2x traffic expected | @platform |
---
## 📞 Escalation Reminders
| Issue Type | First Escalation | Second Escalation |
|------------|------------------|-------------------|
| Payment issues | @payments-oncall | @payments-manager |
| Auth issues | @auth-oncall | @security-team |
| Database issues | @dba-team | @infra-manager |
| Unknown/severe | @engineering-manager | @vp-engineering |
---
## 🔧 Quick Reference
### Common Commands
```bash
# Check service health
kubectl get pods -A | grep -v Running
# Recent deployments
kubectl get events --sort-by='.lastTimestamp' | tail -20
# Database connections
psql -c "SELECT count(*) FROM pg_stat_activity;"
# Clear cache (emergency only)
redis-cli FLUSHDB
```
### Important Links
- [Runbooks](https://wiki/runbooks)
- [Service Catalog](https://wiki/services)
- [Incident Slack](https://slack.com/incidents)
- [PagerDuty](https://pagerduty.com/schedules)
---
## Handoff Checklist
### Outgoing Engineer
- [x] Document active incidents
- [x] Document ongoing investigations
- [x] List recent changes
- [x] Note known issues
- [x] Add upcoming events
- [x] Sync with incoming engineer
### Incoming Engineer
- [ ] Read this document
- [ ] Join sync call
- [ ] Verify PagerDuty is routing to you
- [ ] Verify Slack notifications working
- [ ] Check VPN/access working
- [ ] Review critical dashboards
```
### Template 2: Quick Handoff (Async)
```markdown
# Quick Handoff: @alice → @bob
## TL;DR
- No active incidents
- 1 investigation ongoing (API timeouts, see ENG-1234)
- Major release tomorrow (01/24) - be ready for issues
## Watch List
1. API latency around 02:00-03:00 UTC (backup window)
2. Auth service memory (restart if > 80%)
## Recent
- Deployed api-gateway v3.2.1 yesterday (stable)
- Increased rate limits to 1500 RPS
## Coming Up
- 01/23 02:00 - DB maintenance (5 min read-only)
- 01/24 14:00 - v5.0 release
## Questions?
I'll be available on Slack until 17:00 today.
```
### Template 3: Incident Handoff (Mid-Incident)
```markdown
# INCIDENT HANDOFF: Payment Service Degradation
**Incident Start**: 2024-01-22 08:15 UTC
**Current Status**: Mitigating
**Severity**: SEV2
---
## Current State
- Error rate: 15% (down from 40%)
- Mitigation in progress: scaling up pods
- ETA to resolution: ~30 min
## What We Know
1. Root cause: Memory pressure on payment-service pods
2. Triggered by: Unusual traffic spike (3x normal)
3. Contributing: Inefficient query in checkout flow
## What We've Done
- Scaled payment-service from 5 → 15 pods
- Enabled rate limiting on checkout endpoint
- Disabled non-critical features
## What Needs to Happen
1. Monitor error rate - should reach <1% in ~15 min
2. If not improving, escalate to @payments-manager
3. Once stable, begin root cause investigation
## Key People
- Incident Commander: @alice (handing off)
- Comms Lead: @charlie
- Technical Lead: @bob (incoming)
## Communication
- Status page: Updated at 08:45
- Customer support: Notified
- Exec team: Aware
## Resources
- Incident channel: #inc-20240122-payment
- Dashboard: [Payment Service](https://grafana/d/payments)
- Runbook: [Payment Degradation](https://wiki/runbooks/payments)
---
**Incoming on-call (@bob) - Please confirm you have:**
- [ ] Joined #inc-20240122-payment
- [ ] Access to dashboards
- [ ] Understand current state
- [ ] Know escalation path
```
## Handoff Sync Meeting
### Agenda (15 minutes)
```markdown
## Handoff Sync: @alice → @bob
1. **Active Issues** (5 min)
- Walk through any ongoing incidents
- Discuss investigation status
- Transfer context and theories
2. **Recent Changes** (3 min)
- Deployments to watch
- Config changes
- Known regressions
3. **Upcoming Events** (3 min)
- Maintenance windows
- Expected traffic changes
- Releases planned
4. **Questions** (4 min)
- Clarify anything unclear
- Confirm access and alerting
- Exchange contact info
```
## On-Call Best Practices
### Before Your Shift
```markdown
## Pre-Shift Checklist
### Access Verification
- [ ] VPN working
- [ ] kubectl access to all clusters
- [ ] Database read access
- [ ] Log aggregator access (Splunk/Datadog)
- [ ] PagerDuty app installed and logged in
### Alerting Setup
- [ ] PagerDuty schedule shows you as primary
- [ ] Phone notifications enabled
- [ ] Slack notifications for incident channels
- [ ] Test alert received and acknowledged
### Knowledge Refresh
- [ ] Review recent incidents (past 2 weeks)
- [ ] Check service changelog
- [ ] Skim critical runbooks
- [ ] Know escalation contacts
### Environment Ready
- [ ] Laptop charged and accessible
- [ ] Phone charged
- [ ] Quiet space available for calls
- [ ] Secondary contact identified (if traveling)
```
### During Your Shift
```markdown
## Daily On-Call Routine
### Morning (start of day)
- [ ] Check overnight alerts
- [ ] Review dashboards for anomalies
- [ ] Check for any P0/P1 tickets created
- [ ] Skim incident channels for context
### Throughout Day
- [ ] Respond to alerts within SLA
- [ ] Document investigation progress
- [ ] Update team on significant issues
- [ ] Triage incoming pages
### End of Day
- [ ] Hand off any active issues
- [ ] Update investigation docs
- [ ] Note anything for next shift
```
### After Your Shift
```markdown
## Post-Shift Checklist
- [ ] Complete handoff document
- [ ] Sync with incoming on-call
- [ ] Verify PagerDuty routing changed
- [ ] Close/update investigation tickets
- [ ] File postmortems for any incidents
- [ ] Take time off if shift was stressful
```
## Escalation Guidelines
### When to Escalate
```markdown
## Escalation Triggers
### Immediate Escalation
- SEV1 incident declared
- Data breach suspected
- Unable to diagnose within 30 min
- Customer or legal escalation received
### Consider Escalation
- Issue spans multiple teams
- Requires expertise you don't have
- Business impact exceeds threshold
- You're uncertain about next steps
### How to Escalate
1. Page the appropriate escalation path
2. Provide brief context in Slack
3. Stay engaged until escalation acknowledges
4. Hand off cleanly, don't just disappear
```
## Best Practices
### Do's
- **Document everything** - Future you will thank you
- **Escalate early** - Better safe than sorry
- **Take breaks** - Alert fatigue is real
- **Keep handoffs synchronous** - Async loses context
- **Test your setup** - Before incidents, not during
### Don'ts
- **Don't skip handoffs** - Context loss causes incidents
- **Don't hero** - Escalate when needed
- **Don't ignore alerts** - Even if they seem minor
- **Don't work sick** - Swap shifts instead
- **Don't disappear** - Stay reachable during shift
## Resources
- [Google SRE - Being On-Call](https://sre.google/sre-book/being-on-call/)
- [PagerDuty On-Call Guide](https://www.pagerduty.com/resources/learn/on-call-management/)
- [Increment On-Call Issue](https://increment.com/on-call/)

View File

@@ -0,0 +1,374 @@
---
name: postmortem-writing
description: Write effective blameless postmortems with root cause analysis, timelines, and action items. Use when conducting incident reviews, writing postmortem documents, or improving incident response processes.
---
# Postmortem Writing
Comprehensive guide to writing effective, blameless postmortems that drive organizational learning and prevent incident recurrence.
## When to Use This Skill
- Conducting post-incident reviews
- Writing postmortem documents
- Facilitating blameless postmortem meetings
- Identifying root causes and contributing factors
- Creating actionable follow-up items
- Building organizational learning culture
## Core Concepts
### 1. Blameless Culture
| Blame-Focused | Blameless |
|---------------|-----------|
| "Who caused this?" | "What conditions allowed this?" |
| "Someone made a mistake" | "The system allowed this mistake" |
| Punish individuals | Improve systems |
| Hide information | Share learnings |
| Fear of speaking up | Psychological safety |
### 2. Postmortem Triggers
- SEV1 or SEV2 incidents
- Customer-facing outages > 15 minutes
- Data loss or security incidents
- Near-misses that could have been severe
- Novel failure modes
- Incidents requiring unusual intervention
## Quick Start
### Postmortem Timeline
```
Day 0: Incident occurs
Day 1-2: Draft postmortem document
Day 3-5: Postmortem meeting
Day 5-7: Finalize document, create tickets
Week 2+: Action item completion
Quarterly: Review patterns across incidents
```
## Templates
### Template 1: Standard Postmortem
```markdown
# Postmortem: [Incident Title]
**Date**: 2024-01-15
**Authors**: @alice, @bob
**Status**: Draft | In Review | Final
**Incident Severity**: SEV2
**Incident Duration**: 47 minutes
## Executive Summary
On January 15, 2024, the payment processing service experienced a 47-minute outage affecting approximately 12,000 customers. The root cause was a database connection pool exhaustion triggered by a configuration change in deployment v2.3.4. The incident was resolved by rolling back to v2.3.3 and increasing connection pool limits.
**Impact**:
- 12,000 customers unable to complete purchases
- Estimated revenue loss: $45,000
- 847 support tickets created
- No data loss or security implications
## Timeline (All times UTC)
| Time | Event |
|------|-------|
| 14:23 | Deployment v2.3.4 completed to production |
| 14:31 | First alert: `payment_error_rate > 5%` |
| 14:33 | On-call engineer @alice acknowledges alert |
| 14:35 | Initial investigation begins, error rate at 23% |
| 14:41 | Incident declared SEV2, @bob joins |
| 14:45 | Database connection exhaustion identified |
| 14:52 | Decision to rollback deployment |
| 14:58 | Rollback to v2.3.3 initiated |
| 15:10 | Rollback complete, error rate dropping |
| 15:18 | Service fully recovered, incident resolved |
## Root Cause Analysis
### What Happened
The v2.3.4 deployment included a change to the database query pattern that inadvertently removed connection pooling for a frequently-called endpoint. Each request opened a new database connection instead of reusing pooled connections.
### Why It Happened
1. **Proximate Cause**: Code change in `PaymentRepository.java` replaced pooled `DataSource` with direct `DriverManager.getConnection()` calls.
2. **Contributing Factors**:
- Code review did not catch the connection handling change
- No integration tests specifically for connection pool behavior
- Staging environment has lower traffic, masking the issue
- Database connection metrics alert threshold was too high (90%)
3. **5 Whys Analysis**:
- Why did the service fail? → Database connections exhausted
- Why were connections exhausted? → Each request opened new connection
- Why did each request open new connection? → Code bypassed connection pool
- Why did code bypass connection pool? → Developer unfamiliar with codebase patterns
- Why was developer unfamiliar? → No documentation on connection management patterns
### System Diagram
```
[Client] → [Load Balancer] → [Payment Service] → [Database]
Connection Pool (broken)
Direct connections (cause)
```
## Detection
### What Worked
- Error rate alert fired within 8 minutes of deployment
- Grafana dashboard clearly showed connection spike
- On-call response was swift (2 minute acknowledgment)
### What Didn't Work
- Database connection metric alert threshold too high
- No deployment-correlated alerting
- Canary deployment would have caught this earlier
### Detection Gap
The deployment completed at 14:23, but the first alert didn't fire until 14:31 (8 minutes). A deployment-aware alert could have detected the issue faster.
## Response
### What Worked
- On-call engineer quickly identified database as the issue
- Rollback decision was made decisively
- Clear communication in incident channel
### What Could Be Improved
- Took 10 minutes to correlate issue with recent deployment
- Had to manually check deployment history
- Rollback took 12 minutes (could be faster)
## Impact
### Customer Impact
- 12,000 unique customers affected
- Average impact duration: 35 minutes
- 847 support tickets (23% of affected users)
- Customer satisfaction score dropped 12 points
### Business Impact
- Estimated revenue loss: $45,000
- Support cost: ~$2,500 (agent time)
- Engineering time: ~8 person-hours
### Technical Impact
- Database primary experienced elevated load
- Some replica lag during incident
- No permanent damage to systems
## Lessons Learned
### What Went Well
1. Alerting detected the issue before customer reports
2. Team collaborated effectively under pressure
3. Rollback procedure worked smoothly
4. Communication was clear and timely
### What Went Wrong
1. Code review missed critical change
2. Test coverage gap for connection pooling
3. Staging environment doesn't reflect production traffic
4. Alert thresholds were not tuned properly
### Where We Got Lucky
1. Incident occurred during business hours with full team available
2. Database handled the load without failing completely
3. No other incidents occurred simultaneously
## Action Items
| Priority | Action | Owner | Due Date | Ticket |
|----------|--------|-------|----------|--------|
| P0 | Add integration test for connection pool behavior | @alice | 2024-01-22 | ENG-1234 |
| P0 | Lower database connection alert threshold to 70% | @bob | 2024-01-17 | OPS-567 |
| P1 | Document connection management patterns | @alice | 2024-01-29 | DOC-89 |
| P1 | Implement deployment-correlated alerting | @bob | 2024-02-05 | OPS-568 |
| P2 | Evaluate canary deployment strategy | @charlie | 2024-02-15 | ENG-1235 |
| P2 | Load test staging with production-like traffic | @dave | 2024-02-28 | QA-123 |
## Appendix
### Supporting Data
#### Error Rate Graph
[Link to Grafana dashboard snapshot]
#### Database Connection Graph
[Link to metrics]
### Related Incidents
- 2023-11-02: Similar connection issue in User Service (POSTMORTEM-42)
### References
- [Connection Pool Best Practices](internal-wiki/connection-pools)
- [Deployment Runbook](internal-wiki/deployment-runbook)
```
### Template 2: 5 Whys Analysis
```markdown
# 5 Whys Analysis: [Incident]
## Problem Statement
Payment service experienced 47-minute outage due to database connection exhaustion.
## Analysis
### Why #1: Why did the service fail?
**Answer**: Database connections were exhausted, causing all new requests to fail.
**Evidence**: Metrics showed connection count at 100/100 (max), with 500+ pending requests.
---
### Why #2: Why were database connections exhausted?
**Answer**: Each incoming request opened a new database connection instead of using the connection pool.
**Evidence**: Code diff shows direct `DriverManager.getConnection()` instead of pooled `DataSource`.
---
### Why #3: Why did the code bypass the connection pool?
**Answer**: A developer refactored the repository class and inadvertently changed the connection acquisition method.
**Evidence**: PR #1234 shows the change, made while fixing a different bug.
---
### Why #4: Why wasn't this caught in code review?
**Answer**: The reviewer focused on the functional change (the bug fix) and didn't notice the infrastructure change.
**Evidence**: Review comments only discuss business logic.
---
### Why #5: Why isn't there a safety net for this type of change?
**Answer**: We lack automated tests that verify connection pool behavior and lack documentation about our connection patterns.
**Evidence**: Test suite has no tests for connection handling; wiki has no article on database connections.
## Root Causes Identified
1. **Primary**: Missing automated tests for infrastructure behavior
2. **Secondary**: Insufficient documentation of architectural patterns
3. **Tertiary**: Code review checklist doesn't include infrastructure considerations
## Systemic Improvements
| Root Cause | Improvement | Type |
|------------|-------------|------|
| Missing tests | Add infrastructure behavior tests | Prevention |
| Missing docs | Document connection patterns | Prevention |
| Review gaps | Update review checklist | Detection |
| No canary | Implement canary deployments | Mitigation |
```
### Template 3: Quick Postmortem (Minor Incidents)
```markdown
# Quick Postmortem: [Brief Title]
**Date**: 2024-01-15 | **Duration**: 12 min | **Severity**: SEV3
## What Happened
API latency spiked to 5s due to cache miss storm after cache flush.
## Timeline
- 10:00 - Cache flush initiated for config update
- 10:02 - Latency alerts fire
- 10:05 - Identified as cache miss storm
- 10:08 - Enabled cache warming
- 10:12 - Latency normalized
## Root Cause
Full cache flush for minor config update caused thundering herd.
## Fix
- Immediate: Enabled cache warming
- Long-term: Implement partial cache invalidation (ENG-999)
## Lessons
Don't full-flush cache in production; use targeted invalidation.
```
## Facilitation Guide
### Running a Postmortem Meeting
```markdown
## Meeting Structure (60 minutes)
### 1. Opening (5 min)
- Remind everyone of blameless culture
- "We're here to learn, not to blame"
- Review meeting norms
### 2. Timeline Review (15 min)
- Walk through events chronologically
- Ask clarifying questions
- Identify gaps in timeline
### 3. Analysis Discussion (20 min)
- What failed?
- Why did it fail?
- What conditions allowed this?
- What would have prevented it?
### 4. Action Items (15 min)
- Brainstorm improvements
- Prioritize by impact and effort
- Assign owners and due dates
### 5. Closing (5 min)
- Summarize key learnings
- Confirm action item owners
- Schedule follow-up if needed
## Facilitation Tips
- Keep discussion on track
- Redirect blame to systems
- Encourage quiet participants
- Document dissenting views
- Time-box tangents
```
## Anti-Patterns to Avoid
| Anti-Pattern | Problem | Better Approach |
|--------------|---------|-----------------|
| **Blame game** | Shuts down learning | Focus on systems |
| **Shallow analysis** | Doesn't prevent recurrence | Ask "why" 5 times |
| **No action items** | Waste of time | Always have concrete next steps |
| **Unrealistic actions** | Never completed | Scope to achievable tasks |
| **No follow-up** | Actions forgotten | Track in ticketing system |
## Best Practices
### Do's
- **Start immediately** - Memory fades fast
- **Be specific** - Exact times, exact errors
- **Include graphs** - Visual evidence
- **Assign owners** - No orphan action items
- **Share widely** - Organizational learning
### Don'ts
- **Don't name and shame** - Ever
- **Don't skip small incidents** - They reveal patterns
- **Don't make it a blame doc** - That kills learning
- **Don't create busywork** - Actions should be meaningful
- **Don't skip follow-up** - Verify actions completed
## Resources
- [Google SRE - Postmortem Culture](https://sre.google/sre-book/postmortem-culture/)
- [Etsy's Blameless Postmortems](https://codeascraft.com/2012/05/22/blameless-postmortems/)
- [PagerDuty Postmortem Guide](https://postmortems.pagerduty.com/)

View File

@@ -0,0 +1,43 @@
# Vector Database Engineer
Expert in vector databases, embedding strategies, and semantic search implementation. Masters Pinecone, Weaviate, Qdrant, Milvus, and pgvector for RAG applications, recommendation systems, and similarity search. Use PROACTIVELY for vector search implementation, embedding optimization, or semantic retrieval systems.
## Capabilities
- Vector database selection and architecture
- Embedding model selection and optimization
- Index configuration (HNSW, IVF, PQ)
- Hybrid search (vector + keyword) implementation
- Chunking strategies for documents
- Metadata filtering and pre/post-filtering
- Performance tuning and scaling
## When to Use
- Building RAG (Retrieval Augmented Generation) systems
- Implementing semantic search over documents
- Creating recommendation engines
- Building image/audio similarity search
- Optimizing vector search latency and recall
- Scaling vector operations to millions of vectors
## Workflow
1. Analyze data characteristics and query patterns
2. Select appropriate embedding model
3. Design chunking and preprocessing pipeline
4. Choose vector database and index type
5. Configure metadata schema for filtering
6. Implement hybrid search if needed
7. Optimize for latency/recall tradeoffs
8. Set up monitoring and reindexing strategies
## Best Practices
- Choose embedding dimensions based on use case (384-1536)
- Implement proper chunking with overlap
- Use metadata filtering to reduce search space
- Monitor embedding drift over time
- Plan for index rebuilding
- Cache frequent queries
- Test recall vs latency tradeoffs

View File

@@ -0,0 +1,479 @@
---
name: embedding-strategies
description: Select and optimize embedding models for semantic search and RAG applications. Use when choosing embedding models, implementing chunking strategies, or optimizing embedding quality for specific domains.
---
# Embedding Strategies
Guide to selecting and optimizing embedding models for vector search applications.
## When to Use This Skill
- Choosing embedding models for RAG
- Optimizing chunking strategies
- Fine-tuning embeddings for domains
- Comparing embedding model performance
- Reducing embedding dimensions
- Handling multilingual content
## Core Concepts
### 1. Embedding Model Comparison
| Model | Dimensions | Max Tokens | Best For |
|-------|------------|------------|----------|
| **text-embedding-3-large** | 3072 | 8191 | High accuracy |
| **text-embedding-3-small** | 1536 | 8191 | Cost-effective |
| **voyage-2** | 1024 | 4000 | Code, legal |
| **bge-large-en-v1.5** | 1024 | 512 | Open source |
| **all-MiniLM-L6-v2** | 384 | 256 | Fast, lightweight |
| **multilingual-e5-large** | 1024 | 512 | Multi-language |
### 2. Embedding Pipeline
```
Document → Chunking → Preprocessing → Embedding Model → Vector
[Overlap, Size] [Clean, Normalize] [API/Local]
```
## Templates
### Template 1: OpenAI Embeddings
```python
from openai import OpenAI
from typing import List
import numpy as np
client = OpenAI()
def get_embeddings(
texts: List[str],
model: str = "text-embedding-3-small",
dimensions: int = None
) -> List[List[float]]:
"""Get embeddings from OpenAI."""
# Handle batching for large lists
batch_size = 100
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
kwargs = {"input": batch, "model": model}
if dimensions:
kwargs["dimensions"] = dimensions
response = client.embeddings.create(**kwargs)
embeddings = [item.embedding for item in response.data]
all_embeddings.extend(embeddings)
return all_embeddings
def get_embedding(text: str, **kwargs) -> List[float]:
"""Get single embedding."""
return get_embeddings([text], **kwargs)[0]
# Dimension reduction with OpenAI
def get_reduced_embedding(text: str, dimensions: int = 512) -> List[float]:
"""Get embedding with reduced dimensions (Matryoshka)."""
return get_embedding(
text,
model="text-embedding-3-small",
dimensions=dimensions
)
```
### Template 2: Local Embeddings with Sentence Transformers
```python
from sentence_transformers import SentenceTransformer
from typing import List, Optional
import numpy as np
class LocalEmbedder:
"""Local embedding with sentence-transformers."""
def __init__(
self,
model_name: str = "BAAI/bge-large-en-v1.5",
device: str = "cuda"
):
self.model = SentenceTransformer(model_name, device=device)
def embed(
self,
texts: List[str],
normalize: bool = True,
show_progress: bool = False
) -> np.ndarray:
"""Embed texts with optional normalization."""
embeddings = self.model.encode(
texts,
normalize_embeddings=normalize,
show_progress_bar=show_progress,
convert_to_numpy=True
)
return embeddings
def embed_query(self, query: str) -> np.ndarray:
"""Embed a query with BGE-style prefix."""
# BGE models benefit from query prefix
if "bge" in self.model.get_sentence_embedding_dimension():
query = f"Represent this sentence for searching relevant passages: {query}"
return self.embed([query])[0]
def embed_documents(self, documents: List[str]) -> np.ndarray:
"""Embed documents for indexing."""
return self.embed(documents)
# E5 model with instructions
class E5Embedder:
def __init__(self, model_name: str = "intfloat/multilingual-e5-large"):
self.model = SentenceTransformer(model_name)
def embed_query(self, query: str) -> np.ndarray:
return self.model.encode(f"query: {query}")
def embed_document(self, document: str) -> np.ndarray:
return self.model.encode(f"passage: {document}")
```
### Template 3: Chunking Strategies
```python
from typing import List, Tuple
import re
def chunk_by_tokens(
text: str,
chunk_size: int = 512,
chunk_overlap: int = 50,
tokenizer=None
) -> List[str]:
"""Chunk text by token count."""
import tiktoken
tokenizer = tokenizer or tiktoken.get_encoding("cl100k_base")
tokens = tokenizer.encode(text)
chunks = []
start = 0
while start < len(tokens):
end = start + chunk_size
chunk_tokens = tokens[start:end]
chunk_text = tokenizer.decode(chunk_tokens)
chunks.append(chunk_text)
start = end - chunk_overlap
return chunks
def chunk_by_sentences(
text: str,
max_chunk_size: int = 1000,
min_chunk_size: int = 100
) -> List[str]:
"""Chunk text by sentences, respecting size limits."""
import nltk
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = []
current_size = 0
for sentence in sentences:
sentence_size = len(sentence)
if current_size + sentence_size > max_chunk_size and current_chunk:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_size = 0
current_chunk.append(sentence)
current_size += sentence_size
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def chunk_by_semantic_sections(
text: str,
headers_pattern: str = r'^#{1,3}\s+.+$'
) -> List[Tuple[str, str]]:
"""Chunk markdown by headers, preserving hierarchy."""
lines = text.split('\n')
chunks = []
current_header = ""
current_content = []
for line in lines:
if re.match(headers_pattern, line, re.MULTILINE):
if current_content:
chunks.append((current_header, '\n'.join(current_content)))
current_header = line
current_content = []
else:
current_content.append(line)
if current_content:
chunks.append((current_header, '\n'.join(current_content)))
return chunks
def recursive_character_splitter(
text: str,
chunk_size: int = 1000,
chunk_overlap: int = 200,
separators: List[str] = None
) -> List[str]:
"""LangChain-style recursive splitter."""
separators = separators or ["\n\n", "\n", ". ", " ", ""]
def split_text(text: str, separators: List[str]) -> List[str]:
if not text:
return []
separator = separators[0]
remaining_separators = separators[1:]
if separator == "":
# Character-level split
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap)]
splits = text.split(separator)
chunks = []
current_chunk = []
current_length = 0
for split in splits:
split_length = len(split) + len(separator)
if current_length + split_length > chunk_size and current_chunk:
chunk_text = separator.join(current_chunk)
# Recursively split if still too large
if len(chunk_text) > chunk_size and remaining_separators:
chunks.extend(split_text(chunk_text, remaining_separators))
else:
chunks.append(chunk_text)
# Start new chunk with overlap
overlap_splits = []
overlap_length = 0
for s in reversed(current_chunk):
if overlap_length + len(s) <= chunk_overlap:
overlap_splits.insert(0, s)
overlap_length += len(s)
else:
break
current_chunk = overlap_splits
current_length = overlap_length
current_chunk.append(split)
current_length += split_length
if current_chunk:
chunks.append(separator.join(current_chunk))
return chunks
return split_text(text, separators)
```
### Template 4: Domain-Specific Embedding Pipeline
```python
class DomainEmbeddingPipeline:
"""Pipeline for domain-specific embeddings."""
def __init__(
self,
embedding_model: str = "text-embedding-3-small",
chunk_size: int = 512,
chunk_overlap: int = 50,
preprocessing_fn=None
):
self.embedding_model = embedding_model
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.preprocess = preprocessing_fn or self._default_preprocess
def _default_preprocess(self, text: str) -> str:
"""Default preprocessing."""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters
text = re.sub(r'[^\w\s.,!?-]', '', text)
return text.strip()
async def process_documents(
self,
documents: List[dict],
id_field: str = "id",
content_field: str = "content",
metadata_fields: List[str] = None
) -> List[dict]:
"""Process documents for vector storage."""
processed = []
for doc in documents:
content = doc[content_field]
doc_id = doc[id_field]
# Preprocess
cleaned = self.preprocess(content)
# Chunk
chunks = chunk_by_tokens(
cleaned,
self.chunk_size,
self.chunk_overlap
)
# Create embeddings
embeddings = get_embeddings(chunks, self.embedding_model)
# Create records
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
record = {
"id": f"{doc_id}_chunk_{i}",
"document_id": doc_id,
"chunk_index": i,
"text": chunk,
"embedding": embedding
}
# Add metadata
if metadata_fields:
for field in metadata_fields:
if field in doc:
record[field] = doc[field]
processed.append(record)
return processed
# Code-specific pipeline
class CodeEmbeddingPipeline:
"""Specialized pipeline for code embeddings."""
def __init__(self, model: str = "voyage-code-2"):
self.model = model
def chunk_code(self, code: str, language: str) -> List[dict]:
"""Chunk code by functions/classes."""
import tree_sitter
# Parse with tree-sitter
# Extract functions, classes, methods
# Return chunks with context
pass
def embed_with_context(self, chunk: str, context: str) -> List[float]:
"""Embed code with surrounding context."""
combined = f"Context: {context}\n\nCode:\n{chunk}"
return get_embedding(combined, model=self.model)
```
### Template 5: Embedding Quality Evaluation
```python
import numpy as np
from typing import List, Tuple
def evaluate_retrieval_quality(
queries: List[str],
relevant_docs: List[List[str]], # List of relevant doc IDs per query
retrieved_docs: List[List[str]], # List of retrieved doc IDs per query
k: int = 10
) -> dict:
"""Evaluate embedding quality for retrieval."""
def precision_at_k(relevant: set, retrieved: List[str], k: int) -> float:
retrieved_k = retrieved[:k]
relevant_retrieved = len(set(retrieved_k) & relevant)
return relevant_retrieved / k
def recall_at_k(relevant: set, retrieved: List[str], k: int) -> float:
retrieved_k = retrieved[:k]
relevant_retrieved = len(set(retrieved_k) & relevant)
return relevant_retrieved / len(relevant) if relevant else 0
def mrr(relevant: set, retrieved: List[str]) -> float:
for i, doc in enumerate(retrieved):
if doc in relevant:
return 1 / (i + 1)
return 0
def ndcg_at_k(relevant: set, retrieved: List[str], k: int) -> float:
dcg = sum(
1 / np.log2(i + 2) if doc in relevant else 0
for i, doc in enumerate(retrieved[:k])
)
ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant), k)))
return dcg / ideal_dcg if ideal_dcg > 0 else 0
metrics = {
f"precision@{k}": [],
f"recall@{k}": [],
"mrr": [],
f"ndcg@{k}": []
}
for relevant, retrieved in zip(relevant_docs, retrieved_docs):
relevant_set = set(relevant)
metrics[f"precision@{k}"].append(precision_at_k(relevant_set, retrieved, k))
metrics[f"recall@{k}"].append(recall_at_k(relevant_set, retrieved, k))
metrics["mrr"].append(mrr(relevant_set, retrieved))
metrics[f"ndcg@{k}"].append(ndcg_at_k(relevant_set, retrieved, k))
return {name: np.mean(values) for name, values in metrics.items()}
def compute_embedding_similarity(
embeddings1: np.ndarray,
embeddings2: np.ndarray,
metric: str = "cosine"
) -> np.ndarray:
"""Compute similarity matrix between embedding sets."""
if metric == "cosine":
# Normalize
norm1 = embeddings1 / np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = embeddings2 / np.linalg.norm(embeddings2, axis=1, keepdims=True)
return norm1 @ norm2.T
elif metric == "euclidean":
from scipy.spatial.distance import cdist
return -cdist(embeddings1, embeddings2, metric='euclidean')
elif metric == "dot":
return embeddings1 @ embeddings2.T
```
## Best Practices
### Do's
- **Match model to use case** - Code vs prose vs multilingual
- **Chunk thoughtfully** - Preserve semantic boundaries
- **Normalize embeddings** - For cosine similarity
- **Batch requests** - More efficient than one-by-one
- **Cache embeddings** - Avoid recomputing
### Don'ts
- **Don't ignore token limits** - Truncation loses info
- **Don't mix embedding models** - Incompatible spaces
- **Don't skip preprocessing** - Garbage in, garbage out
- **Don't over-chunk** - Lose context
## Resources
- [OpenAI Embeddings](https://platform.openai.com/docs/guides/embeddings)
- [Sentence Transformers](https://www.sbert.net/)
- [MTEB Benchmark](https://huggingface.co/spaces/mteb/leaderboard)

View File

@@ -0,0 +1,568 @@
---
name: hybrid-search-implementation
description: Combine vector and keyword search for improved retrieval. Use when implementing RAG systems, building search engines, or when neither approach alone provides sufficient recall.
---
# Hybrid Search Implementation
Patterns for combining vector similarity and keyword-based search.
## When to Use This Skill
- Building RAG systems with improved recall
- Combining semantic understanding with exact matching
- Handling queries with specific terms (names, codes)
- Improving search for domain-specific vocabulary
- When pure vector search misses keyword matches
## Core Concepts
### 1. Hybrid Search Architecture
```
Query → ┬─► Vector Search ──► Candidates ─┐
│ │
└─► Keyword Search ─► Candidates ─┴─► Fusion ─► Results
```
### 2. Fusion Methods
| Method | Description | Best For |
|--------|-------------|----------|
| **RRF** | Reciprocal Rank Fusion | General purpose |
| **Linear** | Weighted sum of scores | Tunable balance |
| **Cross-encoder** | Rerank with neural model | Highest quality |
| **Cascade** | Filter then rerank | Efficiency |
## Templates
### Template 1: Reciprocal Rank Fusion
```python
from typing import List, Dict, Tuple
from collections import defaultdict
def reciprocal_rank_fusion(
result_lists: List[List[Tuple[str, float]]],
k: int = 60,
weights: List[float] = None
) -> List[Tuple[str, float]]:
"""
Combine multiple ranked lists using RRF.
Args:
result_lists: List of (doc_id, score) tuples per search method
k: RRF constant (higher = more weight to lower ranks)
weights: Optional weights per result list
Returns:
Fused ranking as (doc_id, score) tuples
"""
if weights is None:
weights = [1.0] * len(result_lists)
scores = defaultdict(float)
for result_list, weight in zip(result_lists, weights):
for rank, (doc_id, _) in enumerate(result_list):
# RRF formula: 1 / (k + rank)
scores[doc_id] += weight * (1.0 / (k + rank + 1))
# Sort by fused score
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
def linear_combination(
vector_results: List[Tuple[str, float]],
keyword_results: List[Tuple[str, float]],
alpha: float = 0.5
) -> List[Tuple[str, float]]:
"""
Combine results with linear interpolation.
Args:
vector_results: (doc_id, similarity_score) from vector search
keyword_results: (doc_id, bm25_score) from keyword search
alpha: Weight for vector search (1-alpha for keyword)
"""
# Normalize scores to [0, 1]
def normalize(results):
if not results:
return {}
scores = [s for _, s in results]
min_s, max_s = min(scores), max(scores)
range_s = max_s - min_s if max_s != min_s else 1
return {doc_id: (score - min_s) / range_s for doc_id, score in results}
vector_scores = normalize(vector_results)
keyword_scores = normalize(keyword_results)
# Combine
all_docs = set(vector_scores.keys()) | set(keyword_scores.keys())
combined = {}
for doc_id in all_docs:
v_score = vector_scores.get(doc_id, 0)
k_score = keyword_scores.get(doc_id, 0)
combined[doc_id] = alpha * v_score + (1 - alpha) * k_score
return sorted(combined.items(), key=lambda x: x[1], reverse=True)
```
### Template 2: PostgreSQL Hybrid Search
```python
import asyncpg
from typing import List, Dict, Optional
import numpy as np
class PostgresHybridSearch:
"""Hybrid search with pgvector and full-text search."""
def __init__(self, pool: asyncpg.Pool):
self.pool = pool
async def setup_schema(self):
"""Create tables and indexes."""
async with self.pool.acquire() as conn:
await conn.execute("""
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS documents (
id TEXT PRIMARY KEY,
content TEXT NOT NULL,
embedding vector(1536),
metadata JSONB DEFAULT '{}',
ts_content tsvector GENERATED ALWAYS AS (
to_tsvector('english', content)
) STORED
);
-- Vector index (HNSW)
CREATE INDEX IF NOT EXISTS documents_embedding_idx
ON documents USING hnsw (embedding vector_cosine_ops);
-- Full-text index (GIN)
CREATE INDEX IF NOT EXISTS documents_fts_idx
ON documents USING gin (ts_content);
""")
async def hybrid_search(
self,
query: str,
query_embedding: List[float],
limit: int = 10,
vector_weight: float = 0.5,
filter_metadata: Optional[Dict] = None
) -> List[Dict]:
"""
Perform hybrid search combining vector and full-text.
Uses RRF fusion for combining results.
"""
async with self.pool.acquire() as conn:
# Build filter clause
where_clause = "1=1"
params = [query_embedding, query, limit * 3]
if filter_metadata:
for key, value in filter_metadata.items():
params.append(value)
where_clause += f" AND metadata->>'{key}' = ${len(params)}"
results = await conn.fetch(f"""
WITH vector_search AS (
SELECT
id,
content,
metadata,
ROW_NUMBER() OVER (ORDER BY embedding <=> $1::vector) as vector_rank,
1 - (embedding <=> $1::vector) as vector_score
FROM documents
WHERE {where_clause}
ORDER BY embedding <=> $1::vector
LIMIT $3
),
keyword_search AS (
SELECT
id,
content,
metadata,
ROW_NUMBER() OVER (ORDER BY ts_rank(ts_content, websearch_to_tsquery('english', $2)) DESC) as keyword_rank,
ts_rank(ts_content, websearch_to_tsquery('english', $2)) as keyword_score
FROM documents
WHERE ts_content @@ websearch_to_tsquery('english', $2)
AND {where_clause}
ORDER BY ts_rank(ts_content, websearch_to_tsquery('english', $2)) DESC
LIMIT $3
)
SELECT
COALESCE(v.id, k.id) as id,
COALESCE(v.content, k.content) as content,
COALESCE(v.metadata, k.metadata) as metadata,
v.vector_score,
k.keyword_score,
-- RRF fusion
COALESCE(1.0 / (60 + v.vector_rank), 0) * $4::float +
COALESCE(1.0 / (60 + k.keyword_rank), 0) * (1 - $4::float) as rrf_score
FROM vector_search v
FULL OUTER JOIN keyword_search k ON v.id = k.id
ORDER BY rrf_score DESC
LIMIT $3 / 3
""", *params, vector_weight)
return [dict(row) for row in results]
async def search_with_rerank(
self,
query: str,
query_embedding: List[float],
limit: int = 10,
rerank_candidates: int = 50
) -> List[Dict]:
"""Hybrid search with cross-encoder reranking."""
from sentence_transformers import CrossEncoder
# Get candidates
candidates = await self.hybrid_search(
query, query_embedding, limit=rerank_candidates
)
if not candidates:
return []
# Rerank with cross-encoder
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
pairs = [(query, c["content"]) for c in candidates]
scores = model.predict(pairs)
for candidate, score in zip(candidates, scores):
candidate["rerank_score"] = float(score)
# Sort by rerank score and return top results
reranked = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
return reranked[:limit]
```
### Template 3: Elasticsearch Hybrid Search
```python
from elasticsearch import Elasticsearch
from typing import List, Dict, Optional
class ElasticsearchHybridSearch:
"""Hybrid search with Elasticsearch and dense vectors."""
def __init__(
self,
es_client: Elasticsearch,
index_name: str = "documents"
):
self.es = es_client
self.index_name = index_name
def create_index(self, vector_dims: int = 1536):
"""Create index with dense vector and text fields."""
mapping = {
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "english"
},
"embedding": {
"type": "dense_vector",
"dims": vector_dims,
"index": True,
"similarity": "cosine"
},
"metadata": {
"type": "object",
"enabled": True
}
}
}
}
self.es.indices.create(index=self.index_name, body=mapping, ignore=400)
def hybrid_search(
self,
query: str,
query_embedding: List[float],
limit: int = 10,
boost_vector: float = 1.0,
boost_text: float = 1.0,
filter: Optional[Dict] = None
) -> List[Dict]:
"""
Hybrid search using Elasticsearch's built-in capabilities.
"""
# Build the hybrid query
search_body = {
"size": limit,
"query": {
"bool": {
"should": [
# Vector search (kNN)
{
"script_score": {
"query": {"match_all": {}},
"script": {
"source": f"cosineSimilarity(params.query_vector, 'embedding') * {boost_vector} + 1.0",
"params": {"query_vector": query_embedding}
}
}
},
# Text search (BM25)
{
"match": {
"content": {
"query": query,
"boost": boost_text
}
}
}
],
"minimum_should_match": 1
}
}
}
# Add filter if provided
if filter:
search_body["query"]["bool"]["filter"] = filter
response = self.es.search(index=self.index_name, body=search_body)
return [
{
"id": hit["_id"],
"content": hit["_source"]["content"],
"metadata": hit["_source"].get("metadata", {}),
"score": hit["_score"]
}
for hit in response["hits"]["hits"]
]
def hybrid_search_rrf(
self,
query: str,
query_embedding: List[float],
limit: int = 10,
window_size: int = 100
) -> List[Dict]:
"""
Hybrid search using Elasticsearch 8.x RRF.
"""
search_body = {
"size": limit,
"sub_searches": [
{
"query": {
"match": {
"content": query
}
}
},
{
"query": {
"knn": {
"field": "embedding",
"query_vector": query_embedding,
"k": window_size,
"num_candidates": window_size * 2
}
}
}
],
"rank": {
"rrf": {
"window_size": window_size,
"rank_constant": 60
}
}
}
response = self.es.search(index=self.index_name, body=search_body)
return [
{
"id": hit["_id"],
"content": hit["_source"]["content"],
"score": hit["_score"]
}
for hit in response["hits"]["hits"]
]
```
### Template 4: Custom Hybrid RAG Pipeline
```python
from typing import List, Dict, Optional, Callable
from dataclasses import dataclass
@dataclass
class SearchResult:
id: str
content: str
score: float
source: str # "vector", "keyword", "hybrid"
metadata: Dict = None
class HybridRAGPipeline:
"""Complete hybrid search pipeline for RAG."""
def __init__(
self,
vector_store,
keyword_store,
embedder,
reranker=None,
fusion_method: str = "rrf",
vector_weight: float = 0.5
):
self.vector_store = vector_store
self.keyword_store = keyword_store
self.embedder = embedder
self.reranker = reranker
self.fusion_method = fusion_method
self.vector_weight = vector_weight
async def search(
self,
query: str,
top_k: int = 10,
filter: Optional[Dict] = None,
use_rerank: bool = True
) -> List[SearchResult]:
"""Execute hybrid search pipeline."""
# Step 1: Get query embedding
query_embedding = self.embedder.embed(query)
# Step 2: Execute parallel searches
vector_results, keyword_results = await asyncio.gather(
self._vector_search(query_embedding, top_k * 3, filter),
self._keyword_search(query, top_k * 3, filter)
)
# Step 3: Fuse results
if self.fusion_method == "rrf":
fused = self._rrf_fusion(vector_results, keyword_results)
else:
fused = self._linear_fusion(vector_results, keyword_results)
# Step 4: Rerank if enabled
if use_rerank and self.reranker:
fused = await self._rerank(query, fused[:top_k * 2])
return fused[:top_k]
async def _vector_search(
self,
embedding: List[float],
limit: int,
filter: Dict
) -> List[SearchResult]:
results = await self.vector_store.search(embedding, limit, filter)
return [
SearchResult(
id=r["id"],
content=r["content"],
score=r["score"],
source="vector",
metadata=r.get("metadata")
)
for r in results
]
async def _keyword_search(
self,
query: str,
limit: int,
filter: Dict
) -> List[SearchResult]:
results = await self.keyword_store.search(query, limit, filter)
return [
SearchResult(
id=r["id"],
content=r["content"],
score=r["score"],
source="keyword",
metadata=r.get("metadata")
)
for r in results
]
def _rrf_fusion(
self,
vector_results: List[SearchResult],
keyword_results: List[SearchResult]
) -> List[SearchResult]:
"""Fuse with RRF."""
k = 60
scores = {}
content_map = {}
for rank, result in enumerate(vector_results):
scores[result.id] = scores.get(result.id, 0) + 1 / (k + rank + 1)
content_map[result.id] = result
for rank, result in enumerate(keyword_results):
scores[result.id] = scores.get(result.id, 0) + 1 / (k + rank + 1)
if result.id not in content_map:
content_map[result.id] = result
sorted_ids = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
return [
SearchResult(
id=doc_id,
content=content_map[doc_id].content,
score=scores[doc_id],
source="hybrid",
metadata=content_map[doc_id].metadata
)
for doc_id in sorted_ids
]
async def _rerank(
self,
query: str,
results: List[SearchResult]
) -> List[SearchResult]:
"""Rerank with cross-encoder."""
if not results:
return results
pairs = [(query, r.content) for r in results]
scores = self.reranker.predict(pairs)
for result, score in zip(results, scores):
result.score = float(score)
return sorted(results, key=lambda x: x.score, reverse=True)
```
## Best Practices
### Do's
- **Tune weights empirically** - Test on your data
- **Use RRF for simplicity** - Works well without tuning
- **Add reranking** - Significant quality improvement
- **Log both scores** - Helps with debugging
- **A/B test** - Measure real user impact
### Don'ts
- **Don't assume one size fits all** - Different queries need different weights
- **Don't skip keyword search** - Handles exact matches better
- **Don't over-fetch** - Balance recall vs latency
- **Don't ignore edge cases** - Empty results, single word queries
## Resources
- [RRF Paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf)
- [Vespa Hybrid Search](https://blog.vespa.ai/improving-text-ranking-with-few-shot-prompting/)
- [Cohere Rerank](https://docs.cohere.com/docs/reranking)

View File

@@ -0,0 +1,558 @@
---
name: similarity-search-patterns
description: Implement efficient similarity search with vector databases. Use when building semantic search, implementing nearest neighbor queries, or optimizing retrieval performance.
---
# Similarity Search Patterns
Patterns for implementing efficient similarity search in production systems.
## When to Use This Skill
- Building semantic search systems
- Implementing RAG retrieval
- Creating recommendation engines
- Optimizing search latency
- Scaling to millions of vectors
- Combining semantic and keyword search
## Core Concepts
### 1. Distance Metrics
| Metric | Formula | Best For |
|--------|---------|----------|
| **Cosine** | 1 - (A·B)/(‖A‖‖B‖) | Normalized embeddings |
| **Euclidean (L2)** | √Σ(a-b)² | Raw embeddings |
| **Dot Product** | A·B | Magnitude matters |
| **Manhattan (L1)** | Σ|a-b| | Sparse vectors |
### 2. Index Types
```
┌─────────────────────────────────────────────────┐
│ Index Types │
├─────────────┬───────────────┬───────────────────┤
│ Flat │ HNSW │ IVF+PQ │
│ (Exact) │ (Graph-based) │ (Quantized) │
├─────────────┼───────────────┼───────────────────┤
│ O(n) search │ O(log n) │ O(√n) │
│ 100% recall │ ~95-99% │ ~90-95% │
│ Small data │ Medium-Large │ Very Large │
└─────────────┴───────────────┴───────────────────┘
```
## Templates
### Template 1: Pinecone Implementation
```python
from pinecone import Pinecone, ServerlessSpec
from typing import List, Dict, Optional
import hashlib
class PineconeVectorStore:
def __init__(
self,
api_key: str,
index_name: str,
dimension: int = 1536,
metric: str = "cosine"
):
self.pc = Pinecone(api_key=api_key)
# Create index if not exists
if index_name not in self.pc.list_indexes().names():
self.pc.create_index(
name=index_name,
dimension=dimension,
metric=metric,
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
self.index = self.pc.Index(index_name)
def upsert(
self,
vectors: List[Dict],
namespace: str = ""
) -> int:
"""
Upsert vectors.
vectors: [{"id": str, "values": List[float], "metadata": dict}]
"""
# Batch upsert
batch_size = 100
total = 0
for i in range(0, len(vectors), batch_size):
batch = vectors[i:i + batch_size]
self.index.upsert(vectors=batch, namespace=namespace)
total += len(batch)
return total
def search(
self,
query_vector: List[float],
top_k: int = 10,
namespace: str = "",
filter: Optional[Dict] = None,
include_metadata: bool = True
) -> List[Dict]:
"""Search for similar vectors."""
results = self.index.query(
vector=query_vector,
top_k=top_k,
namespace=namespace,
filter=filter,
include_metadata=include_metadata
)
return [
{
"id": match.id,
"score": match.score,
"metadata": match.metadata
}
for match in results.matches
]
def search_with_rerank(
self,
query: str,
query_vector: List[float],
top_k: int = 10,
rerank_top_n: int = 50,
namespace: str = ""
) -> List[Dict]:
"""Search and rerank results."""
# Over-fetch for reranking
initial_results = self.search(
query_vector,
top_k=rerank_top_n,
namespace=namespace
)
# Rerank with cross-encoder or LLM
reranked = self._rerank(query, initial_results)
return reranked[:top_k]
def _rerank(self, query: str, results: List[Dict]) -> List[Dict]:
"""Rerank results using cross-encoder."""
from sentence_transformers import CrossEncoder
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
pairs = [(query, r["metadata"]["text"]) for r in results]
scores = model.predict(pairs)
for result, score in zip(results, scores):
result["rerank_score"] = float(score)
return sorted(results, key=lambda x: x["rerank_score"], reverse=True)
def delete(self, ids: List[str], namespace: str = ""):
"""Delete vectors by ID."""
self.index.delete(ids=ids, namespace=namespace)
def delete_by_filter(self, filter: Dict, namespace: str = ""):
"""Delete vectors matching filter."""
self.index.delete(filter=filter, namespace=namespace)
```
### Template 2: Qdrant Implementation
```python
from qdrant_client import QdrantClient
from qdrant_client.http import models
from typing import List, Dict, Optional
class QdrantVectorStore:
def __init__(
self,
url: str = "localhost",
port: int = 6333,
collection_name: str = "documents",
vector_size: int = 1536
):
self.client = QdrantClient(url=url, port=port)
self.collection_name = collection_name
# Create collection if not exists
collections = self.client.get_collections().collections
if collection_name not in [c.name for c in collections]:
self.client.create_collection(
collection_name=collection_name,
vectors_config=models.VectorParams(
size=vector_size,
distance=models.Distance.COSINE
),
# Optional: enable quantization for memory efficiency
quantization_config=models.ScalarQuantization(
scalar=models.ScalarQuantizationConfig(
type=models.ScalarType.INT8,
quantile=0.99,
always_ram=True
)
)
)
def upsert(self, points: List[Dict]) -> int:
"""
Upsert points.
points: [{"id": str/int, "vector": List[float], "payload": dict}]
"""
qdrant_points = [
models.PointStruct(
id=p["id"],
vector=p["vector"],
payload=p.get("payload", {})
)
for p in points
]
self.client.upsert(
collection_name=self.collection_name,
points=qdrant_points
)
return len(points)
def search(
self,
query_vector: List[float],
limit: int = 10,
filter: Optional[models.Filter] = None,
score_threshold: Optional[float] = None
) -> List[Dict]:
"""Search for similar vectors."""
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_vector,
limit=limit,
query_filter=filter,
score_threshold=score_threshold
)
return [
{
"id": r.id,
"score": r.score,
"payload": r.payload
}
for r in results
]
def search_with_filter(
self,
query_vector: List[float],
must_conditions: List[Dict] = None,
should_conditions: List[Dict] = None,
must_not_conditions: List[Dict] = None,
limit: int = 10
) -> List[Dict]:
"""Search with complex filters."""
conditions = []
if must_conditions:
conditions.extend([
models.FieldCondition(
key=c["key"],
match=models.MatchValue(value=c["value"])
)
for c in must_conditions
])
filter = models.Filter(must=conditions) if conditions else None
return self.search(query_vector, limit=limit, filter=filter)
def search_with_sparse(
self,
dense_vector: List[float],
sparse_vector: Dict[int, float],
limit: int = 10,
dense_weight: float = 0.7
) -> List[Dict]:
"""Hybrid search with dense and sparse vectors."""
# Requires collection with named vectors
results = self.client.search(
collection_name=self.collection_name,
query_vector=models.NamedVector(
name="dense",
vector=dense_vector
),
limit=limit
)
return [{"id": r.id, "score": r.score, "payload": r.payload} for r in results]
```
### Template 3: pgvector with PostgreSQL
```python
import asyncpg
from typing import List, Dict, Optional
import numpy as np
class PgVectorStore:
def __init__(self, connection_string: str):
self.connection_string = connection_string
async def init(self):
"""Initialize connection pool and extension."""
self.pool = await asyncpg.create_pool(self.connection_string)
async with self.pool.acquire() as conn:
# Enable extension
await conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
# Create table
await conn.execute("""
CREATE TABLE IF NOT EXISTS documents (
id TEXT PRIMARY KEY,
content TEXT,
metadata JSONB,
embedding vector(1536)
)
""")
# Create index (HNSW for better performance)
await conn.execute("""
CREATE INDEX IF NOT EXISTS documents_embedding_idx
ON documents
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64)
""")
async def upsert(self, documents: List[Dict]):
"""Upsert documents with embeddings."""
async with self.pool.acquire() as conn:
await conn.executemany(
"""
INSERT INTO documents (id, content, metadata, embedding)
VALUES ($1, $2, $3, $4)
ON CONFLICT (id) DO UPDATE SET
content = EXCLUDED.content,
metadata = EXCLUDED.metadata,
embedding = EXCLUDED.embedding
""",
[
(
doc["id"],
doc["content"],
doc.get("metadata", {}),
np.array(doc["embedding"]).tolist()
)
for doc in documents
]
)
async def search(
self,
query_embedding: List[float],
limit: int = 10,
filter_metadata: Optional[Dict] = None
) -> List[Dict]:
"""Search for similar documents."""
query = """
SELECT id, content, metadata,
1 - (embedding <=> $1::vector) as similarity
FROM documents
"""
params = [query_embedding]
if filter_metadata:
conditions = []
for key, value in filter_metadata.items():
params.append(value)
conditions.append(f"metadata->>'{key}' = ${len(params)}")
query += " WHERE " + " AND ".join(conditions)
query += f" ORDER BY embedding <=> $1::vector LIMIT ${len(params) + 1}"
params.append(limit)
async with self.pool.acquire() as conn:
rows = await conn.fetch(query, *params)
return [
{
"id": row["id"],
"content": row["content"],
"metadata": row["metadata"],
"score": row["similarity"]
}
for row in rows
]
async def hybrid_search(
self,
query_embedding: List[float],
query_text: str,
limit: int = 10,
vector_weight: float = 0.5
) -> List[Dict]:
"""Hybrid search combining vector and full-text."""
async with self.pool.acquire() as conn:
rows = await conn.fetch(
"""
WITH vector_results AS (
SELECT id, content, metadata,
1 - (embedding <=> $1::vector) as vector_score
FROM documents
ORDER BY embedding <=> $1::vector
LIMIT $3 * 2
),
text_results AS (
SELECT id, content, metadata,
ts_rank(to_tsvector('english', content),
plainto_tsquery('english', $2)) as text_score
FROM documents
WHERE to_tsvector('english', content) @@ plainto_tsquery('english', $2)
LIMIT $3 * 2
)
SELECT
COALESCE(v.id, t.id) as id,
COALESCE(v.content, t.content) as content,
COALESCE(v.metadata, t.metadata) as metadata,
COALESCE(v.vector_score, 0) * $4 +
COALESCE(t.text_score, 0) * (1 - $4) as combined_score
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
ORDER BY combined_score DESC
LIMIT $3
""",
query_embedding, query_text, limit, vector_weight
)
return [dict(row) for row in rows]
```
### Template 4: Weaviate Implementation
```python
import weaviate
from weaviate.util import generate_uuid5
from typing import List, Dict, Optional
class WeaviateVectorStore:
def __init__(
self,
url: str = "http://localhost:8080",
class_name: str = "Document"
):
self.client = weaviate.Client(url=url)
self.class_name = class_name
self._ensure_schema()
def _ensure_schema(self):
"""Create schema if not exists."""
schema = {
"class": self.class_name,
"vectorizer": "none", # We provide vectors
"properties": [
{"name": "content", "dataType": ["text"]},
{"name": "source", "dataType": ["string"]},
{"name": "chunk_id", "dataType": ["int"]}
]
}
if not self.client.schema.exists(self.class_name):
self.client.schema.create_class(schema)
def upsert(self, documents: List[Dict]):
"""Batch upsert documents."""
with self.client.batch as batch:
batch.batch_size = 100
for doc in documents:
batch.add_data_object(
data_object={
"content": doc["content"],
"source": doc.get("source", ""),
"chunk_id": doc.get("chunk_id", 0)
},
class_name=self.class_name,
uuid=generate_uuid5(doc["id"]),
vector=doc["embedding"]
)
def search(
self,
query_vector: List[float],
limit: int = 10,
where_filter: Optional[Dict] = None
) -> List[Dict]:
"""Vector search."""
query = (
self.client.query
.get(self.class_name, ["content", "source", "chunk_id"])
.with_near_vector({"vector": query_vector})
.with_limit(limit)
.with_additional(["distance", "id"])
)
if where_filter:
query = query.with_where(where_filter)
results = query.do()
return [
{
"id": item["_additional"]["id"],
"content": item["content"],
"source": item["source"],
"score": 1 - item["_additional"]["distance"]
}
for item in results["data"]["Get"][self.class_name]
]
def hybrid_search(
self,
query: str,
query_vector: List[float],
limit: int = 10,
alpha: float = 0.5 # 0 = keyword, 1 = vector
) -> List[Dict]:
"""Hybrid search combining BM25 and vector."""
results = (
self.client.query
.get(self.class_name, ["content", "source"])
.with_hybrid(query=query, vector=query_vector, alpha=alpha)
.with_limit(limit)
.with_additional(["score"])
.do()
)
return [
{
"content": item["content"],
"source": item["source"],
"score": item["_additional"]["score"]
}
for item in results["data"]["Get"][self.class_name]
]
```
## Best Practices
### Do's
- **Use appropriate index** - HNSW for most cases
- **Tune parameters** - ef_search, nprobe for recall/speed
- **Implement hybrid search** - Combine with keyword search
- **Monitor recall** - Measure search quality
- **Pre-filter when possible** - Reduce search space
### Don'ts
- **Don't skip evaluation** - Measure before optimizing
- **Don't over-index** - Start with flat, scale up
- **Don't ignore latency** - P99 matters for UX
- **Don't forget costs** - Vector storage adds up
## Resources
- [Pinecone Docs](https://docs.pinecone.io/)
- [Qdrant Docs](https://qdrant.tech/documentation/)
- [pgvector](https://github.com/pgvector/pgvector)
- [Weaviate Docs](https://weaviate.io/developers/weaviate)

View File

@@ -0,0 +1,521 @@
---
name: vector-index-tuning
description: Optimize vector index performance for latency, recall, and memory. Use when tuning HNSW parameters, selecting quantization strategies, or scaling vector search infrastructure.
---
# Vector Index Tuning
Guide to optimizing vector indexes for production performance.
## When to Use This Skill
- Tuning HNSW parameters
- Implementing quantization
- Optimizing memory usage
- Reducing search latency
- Balancing recall vs speed
- Scaling to billions of vectors
## Core Concepts
### 1. Index Type Selection
```
Data Size Recommended Index
────────────────────────────────────────
< 10K vectors → Flat (exact search)
10K - 1M → HNSW
1M - 100M → HNSW + Quantization
> 100M → IVF + PQ or DiskANN
```
### 2. HNSW Parameters
| Parameter | Default | Effect |
|-----------|---------|--------|
| **M** | 16 | Connections per node, ↑ = better recall, more memory |
| **efConstruction** | 100 | Build quality, ↑ = better index, slower build |
| **efSearch** | 50 | Search quality, ↑ = better recall, slower search |
### 3. Quantization Types
```
Full Precision (FP32): 4 bytes × dimensions
Half Precision (FP16): 2 bytes × dimensions
INT8 Scalar: 1 byte × dimensions
Product Quantization: ~32-64 bytes total
Binary: dimensions/8 bytes
```
## Templates
### Template 1: HNSW Parameter Tuning
```python
import numpy as np
from typing import List, Tuple
import time
def benchmark_hnsw_parameters(
vectors: np.ndarray,
queries: np.ndarray,
ground_truth: np.ndarray,
m_values: List[int] = [8, 16, 32, 64],
ef_construction_values: List[int] = [64, 128, 256],
ef_search_values: List[int] = [32, 64, 128, 256]
) -> List[dict]:
"""Benchmark different HNSW configurations."""
import hnswlib
results = []
dim = vectors.shape[1]
n = vectors.shape[0]
for m in m_values:
for ef_construction in ef_construction_values:
# Build index
index = hnswlib.Index(space='cosine', dim=dim)
index.init_index(max_elements=n, M=m, ef_construction=ef_construction)
build_start = time.time()
index.add_items(vectors)
build_time = time.time() - build_start
# Get memory usage
memory_bytes = index.element_count * (
dim * 4 + # Vector storage
m * 2 * 4 # Graph edges (approximate)
)
for ef_search in ef_search_values:
index.set_ef(ef_search)
# Measure search
search_start = time.time()
labels, distances = index.knn_query(queries, k=10)
search_time = time.time() - search_start
# Calculate recall
recall = calculate_recall(labels, ground_truth, k=10)
results.append({
"M": m,
"ef_construction": ef_construction,
"ef_search": ef_search,
"build_time_s": build_time,
"search_time_ms": search_time * 1000 / len(queries),
"recall@10": recall,
"memory_mb": memory_bytes / 1024 / 1024
})
return results
def calculate_recall(predictions: np.ndarray, ground_truth: np.ndarray, k: int) -> float:
"""Calculate recall@k."""
correct = 0
for pred, truth in zip(predictions, ground_truth):
correct += len(set(pred[:k]) & set(truth[:k]))
return correct / (len(predictions) * k)
def recommend_hnsw_params(
num_vectors: int,
target_recall: float = 0.95,
max_latency_ms: float = 10,
available_memory_gb: float = 8
) -> dict:
"""Recommend HNSW parameters based on requirements."""
# Base recommendations
if num_vectors < 100_000:
m = 16
ef_construction = 100
elif num_vectors < 1_000_000:
m = 32
ef_construction = 200
else:
m = 48
ef_construction = 256
# Adjust ef_search based on recall target
if target_recall >= 0.99:
ef_search = 256
elif target_recall >= 0.95:
ef_search = 128
else:
ef_search = 64
return {
"M": m,
"ef_construction": ef_construction,
"ef_search": ef_search,
"notes": f"Estimated for {num_vectors:,} vectors, {target_recall:.0%} recall"
}
```
### Template 2: Quantization Strategies
```python
import numpy as np
from typing import Optional
class VectorQuantizer:
"""Quantization strategies for vector compression."""
@staticmethod
def scalar_quantize_int8(
vectors: np.ndarray,
min_val: Optional[float] = None,
max_val: Optional[float] = None
) -> Tuple[np.ndarray, dict]:
"""Scalar quantization to INT8."""
if min_val is None:
min_val = vectors.min()
if max_val is None:
max_val = vectors.max()
# Scale to 0-255 range
scale = 255.0 / (max_val - min_val)
quantized = np.clip(
np.round((vectors - min_val) * scale),
0, 255
).astype(np.uint8)
params = {"min_val": min_val, "max_val": max_val, "scale": scale}
return quantized, params
@staticmethod
def dequantize_int8(
quantized: np.ndarray,
params: dict
) -> np.ndarray:
"""Dequantize INT8 vectors."""
return quantized.astype(np.float32) / params["scale"] + params["min_val"]
@staticmethod
def product_quantize(
vectors: np.ndarray,
n_subvectors: int = 8,
n_centroids: int = 256
) -> Tuple[np.ndarray, dict]:
"""Product quantization for aggressive compression."""
from sklearn.cluster import KMeans
n, dim = vectors.shape
assert dim % n_subvectors == 0
subvector_dim = dim // n_subvectors
codebooks = []
codes = np.zeros((n, n_subvectors), dtype=np.uint8)
for i in range(n_subvectors):
start = i * subvector_dim
end = (i + 1) * subvector_dim
subvectors = vectors[:, start:end]
kmeans = KMeans(n_clusters=n_centroids, random_state=42)
codes[:, i] = kmeans.fit_predict(subvectors)
codebooks.append(kmeans.cluster_centers_)
params = {
"codebooks": codebooks,
"n_subvectors": n_subvectors,
"subvector_dim": subvector_dim
}
return codes, params
@staticmethod
def binary_quantize(vectors: np.ndarray) -> np.ndarray:
"""Binary quantization (sign of each dimension)."""
# Convert to binary: positive = 1, negative = 0
binary = (vectors > 0).astype(np.uint8)
# Pack bits into bytes
n, dim = vectors.shape
packed_dim = (dim + 7) // 8
packed = np.zeros((n, packed_dim), dtype=np.uint8)
for i in range(dim):
byte_idx = i // 8
bit_idx = i % 8
packed[:, byte_idx] |= (binary[:, i] << bit_idx)
return packed
def estimate_memory_usage(
num_vectors: int,
dimensions: int,
quantization: str = "fp32",
index_type: str = "hnsw",
hnsw_m: int = 16
) -> dict:
"""Estimate memory usage for different configurations."""
# Vector storage
bytes_per_dimension = {
"fp32": 4,
"fp16": 2,
"int8": 1,
"pq": 0.05, # Approximate
"binary": 0.125
}
vector_bytes = num_vectors * dimensions * bytes_per_dimension[quantization]
# Index overhead
if index_type == "hnsw":
# Each node has ~M*2 edges, each edge is 4 bytes (int32)
index_bytes = num_vectors * hnsw_m * 2 * 4
elif index_type == "ivf":
# Inverted lists + centroids
index_bytes = num_vectors * 8 + 65536 * dimensions * 4
else:
index_bytes = 0
total_bytes = vector_bytes + index_bytes
return {
"vector_storage_mb": vector_bytes / 1024 / 1024,
"index_overhead_mb": index_bytes / 1024 / 1024,
"total_mb": total_bytes / 1024 / 1024,
"total_gb": total_bytes / 1024 / 1024 / 1024
}
```
### Template 3: Qdrant Index Configuration
```python
from qdrant_client import QdrantClient
from qdrant_client.http import models
def create_optimized_collection(
client: QdrantClient,
collection_name: str,
vector_size: int,
num_vectors: int,
optimize_for: str = "balanced" # "recall", "speed", "memory"
) -> None:
"""Create collection with optimized settings."""
# HNSW configuration based on optimization target
hnsw_configs = {
"recall": models.HnswConfigDiff(m=32, ef_construct=256),
"speed": models.HnswConfigDiff(m=16, ef_construct=64),
"balanced": models.HnswConfigDiff(m=16, ef_construct=128),
"memory": models.HnswConfigDiff(m=8, ef_construct=64)
}
# Quantization configuration
quantization_configs = {
"recall": None, # No quantization for max recall
"speed": models.ScalarQuantization(
scalar=models.ScalarQuantizationConfig(
type=models.ScalarType.INT8,
quantile=0.99,
always_ram=True
)
),
"balanced": models.ScalarQuantization(
scalar=models.ScalarQuantizationConfig(
type=models.ScalarType.INT8,
quantile=0.99,
always_ram=False
)
),
"memory": models.ProductQuantization(
product=models.ProductQuantizationConfig(
compression=models.CompressionRatio.X16,
always_ram=False
)
)
}
# Optimizer configuration
optimizer_configs = {
"recall": models.OptimizersConfigDiff(
indexing_threshold=10000,
memmap_threshold=50000
),
"speed": models.OptimizersConfigDiff(
indexing_threshold=5000,
memmap_threshold=20000
),
"balanced": models.OptimizersConfigDiff(
indexing_threshold=20000,
memmap_threshold=50000
),
"memory": models.OptimizersConfigDiff(
indexing_threshold=50000,
memmap_threshold=10000 # Use disk sooner
)
}
client.create_collection(
collection_name=collection_name,
vectors_config=models.VectorParams(
size=vector_size,
distance=models.Distance.COSINE
),
hnsw_config=hnsw_configs[optimize_for],
quantization_config=quantization_configs[optimize_for],
optimizers_config=optimizer_configs[optimize_for]
)
def tune_search_parameters(
client: QdrantClient,
collection_name: str,
target_recall: float = 0.95
) -> dict:
"""Tune search parameters for target recall."""
# Search parameter recommendations
if target_recall >= 0.99:
search_params = models.SearchParams(
hnsw_ef=256,
exact=False,
quantization=models.QuantizationSearchParams(
ignore=True, # Don't use quantization for search
rescore=True
)
)
elif target_recall >= 0.95:
search_params = models.SearchParams(
hnsw_ef=128,
exact=False,
quantization=models.QuantizationSearchParams(
ignore=False,
rescore=True,
oversampling=2.0
)
)
else:
search_params = models.SearchParams(
hnsw_ef=64,
exact=False,
quantization=models.QuantizationSearchParams(
ignore=False,
rescore=False
)
)
return search_params
```
### Template 4: Performance Monitoring
```python
import time
from dataclasses import dataclass
from typing import List
import numpy as np
@dataclass
class SearchMetrics:
latency_p50_ms: float
latency_p95_ms: float
latency_p99_ms: float
recall: float
qps: float
class VectorSearchMonitor:
"""Monitor vector search performance."""
def __init__(self, ground_truth_fn=None):
self.latencies = []
self.recalls = []
self.ground_truth_fn = ground_truth_fn
def measure_search(
self,
search_fn,
query_vectors: np.ndarray,
k: int = 10,
num_iterations: int = 100
) -> SearchMetrics:
"""Benchmark search performance."""
latencies = []
for _ in range(num_iterations):
for query in query_vectors:
start = time.perf_counter()
results = search_fn(query, k=k)
latency = (time.perf_counter() - start) * 1000
latencies.append(latency)
latencies = np.array(latencies)
total_queries = num_iterations * len(query_vectors)
total_time = sum(latencies) / 1000 # seconds
return SearchMetrics(
latency_p50_ms=np.percentile(latencies, 50),
latency_p95_ms=np.percentile(latencies, 95),
latency_p99_ms=np.percentile(latencies, 99),
recall=self._calculate_recall(search_fn, query_vectors, k) if self.ground_truth_fn else 0,
qps=total_queries / total_time
)
def _calculate_recall(self, search_fn, queries: np.ndarray, k: int) -> float:
"""Calculate recall against ground truth."""
if not self.ground_truth_fn:
return 0
correct = 0
total = 0
for query in queries:
predicted = set(search_fn(query, k=k))
actual = set(self.ground_truth_fn(query, k=k))
correct += len(predicted & actual)
total += k
return correct / total
def profile_index_build(
build_fn,
vectors: np.ndarray,
batch_sizes: List[int] = [1000, 10000, 50000]
) -> dict:
"""Profile index build performance."""
results = {}
for batch_size in batch_sizes:
times = []
for i in range(0, len(vectors), batch_size):
batch = vectors[i:i + batch_size]
start = time.perf_counter()
build_fn(batch)
times.append(time.perf_counter() - start)
results[batch_size] = {
"avg_batch_time_s": np.mean(times),
"vectors_per_second": batch_size / np.mean(times)
}
return results
```
## Best Practices
### Do's
- **Benchmark with real queries** - Synthetic may not represent production
- **Monitor recall continuously** - Can degrade with data drift
- **Start with defaults** - Tune only when needed
- **Use quantization** - Significant memory savings
- **Consider tiered storage** - Hot/cold data separation
### Don'ts
- **Don't over-optimize early** - Profile first
- **Don't ignore build time** - Index updates have cost
- **Don't forget reindexing** - Plan for maintenance
- **Don't skip warming** - Cold indexes are slow
## Resources
- [HNSW Paper](https://arxiv.org/abs/1603.09320)
- [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki)
- [ANN Benchmarks](https://ann-benchmarks.com/)

View File

@@ -0,0 +1,661 @@
---
name: backtesting-frameworks
description: Build robust backtesting systems for trading strategies with proper handling of look-ahead bias, survivorship bias, and transaction costs. Use when developing trading algorithms, validating strategies, or building backtesting infrastructure.
---
# Backtesting Frameworks
Build robust, production-grade backtesting systems that avoid common pitfalls and produce reliable strategy performance estimates.
## When to Use This Skill
- Developing trading strategy backtests
- Building backtesting infrastructure
- Validating strategy performance
- Avoiding common backtesting biases
- Implementing walk-forward analysis
- Comparing strategy alternatives
## Core Concepts
### 1. Backtesting Biases
| Bias | Description | Mitigation |
|------|-------------|------------|
| **Look-ahead** | Using future information | Point-in-time data |
| **Survivorship** | Only testing on survivors | Use delisted securities |
| **Overfitting** | Curve-fitting to history | Out-of-sample testing |
| **Selection** | Cherry-picking strategies | Pre-registration |
| **Transaction** | Ignoring trading costs | Realistic cost models |
### 2. Proper Backtest Structure
```
Historical Data
┌─────────────────────────────────────────┐
│ Training Set │
│ (Strategy Development & Optimization) │
└─────────────────────────────────────────┘
┌─────────────────────────────────────────┐
│ Validation Set │
│ (Parameter Selection, No Peeking) │
└─────────────────────────────────────────┘
┌─────────────────────────────────────────┐
│ Test Set │
│ (Final Performance Evaluation) │
└─────────────────────────────────────────┘
```
### 3. Walk-Forward Analysis
```
Window 1: [Train──────][Test]
Window 2: [Train──────][Test]
Window 3: [Train──────][Test]
Window 4: [Train──────][Test]
─────▶ Time
```
## Implementation Patterns
### Pattern 1: Event-Driven Backtester
```python
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from decimal import Decimal
from enum import Enum
from typing import Dict, List, Optional
import pandas as pd
import numpy as np
class OrderSide(Enum):
BUY = "buy"
SELL = "sell"
class OrderType(Enum):
MARKET = "market"
LIMIT = "limit"
STOP = "stop"
@dataclass
class Order:
symbol: str
side: OrderSide
quantity: Decimal
order_type: OrderType
limit_price: Optional[Decimal] = None
stop_price: Optional[Decimal] = None
timestamp: Optional[datetime] = None
@dataclass
class Fill:
order: Order
fill_price: Decimal
fill_quantity: Decimal
commission: Decimal
slippage: Decimal
timestamp: datetime
@dataclass
class Position:
symbol: str
quantity: Decimal = Decimal("0")
avg_cost: Decimal = Decimal("0")
realized_pnl: Decimal = Decimal("0")
def update(self, fill: Fill) -> None:
if fill.order.side == OrderSide.BUY:
new_quantity = self.quantity + fill.fill_quantity
if new_quantity != 0:
self.avg_cost = (
(self.quantity * self.avg_cost + fill.fill_quantity * fill.fill_price)
/ new_quantity
)
self.quantity = new_quantity
else:
self.realized_pnl += fill.fill_quantity * (fill.fill_price - self.avg_cost)
self.quantity -= fill.fill_quantity
@dataclass
class Portfolio:
cash: Decimal
positions: Dict[str, Position] = field(default_factory=dict)
def get_position(self, symbol: str) -> Position:
if symbol not in self.positions:
self.positions[symbol] = Position(symbol=symbol)
return self.positions[symbol]
def process_fill(self, fill: Fill) -> None:
position = self.get_position(fill.order.symbol)
position.update(fill)
if fill.order.side == OrderSide.BUY:
self.cash -= fill.fill_price * fill.fill_quantity + fill.commission
else:
self.cash += fill.fill_price * fill.fill_quantity - fill.commission
def get_equity(self, prices: Dict[str, Decimal]) -> Decimal:
equity = self.cash
for symbol, position in self.positions.items():
if position.quantity != 0 and symbol in prices:
equity += position.quantity * prices[symbol]
return equity
class Strategy(ABC):
@abstractmethod
def on_bar(self, timestamp: datetime, data: pd.DataFrame) -> List[Order]:
pass
@abstractmethod
def on_fill(self, fill: Fill) -> None:
pass
class ExecutionModel(ABC):
@abstractmethod
def execute(self, order: Order, bar: pd.Series) -> Optional[Fill]:
pass
class SimpleExecutionModel(ExecutionModel):
def __init__(self, slippage_bps: float = 10, commission_per_share: float = 0.01):
self.slippage_bps = slippage_bps
self.commission_per_share = commission_per_share
def execute(self, order: Order, bar: pd.Series) -> Optional[Fill]:
if order.order_type == OrderType.MARKET:
base_price = Decimal(str(bar["open"]))
# Apply slippage
slippage_mult = 1 + (self.slippage_bps / 10000)
if order.side == OrderSide.BUY:
fill_price = base_price * Decimal(str(slippage_mult))
else:
fill_price = base_price / Decimal(str(slippage_mult))
commission = order.quantity * Decimal(str(self.commission_per_share))
slippage = abs(fill_price - base_price) * order.quantity
return Fill(
order=order,
fill_price=fill_price,
fill_quantity=order.quantity,
commission=commission,
slippage=slippage,
timestamp=bar.name
)
return None
class Backtester:
def __init__(
self,
strategy: Strategy,
execution_model: ExecutionModel,
initial_capital: Decimal = Decimal("100000")
):
self.strategy = strategy
self.execution_model = execution_model
self.portfolio = Portfolio(cash=initial_capital)
self.equity_curve: List[tuple] = []
self.trades: List[Fill] = []
def run(self, data: pd.DataFrame) -> pd.DataFrame:
"""Run backtest on OHLCV data with DatetimeIndex."""
pending_orders: List[Order] = []
for timestamp, bar in data.iterrows():
# Execute pending orders at today's prices
for order in pending_orders:
fill = self.execution_model.execute(order, bar)
if fill:
self.portfolio.process_fill(fill)
self.strategy.on_fill(fill)
self.trades.append(fill)
pending_orders.clear()
# Get current prices for equity calculation
prices = {data.index.name or "default": Decimal(str(bar["close"]))}
equity = self.portfolio.get_equity(prices)
self.equity_curve.append((timestamp, float(equity)))
# Generate new orders for next bar
new_orders = self.strategy.on_bar(timestamp, data.loc[:timestamp])
pending_orders.extend(new_orders)
return self._create_results()
def _create_results(self) -> pd.DataFrame:
equity_df = pd.DataFrame(self.equity_curve, columns=["timestamp", "equity"])
equity_df.set_index("timestamp", inplace=True)
equity_df["returns"] = equity_df["equity"].pct_change()
return equity_df
```
### Pattern 2: Vectorized Backtester (Fast)
```python
import pandas as pd
import numpy as np
from typing import Callable, Dict, Any
class VectorizedBacktester:
"""Fast vectorized backtester for simple strategies."""
def __init__(
self,
initial_capital: float = 100000,
commission: float = 0.001, # 0.1%
slippage: float = 0.0005 # 0.05%
):
self.initial_capital = initial_capital
self.commission = commission
self.slippage = slippage
def run(
self,
prices: pd.DataFrame,
signal_func: Callable[[pd.DataFrame], pd.Series]
) -> Dict[str, Any]:
"""
Run backtest with signal function.
Args:
prices: DataFrame with 'close' column
signal_func: Function that returns position signals (-1, 0, 1)
Returns:
Dictionary with results
"""
# Generate signals (shifted to avoid look-ahead)
signals = signal_func(prices).shift(1).fillna(0)
# Calculate returns
returns = prices["close"].pct_change()
# Calculate strategy returns with costs
position_changes = signals.diff().abs()
trading_costs = position_changes * (self.commission + self.slippage)
strategy_returns = signals * returns - trading_costs
# Build equity curve
equity = (1 + strategy_returns).cumprod() * self.initial_capital
# Calculate metrics
results = {
"equity": equity,
"returns": strategy_returns,
"signals": signals,
"metrics": self._calculate_metrics(strategy_returns, equity)
}
return results
def _calculate_metrics(
self,
returns: pd.Series,
equity: pd.Series
) -> Dict[str, float]:
"""Calculate performance metrics."""
total_return = (equity.iloc[-1] / self.initial_capital) - 1
annual_return = (1 + total_return) ** (252 / len(returns)) - 1
annual_vol = returns.std() * np.sqrt(252)
sharpe = annual_return / annual_vol if annual_vol > 0 else 0
# Drawdown
rolling_max = equity.cummax()
drawdown = (equity - rolling_max) / rolling_max
max_drawdown = drawdown.min()
# Win rate
winning_days = (returns > 0).sum()
total_days = (returns != 0).sum()
win_rate = winning_days / total_days if total_days > 0 else 0
return {
"total_return": total_return,
"annual_return": annual_return,
"annual_volatility": annual_vol,
"sharpe_ratio": sharpe,
"max_drawdown": max_drawdown,
"win_rate": win_rate,
"num_trades": int((returns != 0).sum())
}
# Example usage
def momentum_signal(prices: pd.DataFrame, lookback: int = 20) -> pd.Series:
"""Simple momentum strategy: long when price > SMA, else flat."""
sma = prices["close"].rolling(lookback).mean()
return (prices["close"] > sma).astype(int)
# Run backtest
# backtester = VectorizedBacktester()
# results = backtester.run(price_data, lambda p: momentum_signal(p, 50))
```
### Pattern 3: Walk-Forward Optimization
```python
from typing import Callable, Dict, List, Tuple, Any
import pandas as pd
import numpy as np
from itertools import product
class WalkForwardOptimizer:
"""Walk-forward analysis with anchored or rolling windows."""
def __init__(
self,
train_period: int,
test_period: int,
anchored: bool = False,
n_splits: int = None
):
"""
Args:
train_period: Number of bars in training window
test_period: Number of bars in test window
anchored: If True, training always starts from beginning
n_splits: Number of train/test splits (auto-calculated if None)
"""
self.train_period = train_period
self.test_period = test_period
self.anchored = anchored
self.n_splits = n_splits
def generate_splits(
self,
data: pd.DataFrame
) -> List[Tuple[pd.DataFrame, pd.DataFrame]]:
"""Generate train/test splits."""
splits = []
n = len(data)
if self.n_splits:
step = (n - self.train_period) // self.n_splits
else:
step = self.test_period
start = 0
while start + self.train_period + self.test_period <= n:
if self.anchored:
train_start = 0
else:
train_start = start
train_end = start + self.train_period
test_end = min(train_end + self.test_period, n)
train_data = data.iloc[train_start:train_end]
test_data = data.iloc[train_end:test_end]
splits.append((train_data, test_data))
start += step
return splits
def optimize(
self,
data: pd.DataFrame,
strategy_func: Callable,
param_grid: Dict[str, List],
metric: str = "sharpe_ratio"
) -> Dict[str, Any]:
"""
Run walk-forward optimization.
Args:
data: Full dataset
strategy_func: Function(data, **params) -> results dict
param_grid: Parameter combinations to test
metric: Metric to optimize
Returns:
Combined results from all test periods
"""
splits = self.generate_splits(data)
all_results = []
optimal_params_history = []
for i, (train_data, test_data) in enumerate(splits):
# Optimize on training data
best_params, best_metric = self._grid_search(
train_data, strategy_func, param_grid, metric
)
optimal_params_history.append(best_params)
# Test with optimal params
test_results = strategy_func(test_data, **best_params)
test_results["split"] = i
test_results["params"] = best_params
all_results.append(test_results)
print(f"Split {i+1}/{len(splits)}: "
f"Best {metric}={best_metric:.4f}, params={best_params}")
return {
"split_results": all_results,
"param_history": optimal_params_history,
"combined_equity": self._combine_equity_curves(all_results)
}
def _grid_search(
self,
data: pd.DataFrame,
strategy_func: Callable,
param_grid: Dict[str, List],
metric: str
) -> Tuple[Dict, float]:
"""Grid search for best parameters."""
best_params = None
best_metric = -np.inf
# Generate all parameter combinations
param_names = list(param_grid.keys())
param_values = list(param_grid.values())
for values in product(*param_values):
params = dict(zip(param_names, values))
results = strategy_func(data, **params)
if results["metrics"][metric] > best_metric:
best_metric = results["metrics"][metric]
best_params = params
return best_params, best_metric
def _combine_equity_curves(
self,
results: List[Dict]
) -> pd.Series:
"""Combine equity curves from all test periods."""
combined = pd.concat([r["equity"] for r in results])
return combined
```
### Pattern 4: Monte Carlo Analysis
```python
import numpy as np
import pandas as pd
from typing import Dict, List
class MonteCarloAnalyzer:
"""Monte Carlo simulation for strategy robustness."""
def __init__(self, n_simulations: int = 1000, confidence: float = 0.95):
self.n_simulations = n_simulations
self.confidence = confidence
def bootstrap_returns(
self,
returns: pd.Series,
n_periods: int = None
) -> np.ndarray:
"""
Bootstrap simulation by resampling returns.
Args:
returns: Historical returns series
n_periods: Length of each simulation (default: same as input)
Returns:
Array of shape (n_simulations, n_periods)
"""
if n_periods is None:
n_periods = len(returns)
simulations = np.zeros((self.n_simulations, n_periods))
for i in range(self.n_simulations):
# Resample with replacement
simulated_returns = np.random.choice(
returns.values,
size=n_periods,
replace=True
)
simulations[i] = simulated_returns
return simulations
def analyze_drawdowns(
self,
returns: pd.Series
) -> Dict[str, float]:
"""Analyze drawdown distribution via simulation."""
simulations = self.bootstrap_returns(returns)
max_drawdowns = []
for sim_returns in simulations:
equity = (1 + sim_returns).cumprod()
rolling_max = np.maximum.accumulate(equity)
drawdowns = (equity - rolling_max) / rolling_max
max_drawdowns.append(drawdowns.min())
max_drawdowns = np.array(max_drawdowns)
return {
"expected_max_dd": np.mean(max_drawdowns),
"median_max_dd": np.median(max_drawdowns),
f"worst_{int(self.confidence*100)}pct": np.percentile(
max_drawdowns, (1 - self.confidence) * 100
),
"worst_case": max_drawdowns.min()
}
def probability_of_loss(
self,
returns: pd.Series,
holding_periods: List[int] = [21, 63, 126, 252]
) -> Dict[int, float]:
"""Calculate probability of loss over various holding periods."""
results = {}
for period in holding_periods:
if period > len(returns):
continue
simulations = self.bootstrap_returns(returns, period)
total_returns = (1 + simulations).prod(axis=1) - 1
prob_loss = (total_returns < 0).mean()
results[period] = prob_loss
return results
def confidence_interval(
self,
returns: pd.Series,
periods: int = 252
) -> Dict[str, float]:
"""Calculate confidence interval for future returns."""
simulations = self.bootstrap_returns(returns, periods)
total_returns = (1 + simulations).prod(axis=1) - 1
lower = (1 - self.confidence) / 2
upper = 1 - lower
return {
"expected": total_returns.mean(),
"lower_bound": np.percentile(total_returns, lower * 100),
"upper_bound": np.percentile(total_returns, upper * 100),
"std": total_returns.std()
}
```
## Performance Metrics
```python
def calculate_metrics(returns: pd.Series, rf_rate: float = 0.02) -> Dict[str, float]:
"""Calculate comprehensive performance metrics."""
# Annualization factor (assuming daily returns)
ann_factor = 252
# Basic metrics
total_return = (1 + returns).prod() - 1
annual_return = (1 + total_return) ** (ann_factor / len(returns)) - 1
annual_vol = returns.std() * np.sqrt(ann_factor)
# Risk-adjusted returns
sharpe = (annual_return - rf_rate) / annual_vol if annual_vol > 0 else 0
# Sortino (downside deviation)
downside_returns = returns[returns < 0]
downside_vol = downside_returns.std() * np.sqrt(ann_factor)
sortino = (annual_return - rf_rate) / downside_vol if downside_vol > 0 else 0
# Calmar ratio
equity = (1 + returns).cumprod()
rolling_max = equity.cummax()
drawdowns = (equity - rolling_max) / rolling_max
max_drawdown = drawdowns.min()
calmar = annual_return / abs(max_drawdown) if max_drawdown != 0 else 0
# Win rate and profit factor
wins = returns[returns > 0]
losses = returns[returns < 0]
win_rate = len(wins) / len(returns[returns != 0]) if len(returns[returns != 0]) > 0 else 0
profit_factor = wins.sum() / abs(losses.sum()) if losses.sum() != 0 else np.inf
return {
"total_return": total_return,
"annual_return": annual_return,
"annual_volatility": annual_vol,
"sharpe_ratio": sharpe,
"sortino_ratio": sortino,
"calmar_ratio": calmar,
"max_drawdown": max_drawdown,
"win_rate": win_rate,
"profit_factor": profit_factor,
"num_trades": int((returns != 0).sum())
}
```
## Best Practices
### Do's
- **Use point-in-time data** - Avoid look-ahead bias
- **Include transaction costs** - Realistic estimates
- **Test out-of-sample** - Always reserve data
- **Use walk-forward** - Not just train/test
- **Monte Carlo analysis** - Understand uncertainty
### Don'ts
- **Don't overfit** - Limit parameters
- **Don't ignore survivorship** - Include delisted
- **Don't use adjusted data carelessly** - Understand adjustments
- **Don't optimize on full history** - Reserve test set
- **Don't ignore capacity** - Market impact matters
## Resources
- [Advances in Financial Machine Learning (Marcos López de Prado)](https://www.amazon.com/Advances-Financial-Machine-Learning-Marcos/dp/1119482089)
- [Quantitative Trading (Ernest Chan)](https://www.amazon.com/Quantitative-Trading-Build-Algorithmic-Business/dp/1119800064)
- [Backtrader Documentation](https://www.backtrader.com/docu/)

View File

@@ -0,0 +1,555 @@
---
name: risk-metrics-calculation
description: Calculate portfolio risk metrics including VaR, CVaR, Sharpe, Sortino, and drawdown analysis. Use when measuring portfolio risk, implementing risk limits, or building risk monitoring systems.
---
# Risk Metrics Calculation
Comprehensive risk measurement toolkit for portfolio management, including Value at Risk, Expected Shortfall, and drawdown analysis.
## When to Use This Skill
- Measuring portfolio risk
- Implementing risk limits
- Building risk dashboards
- Calculating risk-adjusted returns
- Setting position sizes
- Regulatory reporting
## Core Concepts
### 1. Risk Metric Categories
| Category | Metrics | Use Case |
|----------|---------|----------|
| **Volatility** | Std Dev, Beta | General risk |
| **Tail Risk** | VaR, CVaR | Extreme losses |
| **Drawdown** | Max DD, Calmar | Capital preservation |
| **Risk-Adjusted** | Sharpe, Sortino | Performance |
### 2. Time Horizons
```
Intraday: Minute/hourly VaR for day traders
Daily: Standard risk reporting
Weekly: Rebalancing decisions
Monthly: Performance attribution
Annual: Strategic allocation
```
## Implementation
### Pattern 1: Core Risk Metrics
```python
import numpy as np
import pandas as pd
from scipy import stats
from typing import Dict, Optional, Tuple
class RiskMetrics:
"""Core risk metric calculations."""
def __init__(self, returns: pd.Series, rf_rate: float = 0.02):
"""
Args:
returns: Series of periodic returns
rf_rate: Annual risk-free rate
"""
self.returns = returns
self.rf_rate = rf_rate
self.ann_factor = 252 # Trading days per year
# Volatility Metrics
def volatility(self, annualized: bool = True) -> float:
"""Standard deviation of returns."""
vol = self.returns.std()
if annualized:
vol *= np.sqrt(self.ann_factor)
return vol
def downside_deviation(self, threshold: float = 0, annualized: bool = True) -> float:
"""Standard deviation of returns below threshold."""
downside = self.returns[self.returns < threshold]
if len(downside) == 0:
return 0.0
dd = downside.std()
if annualized:
dd *= np.sqrt(self.ann_factor)
return dd
def beta(self, market_returns: pd.Series) -> float:
"""Beta relative to market."""
aligned = pd.concat([self.returns, market_returns], axis=1).dropna()
if len(aligned) < 2:
return np.nan
cov = np.cov(aligned.iloc[:, 0], aligned.iloc[:, 1])
return cov[0, 1] / cov[1, 1] if cov[1, 1] != 0 else 0
# Value at Risk
def var_historical(self, confidence: float = 0.95) -> float:
"""Historical VaR at confidence level."""
return -np.percentile(self.returns, (1 - confidence) * 100)
def var_parametric(self, confidence: float = 0.95) -> float:
"""Parametric VaR assuming normal distribution."""
z_score = stats.norm.ppf(confidence)
return self.returns.mean() - z_score * self.returns.std()
def var_cornish_fisher(self, confidence: float = 0.95) -> float:
"""VaR with Cornish-Fisher expansion for non-normality."""
z = stats.norm.ppf(confidence)
s = stats.skew(self.returns) # Skewness
k = stats.kurtosis(self.returns) # Excess kurtosis
# Cornish-Fisher expansion
z_cf = (z + (z**2 - 1) * s / 6 +
(z**3 - 3*z) * k / 24 -
(2*z**3 - 5*z) * s**2 / 36)
return -(self.returns.mean() + z_cf * self.returns.std())
# Conditional VaR (Expected Shortfall)
def cvar(self, confidence: float = 0.95) -> float:
"""Expected Shortfall / CVaR / Average VaR."""
var = self.var_historical(confidence)
return -self.returns[self.returns <= -var].mean()
# Drawdown Analysis
def drawdowns(self) -> pd.Series:
"""Calculate drawdown series."""
cumulative = (1 + self.returns).cumprod()
running_max = cumulative.cummax()
return (cumulative - running_max) / running_max
def max_drawdown(self) -> float:
"""Maximum drawdown."""
return self.drawdowns().min()
def avg_drawdown(self) -> float:
"""Average drawdown."""
dd = self.drawdowns()
return dd[dd < 0].mean() if (dd < 0).any() else 0
def drawdown_duration(self) -> Dict[str, int]:
"""Drawdown duration statistics."""
dd = self.drawdowns()
in_drawdown = dd < 0
# Find drawdown periods
drawdown_starts = in_drawdown & ~in_drawdown.shift(1).fillna(False)
drawdown_ends = ~in_drawdown & in_drawdown.shift(1).fillna(False)
durations = []
current_duration = 0
for i in range(len(dd)):
if in_drawdown.iloc[i]:
current_duration += 1
elif current_duration > 0:
durations.append(current_duration)
current_duration = 0
if current_duration > 0:
durations.append(current_duration)
return {
"max_duration": max(durations) if durations else 0,
"avg_duration": np.mean(durations) if durations else 0,
"current_duration": current_duration
}
# Risk-Adjusted Returns
def sharpe_ratio(self) -> float:
"""Annualized Sharpe ratio."""
excess_return = self.returns.mean() * self.ann_factor - self.rf_rate
vol = self.volatility(annualized=True)
return excess_return / vol if vol > 0 else 0
def sortino_ratio(self) -> float:
"""Sortino ratio using downside deviation."""
excess_return = self.returns.mean() * self.ann_factor - self.rf_rate
dd = self.downside_deviation(threshold=0, annualized=True)
return excess_return / dd if dd > 0 else 0
def calmar_ratio(self) -> float:
"""Calmar ratio (return / max drawdown)."""
annual_return = (1 + self.returns).prod() ** (self.ann_factor / len(self.returns)) - 1
max_dd = abs(self.max_drawdown())
return annual_return / max_dd if max_dd > 0 else 0
def omega_ratio(self, threshold: float = 0) -> float:
"""Omega ratio."""
returns_above = self.returns[self.returns > threshold] - threshold
returns_below = threshold - self.returns[self.returns <= threshold]
if returns_below.sum() == 0:
return np.inf
return returns_above.sum() / returns_below.sum()
# Information Ratio
def information_ratio(self, benchmark_returns: pd.Series) -> float:
"""Information ratio vs benchmark."""
active_returns = self.returns - benchmark_returns
tracking_error = active_returns.std() * np.sqrt(self.ann_factor)
active_return = active_returns.mean() * self.ann_factor
return active_return / tracking_error if tracking_error > 0 else 0
# Summary
def summary(self) -> Dict[str, float]:
"""Generate comprehensive risk summary."""
dd_stats = self.drawdown_duration()
return {
# Returns
"total_return": (1 + self.returns).prod() - 1,
"annual_return": (1 + self.returns).prod() ** (self.ann_factor / len(self.returns)) - 1,
# Volatility
"annual_volatility": self.volatility(),
"downside_deviation": self.downside_deviation(),
# VaR & CVaR
"var_95_historical": self.var_historical(0.95),
"var_99_historical": self.var_historical(0.99),
"cvar_95": self.cvar(0.95),
# Drawdowns
"max_drawdown": self.max_drawdown(),
"avg_drawdown": self.avg_drawdown(),
"max_drawdown_duration": dd_stats["max_duration"],
# Risk-Adjusted
"sharpe_ratio": self.sharpe_ratio(),
"sortino_ratio": self.sortino_ratio(),
"calmar_ratio": self.calmar_ratio(),
"omega_ratio": self.omega_ratio(),
# Distribution
"skewness": stats.skew(self.returns),
"kurtosis": stats.kurtosis(self.returns),
}
```
### Pattern 2: Portfolio Risk
```python
class PortfolioRisk:
"""Portfolio-level risk calculations."""
def __init__(
self,
returns: pd.DataFrame,
weights: Optional[pd.Series] = None
):
"""
Args:
returns: DataFrame with asset returns (columns = assets)
weights: Portfolio weights (default: equal weight)
"""
self.returns = returns
self.weights = weights if weights is not None else \
pd.Series(1/len(returns.columns), index=returns.columns)
self.ann_factor = 252
def portfolio_return(self) -> float:
"""Weighted portfolio return."""
return (self.returns @ self.weights).mean() * self.ann_factor
def portfolio_volatility(self) -> float:
"""Portfolio volatility."""
cov_matrix = self.returns.cov() * self.ann_factor
port_var = self.weights @ cov_matrix @ self.weights
return np.sqrt(port_var)
def marginal_risk_contribution(self) -> pd.Series:
"""Marginal contribution to risk by asset."""
cov_matrix = self.returns.cov() * self.ann_factor
port_vol = self.portfolio_volatility()
# Marginal contribution
mrc = (cov_matrix @ self.weights) / port_vol
return mrc
def component_risk(self) -> pd.Series:
"""Component contribution to total risk."""
mrc = self.marginal_risk_contribution()
return self.weights * mrc
def risk_parity_weights(self, target_vol: float = None) -> pd.Series:
"""Calculate risk parity weights."""
from scipy.optimize import minimize
n = len(self.returns.columns)
cov_matrix = self.returns.cov() * self.ann_factor
def risk_budget_objective(weights):
port_vol = np.sqrt(weights @ cov_matrix @ weights)
mrc = (cov_matrix @ weights) / port_vol
rc = weights * mrc
target_rc = port_vol / n # Equal risk contribution
return np.sum((rc - target_rc) ** 2)
constraints = [
{"type": "eq", "fun": lambda w: np.sum(w) - 1}, # Weights sum to 1
]
bounds = [(0.01, 1.0) for _ in range(n)] # Min 1%, max 100%
x0 = np.array([1/n] * n)
result = minimize(
risk_budget_objective,
x0,
method="SLSQP",
bounds=bounds,
constraints=constraints
)
return pd.Series(result.x, index=self.returns.columns)
def correlation_matrix(self) -> pd.DataFrame:
"""Asset correlation matrix."""
return self.returns.corr()
def diversification_ratio(self) -> float:
"""Diversification ratio (higher = more diversified)."""
asset_vols = self.returns.std() * np.sqrt(self.ann_factor)
weighted_vol = (self.weights * asset_vols).sum()
port_vol = self.portfolio_volatility()
return weighted_vol / port_vol if port_vol > 0 else 1
def tracking_error(self, benchmark_returns: pd.Series) -> float:
"""Tracking error vs benchmark."""
port_returns = self.returns @ self.weights
active_returns = port_returns - benchmark_returns
return active_returns.std() * np.sqrt(self.ann_factor)
def conditional_correlation(
self,
threshold_percentile: float = 10
) -> pd.DataFrame:
"""Correlation during stress periods."""
port_returns = self.returns @ self.weights
threshold = np.percentile(port_returns, threshold_percentile)
stress_mask = port_returns <= threshold
return self.returns[stress_mask].corr()
```
### Pattern 3: Rolling Risk Metrics
```python
class RollingRiskMetrics:
"""Rolling window risk calculations."""
def __init__(self, returns: pd.Series, window: int = 63):
"""
Args:
returns: Return series
window: Rolling window size (default: 63 = ~3 months)
"""
self.returns = returns
self.window = window
def rolling_volatility(self, annualized: bool = True) -> pd.Series:
"""Rolling volatility."""
vol = self.returns.rolling(self.window).std()
if annualized:
vol *= np.sqrt(252)
return vol
def rolling_sharpe(self, rf_rate: float = 0.02) -> pd.Series:
"""Rolling Sharpe ratio."""
rolling_return = self.returns.rolling(self.window).mean() * 252
rolling_vol = self.rolling_volatility()
return (rolling_return - rf_rate) / rolling_vol
def rolling_var(self, confidence: float = 0.95) -> pd.Series:
"""Rolling historical VaR."""
return self.returns.rolling(self.window).apply(
lambda x: -np.percentile(x, (1 - confidence) * 100),
raw=True
)
def rolling_max_drawdown(self) -> pd.Series:
"""Rolling maximum drawdown."""
def max_dd(returns):
cumulative = (1 + returns).cumprod()
running_max = cumulative.cummax()
drawdowns = (cumulative - running_max) / running_max
return drawdowns.min()
return self.returns.rolling(self.window).apply(max_dd, raw=False)
def rolling_beta(self, market_returns: pd.Series) -> pd.Series:
"""Rolling beta vs market."""
def calc_beta(window_data):
port_ret = window_data.iloc[:, 0]
mkt_ret = window_data.iloc[:, 1]
cov = np.cov(port_ret, mkt_ret)
return cov[0, 1] / cov[1, 1] if cov[1, 1] != 0 else 0
combined = pd.concat([self.returns, market_returns], axis=1)
return combined.rolling(self.window).apply(
lambda x: calc_beta(x.to_frame()),
raw=False
).iloc[:, 0]
def volatility_regime(
self,
low_threshold: float = 0.10,
high_threshold: float = 0.20
) -> pd.Series:
"""Classify volatility regime."""
vol = self.rolling_volatility()
def classify(v):
if v < low_threshold:
return "low"
elif v > high_threshold:
return "high"
else:
return "normal"
return vol.apply(classify)
```
### Pattern 4: Stress Testing
```python
class StressTester:
"""Historical and hypothetical stress testing."""
# Historical crisis periods
HISTORICAL_SCENARIOS = {
"2008_financial_crisis": ("2008-09-01", "2009-03-31"),
"2020_covid_crash": ("2020-02-19", "2020-03-23"),
"2022_rate_hikes": ("2022-01-01", "2022-10-31"),
"dot_com_bust": ("2000-03-01", "2002-10-01"),
"flash_crash_2010": ("2010-05-06", "2010-05-06"),
}
def __init__(self, returns: pd.Series, weights: pd.Series = None):
self.returns = returns
self.weights = weights
def historical_stress_test(
self,
scenario_name: str,
historical_data: pd.DataFrame
) -> Dict[str, float]:
"""Test portfolio against historical crisis period."""
if scenario_name not in self.HISTORICAL_SCENARIOS:
raise ValueError(f"Unknown scenario: {scenario_name}")
start, end = self.HISTORICAL_SCENARIOS[scenario_name]
# Get returns during crisis
crisis_returns = historical_data.loc[start:end]
if self.weights is not None:
port_returns = (crisis_returns @ self.weights)
else:
port_returns = crisis_returns
total_return = (1 + port_returns).prod() - 1
max_dd = self._calculate_max_dd(port_returns)
worst_day = port_returns.min()
return {
"scenario": scenario_name,
"period": f"{start} to {end}",
"total_return": total_return,
"max_drawdown": max_dd,
"worst_day": worst_day,
"volatility": port_returns.std() * np.sqrt(252)
}
def hypothetical_stress_test(
self,
shocks: Dict[str, float]
) -> float:
"""
Test portfolio against hypothetical shocks.
Args:
shocks: Dict of {asset: shock_return}
"""
if self.weights is None:
raise ValueError("Weights required for hypothetical stress test")
total_impact = 0
for asset, shock in shocks.items():
if asset in self.weights.index:
total_impact += self.weights[asset] * shock
return total_impact
def monte_carlo_stress(
self,
n_simulations: int = 10000,
horizon_days: int = 21,
vol_multiplier: float = 2.0
) -> Dict[str, float]:
"""Monte Carlo stress test with elevated volatility."""
mean = self.returns.mean()
vol = self.returns.std() * vol_multiplier
simulations = np.random.normal(
mean,
vol,
(n_simulations, horizon_days)
)
total_returns = (1 + simulations).prod(axis=1) - 1
return {
"expected_loss": -total_returns.mean(),
"var_95": -np.percentile(total_returns, 5),
"var_99": -np.percentile(total_returns, 1),
"worst_case": -total_returns.min(),
"prob_10pct_loss": (total_returns < -0.10).mean()
}
def _calculate_max_dd(self, returns: pd.Series) -> float:
cumulative = (1 + returns).cumprod()
running_max = cumulative.cummax()
drawdowns = (cumulative - running_max) / running_max
return drawdowns.min()
```
## Quick Reference
```python
# Daily usage
metrics = RiskMetrics(returns)
print(f"Sharpe: {metrics.sharpe_ratio():.2f}")
print(f"Max DD: {metrics.max_drawdown():.2%}")
print(f"VaR 95%: {metrics.var_historical(0.95):.2%}")
# Full summary
summary = metrics.summary()
for metric, value in summary.items():
print(f"{metric}: {value:.4f}")
```
## Best Practices
### Do's
- **Use multiple metrics** - No single metric captures all risk
- **Consider tail risk** - VaR isn't enough, use CVaR
- **Rolling analysis** - Risk changes over time
- **Stress test** - Historical and hypothetical
- **Document assumptions** - Distribution, lookback, etc.
### Don'ts
- **Don't rely on VaR alone** - Underestimates tail risk
- **Don't assume normality** - Returns are fat-tailed
- **Don't ignore correlation** - Increases in stress
- **Don't use short lookbacks** - Miss regime changes
- **Don't forget transaction costs** - Affects realized risk
## Resources
- [Risk Management and Financial Institutions (John Hull)](https://www.amazon.com/Risk-Management-Financial-Institutions-5th/dp/1119448115)
- [Quantitative Risk Management (McNeil, Frey, Embrechts)](https://www.amazon.com/Quantitative-Risk-Management-Techniques-Princeton/dp/0691166277)
- [pyfolio Documentation](https://quantopian.github.io/pyfolio/)

View File

@@ -0,0 +1,44 @@
# Threat Modeling Expert
Expert in threat modeling methodologies, security architecture review, and risk assessment. Masters STRIDE, PASTA, attack trees, and security requirement extraction. Use PROACTIVELY for security architecture reviews, threat identification, or building secure-by-design systems.
## Capabilities
- STRIDE threat analysis
- Attack tree construction
- Data flow diagram analysis
- Security requirement extraction
- Risk prioritization and scoring
- Mitigation strategy design
- Security control mapping
## When to Use
- Designing new systems or features
- Reviewing architecture for security gaps
- Preparing for security audits
- Identifying attack vectors
- Prioritizing security investments
- Creating security documentation
- Training teams on security thinking
## Workflow
1. Define system scope and trust boundaries
2. Create data flow diagrams
3. Identify assets and entry points
4. Apply STRIDE to each component
5. Build attack trees for critical paths
6. Score and prioritize threats
7. Design mitigations
8. Document residual risks
## Best Practices
- Involve developers in threat modeling sessions
- Focus on data flows, not just components
- Consider insider threats
- Update threat models with architecture changes
- Link threats to security requirements
- Track mitigations to implementation
- Review regularly, not just at design time

View File

@@ -0,0 +1,685 @@
---
name: attack-tree-construction
description: Build comprehensive attack trees to visualize threat paths. Use when mapping attack scenarios, identifying defense gaps, or communicating security risks to stakeholders.
---
# Attack Tree Construction
Systematic attack path visualization and analysis.
## When to Use This Skill
- Visualizing complex attack scenarios
- Identifying defense gaps and priorities
- Communicating risks to stakeholders
- Planning defensive investments
- Penetration test planning
- Security architecture review
## Core Concepts
### 1. Attack Tree Structure
```
[Root Goal]
|
┌────────────┴────────────┐
│ │
[Sub-goal 1] [Sub-goal 2]
(OR node) (AND node)
│ │
┌─────┴─────┐ ┌─────┴─────┐
│ │ │ │
[Attack] [Attack] [Attack] [Attack]
(leaf) (leaf) (leaf) (leaf)
```
### 2. Node Types
| Type | Symbol | Description |
|------|--------|-------------|
| **OR** | Oval | Any child achieves goal |
| **AND** | Rectangle | All children required |
| **Leaf** | Box | Atomic attack step |
### 3. Attack Attributes
| Attribute | Description | Values |
|-----------|-------------|--------|
| **Cost** | Resources needed | $, $$, $$$ |
| **Time** | Duration to execute | Hours, Days, Weeks |
| **Skill** | Expertise required | Low, Medium, High |
| **Detection** | Likelihood of detection | Low, Medium, High |
## Templates
### Template 1: Attack Tree Data Model
```python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Union
import json
class NodeType(Enum):
OR = "or"
AND = "and"
LEAF = "leaf"
class Difficulty(Enum):
TRIVIAL = 1
LOW = 2
MEDIUM = 3
HIGH = 4
EXPERT = 5
class Cost(Enum):
FREE = 0
LOW = 1
MEDIUM = 2
HIGH = 3
VERY_HIGH = 4
class DetectionRisk(Enum):
NONE = 0
LOW = 1
MEDIUM = 2
HIGH = 3
CERTAIN = 4
@dataclass
class AttackAttributes:
difficulty: Difficulty = Difficulty.MEDIUM
cost: Cost = Cost.MEDIUM
detection_risk: DetectionRisk = DetectionRisk.MEDIUM
time_hours: float = 8.0
requires_insider: bool = False
requires_physical: bool = False
@dataclass
class AttackNode:
id: str
name: str
description: str
node_type: NodeType
attributes: AttackAttributes = field(default_factory=AttackAttributes)
children: List['AttackNode'] = field(default_factory=list)
mitigations: List[str] = field(default_factory=list)
cve_refs: List[str] = field(default_factory=list)
def add_child(self, child: 'AttackNode') -> None:
self.children.append(child)
def calculate_path_difficulty(self) -> float:
"""Calculate aggregate difficulty for this path."""
if self.node_type == NodeType.LEAF:
return self.attributes.difficulty.value
if not self.children:
return 0
child_difficulties = [c.calculate_path_difficulty() for c in self.children]
if self.node_type == NodeType.OR:
return min(child_difficulties)
else: # AND
return max(child_difficulties)
def calculate_path_cost(self) -> float:
"""Calculate aggregate cost for this path."""
if self.node_type == NodeType.LEAF:
return self.attributes.cost.value
if not self.children:
return 0
child_costs = [c.calculate_path_cost() for c in self.children]
if self.node_type == NodeType.OR:
return min(child_costs)
else: # AND
return sum(child_costs)
def to_dict(self) -> Dict:
"""Convert to dictionary for serialization."""
return {
"id": self.id,
"name": self.name,
"description": self.description,
"type": self.node_type.value,
"attributes": {
"difficulty": self.attributes.difficulty.name,
"cost": self.attributes.cost.name,
"detection_risk": self.attributes.detection_risk.name,
"time_hours": self.attributes.time_hours,
},
"mitigations": self.mitigations,
"children": [c.to_dict() for c in self.children]
}
@dataclass
class AttackTree:
name: str
description: str
root: AttackNode
version: str = "1.0"
def find_easiest_path(self) -> List[AttackNode]:
"""Find the path with lowest difficulty."""
return self._find_path(self.root, minimize="difficulty")
def find_cheapest_path(self) -> List[AttackNode]:
"""Find the path with lowest cost."""
return self._find_path(self.root, minimize="cost")
def find_stealthiest_path(self) -> List[AttackNode]:
"""Find the path with lowest detection risk."""
return self._find_path(self.root, minimize="detection")
def _find_path(
self,
node: AttackNode,
minimize: str
) -> List[AttackNode]:
"""Recursive path finding."""
if node.node_type == NodeType.LEAF:
return [node]
if not node.children:
return [node]
if node.node_type == NodeType.OR:
# Pick the best child path
best_path = None
best_score = float('inf')
for child in node.children:
child_path = self._find_path(child, minimize)
score = self._path_score(child_path, minimize)
if score < best_score:
best_score = score
best_path = child_path
return [node] + (best_path or [])
else: # AND
# Must traverse all children
path = [node]
for child in node.children:
path.extend(self._find_path(child, minimize))
return path
def _path_score(self, path: List[AttackNode], metric: str) -> float:
"""Calculate score for a path."""
if metric == "difficulty":
return sum(n.attributes.difficulty.value for n in path if n.node_type == NodeType.LEAF)
elif metric == "cost":
return sum(n.attributes.cost.value for n in path if n.node_type == NodeType.LEAF)
elif metric == "detection":
return sum(n.attributes.detection_risk.value for n in path if n.node_type == NodeType.LEAF)
return 0
def get_all_leaf_attacks(self) -> List[AttackNode]:
"""Get all leaf attack nodes."""
leaves = []
self._collect_leaves(self.root, leaves)
return leaves
def _collect_leaves(self, node: AttackNode, leaves: List[AttackNode]) -> None:
if node.node_type == NodeType.LEAF:
leaves.append(node)
for child in node.children:
self._collect_leaves(child, leaves)
def get_unmitigated_attacks(self) -> List[AttackNode]:
"""Find attacks without mitigations."""
return [n for n in self.get_all_leaf_attacks() if not n.mitigations]
def export_json(self) -> str:
"""Export tree to JSON."""
return json.dumps({
"name": self.name,
"description": self.description,
"version": self.version,
"root": self.root.to_dict()
}, indent=2)
```
### Template 2: Attack Tree Builder
```python
class AttackTreeBuilder:
"""Fluent builder for attack trees."""
def __init__(self, name: str, description: str):
self.name = name
self.description = description
self._node_stack: List[AttackNode] = []
self._root: Optional[AttackNode] = None
def goal(self, id: str, name: str, description: str = "") -> 'AttackTreeBuilder':
"""Set the root goal (OR node by default)."""
self._root = AttackNode(
id=id,
name=name,
description=description,
node_type=NodeType.OR
)
self._node_stack = [self._root]
return self
def or_node(self, id: str, name: str, description: str = "") -> 'AttackTreeBuilder':
"""Add an OR sub-goal."""
node = AttackNode(
id=id,
name=name,
description=description,
node_type=NodeType.OR
)
self._current().add_child(node)
self._node_stack.append(node)
return self
def and_node(self, id: str, name: str, description: str = "") -> 'AttackTreeBuilder':
"""Add an AND sub-goal (all children required)."""
node = AttackNode(
id=id,
name=name,
description=description,
node_type=NodeType.AND
)
self._current().add_child(node)
self._node_stack.append(node)
return self
def attack(
self,
id: str,
name: str,
description: str = "",
difficulty: Difficulty = Difficulty.MEDIUM,
cost: Cost = Cost.MEDIUM,
detection: DetectionRisk = DetectionRisk.MEDIUM,
time_hours: float = 8.0,
mitigations: List[str] = None
) -> 'AttackTreeBuilder':
"""Add a leaf attack node."""
node = AttackNode(
id=id,
name=name,
description=description,
node_type=NodeType.LEAF,
attributes=AttackAttributes(
difficulty=difficulty,
cost=cost,
detection_risk=detection,
time_hours=time_hours
),
mitigations=mitigations or []
)
self._current().add_child(node)
return self
def end(self) -> 'AttackTreeBuilder':
"""Close current node, return to parent."""
if len(self._node_stack) > 1:
self._node_stack.pop()
return self
def build(self) -> AttackTree:
"""Build the attack tree."""
if not self._root:
raise ValueError("No root goal defined")
return AttackTree(
name=self.name,
description=self.description,
root=self._root
)
def _current(self) -> AttackNode:
if not self._node_stack:
raise ValueError("No current node")
return self._node_stack[-1]
# Example usage
def build_account_takeover_tree() -> AttackTree:
"""Build attack tree for account takeover scenario."""
return (
AttackTreeBuilder("Account Takeover", "Gain unauthorized access to user account")
.goal("G1", "Take Over User Account")
.or_node("S1", "Steal Credentials")
.attack(
"A1", "Phishing Attack",
difficulty=Difficulty.LOW,
cost=Cost.LOW,
detection=DetectionRisk.MEDIUM,
mitigations=["Security awareness training", "Email filtering"]
)
.attack(
"A2", "Credential Stuffing",
difficulty=Difficulty.TRIVIAL,
cost=Cost.LOW,
detection=DetectionRisk.HIGH,
mitigations=["Rate limiting", "MFA", "Password breach monitoring"]
)
.attack(
"A3", "Keylogger Malware",
difficulty=Difficulty.MEDIUM,
cost=Cost.MEDIUM,
detection=DetectionRisk.MEDIUM,
mitigations=["Endpoint protection", "MFA"]
)
.end()
.or_node("S2", "Bypass Authentication")
.attack(
"A4", "Session Hijacking",
difficulty=Difficulty.MEDIUM,
cost=Cost.LOW,
detection=DetectionRisk.LOW,
mitigations=["Secure session management", "HTTPS only"]
)
.attack(
"A5", "Authentication Bypass Vulnerability",
difficulty=Difficulty.HIGH,
cost=Cost.LOW,
detection=DetectionRisk.LOW,
mitigations=["Security testing", "Code review", "WAF"]
)
.end()
.or_node("S3", "Social Engineering")
.and_node("S3.1", "Account Recovery Attack")
.attack(
"A6", "Gather Personal Information",
difficulty=Difficulty.LOW,
cost=Cost.FREE,
detection=DetectionRisk.NONE
)
.attack(
"A7", "Call Support Desk",
difficulty=Difficulty.MEDIUM,
cost=Cost.FREE,
detection=DetectionRisk.MEDIUM,
mitigations=["Support verification procedures", "Security questions"]
)
.end()
.end()
.build()
)
```
### Template 3: Mermaid Diagram Generator
```python
class MermaidExporter:
"""Export attack trees to Mermaid diagram format."""
def __init__(self, tree: AttackTree):
self.tree = tree
self._lines: List[str] = []
self._node_count = 0
def export(self) -> str:
"""Export tree to Mermaid flowchart."""
self._lines = ["flowchart TD"]
self._export_node(self.tree.root, None)
return "\n".join(self._lines)
def _export_node(self, node: AttackNode, parent_id: Optional[str]) -> str:
"""Recursively export nodes."""
node_id = f"N{self._node_count}"
self._node_count += 1
# Node shape based on type
if node.node_type == NodeType.OR:
shape = f"{node_id}(({node.name}))"
elif node.node_type == NodeType.AND:
shape = f"{node_id}[{node.name}]"
else: # LEAF
# Color based on difficulty
style = self._get_leaf_style(node)
shape = f"{node_id}[/{node.name}/]"
self._lines.append(f" style {node_id} {style}")
self._lines.append(f" {shape}")
if parent_id:
connector = "-->" if node.node_type != NodeType.AND else "==>"
self._lines.append(f" {parent_id} {connector} {node_id}")
for child in node.children:
self._export_node(child, node_id)
return node_id
def _get_leaf_style(self, node: AttackNode) -> str:
"""Get style based on attack attributes."""
colors = {
Difficulty.TRIVIAL: "fill:#ff6b6b", # Red - easy attack
Difficulty.LOW: "fill:#ffa06b",
Difficulty.MEDIUM: "fill:#ffd93d",
Difficulty.HIGH: "fill:#6bcb77",
Difficulty.EXPERT: "fill:#4d96ff", # Blue - hard attack
}
color = colors.get(node.attributes.difficulty, "fill:#gray")
return color
class PlantUMLExporter:
"""Export attack trees to PlantUML format."""
def __init__(self, tree: AttackTree):
self.tree = tree
def export(self) -> str:
"""Export tree to PlantUML."""
lines = [
"@startmindmap",
f"* {self.tree.name}",
]
self._export_node(self.tree.root, lines, 1)
lines.append("@endmindmap")
return "\n".join(lines)
def _export_node(self, node: AttackNode, lines: List[str], depth: int) -> None:
"""Recursively export nodes."""
prefix = "*" * (depth + 1)
if node.node_type == NodeType.OR:
marker = "[OR]"
elif node.node_type == NodeType.AND:
marker = "[AND]"
else:
diff = node.attributes.difficulty.name
marker = f"<<{diff}>>"
lines.append(f"{prefix} {marker} {node.name}")
for child in node.children:
self._export_node(child, lines, depth + 1)
```
### Template 4: Attack Path Analysis
```python
from typing import Set, Tuple
class AttackPathAnalyzer:
"""Analyze attack paths and coverage."""
def __init__(self, tree: AttackTree):
self.tree = tree
def get_all_paths(self) -> List[List[AttackNode]]:
"""Get all possible attack paths."""
paths = []
self._collect_paths(self.tree.root, [], paths)
return paths
def _collect_paths(
self,
node: AttackNode,
current_path: List[AttackNode],
all_paths: List[List[AttackNode]]
) -> None:
"""Recursively collect all paths."""
current_path = current_path + [node]
if node.node_type == NodeType.LEAF:
all_paths.append(current_path)
return
if not node.children:
all_paths.append(current_path)
return
if node.node_type == NodeType.OR:
# Each child is a separate path
for child in node.children:
self._collect_paths(child, current_path, all_paths)
else: # AND
# Must combine all children
child_paths = []
for child in node.children:
child_sub_paths = []
self._collect_paths(child, [], child_sub_paths)
child_paths.append(child_sub_paths)
# Combine paths from all AND children
combined = self._combine_and_paths(child_paths)
for combo in combined:
all_paths.append(current_path + combo)
def _combine_and_paths(
self,
child_paths: List[List[List[AttackNode]]]
) -> List[List[AttackNode]]:
"""Combine paths from AND node children."""
if not child_paths:
return [[]]
if len(child_paths) == 1:
return [path for paths in child_paths for path in paths]
# Cartesian product of all child path combinations
result = [[]]
for paths in child_paths:
new_result = []
for existing in result:
for path in paths:
new_result.append(existing + path)
result = new_result
return result
def calculate_path_metrics(self, path: List[AttackNode]) -> Dict:
"""Calculate metrics for a specific path."""
leaves = [n for n in path if n.node_type == NodeType.LEAF]
total_difficulty = sum(n.attributes.difficulty.value for n in leaves)
total_cost = sum(n.attributes.cost.value for n in leaves)
total_time = sum(n.attributes.time_hours for n in leaves)
max_detection = max((n.attributes.detection_risk.value for n in leaves), default=0)
return {
"steps": len(leaves),
"total_difficulty": total_difficulty,
"avg_difficulty": total_difficulty / len(leaves) if leaves else 0,
"total_cost": total_cost,
"total_time_hours": total_time,
"max_detection_risk": max_detection,
"requires_insider": any(n.attributes.requires_insider for n in leaves),
"requires_physical": any(n.attributes.requires_physical for n in leaves),
}
def identify_critical_nodes(self) -> List[Tuple[AttackNode, int]]:
"""Find nodes that appear in the most paths."""
paths = self.get_all_paths()
node_counts: Dict[str, Tuple[AttackNode, int]] = {}
for path in paths:
for node in path:
if node.id not in node_counts:
node_counts[node.id] = (node, 0)
node_counts[node.id] = (node, node_counts[node.id][1] + 1)
return sorted(
node_counts.values(),
key=lambda x: x[1],
reverse=True
)
def coverage_analysis(self, mitigated_attacks: Set[str]) -> Dict:
"""Analyze how mitigations affect attack coverage."""
all_paths = self.get_all_paths()
blocked_paths = []
open_paths = []
for path in all_paths:
path_attacks = {n.id for n in path if n.node_type == NodeType.LEAF}
if path_attacks & mitigated_attacks:
blocked_paths.append(path)
else:
open_paths.append(path)
return {
"total_paths": len(all_paths),
"blocked_paths": len(blocked_paths),
"open_paths": len(open_paths),
"coverage_percentage": len(blocked_paths) / len(all_paths) * 100 if all_paths else 0,
"open_path_details": [
{"path": [n.name for n in p], "metrics": self.calculate_path_metrics(p)}
for p in open_paths[:5] # Top 5 open paths
]
}
def prioritize_mitigations(self) -> List[Dict]:
"""Prioritize mitigations by impact."""
critical_nodes = self.identify_critical_nodes()
paths = self.get_all_paths()
total_paths = len(paths)
recommendations = []
for node, count in critical_nodes:
if node.node_type == NodeType.LEAF and node.mitigations:
recommendations.append({
"attack": node.name,
"attack_id": node.id,
"paths_blocked": count,
"coverage_impact": count / total_paths * 100,
"difficulty": node.attributes.difficulty.name,
"mitigations": node.mitigations,
})
return sorted(recommendations, key=lambda x: x["coverage_impact"], reverse=True)
```
## Best Practices
### Do's
- **Start with clear goals** - Define what attacker wants
- **Be exhaustive** - Consider all attack vectors
- **Attribute attacks** - Cost, skill, and detection
- **Update regularly** - New threats emerge
- **Validate with experts** - Red team review
### Don'ts
- **Don't oversimplify** - Real attacks are complex
- **Don't ignore dependencies** - AND nodes matter
- **Don't forget insider threats** - Not all attackers are external
- **Don't skip mitigations** - Trees are for defense planning
- **Don't make it static** - Threat landscape evolves
## Resources
- [Attack Trees by Bruce Schneier](https://www.schneier.com/academic/archives/1999/12/attack_trees.html)
- [MITRE ATT&CK Framework](https://attack.mitre.org/)
- [OWASP Attack Surface Analysis](https://owasp.org/www-community/controls/Attack_Surface_Analysis_Cheat_Sheet)

View File

@@ -0,0 +1,677 @@
---
name: security-requirement-extraction
description: Derive security requirements from threat models and business context. Use when translating threats into actionable requirements, creating security user stories, or building security test cases.
---
# Security Requirement Extraction
Transform threat analysis into actionable security requirements.
## When to Use This Skill
- Converting threat models to requirements
- Writing security user stories
- Creating security test cases
- Building security acceptance criteria
- Compliance requirement mapping
- Security architecture documentation
## Core Concepts
### 1. Requirement Categories
```
Business Requirements → Security Requirements → Technical Controls
↓ ↓ ↓
"Protect customer "Encrypt PII at rest" "AES-256 encryption
data" with KMS key rotation"
```
### 2. Security Requirement Types
| Type | Focus | Example |
|------|-------|---------|
| **Functional** | What system must do | "System must authenticate users" |
| **Non-functional** | How system must perform | "Authentication must complete in <2s" |
| **Constraint** | Limitations imposed | "Must use approved crypto libraries" |
### 3. Requirement Attributes
| Attribute | Description |
|-----------|-------------|
| **Traceability** | Links to threats/compliance |
| **Testability** | Can be verified |
| **Priority** | Business importance |
| **Risk Level** | Impact if not met |
## Templates
### Template 1: Security Requirement Model
```python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Set
from datetime import datetime
class RequirementType(Enum):
FUNCTIONAL = "functional"
NON_FUNCTIONAL = "non_functional"
CONSTRAINT = "constraint"
class Priority(Enum):
CRITICAL = 1
HIGH = 2
MEDIUM = 3
LOW = 4
class SecurityDomain(Enum):
AUTHENTICATION = "authentication"
AUTHORIZATION = "authorization"
DATA_PROTECTION = "data_protection"
AUDIT_LOGGING = "audit_logging"
INPUT_VALIDATION = "input_validation"
ERROR_HANDLING = "error_handling"
SESSION_MANAGEMENT = "session_management"
CRYPTOGRAPHY = "cryptography"
NETWORK_SECURITY = "network_security"
AVAILABILITY = "availability"
class ComplianceFramework(Enum):
PCI_DSS = "pci_dss"
HIPAA = "hipaa"
GDPR = "gdpr"
SOC2 = "soc2"
NIST_CSF = "nist_csf"
ISO_27001 = "iso_27001"
OWASP = "owasp"
@dataclass
class SecurityRequirement:
id: str
title: str
description: str
req_type: RequirementType
domain: SecurityDomain
priority: Priority
rationale: str = ""
acceptance_criteria: List[str] = field(default_factory=list)
test_cases: List[str] = field(default_factory=list)
threat_refs: List[str] = field(default_factory=list)
compliance_refs: List[str] = field(default_factory=list)
dependencies: List[str] = field(default_factory=list)
status: str = "draft"
owner: str = ""
created_date: datetime = field(default_factory=datetime.now)
def to_user_story(self) -> str:
"""Convert to user story format."""
return f"""
**{self.id}: {self.title}**
As a security-conscious system,
I need to {self.description.lower()},
So that {self.rationale.lower()}.
**Acceptance Criteria:**
{chr(10).join(f'- [ ] {ac}' for ac in self.acceptance_criteria)}
**Priority:** {self.priority.name}
**Domain:** {self.domain.value}
**Threat References:** {', '.join(self.threat_refs)}
"""
def to_test_spec(self) -> str:
"""Convert to test specification."""
return f"""
## Test Specification: {self.id}
### Requirement
{self.description}
### Test Cases
{chr(10).join(f'{i+1}. {tc}' for i, tc in enumerate(self.test_cases))}
### Acceptance Criteria Verification
{chr(10).join(f'- {ac}' for ac in self.acceptance_criteria)}
"""
@dataclass
class RequirementSet:
name: str
version: str
requirements: List[SecurityRequirement] = field(default_factory=list)
def add(self, req: SecurityRequirement) -> None:
self.requirements.append(req)
def get_by_domain(self, domain: SecurityDomain) -> List[SecurityRequirement]:
return [r for r in self.requirements if r.domain == domain]
def get_by_priority(self, priority: Priority) -> List[SecurityRequirement]:
return [r for r in self.requirements if r.priority == priority]
def get_by_threat(self, threat_id: str) -> List[SecurityRequirement]:
return [r for r in self.requirements if threat_id in r.threat_refs]
def get_critical_requirements(self) -> List[SecurityRequirement]:
return [r for r in self.requirements if r.priority == Priority.CRITICAL]
def export_markdown(self) -> str:
"""Export all requirements as markdown."""
lines = [f"# Security Requirements: {self.name}\n"]
lines.append(f"Version: {self.version}\n")
for domain in SecurityDomain:
domain_reqs = self.get_by_domain(domain)
if domain_reqs:
lines.append(f"\n## {domain.value.replace('_', ' ').title()}\n")
for req in domain_reqs:
lines.append(req.to_user_story())
return "\n".join(lines)
def traceability_matrix(self) -> Dict[str, List[str]]:
"""Generate threat-to-requirement traceability."""
matrix = {}
for req in self.requirements:
for threat_id in req.threat_refs:
if threat_id not in matrix:
matrix[threat_id] = []
matrix[threat_id].append(req.id)
return matrix
```
### Template 2: Threat-to-Requirement Extractor
```python
from dataclasses import dataclass
from typing import List, Dict, Tuple
@dataclass
class ThreatInput:
id: str
category: str # STRIDE category
title: str
description: str
target: str
impact: str
likelihood: str
class RequirementExtractor:
"""Extract security requirements from threats."""
# Mapping of STRIDE categories to security domains and requirement patterns
STRIDE_MAPPINGS = {
"SPOOFING": {
"domains": [SecurityDomain.AUTHENTICATION, SecurityDomain.SESSION_MANAGEMENT],
"patterns": [
("Implement strong authentication for {target}",
"Ensure {target} authenticates all users before granting access"),
("Validate identity tokens for {target}",
"All authentication tokens must be cryptographically verified"),
("Implement session management for {target}",
"Sessions must be securely managed with proper expiration"),
]
},
"TAMPERING": {
"domains": [SecurityDomain.INPUT_VALIDATION, SecurityDomain.DATA_PROTECTION],
"patterns": [
("Validate all input to {target}",
"All input must be validated against expected formats"),
("Implement integrity checks for {target}",
"Data integrity must be verified using cryptographic signatures"),
("Protect {target} from modification",
"Implement controls to prevent unauthorized data modification"),
]
},
"REPUDIATION": {
"domains": [SecurityDomain.AUDIT_LOGGING],
"patterns": [
("Log all security events for {target}",
"Security-relevant events must be logged for audit purposes"),
("Implement non-repudiation for {target}",
"Critical actions must have cryptographic proof of origin"),
("Protect audit logs for {target}",
"Audit logs must be tamper-evident and protected"),
]
},
"INFORMATION_DISCLOSURE": {
"domains": [SecurityDomain.DATA_PROTECTION, SecurityDomain.CRYPTOGRAPHY],
"patterns": [
("Encrypt sensitive data in {target}",
"Sensitive data must be encrypted at rest and in transit"),
("Implement access controls for {target}",
"Data access must be restricted based on need-to-know"),
("Prevent information leakage from {target}",
"Error messages and logs must not expose sensitive information"),
]
},
"DENIAL_OF_SERVICE": {
"domains": [SecurityDomain.AVAILABILITY, SecurityDomain.INPUT_VALIDATION],
"patterns": [
("Implement rate limiting for {target}",
"Requests must be rate-limited to prevent resource exhaustion"),
("Ensure availability of {target}",
"System must remain available under high load conditions"),
("Implement resource quotas for {target}",
"Resource consumption must be bounded and monitored"),
]
},
"ELEVATION_OF_PRIVILEGE": {
"domains": [SecurityDomain.AUTHORIZATION],
"patterns": [
("Enforce authorization for {target}",
"All actions must be authorized based on user permissions"),
("Implement least privilege for {target}",
"Users must only have minimum necessary permissions"),
("Validate permissions for {target}",
"Permission checks must be performed server-side"),
]
},
}
def extract_requirements(
self,
threats: List[ThreatInput],
project_name: str
) -> RequirementSet:
"""Extract security requirements from threats."""
req_set = RequirementSet(
name=f"{project_name} Security Requirements",
version="1.0"
)
req_counter = 1
for threat in threats:
reqs = self._threat_to_requirements(threat, req_counter)
for req in reqs:
req_set.add(req)
req_counter += len(reqs)
return req_set
def _threat_to_requirements(
self,
threat: ThreatInput,
start_id: int
) -> List[SecurityRequirement]:
"""Convert a single threat to requirements."""
requirements = []
mapping = self.STRIDE_MAPPINGS.get(threat.category, {})
domains = mapping.get("domains", [])
patterns = mapping.get("patterns", [])
priority = self._calculate_priority(threat.impact, threat.likelihood)
for i, (title_pattern, desc_pattern) in enumerate(patterns):
req = SecurityRequirement(
id=f"SR-{start_id + i:03d}",
title=title_pattern.format(target=threat.target),
description=desc_pattern.format(target=threat.target),
req_type=RequirementType.FUNCTIONAL,
domain=domains[i % len(domains)] if domains else SecurityDomain.DATA_PROTECTION,
priority=priority,
rationale=f"Mitigates threat: {threat.title}",
threat_refs=[threat.id],
acceptance_criteria=self._generate_acceptance_criteria(
threat.category, threat.target
),
test_cases=self._generate_test_cases(
threat.category, threat.target
)
)
requirements.append(req)
return requirements
def _calculate_priority(self, impact: str, likelihood: str) -> Priority:
"""Calculate requirement priority from threat attributes."""
score_map = {"LOW": 1, "MEDIUM": 2, "HIGH": 3, "CRITICAL": 4}
impact_score = score_map.get(impact.upper(), 2)
likelihood_score = score_map.get(likelihood.upper(), 2)
combined = impact_score * likelihood_score
if combined >= 12:
return Priority.CRITICAL
elif combined >= 6:
return Priority.HIGH
elif combined >= 3:
return Priority.MEDIUM
return Priority.LOW
def _generate_acceptance_criteria(
self,
category: str,
target: str
) -> List[str]:
"""Generate acceptance criteria for requirement."""
criteria_templates = {
"SPOOFING": [
f"Users must authenticate before accessing {target}",
"Authentication failures are logged and monitored",
"Multi-factor authentication is available for sensitive operations",
],
"TAMPERING": [
f"All input to {target} is validated",
"Data integrity is verified before processing",
"Modification attempts trigger alerts",
],
"REPUDIATION": [
f"All actions on {target} are logged with user identity",
"Logs cannot be modified by regular users",
"Log retention meets compliance requirements",
],
"INFORMATION_DISCLOSURE": [
f"Sensitive data in {target} is encrypted",
"Access to sensitive data is logged",
"Error messages do not reveal sensitive information",
],
"DENIAL_OF_SERVICE": [
f"Rate limiting is enforced on {target}",
"System degrades gracefully under load",
"Resource exhaustion triggers alerts",
],
"ELEVATION_OF_PRIVILEGE": [
f"Authorization is checked for all {target} operations",
"Users cannot access resources beyond their permissions",
"Privilege changes are logged and monitored",
],
}
return criteria_templates.get(category, [])
def _generate_test_cases(
self,
category: str,
target: str
) -> List[str]:
"""Generate test cases for requirement."""
test_templates = {
"SPOOFING": [
f"Test: Unauthenticated access to {target} is denied",
"Test: Invalid credentials are rejected",
"Test: Session tokens cannot be forged",
],
"TAMPERING": [
f"Test: Invalid input to {target} is rejected",
"Test: Tampered data is detected and rejected",
"Test: SQL injection attempts are blocked",
],
"REPUDIATION": [
"Test: Security events are logged",
"Test: Logs include sufficient detail for forensics",
"Test: Log integrity is protected",
],
"INFORMATION_DISCLOSURE": [
f"Test: {target} data is encrypted in transit",
f"Test: {target} data is encrypted at rest",
"Test: Error messages are sanitized",
],
"DENIAL_OF_SERVICE": [
f"Test: Rate limiting on {target} works correctly",
"Test: System handles burst traffic gracefully",
"Test: Resource limits are enforced",
],
"ELEVATION_OF_PRIVILEGE": [
f"Test: Unauthorized access to {target} is denied",
"Test: Privilege escalation attempts are blocked",
"Test: IDOR vulnerabilities are not present",
],
}
return test_templates.get(category, [])
```
### Template 3: Compliance Mapping
```python
from typing import Dict, List, Set
class ComplianceMapper:
"""Map security requirements to compliance frameworks."""
FRAMEWORK_CONTROLS = {
ComplianceFramework.PCI_DSS: {
SecurityDomain.AUTHENTICATION: ["8.1", "8.2", "8.3"],
SecurityDomain.AUTHORIZATION: ["7.1", "7.2"],
SecurityDomain.DATA_PROTECTION: ["3.4", "3.5", "4.1"],
SecurityDomain.AUDIT_LOGGING: ["10.1", "10.2", "10.3"],
SecurityDomain.NETWORK_SECURITY: ["1.1", "1.2", "1.3"],
SecurityDomain.CRYPTOGRAPHY: ["3.5", "3.6", "4.1"],
},
ComplianceFramework.HIPAA: {
SecurityDomain.AUTHENTICATION: ["164.312(d)"],
SecurityDomain.AUTHORIZATION: ["164.312(a)(1)"],
SecurityDomain.DATA_PROTECTION: ["164.312(a)(2)(iv)", "164.312(e)(2)(ii)"],
SecurityDomain.AUDIT_LOGGING: ["164.312(b)"],
},
ComplianceFramework.GDPR: {
SecurityDomain.DATA_PROTECTION: ["Art. 32", "Art. 25"],
SecurityDomain.AUDIT_LOGGING: ["Art. 30"],
SecurityDomain.AUTHORIZATION: ["Art. 25"],
},
ComplianceFramework.OWASP: {
SecurityDomain.AUTHENTICATION: ["V2.1", "V2.2", "V2.3"],
SecurityDomain.SESSION_MANAGEMENT: ["V3.1", "V3.2", "V3.3"],
SecurityDomain.INPUT_VALIDATION: ["V5.1", "V5.2", "V5.3"],
SecurityDomain.CRYPTOGRAPHY: ["V6.1", "V6.2"],
SecurityDomain.ERROR_HANDLING: ["V7.1", "V7.2"],
SecurityDomain.DATA_PROTECTION: ["V8.1", "V8.2", "V8.3"],
SecurityDomain.AUDIT_LOGGING: ["V7.1", "V7.2"],
},
}
def map_requirement_to_compliance(
self,
requirement: SecurityRequirement,
frameworks: List[ComplianceFramework]
) -> Dict[str, List[str]]:
"""Map a requirement to compliance controls."""
mapping = {}
for framework in frameworks:
controls = self.FRAMEWORK_CONTROLS.get(framework, {})
domain_controls = controls.get(requirement.domain, [])
if domain_controls:
mapping[framework.value] = domain_controls
return mapping
def get_requirements_for_control(
self,
requirement_set: RequirementSet,
framework: ComplianceFramework,
control_id: str
) -> List[SecurityRequirement]:
"""Find requirements that satisfy a compliance control."""
matching = []
framework_controls = self.FRAMEWORK_CONTROLS.get(framework, {})
for domain, controls in framework_controls.items():
if control_id in controls:
matching.extend(requirement_set.get_by_domain(domain))
return matching
def generate_compliance_matrix(
self,
requirement_set: RequirementSet,
frameworks: List[ComplianceFramework]
) -> Dict[str, Dict[str, List[str]]]:
"""Generate compliance traceability matrix."""
matrix = {}
for framework in frameworks:
matrix[framework.value] = {}
framework_controls = self.FRAMEWORK_CONTROLS.get(framework, {})
for domain, controls in framework_controls.items():
for control in controls:
reqs = self.get_requirements_for_control(
requirement_set, framework, control
)
if reqs:
matrix[framework.value][control] = [r.id for r in reqs]
return matrix
def gap_analysis(
self,
requirement_set: RequirementSet,
framework: ComplianceFramework
) -> Dict[str, List[str]]:
"""Identify compliance gaps."""
gaps = {"missing_controls": [], "weak_coverage": []}
framework_controls = self.FRAMEWORK_CONTROLS.get(framework, {})
for domain, controls in framework_controls.items():
domain_reqs = requirement_set.get_by_domain(domain)
for control in controls:
matching = self.get_requirements_for_control(
requirement_set, framework, control
)
if not matching:
gaps["missing_controls"].append(f"{framework.value}:{control}")
elif len(matching) < 2:
gaps["weak_coverage"].append(f"{framework.value}:{control}")
return gaps
```
### Template 4: Security User Story Generator
```python
class SecurityUserStoryGenerator:
"""Generate security-focused user stories."""
STORY_TEMPLATES = {
SecurityDomain.AUTHENTICATION: {
"as_a": "security-conscious user",
"so_that": "my identity is protected from impersonation",
},
SecurityDomain.AUTHORIZATION: {
"as_a": "system administrator",
"so_that": "users can only access resources appropriate to their role",
},
SecurityDomain.DATA_PROTECTION: {
"as_a": "data owner",
"so_that": "my sensitive information remains confidential",
},
SecurityDomain.AUDIT_LOGGING: {
"as_a": "security analyst",
"so_that": "I can investigate security incidents",
},
SecurityDomain.INPUT_VALIDATION: {
"as_a": "application developer",
"so_that": "the system is protected from malicious input",
},
}
def generate_story(self, requirement: SecurityRequirement) -> str:
"""Generate a user story from requirement."""
template = self.STORY_TEMPLATES.get(
requirement.domain,
{"as_a": "user", "so_that": "the system is secure"}
)
story = f"""
## {requirement.id}: {requirement.title}
**User Story:**
As a {template['as_a']},
I want the system to {requirement.description.lower()},
So that {template['so_that']}.
**Priority:** {requirement.priority.name}
**Type:** {requirement.req_type.value}
**Domain:** {requirement.domain.value}
**Acceptance Criteria:**
{self._format_acceptance_criteria(requirement.acceptance_criteria)}
**Definition of Done:**
- [ ] Implementation complete
- [ ] Security tests pass
- [ ] Code review complete
- [ ] Security review approved
- [ ] Documentation updated
**Security Test Cases:**
{self._format_test_cases(requirement.test_cases)}
**Traceability:**
- Threats: {', '.join(requirement.threat_refs) or 'N/A'}
- Compliance: {', '.join(requirement.compliance_refs) or 'N/A'}
"""
return story
def _format_acceptance_criteria(self, criteria: List[str]) -> str:
return "\n".join(f"- [ ] {c}" for c in criteria) if criteria else "- [ ] TBD"
def _format_test_cases(self, tests: List[str]) -> str:
return "\n".join(f"- {t}" for t in tests) if tests else "- TBD"
def generate_epic(
self,
requirement_set: RequirementSet,
domain: SecurityDomain
) -> str:
"""Generate an epic for a security domain."""
reqs = requirement_set.get_by_domain(domain)
epic = f"""
# Security Epic: {domain.value.replace('_', ' ').title()}
## Overview
This epic covers all security requirements related to {domain.value.replace('_', ' ')}.
## Business Value
- Protect against {domain.value.replace('_', ' ')} related threats
- Meet compliance requirements
- Reduce security risk
## Stories in this Epic
{chr(10).join(f'- [{r.id}] {r.title}' for r in reqs)}
## Acceptance Criteria
- All stories complete
- Security tests passing
- Security review approved
- Compliance requirements met
## Risk if Not Implemented
- Vulnerability to {domain.value.replace('_', ' ')} attacks
- Compliance violations
- Potential data breach
## Dependencies
{chr(10).join(f'- {d}' for r in reqs for d in r.dependencies) or '- None identified'}
"""
return epic
```
## Best Practices
### Do's
- **Trace to threats** - Every requirement should map to threats
- **Be specific** - Vague requirements can't be tested
- **Include acceptance criteria** - Define "done"
- **Consider compliance** - Map to frameworks early
- **Review regularly** - Requirements evolve with threats
### Don'ts
- **Don't be generic** - "Be secure" is not a requirement
- **Don't skip rationale** - Explain why it matters
- **Don't ignore priorities** - Not all requirements are equal
- **Don't forget testability** - If you can't test it, you can't verify it
- **Don't work in isolation** - Involve stakeholders
## Resources
- [OWASP ASVS](https://owasp.org/www-project-application-security-verification-standard/)
- [NIST SP 800-53](https://csrc.nist.gov/publications/detail/sp/800-53/rev-5/final)
- [Security User Stories](https://www.oreilly.com/library/view/agile-application-security/9781491938836/)

View File

@@ -0,0 +1,656 @@
---
name: stride-analysis-patterns
description: Apply STRIDE methodology to systematically identify threats. Use when analyzing system security, conducting threat modeling sessions, or creating security documentation.
---
# STRIDE Analysis Patterns
Systematic threat identification using the STRIDE methodology.
## When to Use This Skill
- Starting new threat modeling sessions
- Analyzing existing system architecture
- Reviewing security design decisions
- Creating threat documentation
- Training teams on threat identification
- Compliance and audit preparation
## Core Concepts
### 1. STRIDE Categories
```
S - Spoofing → Authentication threats
T - Tampering → Integrity threats
R - Repudiation → Non-repudiation threats
I - Information → Confidentiality threats
Disclosure
D - Denial of → Availability threats
Service
E - Elevation of → Authorization threats
Privilege
```
### 2. Threat Analysis Matrix
| Category | Question | Control Family |
|----------|----------|----------------|
| **Spoofing** | Can attacker pretend to be someone else? | Authentication |
| **Tampering** | Can attacker modify data in transit/rest? | Integrity |
| **Repudiation** | Can attacker deny actions? | Logging/Audit |
| **Info Disclosure** | Can attacker access unauthorized data? | Encryption |
| **DoS** | Can attacker disrupt availability? | Rate limiting |
| **Elevation** | Can attacker gain higher privileges? | Authorization |
## Templates
### Template 1: STRIDE Threat Model Document
```markdown
# Threat Model: [System Name]
## 1. System Overview
### 1.1 Description
[Brief description of the system and its purpose]
### 1.2 Data Flow Diagram
```
[User] --> [Web App] --> [API Gateway] --> [Backend Services]
|
v
[Database]
```
### 1.3 Trust Boundaries
- **External Boundary**: Internet to DMZ
- **Internal Boundary**: DMZ to Internal Network
- **Data Boundary**: Application to Database
## 2. Assets
| Asset | Sensitivity | Description |
|-------|-------------|-------------|
| User Credentials | High | Authentication tokens, passwords |
| Personal Data | High | PII, financial information |
| Session Data | Medium | Active user sessions |
| Application Logs | Medium | System activity records |
| Configuration | High | System settings, secrets |
## 3. STRIDE Analysis
### 3.1 Spoofing Threats
| ID | Threat | Target | Impact | Likelihood |
|----|--------|--------|--------|------------|
| S1 | Session hijacking | User sessions | High | Medium |
| S2 | Token forgery | JWT tokens | High | Low |
| S3 | Credential stuffing | Login endpoint | High | High |
**Mitigations:**
- [ ] Implement MFA
- [ ] Use secure session management
- [ ] Implement account lockout policies
### 3.2 Tampering Threats
| ID | Threat | Target | Impact | Likelihood |
|----|--------|--------|--------|------------|
| T1 | SQL injection | Database queries | Critical | Medium |
| T2 | Parameter manipulation | API requests | High | High |
| T3 | File upload abuse | File storage | High | Medium |
**Mitigations:**
- [ ] Input validation on all endpoints
- [ ] Parameterized queries
- [ ] File type validation
### 3.3 Repudiation Threats
| ID | Threat | Target | Impact | Likelihood |
|----|--------|--------|--------|------------|
| R1 | Transaction denial | Financial ops | High | Medium |
| R2 | Access log tampering | Audit logs | Medium | Low |
| R3 | Action attribution | User actions | Medium | Medium |
**Mitigations:**
- [ ] Comprehensive audit logging
- [ ] Log integrity protection
- [ ] Digital signatures for critical actions
### 3.4 Information Disclosure Threats
| ID | Threat | Target | Impact | Likelihood |
|----|--------|--------|--------|------------|
| I1 | Data breach | User PII | Critical | Medium |
| I2 | Error message leakage | System info | Low | High |
| I3 | Insecure transmission | Network traffic | High | Medium |
**Mitigations:**
- [ ] Encryption at rest and in transit
- [ ] Sanitize error messages
- [ ] Implement TLS 1.3
### 3.5 Denial of Service Threats
| ID | Threat | Target | Impact | Likelihood |
|----|--------|--------|--------|------------|
| D1 | Resource exhaustion | API servers | High | High |
| D2 | Database overload | Database | Critical | Medium |
| D3 | Bandwidth saturation | Network | High | Medium |
**Mitigations:**
- [ ] Rate limiting
- [ ] Auto-scaling
- [ ] DDoS protection
### 3.6 Elevation of Privilege Threats
| ID | Threat | Target | Impact | Likelihood |
|----|--------|--------|--------|------------|
| E1 | IDOR vulnerabilities | User resources | High | High |
| E2 | Role manipulation | Admin access | Critical | Low |
| E3 | JWT claim tampering | Authorization | High | Medium |
**Mitigations:**
- [ ] Proper authorization checks
- [ ] Principle of least privilege
- [ ] Server-side role validation
## 4. Risk Assessment
### 4.1 Risk Matrix
```
IMPACT
Low Med High Crit
Low 1 2 3 4
L Med 2 4 6 8
I High 3 6 9 12
K Crit 4 8 12 16
```
### 4.2 Prioritized Risks
| Rank | Threat | Risk Score | Priority |
|------|--------|------------|----------|
| 1 | SQL Injection (T1) | 12 | Critical |
| 2 | IDOR (E1) | 9 | High |
| 3 | Credential Stuffing (S3) | 9 | High |
| 4 | Data Breach (I1) | 8 | High |
## 5. Recommendations
### Immediate Actions
1. Implement input validation framework
2. Add rate limiting to authentication endpoints
3. Enable comprehensive audit logging
### Short-term (30 days)
1. Deploy WAF with OWASP ruleset
2. Implement MFA for sensitive operations
3. Encrypt all PII at rest
### Long-term (90 days)
1. Security awareness training
2. Penetration testing
3. Bug bounty program
```
### Template 2: STRIDE Analysis Code
```python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional
import json
class StrideCategory(Enum):
SPOOFING = "S"
TAMPERING = "T"
REPUDIATION = "R"
INFORMATION_DISCLOSURE = "I"
DENIAL_OF_SERVICE = "D"
ELEVATION_OF_PRIVILEGE = "E"
class Impact(Enum):
LOW = 1
MEDIUM = 2
HIGH = 3
CRITICAL = 4
class Likelihood(Enum):
LOW = 1
MEDIUM = 2
HIGH = 3
CRITICAL = 4
@dataclass
class Threat:
id: str
category: StrideCategory
title: str
description: str
target: str
impact: Impact
likelihood: Likelihood
mitigations: List[str] = field(default_factory=list)
status: str = "open"
@property
def risk_score(self) -> int:
return self.impact.value * self.likelihood.value
@property
def risk_level(self) -> str:
score = self.risk_score
if score >= 12:
return "Critical"
elif score >= 6:
return "High"
elif score >= 3:
return "Medium"
return "Low"
@dataclass
class Asset:
name: str
sensitivity: str
description: str
data_classification: str
@dataclass
class TrustBoundary:
name: str
description: str
from_zone: str
to_zone: str
@dataclass
class ThreatModel:
name: str
version: str
description: str
assets: List[Asset] = field(default_factory=list)
boundaries: List[TrustBoundary] = field(default_factory=list)
threats: List[Threat] = field(default_factory=list)
def add_threat(self, threat: Threat) -> None:
self.threats.append(threat)
def get_threats_by_category(self, category: StrideCategory) -> List[Threat]:
return [t for t in self.threats if t.category == category]
def get_critical_threats(self) -> List[Threat]:
return [t for t in self.threats if t.risk_level in ("Critical", "High")]
def generate_report(self) -> Dict:
"""Generate threat model report."""
return {
"summary": {
"name": self.name,
"version": self.version,
"total_threats": len(self.threats),
"critical_threats": len([t for t in self.threats if t.risk_level == "Critical"]),
"high_threats": len([t for t in self.threats if t.risk_level == "High"]),
},
"by_category": {
cat.name: len(self.get_threats_by_category(cat))
for cat in StrideCategory
},
"top_risks": [
{
"id": t.id,
"title": t.title,
"risk_score": t.risk_score,
"risk_level": t.risk_level
}
for t in sorted(self.threats, key=lambda x: x.risk_score, reverse=True)[:10]
]
}
class StrideAnalyzer:
"""Automated STRIDE analysis helper."""
STRIDE_QUESTIONS = {
StrideCategory.SPOOFING: [
"Can an attacker impersonate a legitimate user?",
"Are authentication tokens properly validated?",
"Can session identifiers be predicted or stolen?",
"Is multi-factor authentication available?",
],
StrideCategory.TAMPERING: [
"Can data be modified in transit?",
"Can data be modified at rest?",
"Are input validation controls sufficient?",
"Can an attacker manipulate application logic?",
],
StrideCategory.REPUDIATION: [
"Are all security-relevant actions logged?",
"Can logs be tampered with?",
"Is there sufficient attribution for actions?",
"Are timestamps reliable and synchronized?",
],
StrideCategory.INFORMATION_DISCLOSURE: [
"Is sensitive data encrypted at rest?",
"Is sensitive data encrypted in transit?",
"Can error messages reveal sensitive information?",
"Are access controls properly enforced?",
],
StrideCategory.DENIAL_OF_SERVICE: [
"Are rate limits implemented?",
"Can resources be exhausted by malicious input?",
"Is there protection against amplification attacks?",
"Are there single points of failure?",
],
StrideCategory.ELEVATION_OF_PRIVILEGE: [
"Are authorization checks performed consistently?",
"Can users access other users' resources?",
"Can privilege escalation occur through parameter manipulation?",
"Is the principle of least privilege followed?",
],
}
def generate_questionnaire(self, component: str) -> List[Dict]:
"""Generate STRIDE questionnaire for a component."""
questionnaire = []
for category, questions in self.STRIDE_QUESTIONS.items():
for q in questions:
questionnaire.append({
"component": component,
"category": category.name,
"question": q,
"answer": None,
"notes": ""
})
return questionnaire
def suggest_mitigations(self, category: StrideCategory) -> List[str]:
"""Suggest common mitigations for a STRIDE category."""
mitigations = {
StrideCategory.SPOOFING: [
"Implement multi-factor authentication",
"Use secure session management",
"Implement account lockout policies",
"Use cryptographically secure tokens",
"Validate authentication at every request",
],
StrideCategory.TAMPERING: [
"Implement input validation",
"Use parameterized queries",
"Apply integrity checks (HMAC, signatures)",
"Implement Content Security Policy",
"Use immutable infrastructure",
],
StrideCategory.REPUDIATION: [
"Enable comprehensive audit logging",
"Protect log integrity",
"Implement digital signatures",
"Use centralized, tamper-evident logging",
"Maintain accurate timestamps",
],
StrideCategory.INFORMATION_DISCLOSURE: [
"Encrypt data at rest and in transit",
"Implement proper access controls",
"Sanitize error messages",
"Use secure defaults",
"Implement data classification",
],
StrideCategory.DENIAL_OF_SERVICE: [
"Implement rate limiting",
"Use auto-scaling",
"Deploy DDoS protection",
"Implement circuit breakers",
"Set resource quotas",
],
StrideCategory.ELEVATION_OF_PRIVILEGE: [
"Implement proper authorization",
"Follow principle of least privilege",
"Validate permissions server-side",
"Use role-based access control",
"Implement security boundaries",
],
}
return mitigations.get(category, [])
```
### Template 3: Data Flow Diagram Analysis
```python
from dataclasses import dataclass
from typing import List, Set, Tuple
from enum import Enum
class ElementType(Enum):
EXTERNAL_ENTITY = "external"
PROCESS = "process"
DATA_STORE = "datastore"
DATA_FLOW = "dataflow"
@dataclass
class DFDElement:
id: str
name: str
type: ElementType
trust_level: int # 0 = untrusted, higher = more trusted
description: str = ""
@dataclass
class DataFlow:
id: str
name: str
source: str
destination: str
data_type: str
protocol: str
encrypted: bool = False
class DFDAnalyzer:
"""Analyze Data Flow Diagrams for STRIDE threats."""
def __init__(self):
self.elements: Dict[str, DFDElement] = {}
self.flows: List[DataFlow] = []
def add_element(self, element: DFDElement) -> None:
self.elements[element.id] = element
def add_flow(self, flow: DataFlow) -> None:
self.flows.append(flow)
def find_trust_boundary_crossings(self) -> List[Tuple[DataFlow, int]]:
"""Find data flows that cross trust boundaries."""
crossings = []
for flow in self.flows:
source = self.elements.get(flow.source)
dest = self.elements.get(flow.destination)
if source and dest and source.trust_level != dest.trust_level:
trust_diff = abs(source.trust_level - dest.trust_level)
crossings.append((flow, trust_diff))
return sorted(crossings, key=lambda x: x[1], reverse=True)
def identify_threats_per_element(self) -> Dict[str, List[StrideCategory]]:
"""Map applicable STRIDE categories to element types."""
threat_mapping = {
ElementType.EXTERNAL_ENTITY: [
StrideCategory.SPOOFING,
StrideCategory.REPUDIATION,
],
ElementType.PROCESS: [
StrideCategory.SPOOFING,
StrideCategory.TAMPERING,
StrideCategory.REPUDIATION,
StrideCategory.INFORMATION_DISCLOSURE,
StrideCategory.DENIAL_OF_SERVICE,
StrideCategory.ELEVATION_OF_PRIVILEGE,
],
ElementType.DATA_STORE: [
StrideCategory.TAMPERING,
StrideCategory.REPUDIATION,
StrideCategory.INFORMATION_DISCLOSURE,
StrideCategory.DENIAL_OF_SERVICE,
],
ElementType.DATA_FLOW: [
StrideCategory.TAMPERING,
StrideCategory.INFORMATION_DISCLOSURE,
StrideCategory.DENIAL_OF_SERVICE,
],
}
result = {}
for elem_id, elem in self.elements.items():
result[elem_id] = threat_mapping.get(elem.type, [])
return result
def analyze_unencrypted_flows(self) -> List[DataFlow]:
"""Find unencrypted data flows crossing trust boundaries."""
risky_flows = []
for flow in self.flows:
if not flow.encrypted:
source = self.elements.get(flow.source)
dest = self.elements.get(flow.destination)
if source and dest and source.trust_level != dest.trust_level:
risky_flows.append(flow)
return risky_flows
def generate_threat_enumeration(self) -> List[Dict]:
"""Generate comprehensive threat enumeration."""
threats = []
element_threats = self.identify_threats_per_element()
for elem_id, categories in element_threats.items():
elem = self.elements[elem_id]
for category in categories:
threats.append({
"element_id": elem_id,
"element_name": elem.name,
"element_type": elem.type.value,
"stride_category": category.name,
"description": f"{category.name} threat against {elem.name}",
"trust_level": elem.trust_level
})
return threats
```
### Template 4: STRIDE per Interaction
```python
from typing import List, Dict, Optional
from dataclasses import dataclass
@dataclass
class Interaction:
"""Represents an interaction between two components."""
id: str
source: str
target: str
action: str
data: str
protocol: str
class StridePerInteraction:
"""Apply STRIDE to each interaction in the system."""
INTERACTION_THREATS = {
# Source type -> Target type -> Applicable threats
("external", "process"): {
"S": "External entity spoofing identity to process",
"T": "Tampering with data sent to process",
"R": "External entity denying sending data",
"I": "Data exposure during transmission",
"D": "Flooding process with requests",
"E": "Exploiting process to gain privileges",
},
("process", "datastore"): {
"T": "Process tampering with stored data",
"R": "Process denying data modifications",
"I": "Unauthorized data access by process",
"D": "Process exhausting storage resources",
},
("process", "process"): {
"S": "Process spoofing another process",
"T": "Tampering with inter-process data",
"I": "Data leakage between processes",
"D": "One process overwhelming another",
"E": "Process gaining elevated access",
},
}
def analyze_interaction(
self,
interaction: Interaction,
source_type: str,
target_type: str
) -> List[Dict]:
"""Analyze a single interaction for STRIDE threats."""
threats = []
key = (source_type, target_type)
applicable_threats = self.INTERACTION_THREATS.get(key, {})
for stride_code, description in applicable_threats.items():
threats.append({
"interaction_id": interaction.id,
"source": interaction.source,
"target": interaction.target,
"stride_category": stride_code,
"threat_description": description,
"context": f"{interaction.action} - {interaction.data}",
})
return threats
def generate_threat_matrix(
self,
interactions: List[Interaction],
element_types: Dict[str, str]
) -> List[Dict]:
"""Generate complete threat matrix for all interactions."""
all_threats = []
for interaction in interactions:
source_type = element_types.get(interaction.source, "unknown")
target_type = element_types.get(interaction.target, "unknown")
threats = self.analyze_interaction(
interaction, source_type, target_type
)
all_threats.extend(threats)
return all_threats
```
## Best Practices
### Do's
- **Involve stakeholders** - Security, dev, and ops perspectives
- **Be systematic** - Cover all STRIDE categories
- **Prioritize realistically** - Focus on high-impact threats
- **Update regularly** - Threat models are living documents
- **Use visual aids** - DFDs help communication
### Don'ts
- **Don't skip categories** - Each reveals different threats
- **Don't assume security** - Question every component
- **Don't work in isolation** - Collaborative modeling is better
- **Don't ignore low-probability** - High-impact threats matter
- **Don't stop at identification** - Follow through with mitigations
## Resources
- [Microsoft STRIDE Documentation](https://docs.microsoft.com/en-us/azure/security/develop/threat-modeling-tool-threats)
- [OWASP Threat Modeling](https://owasp.org/www-community/Threat_Modeling)
- [Threat Modeling: Designing for Security](https://www.wiley.com/en-us/Threat+Modeling%3A+Designing+for+Security-p-9781118809990)

View File

@@ -0,0 +1,745 @@
---
name: threat-mitigation-mapping
description: Map identified threats to appropriate security controls and mitigations. Use when prioritizing security investments, creating remediation plans, or validating control effectiveness.
---
# Threat Mitigation Mapping
Connect threats to controls for effective security planning.
## When to Use This Skill
- Prioritizing security investments
- Creating remediation roadmaps
- Validating control coverage
- Designing defense-in-depth
- Security architecture review
- Risk treatment planning
## Core Concepts
### 1. Control Categories
```
Preventive ────► Stop attacks before they occur
│ (Firewall, Input validation)
Detective ─────► Identify attacks in progress
│ (IDS, Log monitoring)
Corrective ────► Respond and recover from attacks
(Incident response, Backup restore)
```
### 2. Control Layers
| Layer | Examples |
|-------|----------|
| **Network** | Firewall, WAF, DDoS protection |
| **Application** | Input validation, authentication |
| **Data** | Encryption, access controls |
| **Endpoint** | EDR, patch management |
| **Process** | Security training, incident response |
### 3. Defense in Depth
```
┌──────────────────────┐
│ Perimeter │ ← Firewall, WAF
│ ┌──────────────┐ │
│ │ Network │ │ ← Segmentation, IDS
│ │ ┌────────┐ │ │
│ │ │ Host │ │ │ ← EDR, Hardening
│ │ │ ┌────┐ │ │ │
│ │ │ │App │ │ │ │ ← Auth, Validation
│ │ │ │Data│ │ │ │ ← Encryption
│ │ │ └────┘ │ │ │
│ │ └────────┘ │ │
│ └──────────────┘ │
└──────────────────────┘
```
## Templates
### Template 1: Mitigation Model
```python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Set
from datetime import datetime
class ControlType(Enum):
PREVENTIVE = "preventive"
DETECTIVE = "detective"
CORRECTIVE = "corrective"
class ControlLayer(Enum):
NETWORK = "network"
APPLICATION = "application"
DATA = "data"
ENDPOINT = "endpoint"
PROCESS = "process"
PHYSICAL = "physical"
class ImplementationStatus(Enum):
NOT_IMPLEMENTED = "not_implemented"
PARTIAL = "partial"
IMPLEMENTED = "implemented"
VERIFIED = "verified"
class Effectiveness(Enum):
NONE = 0
LOW = 1
MEDIUM = 2
HIGH = 3
VERY_HIGH = 4
@dataclass
class SecurityControl:
id: str
name: str
description: str
control_type: ControlType
layer: ControlLayer
effectiveness: Effectiveness
implementation_cost: str # Low, Medium, High
maintenance_cost: str
status: ImplementationStatus = ImplementationStatus.NOT_IMPLEMENTED
mitigates_threats: List[str] = field(default_factory=list)
dependencies: List[str] = field(default_factory=list)
technologies: List[str] = field(default_factory=list)
compliance_refs: List[str] = field(default_factory=list)
def coverage_score(self) -> float:
"""Calculate coverage score based on status and effectiveness."""
status_multiplier = {
ImplementationStatus.NOT_IMPLEMENTED: 0.0,
ImplementationStatus.PARTIAL: 0.5,
ImplementationStatus.IMPLEMENTED: 0.8,
ImplementationStatus.VERIFIED: 1.0,
}
return self.effectiveness.value * status_multiplier[self.status]
@dataclass
class Threat:
id: str
name: str
category: str # STRIDE category
description: str
impact: str # Critical, High, Medium, Low
likelihood: str
risk_score: float
@dataclass
class MitigationMapping:
threat: Threat
controls: List[SecurityControl]
residual_risk: str = "Unknown"
notes: str = ""
def calculate_coverage(self) -> float:
"""Calculate how well controls cover the threat."""
if not self.controls:
return 0.0
total_score = sum(c.coverage_score() for c in self.controls)
max_possible = len(self.controls) * Effectiveness.VERY_HIGH.value
return (total_score / max_possible) * 100 if max_possible > 0 else 0
def has_defense_in_depth(self) -> bool:
"""Check if multiple layers are covered."""
layers = set(c.layer for c in self.controls if c.status != ImplementationStatus.NOT_IMPLEMENTED)
return len(layers) >= 2
def has_control_diversity(self) -> bool:
"""Check if multiple control types are present."""
types = set(c.control_type for c in self.controls if c.status != ImplementationStatus.NOT_IMPLEMENTED)
return len(types) >= 2
@dataclass
class MitigationPlan:
name: str
threats: List[Threat] = field(default_factory=list)
controls: List[SecurityControl] = field(default_factory=list)
mappings: List[MitigationMapping] = field(default_factory=list)
def get_unmapped_threats(self) -> List[Threat]:
"""Find threats without mitigations."""
mapped_ids = {m.threat.id for m in self.mappings}
return [t for t in self.threats if t.id not in mapped_ids]
def get_control_coverage(self) -> Dict[str, float]:
"""Get coverage percentage for each threat."""
return {
m.threat.id: m.calculate_coverage()
for m in self.mappings
}
def get_gaps(self) -> List[Dict]:
"""Identify mitigation gaps."""
gaps = []
for mapping in self.mappings:
coverage = mapping.calculate_coverage()
if coverage < 50:
gaps.append({
"threat": mapping.threat.id,
"threat_name": mapping.threat.name,
"coverage": coverage,
"issue": "Insufficient control coverage",
"recommendation": "Add more controls or improve existing ones"
})
if not mapping.has_defense_in_depth():
gaps.append({
"threat": mapping.threat.id,
"threat_name": mapping.threat.name,
"coverage": coverage,
"issue": "No defense in depth",
"recommendation": "Add controls at different layers"
})
if not mapping.has_control_diversity():
gaps.append({
"threat": mapping.threat.id,
"threat_name": mapping.threat.name,
"coverage": coverage,
"issue": "No control diversity",
"recommendation": "Add detective/corrective controls"
})
return gaps
```
### Template 2: Control Library
```python
class ControlLibrary:
"""Library of standard security controls."""
STANDARD_CONTROLS = {
# Authentication Controls
"AUTH-001": SecurityControl(
id="AUTH-001",
name="Multi-Factor Authentication",
description="Require MFA for all user authentication",
control_type=ControlType.PREVENTIVE,
layer=ControlLayer.APPLICATION,
effectiveness=Effectiveness.HIGH,
implementation_cost="Medium",
maintenance_cost="Low",
mitigates_threats=["SPOOFING"],
technologies=["TOTP", "WebAuthn", "SMS OTP"],
compliance_refs=["PCI-DSS 8.3", "NIST 800-63B"]
),
"AUTH-002": SecurityControl(
id="AUTH-002",
name="Account Lockout Policy",
description="Lock accounts after failed authentication attempts",
control_type=ControlType.PREVENTIVE,
layer=ControlLayer.APPLICATION,
effectiveness=Effectiveness.MEDIUM,
implementation_cost="Low",
maintenance_cost="Low",
mitigates_threats=["SPOOFING"],
technologies=["Custom implementation"],
compliance_refs=["PCI-DSS 8.1.6"]
),
# Input Validation Controls
"VAL-001": SecurityControl(
id="VAL-001",
name="Input Validation Framework",
description="Validate and sanitize all user input",
control_type=ControlType.PREVENTIVE,
layer=ControlLayer.APPLICATION,
effectiveness=Effectiveness.HIGH,
implementation_cost="Medium",
maintenance_cost="Medium",
mitigates_threats=["TAMPERING", "INJECTION"],
technologies=["Joi", "Yup", "Pydantic"],
compliance_refs=["OWASP ASVS V5"]
),
"VAL-002": SecurityControl(
id="VAL-002",
name="Web Application Firewall",
description="Deploy WAF to filter malicious requests",
control_type=ControlType.PREVENTIVE,
layer=ControlLayer.NETWORK,
effectiveness=Effectiveness.MEDIUM,
implementation_cost="Medium",
maintenance_cost="Medium",
mitigates_threats=["TAMPERING", "INJECTION", "DOS"],
technologies=["AWS WAF", "Cloudflare", "ModSecurity"],
compliance_refs=["PCI-DSS 6.6"]
),
# Encryption Controls
"ENC-001": SecurityControl(
id="ENC-001",
name="Data Encryption at Rest",
description="Encrypt sensitive data in storage",
control_type=ControlType.PREVENTIVE,
layer=ControlLayer.DATA,
effectiveness=Effectiveness.HIGH,
implementation_cost="Medium",
maintenance_cost="Low",
mitigates_threats=["INFORMATION_DISCLOSURE"],
technologies=["AES-256", "KMS", "HSM"],
compliance_refs=["PCI-DSS 3.4", "GDPR Art. 32"]
),
"ENC-002": SecurityControl(
id="ENC-002",
name="TLS Encryption",
description="Encrypt data in transit using TLS 1.3",
control_type=ControlType.PREVENTIVE,
layer=ControlLayer.NETWORK,
effectiveness=Effectiveness.HIGH,
implementation_cost="Low",
maintenance_cost="Low",
mitigates_threats=["INFORMATION_DISCLOSURE", "TAMPERING"],
technologies=["TLS 1.3", "Certificate management"],
compliance_refs=["PCI-DSS 4.1", "HIPAA"]
),
# Logging Controls
"LOG-001": SecurityControl(
id="LOG-001",
name="Security Event Logging",
description="Log all security-relevant events",
control_type=ControlType.DETECTIVE,
layer=ControlLayer.APPLICATION,
effectiveness=Effectiveness.MEDIUM,
implementation_cost="Low",
maintenance_cost="Medium",
mitigates_threats=["REPUDIATION"],
technologies=["ELK Stack", "Splunk", "CloudWatch"],
compliance_refs=["PCI-DSS 10.2", "SOC2"]
),
"LOG-002": SecurityControl(
id="LOG-002",
name="Log Integrity Protection",
description="Protect logs from tampering",
control_type=ControlType.PREVENTIVE,
layer=ControlLayer.DATA,
effectiveness=Effectiveness.MEDIUM,
implementation_cost="Medium",
maintenance_cost="Low",
mitigates_threats=["REPUDIATION", "TAMPERING"],
technologies=["Immutable storage", "Log signing"],
compliance_refs=["PCI-DSS 10.5"]
),
# Access Control
"ACC-001": SecurityControl(
id="ACC-001",
name="Role-Based Access Control",
description="Implement RBAC for authorization",
control_type=ControlType.PREVENTIVE,
layer=ControlLayer.APPLICATION,
effectiveness=Effectiveness.HIGH,
implementation_cost="Medium",
maintenance_cost="Medium",
mitigates_threats=["ELEVATION_OF_PRIVILEGE", "INFORMATION_DISCLOSURE"],
technologies=["RBAC", "ABAC", "Policy engines"],
compliance_refs=["PCI-DSS 7.1", "SOC2"]
),
# Availability Controls
"AVL-001": SecurityControl(
id="AVL-001",
name="Rate Limiting",
description="Limit request rates to prevent abuse",
control_type=ControlType.PREVENTIVE,
layer=ControlLayer.APPLICATION,
effectiveness=Effectiveness.MEDIUM,
implementation_cost="Low",
maintenance_cost="Low",
mitigates_threats=["DENIAL_OF_SERVICE"],
technologies=["API Gateway", "Redis", "Token bucket"],
compliance_refs=["OWASP API Security"]
),
"AVL-002": SecurityControl(
id="AVL-002",
name="DDoS Protection",
description="Deploy DDoS mitigation services",
control_type=ControlType.PREVENTIVE,
layer=ControlLayer.NETWORK,
effectiveness=Effectiveness.HIGH,
implementation_cost="High",
maintenance_cost="Medium",
mitigates_threats=["DENIAL_OF_SERVICE"],
technologies=["Cloudflare", "AWS Shield", "Akamai"],
compliance_refs=["NIST CSF"]
),
}
def get_controls_for_threat(self, threat_category: str) -> List[SecurityControl]:
"""Get all controls that mitigate a threat category."""
return [
c for c in self.STANDARD_CONTROLS.values()
if threat_category in c.mitigates_threats
]
def get_controls_by_layer(self, layer: ControlLayer) -> List[SecurityControl]:
"""Get controls for a specific layer."""
return [c for c in self.STANDARD_CONTROLS.values() if c.layer == layer]
def get_control(self, control_id: str) -> Optional[SecurityControl]:
"""Get a specific control by ID."""
return self.STANDARD_CONTROLS.get(control_id)
def recommend_controls(
self,
threat: Threat,
existing_controls: List[str]
) -> List[SecurityControl]:
"""Recommend additional controls for a threat."""
available = self.get_controls_for_threat(threat.category)
return [c for c in available if c.id not in existing_controls]
```
### Template 3: Mitigation Analysis
```python
class MitigationAnalyzer:
"""Analyze and optimize mitigation strategies."""
def __init__(self, plan: MitigationPlan, library: ControlLibrary):
self.plan = plan
self.library = library
def calculate_overall_risk_reduction(self) -> float:
"""Calculate overall risk reduction percentage."""
if not self.plan.mappings:
return 0.0
weighted_coverage = 0
total_weight = 0
for mapping in self.plan.mappings:
# Weight by threat risk score
weight = mapping.threat.risk_score
coverage = mapping.calculate_coverage()
weighted_coverage += weight * coverage
total_weight += weight
return weighted_coverage / total_weight if total_weight > 0 else 0
def get_critical_gaps(self) -> List[Dict]:
"""Find critical gaps that need immediate attention."""
gaps = self.plan.get_gaps()
critical_threats = {t.id for t in self.plan.threats if t.impact == "Critical"}
return [g for g in gaps if g["threat"] in critical_threats]
def optimize_budget(
self,
budget: float,
cost_map: Dict[str, float]
) -> List[SecurityControl]:
"""Select controls that maximize risk reduction within budget."""
# Simple greedy approach - can be replaced with optimization algorithm
recommended = []
remaining_budget = budget
unmapped = self.plan.get_unmapped_threats()
# Sort controls by effectiveness/cost ratio
all_controls = list(self.library.STANDARD_CONTROLS.values())
controls_with_value = []
for control in all_controls:
if control.status == ImplementationStatus.NOT_IMPLEMENTED:
cost = cost_map.get(control.id, float('inf'))
if cost <= remaining_budget:
# Calculate value as threats covered * effectiveness / cost
threats_covered = len([
t for t in unmapped
if t.category in control.mitigates_threats
])
if threats_covered > 0:
value = (threats_covered * control.effectiveness.value) / cost
controls_with_value.append((control, value, cost))
# Sort by value (higher is better)
controls_with_value.sort(key=lambda x: x[1], reverse=True)
for control, value, cost in controls_with_value:
if cost <= remaining_budget:
recommended.append(control)
remaining_budget -= cost
return recommended
def generate_roadmap(self) -> List[Dict]:
"""Generate implementation roadmap by priority."""
roadmap = []
gaps = self.plan.get_gaps()
# Phase 1: Critical threats with low coverage
phase1 = []
for gap in gaps:
mapping = next(
(m for m in self.plan.mappings if m.threat.id == gap["threat"]),
None
)
if mapping and mapping.threat.impact == "Critical":
controls = self.library.get_controls_for_threat(mapping.threat.category)
phase1.extend([
{
"threat": gap["threat"],
"control": c.id,
"control_name": c.name,
"phase": 1,
"priority": "Critical"
}
for c in controls
if c.status == ImplementationStatus.NOT_IMPLEMENTED
])
roadmap.extend(phase1[:5]) # Top 5 for phase 1
# Phase 2: High impact threats
phase2 = []
for gap in gaps:
mapping = next(
(m for m in self.plan.mappings if m.threat.id == gap["threat"]),
None
)
if mapping and mapping.threat.impact == "High":
controls = self.library.get_controls_for_threat(mapping.threat.category)
phase2.extend([
{
"threat": gap["threat"],
"control": c.id,
"control_name": c.name,
"phase": 2,
"priority": "High"
}
for c in controls
if c.status == ImplementationStatus.NOT_IMPLEMENTED
])
roadmap.extend(phase2[:5]) # Top 5 for phase 2
return roadmap
def defense_in_depth_analysis(self) -> Dict[str, List[str]]:
"""Analyze defense in depth coverage."""
layer_coverage = {layer.value: [] for layer in ControlLayer}
for mapping in self.plan.mappings:
for control in mapping.controls:
if control.status in [ImplementationStatus.IMPLEMENTED, ImplementationStatus.VERIFIED]:
layer_coverage[control.layer.value].append(control.id)
return layer_coverage
def generate_report(self) -> str:
"""Generate comprehensive mitigation report."""
risk_reduction = self.calculate_overall_risk_reduction()
gaps = self.plan.get_gaps()
critical_gaps = self.get_critical_gaps()
layer_coverage = self.defense_in_depth_analysis()
report = f"""
# Threat Mitigation Report
## Executive Summary
- **Overall Risk Reduction:** {risk_reduction:.1f}%
- **Total Threats:** {len(self.plan.threats)}
- **Total Controls:** {len(self.plan.controls)}
- **Identified Gaps:** {len(gaps)}
- **Critical Gaps:** {len(critical_gaps)}
## Defense in Depth Coverage
{self._format_layer_coverage(layer_coverage)}
## Critical Gaps Requiring Immediate Action
{self._format_gaps(critical_gaps)}
## Recommendations
{self._format_recommendations()}
## Implementation Roadmap
{self._format_roadmap()}
"""
return report
def _format_layer_coverage(self, coverage: Dict[str, List[str]]) -> str:
lines = []
for layer, controls in coverage.items():
status = "" if controls else ""
lines.append(f"- {layer}: {status} ({len(controls)} controls)")
return "\n".join(lines)
def _format_gaps(self, gaps: List[Dict]) -> str:
if not gaps:
return "No critical gaps identified."
lines = []
for gap in gaps:
lines.append(f"- **{gap['threat_name']}**: {gap['issue']}")
lines.append(f" - Coverage: {gap['coverage']:.1f}%")
lines.append(f" - Recommendation: {gap['recommendation']}")
return "\n".join(lines)
def _format_recommendations(self) -> str:
recommendations = []
layer_coverage = self.defense_in_depth_analysis()
for layer, controls in layer_coverage.items():
if not controls:
recommendations.append(f"- Add {layer} layer controls")
gaps = self.plan.get_gaps()
if any(g["issue"] == "No control diversity" for g in gaps):
recommendations.append("- Add more detective and corrective controls")
return "\n".join(recommendations) if recommendations else "Current coverage is adequate."
def _format_roadmap(self) -> str:
roadmap = self.generate_roadmap()
if not roadmap:
return "No additional controls recommended at this time."
lines = []
current_phase = 0
for item in roadmap:
if item["phase"] != current_phase:
current_phase = item["phase"]
lines.append(f"\n### Phase {current_phase}")
lines.append(f"- [{item['priority']}] {item['control_name']} (for {item['threat']})")
return "\n".join(lines)
```
### Template 4: Control Effectiveness Testing
```python
from dataclasses import dataclass
from typing import List, Callable, Any
import asyncio
@dataclass
class ControlTest:
control_id: str
test_name: str
test_function: Callable[[], bool]
expected_result: bool
description: str
class ControlTester:
"""Test control effectiveness."""
def __init__(self):
self.tests: List[ControlTest] = []
self.results: List[Dict] = []
def add_test(self, test: ControlTest) -> None:
self.tests.append(test)
async def run_tests(self) -> List[Dict]:
"""Run all control tests."""
self.results = []
for test in self.tests:
try:
result = test.test_function()
passed = result == test.expected_result
self.results.append({
"control_id": test.control_id,
"test_name": test.test_name,
"passed": passed,
"actual_result": result,
"expected_result": test.expected_result,
"description": test.description,
"error": None
})
except Exception as e:
self.results.append({
"control_id": test.control_id,
"test_name": test.test_name,
"passed": False,
"actual_result": None,
"expected_result": test.expected_result,
"description": test.description,
"error": str(e)
})
return self.results
def get_effectiveness_score(self, control_id: str) -> float:
"""Calculate effectiveness score for a control."""
control_results = [r for r in self.results if r["control_id"] == control_id]
if not control_results:
return 0.0
passed = sum(1 for r in control_results if r["passed"])
return (passed / len(control_results)) * 100
def generate_test_report(self) -> str:
"""Generate test results report."""
if not self.results:
return "No tests have been run."
total = len(self.results)
passed = sum(1 for r in self.results if r["passed"])
report = f"""
# Control Effectiveness Test Report
## Summary
- **Total Tests:** {total}
- **Passed:** {passed}
- **Failed:** {total - passed}
- **Pass Rate:** {(passed/total)*100:.1f}%
## Results by Control
"""
# Group by control
controls = {}
for result in self.results:
cid = result["control_id"]
if cid not in controls:
controls[cid] = []
controls[cid].append(result)
for control_id, results in controls.items():
score = self.get_effectiveness_score(control_id)
report += f"\n### {control_id} (Effectiveness: {score:.1f}%)\n"
for r in results:
status = "" if r["passed"] else ""
report += f"- {status} {r['test_name']}\n"
if r["error"]:
report += f" - Error: {r['error']}\n"
return report
```
## Best Practices
### Do's
- **Map all threats** - No threat should be unmapped
- **Layer controls** - Defense in depth is essential
- **Mix control types** - Preventive, detective, corrective
- **Track effectiveness** - Measure and improve
- **Review regularly** - Controls degrade over time
### Don'ts
- **Don't rely on single controls** - Single points of failure
- **Don't ignore cost** - ROI matters
- **Don't skip testing** - Untested controls may fail
- **Don't set and forget** - Continuous improvement
- **Don't ignore people/process** - Technology alone isn't enough
## Resources
- [NIST Cybersecurity Framework](https://www.nist.gov/cyberframework)
- [CIS Controls](https://www.cisecurity.org/controls)
- [MITRE D3FEND](https://d3fend.mitre.org/)

View File

@@ -0,0 +1,655 @@
---
name: go-concurrency-patterns
description: Master Go concurrency with goroutines, channels, sync primitives, and context. Use when building concurrent Go applications, implementing worker pools, or debugging race conditions.
---
# Go Concurrency Patterns
Production patterns for Go concurrency including goroutines, channels, synchronization primitives, and context management.
## When to Use This Skill
- Building concurrent Go applications
- Implementing worker pools and pipelines
- Managing goroutine lifecycles
- Using channels for communication
- Debugging race conditions
- Implementing graceful shutdown
## Core Concepts
### 1. Go Concurrency Primitives
| Primitive | Purpose |
|-----------|---------|
| `goroutine` | Lightweight concurrent execution |
| `channel` | Communication between goroutines |
| `select` | Multiplex channel operations |
| `sync.Mutex` | Mutual exclusion |
| `sync.WaitGroup` | Wait for goroutines to complete |
| `context.Context` | Cancellation and deadlines |
### 2. Go Concurrency Mantra
```
Don't communicate by sharing memory;
share memory by communicating.
```
## Quick Start
```go
package main
import (
"context"
"fmt"
"sync"
"time"
)
func main() {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
results := make(chan string, 10)
var wg sync.WaitGroup
// Spawn workers
for i := 0; i < 3; i++ {
wg.Add(1)
go worker(ctx, i, results, &wg)
}
// Close results when done
go func() {
wg.Wait()
close(results)
}()
// Collect results
for result := range results {
fmt.Println(result)
}
}
func worker(ctx context.Context, id int, results chan<- string, wg *sync.WaitGroup) {
defer wg.Done()
select {
case <-ctx.Done():
return
case results <- fmt.Sprintf("Worker %d done", id):
}
}
```
## Patterns
### Pattern 1: Worker Pool
```go
package main
import (
"context"
"fmt"
"sync"
)
type Job struct {
ID int
Data string
}
type Result struct {
JobID int
Output string
Err error
}
func WorkerPool(ctx context.Context, numWorkers int, jobs <-chan Job) <-chan Result {
results := make(chan Result, len(jobs))
var wg sync.WaitGroup
for i := 0; i < numWorkers; i++ {
wg.Add(1)
go func(workerID int) {
defer wg.Done()
for job := range jobs {
select {
case <-ctx.Done():
return
default:
result := processJob(job)
results <- result
}
}
}(i)
}
go func() {
wg.Wait()
close(results)
}()
return results
}
func processJob(job Job) Result {
// Simulate work
return Result{
JobID: job.ID,
Output: fmt.Sprintf("Processed: %s", job.Data),
}
}
// Usage
func main() {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
jobs := make(chan Job, 100)
// Send jobs
go func() {
for i := 0; i < 50; i++ {
jobs <- Job{ID: i, Data: fmt.Sprintf("job-%d", i)}
}
close(jobs)
}()
// Process with 5 workers
results := WorkerPool(ctx, 5, jobs)
for result := range results {
fmt.Printf("Result: %+v\n", result)
}
}
```
### Pattern 2: Fan-Out/Fan-In Pipeline
```go
package main
import (
"context"
"sync"
)
// Stage 1: Generate numbers
func generate(ctx context.Context, nums ...int) <-chan int {
out := make(chan int)
go func() {
defer close(out)
for _, n := range nums {
select {
case <-ctx.Done():
return
case out <- n:
}
}
}()
return out
}
// Stage 2: Square numbers (can run multiple instances)
func square(ctx context.Context, in <-chan int) <-chan int {
out := make(chan int)
go func() {
defer close(out)
for n := range in {
select {
case <-ctx.Done():
return
case out <- n * n:
}
}
}()
return out
}
// Fan-in: Merge multiple channels into one
func merge(ctx context.Context, cs ...<-chan int) <-chan int {
var wg sync.WaitGroup
out := make(chan int)
// Start output goroutine for each input channel
output := func(c <-chan int) {
defer wg.Done()
for n := range c {
select {
case <-ctx.Done():
return
case out <- n:
}
}
}
wg.Add(len(cs))
for _, c := range cs {
go output(c)
}
// Close out after all inputs are done
go func() {
wg.Wait()
close(out)
}()
return out
}
func main() {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Generate input
in := generate(ctx, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
// Fan out to multiple squarers
c1 := square(ctx, in)
c2 := square(ctx, in)
c3 := square(ctx, in)
// Fan in results
for result := range merge(ctx, c1, c2, c3) {
fmt.Println(result)
}
}
```
### Pattern 3: Bounded Concurrency with Semaphore
```go
package main
import (
"context"
"fmt"
"golang.org/x/sync/semaphore"
"sync"
)
type RateLimitedWorker struct {
sem *semaphore.Weighted
}
func NewRateLimitedWorker(maxConcurrent int64) *RateLimitedWorker {
return &RateLimitedWorker{
sem: semaphore.NewWeighted(maxConcurrent),
}
}
func (w *RateLimitedWorker) Do(ctx context.Context, tasks []func() error) []error {
var (
wg sync.WaitGroup
mu sync.Mutex
errors []error
)
for _, task := range tasks {
// Acquire semaphore (blocks if at limit)
if err := w.sem.Acquire(ctx, 1); err != nil {
return []error{err}
}
wg.Add(1)
go func(t func() error) {
defer wg.Done()
defer w.sem.Release(1)
if err := t(); err != nil {
mu.Lock()
errors = append(errors, err)
mu.Unlock()
}
}(task)
}
wg.Wait()
return errors
}
// Alternative: Channel-based semaphore
type Semaphore chan struct{}
func NewSemaphore(n int) Semaphore {
return make(chan struct{}, n)
}
func (s Semaphore) Acquire() {
s <- struct{}{}
}
func (s Semaphore) Release() {
<-s
}
```
### Pattern 4: Graceful Shutdown
```go
package main
import (
"context"
"fmt"
"os"
"os/signal"
"sync"
"syscall"
"time"
)
type Server struct {
shutdown chan struct{}
wg sync.WaitGroup
}
func NewServer() *Server {
return &Server{
shutdown: make(chan struct{}),
}
}
func (s *Server) Start(ctx context.Context) {
// Start workers
for i := 0; i < 5; i++ {
s.wg.Add(1)
go s.worker(ctx, i)
}
}
func (s *Server) worker(ctx context.Context, id int) {
defer s.wg.Done()
defer fmt.Printf("Worker %d stopped\n", id)
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
// Cleanup
fmt.Printf("Worker %d cleaning up...\n", id)
time.Sleep(500 * time.Millisecond) // Simulated cleanup
return
case <-ticker.C:
fmt.Printf("Worker %d working...\n", id)
}
}
}
func (s *Server) Shutdown(timeout time.Duration) {
// Signal shutdown
close(s.shutdown)
// Wait with timeout
done := make(chan struct{})
go func() {
s.wg.Wait()
close(done)
}()
select {
case <-done:
fmt.Println("Clean shutdown completed")
case <-time.After(timeout):
fmt.Println("Shutdown timed out, forcing exit")
}
}
func main() {
// Setup signal handling
ctx, cancel := context.WithCancel(context.Background())
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
server := NewServer()
server.Start(ctx)
// Wait for signal
sig := <-sigCh
fmt.Printf("\nReceived signal: %v\n", sig)
// Cancel context to stop workers
cancel()
// Wait for graceful shutdown
server.Shutdown(5 * time.Second)
}
```
### Pattern 5: Error Group with Cancellation
```go
package main
import (
"context"
"fmt"
"golang.org/x/sync/errgroup"
"net/http"
)
func fetchAllURLs(ctx context.Context, urls []string) ([]string, error) {
g, ctx := errgroup.WithContext(ctx)
results := make([]string, len(urls))
for i, url := range urls {
i, url := i, url // Capture loop variables
g.Go(func() error {
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return fmt.Errorf("creating request for %s: %w", url, err)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("fetching %s: %w", url, err)
}
defer resp.Body.Close()
results[i] = fmt.Sprintf("%s: %d", url, resp.StatusCode)
return nil
})
}
// Wait for all goroutines to complete or one to fail
if err := g.Wait(); err != nil {
return nil, err // First error cancels all others
}
return results, nil
}
// With concurrency limit
func fetchWithLimit(ctx context.Context, urls []string, limit int) ([]string, error) {
g, ctx := errgroup.WithContext(ctx)
g.SetLimit(limit) // Max concurrent goroutines
results := make([]string, len(urls))
var mu sync.Mutex
for i, url := range urls {
i, url := i, url
g.Go(func() error {
result, err := fetchURL(ctx, url)
if err != nil {
return err
}
mu.Lock()
results[i] = result
mu.Unlock()
return nil
})
}
if err := g.Wait(); err != nil {
return nil, err
}
return results, nil
}
```
### Pattern 6: Concurrent Map with sync.Map
```go
package main
import (
"sync"
)
// For frequent reads, infrequent writes
type Cache struct {
m sync.Map
}
func (c *Cache) Get(key string) (interface{}, bool) {
return c.m.Load(key)
}
func (c *Cache) Set(key string, value interface{}) {
c.m.Store(key, value)
}
func (c *Cache) GetOrSet(key string, value interface{}) (interface{}, bool) {
return c.m.LoadOrStore(key, value)
}
func (c *Cache) Delete(key string) {
c.m.Delete(key)
}
// For write-heavy workloads, use sharded map
type ShardedMap struct {
shards []*shard
numShards int
}
type shard struct {
sync.RWMutex
data map[string]interface{}
}
func NewShardedMap(numShards int) *ShardedMap {
m := &ShardedMap{
shards: make([]*shard, numShards),
numShards: numShards,
}
for i := range m.shards {
m.shards[i] = &shard{data: make(map[string]interface{})}
}
return m
}
func (m *ShardedMap) getShard(key string) *shard {
// Simple hash
h := 0
for _, c := range key {
h = 31*h + int(c)
}
return m.shards[h%m.numShards]
}
func (m *ShardedMap) Get(key string) (interface{}, bool) {
shard := m.getShard(key)
shard.RLock()
defer shard.RUnlock()
v, ok := shard.data[key]
return v, ok
}
func (m *ShardedMap) Set(key string, value interface{}) {
shard := m.getShard(key)
shard.Lock()
defer shard.Unlock()
shard.data[key] = value
}
```
### Pattern 7: Select with Timeout and Default
```go
func selectPatterns() {
ch := make(chan int)
// Timeout pattern
select {
case v := <-ch:
fmt.Println("Received:", v)
case <-time.After(time.Second):
fmt.Println("Timeout!")
}
// Non-blocking send/receive
select {
case ch <- 42:
fmt.Println("Sent")
default:
fmt.Println("Channel full, skipping")
}
// Priority select (check high priority first)
highPriority := make(chan int)
lowPriority := make(chan int)
for {
select {
case msg := <-highPriority:
fmt.Println("High priority:", msg)
default:
select {
case msg := <-highPriority:
fmt.Println("High priority:", msg)
case msg := <-lowPriority:
fmt.Println("Low priority:", msg)
}
}
}
}
```
## Race Detection
```bash
# Run tests with race detector
go test -race ./...
# Build with race detector
go build -race .
# Run with race detector
go run -race main.go
```
## Best Practices
### Do's
- **Use context** - For cancellation and deadlines
- **Close channels** - From sender side only
- **Use errgroup** - For concurrent operations with errors
- **Buffer channels** - When you know the count
- **Prefer channels** - Over mutexes when possible
### Don'ts
- **Don't leak goroutines** - Always have exit path
- **Don't close from receiver** - Causes panic
- **Don't use shared memory** - Unless necessary
- **Don't ignore context cancellation** - Check ctx.Done()
- **Don't use time.Sleep for sync** - Use proper primitives
## Resources
- [Go Concurrency Patterns](https://go.dev/blog/pipelines)
- [Effective Go - Concurrency](https://go.dev/doc/effective_go#concurrency)
- [Go by Example - Goroutines](https://gobyexample.com/goroutines)

View File

@@ -0,0 +1,604 @@
---
name: memory-safety-patterns
description: Implement memory-safe programming with RAII, ownership, smart pointers, and resource management across Rust, C++, and C. Use when writing safe systems code, managing resources, or preventing memory bugs.
---
# Memory Safety Patterns
Cross-language patterns for memory-safe programming including RAII, ownership, smart pointers, and resource management.
## When to Use This Skill
- Writing memory-safe systems code
- Managing resources (files, sockets, memory)
- Preventing use-after-free and leaks
- Implementing RAII patterns
- Choosing between languages for safety
- Debugging memory issues
## Core Concepts
### 1. Memory Bug Categories
| Bug Type | Description | Prevention |
|----------|-------------|------------|
| **Use-after-free** | Access freed memory | Ownership, RAII |
| **Double-free** | Free same memory twice | Smart pointers |
| **Memory leak** | Never free memory | RAII, GC |
| **Buffer overflow** | Write past buffer end | Bounds checking |
| **Dangling pointer** | Pointer to freed memory | Lifetime tracking |
| **Data race** | Concurrent unsynchronized access | Ownership, Sync |
### 2. Safety Spectrum
```
Manual (C) → Smart Pointers (C++) → Ownership (Rust) → GC (Go, Java)
Less safe More safe
More control Less control
```
## Patterns by Language
### Pattern 1: RAII in C++
```cpp
// RAII: Resource Acquisition Is Initialization
// Resource lifetime tied to object lifetime
#include <memory>
#include <fstream>
#include <mutex>
// File handle with RAII
class FileHandle {
public:
explicit FileHandle(const std::string& path)
: file_(path) {
if (!file_.is_open()) {
throw std::runtime_error("Failed to open file");
}
}
// Destructor automatically closes file
~FileHandle() = default; // fstream closes in its destructor
// Delete copy (prevent double-close)
FileHandle(const FileHandle&) = delete;
FileHandle& operator=(const FileHandle&) = delete;
// Allow move
FileHandle(FileHandle&&) = default;
FileHandle& operator=(FileHandle&&) = default;
void write(const std::string& data) {
file_ << data;
}
private:
std::fstream file_;
};
// Lock guard (RAII for mutexes)
class Database {
public:
void update(const std::string& key, const std::string& value) {
std::lock_guard<std::mutex> lock(mutex_); // Released on scope exit
data_[key] = value;
}
std::string get(const std::string& key) {
std::shared_lock<std::shared_mutex> lock(shared_mutex_);
return data_[key];
}
private:
std::mutex mutex_;
std::shared_mutex shared_mutex_;
std::map<std::string, std::string> data_;
};
// Transaction with rollback (RAII)
template<typename T>
class Transaction {
public:
explicit Transaction(T& target)
: target_(target), backup_(target), committed_(false) {}
~Transaction() {
if (!committed_) {
target_ = backup_; // Rollback
}
}
void commit() { committed_ = true; }
T& get() { return target_; }
private:
T& target_;
T backup_;
bool committed_;
};
```
### Pattern 2: Smart Pointers in C++
```cpp
#include <memory>
// unique_ptr: Single ownership
class Engine {
public:
void start() { /* ... */ }
};
class Car {
public:
Car() : engine_(std::make_unique<Engine>()) {}
void start() {
engine_->start();
}
// Transfer ownership
std::unique_ptr<Engine> extractEngine() {
return std::move(engine_);
}
private:
std::unique_ptr<Engine> engine_;
};
// shared_ptr: Shared ownership
class Node {
public:
std::string data;
std::shared_ptr<Node> next;
// Use weak_ptr to break cycles
std::weak_ptr<Node> parent;
};
void sharedPtrExample() {
auto node1 = std::make_shared<Node>();
auto node2 = std::make_shared<Node>();
node1->next = node2;
node2->parent = node1; // Weak reference prevents cycle
// Access weak_ptr
if (auto parent = node2->parent.lock()) {
// parent is valid shared_ptr
}
}
// Custom deleter for resources
class Socket {
public:
static void close(int* fd) {
if (fd && *fd >= 0) {
::close(*fd);
delete fd;
}
}
};
auto createSocket() {
int fd = socket(AF_INET, SOCK_STREAM, 0);
return std::unique_ptr<int, decltype(&Socket::close)>(
new int(fd),
&Socket::close
);
}
// make_unique/make_shared best practices
void bestPractices() {
// Good: Exception safe, single allocation
auto ptr = std::make_shared<Widget>();
// Bad: Two allocations, not exception safe
std::shared_ptr<Widget> ptr2(new Widget());
// For arrays
auto arr = std::make_unique<int[]>(10);
}
```
### Pattern 3: Ownership in Rust
```rust
// Move semantics (default)
fn move_example() {
let s1 = String::from("hello");
let s2 = s1; // s1 is MOVED, no longer valid
// println!("{}", s1); // Compile error!
println!("{}", s2);
}
// Borrowing (references)
fn borrow_example() {
let s = String::from("hello");
// Immutable borrow (multiple allowed)
let len = calculate_length(&s);
println!("{} has length {}", s, len);
// Mutable borrow (only one allowed)
let mut s = String::from("hello");
change(&mut s);
}
fn calculate_length(s: &String) -> usize {
s.len()
} // s goes out of scope, but doesn't drop since borrowed
fn change(s: &mut String) {
s.push_str(", world");
}
// Lifetimes: Compiler tracks reference validity
fn longest<'a>(x: &'a str, y: &'a str) -> &'a str {
if x.len() > y.len() { x } else { y }
}
// Struct with references needs lifetime annotation
struct ImportantExcerpt<'a> {
part: &'a str,
}
impl<'a> ImportantExcerpt<'a> {
fn level(&self) -> i32 {
3
}
// Lifetime elision: compiler infers 'a for &self
fn announce_and_return_part(&self, announcement: &str) -> &str {
println!("Attention: {}", announcement);
self.part
}
}
// Interior mutability
use std::cell::{Cell, RefCell};
use std::rc::Rc;
struct Stats {
count: Cell<i32>, // Copy types
data: RefCell<Vec<String>>, // Non-Copy types
}
impl Stats {
fn increment(&self) {
self.count.set(self.count.get() + 1);
}
fn add_data(&self, item: String) {
self.data.borrow_mut().push(item);
}
}
// Rc for shared ownership (single-threaded)
fn rc_example() {
let data = Rc::new(vec![1, 2, 3]);
let data2 = Rc::clone(&data); // Increment reference count
println!("Count: {}", Rc::strong_count(&data)); // 2
}
// Arc for shared ownership (thread-safe)
use std::sync::Arc;
use std::thread;
fn arc_example() {
let data = Arc::new(vec![1, 2, 3]);
let handles: Vec<_> = (0..3)
.map(|_| {
let data = Arc::clone(&data);
thread::spawn(move || {
println!("{:?}", data);
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
}
```
### Pattern 4: Safe Resource Management in C
```c
// C doesn't have RAII, but we can use patterns
#include <stdlib.h>
#include <stdio.h>
// Pattern: goto cleanup
int process_file(const char* path) {
FILE* file = NULL;
char* buffer = NULL;
int result = -1;
file = fopen(path, "r");
if (!file) {
goto cleanup;
}
buffer = malloc(1024);
if (!buffer) {
goto cleanup;
}
// Process file...
result = 0;
cleanup:
if (buffer) free(buffer);
if (file) fclose(file);
return result;
}
// Pattern: Opaque pointer with create/destroy
typedef struct Context Context;
Context* context_create(void);
void context_destroy(Context* ctx);
int context_process(Context* ctx, const char* data);
// Implementation
struct Context {
int* data;
size_t size;
FILE* log;
};
Context* context_create(void) {
Context* ctx = calloc(1, sizeof(Context));
if (!ctx) return NULL;
ctx->data = malloc(100 * sizeof(int));
if (!ctx->data) {
free(ctx);
return NULL;
}
ctx->log = fopen("log.txt", "w");
if (!ctx->log) {
free(ctx->data);
free(ctx);
return NULL;
}
return ctx;
}
void context_destroy(Context* ctx) {
if (ctx) {
if (ctx->log) fclose(ctx->log);
if (ctx->data) free(ctx->data);
free(ctx);
}
}
// Pattern: Cleanup attribute (GCC/Clang extension)
#define AUTO_FREE __attribute__((cleanup(auto_free_func)))
void auto_free_func(void** ptr) {
free(*ptr);
}
void auto_free_example(void) {
AUTO_FREE char* buffer = malloc(1024);
// buffer automatically freed at end of scope
}
```
### Pattern 5: Bounds Checking
```cpp
// C++: Use containers instead of raw arrays
#include <vector>
#include <array>
#include <span>
void safe_array_access() {
std::vector<int> vec = {1, 2, 3, 4, 5};
// Safe: throws std::out_of_range
try {
int val = vec.at(10);
} catch (const std::out_of_range& e) {
// Handle error
}
// Unsafe but faster (no bounds check)
int val = vec[2];
// Modern C++20: std::span for array views
std::span<int> view(vec);
// Iterators are bounds-safe
for (int& x : view) {
x *= 2;
}
}
// Fixed-size arrays
void fixed_array() {
std::array<int, 5> arr = {1, 2, 3, 4, 5};
// Compile-time size known
static_assert(arr.size() == 5);
// Safe access
int val = arr.at(2);
}
```
```rust
// Rust: Bounds checking by default
fn rust_bounds_checking() {
let vec = vec![1, 2, 3, 4, 5];
// Runtime bounds check (panics if out of bounds)
let val = vec[2];
// Explicit option (no panic)
match vec.get(10) {
Some(val) => println!("Got {}", val),
None => println!("Index out of bounds"),
}
// Iterators (no bounds checking needed)
for val in &vec {
println!("{}", val);
}
// Slices are bounds-checked
let slice = &vec[1..3]; // [2, 3]
}
```
### Pattern 6: Preventing Data Races
```cpp
// C++: Thread-safe shared state
#include <mutex>
#include <shared_mutex>
#include <atomic>
class ThreadSafeCounter {
public:
void increment() {
// Atomic operations
count_.fetch_add(1, std::memory_order_relaxed);
}
int get() const {
return count_.load(std::memory_order_relaxed);
}
private:
std::atomic<int> count_{0};
};
class ThreadSafeMap {
public:
void write(const std::string& key, int value) {
std::unique_lock lock(mutex_);
data_[key] = value;
}
std::optional<int> read(const std::string& key) {
std::shared_lock lock(mutex_);
auto it = data_.find(key);
if (it != data_.end()) {
return it->second;
}
return std::nullopt;
}
private:
mutable std::shared_mutex mutex_;
std::map<std::string, int> data_;
};
```
```rust
// Rust: Data race prevention at compile time
use std::sync::{Arc, Mutex, RwLock};
use std::sync::atomic::{AtomicI32, Ordering};
use std::thread;
// Atomic for simple types
fn atomic_example() {
let counter = Arc::new(AtomicI32::new(0));
let handles: Vec<_> = (0..10)
.map(|_| {
let counter = Arc::clone(&counter);
thread::spawn(move || {
counter.fetch_add(1, Ordering::SeqCst);
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
println!("Counter: {}", counter.load(Ordering::SeqCst));
}
// Mutex for complex types
fn mutex_example() {
let data = Arc::new(Mutex::new(vec![]));
let handles: Vec<_> = (0..10)
.map(|i| {
let data = Arc::clone(&data);
thread::spawn(move || {
let mut vec = data.lock().unwrap();
vec.push(i);
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
}
// RwLock for read-heavy workloads
fn rwlock_example() {
let data = Arc::new(RwLock::new(HashMap::new()));
// Multiple readers OK
let read_guard = data.read().unwrap();
// Writer blocks readers
let write_guard = data.write().unwrap();
}
```
## Best Practices
### Do's
- **Prefer RAII** - Tie resource lifetime to scope
- **Use smart pointers** - Avoid raw pointers in C++
- **Understand ownership** - Know who owns what
- **Check bounds** - Use safe access methods
- **Use tools** - AddressSanitizer, Valgrind, Miri
### Don'ts
- **Don't use raw pointers** - Unless interfacing with C
- **Don't return local references** - Dangling pointer
- **Don't ignore compiler warnings** - They catch bugs
- **Don't use `unsafe` carelessly** - In Rust, minimize it
- **Don't assume thread safety** - Be explicit
## Debugging Tools
```bash
# AddressSanitizer (Clang/GCC)
clang++ -fsanitize=address -g source.cpp
# Valgrind
valgrind --leak-check=full ./program
# Rust Miri (undefined behavior detector)
cargo +nightly miri run
# ThreadSanitizer
clang++ -fsanitize=thread -g source.cpp
```
## Resources
- [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/)
- [Rust Ownership](https://doc.rust-lang.org/book/ch04-00-understanding-ownership.html)
- [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html)

View File

@@ -0,0 +1,517 @@
---
name: rust-async-patterns
description: Master Rust async programming with Tokio, async traits, error handling, and concurrent patterns. Use when building async Rust applications, implementing concurrent systems, or debugging async code.
---
# Rust Async Patterns
Production patterns for async Rust programming with Tokio runtime, including tasks, channels, streams, and error handling.
## When to Use This Skill
- Building async Rust applications
- Implementing concurrent network services
- Using Tokio for async I/O
- Handling async errors properly
- Debugging async code issues
- Optimizing async performance
## Core Concepts
### 1. Async Execution Model
```
Future (lazy) → poll() → Ready(value) | Pending
↑ ↓
Waker ← Runtime schedules
```
### 2. Key Abstractions
| Concept | Purpose |
|---------|---------|
| `Future` | Lazy computation that may complete later |
| `async fn` | Function returning impl Future |
| `await` | Suspend until future completes |
| `Task` | Spawned future running concurrently |
| `Runtime` | Executor that polls futures |
## Quick Start
```toml
# Cargo.toml
[dependencies]
tokio = { version = "1", features = ["full"] }
futures = "0.3"
async-trait = "0.1"
anyhow = "1.0"
tracing = "0.1"
tracing-subscriber = "0.3"
```
```rust
use tokio::time::{sleep, Duration};
use anyhow::Result;
#[tokio::main]
async fn main() -> Result<()> {
// Initialize tracing
tracing_subscriber::fmt::init();
// Async operations
let result = fetch_data("https://api.example.com").await?;
println!("Got: {}", result);
Ok(())
}
async fn fetch_data(url: &str) -> Result<String> {
// Simulated async operation
sleep(Duration::from_millis(100)).await;
Ok(format!("Data from {}", url))
}
```
## Patterns
### Pattern 1: Concurrent Task Execution
```rust
use tokio::task::JoinSet;
use anyhow::Result;
// Spawn multiple concurrent tasks
async fn fetch_all_concurrent(urls: Vec<String>) -> Result<Vec<String>> {
let mut set = JoinSet::new();
for url in urls {
set.spawn(async move {
fetch_data(&url).await
});
}
let mut results = Vec::new();
while let Some(res) = set.join_next().await {
match res {
Ok(Ok(data)) => results.push(data),
Ok(Err(e)) => tracing::error!("Task failed: {}", e),
Err(e) => tracing::error!("Join error: {}", e),
}
}
Ok(results)
}
// With concurrency limit
use futures::stream::{self, StreamExt};
async fn fetch_with_limit(urls: Vec<String>, limit: usize) -> Vec<Result<String>> {
stream::iter(urls)
.map(|url| async move { fetch_data(&url).await })
.buffer_unordered(limit) // Max concurrent tasks
.collect()
.await
}
// Select first to complete
use tokio::select;
async fn race_requests(url1: &str, url2: &str) -> Result<String> {
select! {
result = fetch_data(url1) => result,
result = fetch_data(url2) => result,
}
}
```
### Pattern 2: Channels for Communication
```rust
use tokio::sync::{mpsc, broadcast, oneshot, watch};
// Multi-producer, single-consumer
async fn mpsc_example() {
let (tx, mut rx) = mpsc::channel::<String>(100);
// Spawn producer
let tx2 = tx.clone();
tokio::spawn(async move {
tx2.send("Hello".to_string()).await.unwrap();
});
// Consume
while let Some(msg) = rx.recv().await {
println!("Got: {}", msg);
}
}
// Broadcast: multi-producer, multi-consumer
async fn broadcast_example() {
let (tx, _) = broadcast::channel::<String>(100);
let mut rx1 = tx.subscribe();
let mut rx2 = tx.subscribe();
tx.send("Event".to_string()).unwrap();
// Both receivers get the message
let _ = rx1.recv().await;
let _ = rx2.recv().await;
}
// Oneshot: single value, single use
async fn oneshot_example() -> String {
let (tx, rx) = oneshot::channel::<String>();
tokio::spawn(async move {
tx.send("Result".to_string()).unwrap();
});
rx.await.unwrap()
}
// Watch: single producer, multi-consumer, latest value
async fn watch_example() {
let (tx, mut rx) = watch::channel("initial".to_string());
tokio::spawn(async move {
loop {
// Wait for changes
rx.changed().await.unwrap();
println!("New value: {}", *rx.borrow());
}
});
tx.send("updated".to_string()).unwrap();
}
```
### Pattern 3: Async Error Handling
```rust
use anyhow::{Context, Result, bail};
use thiserror::Error;
#[derive(Error, Debug)]
pub enum ServiceError {
#[error("Network error: {0}")]
Network(#[from] reqwest::Error),
#[error("Database error: {0}")]
Database(#[from] sqlx::Error),
#[error("Not found: {0}")]
NotFound(String),
#[error("Timeout after {0:?}")]
Timeout(std::time::Duration),
}
// Using anyhow for application errors
async fn process_request(id: &str) -> Result<Response> {
let data = fetch_data(id)
.await
.context("Failed to fetch data")?;
let parsed = parse_response(&data)
.context("Failed to parse response")?;
Ok(parsed)
}
// Using custom errors for library code
async fn get_user(id: &str) -> Result<User, ServiceError> {
let result = db.query(id).await?;
match result {
Some(user) => Ok(user),
None => Err(ServiceError::NotFound(id.to_string())),
}
}
// Timeout wrapper
use tokio::time::timeout;
async fn with_timeout<T, F>(duration: Duration, future: F) -> Result<T, ServiceError>
where
F: std::future::Future<Output = Result<T, ServiceError>>,
{
timeout(duration, future)
.await
.map_err(|_| ServiceError::Timeout(duration))?
}
```
### Pattern 4: Graceful Shutdown
```rust
use tokio::signal;
use tokio::sync::broadcast;
use tokio_util::sync::CancellationToken;
async fn run_server() -> Result<()> {
// Method 1: CancellationToken
let token = CancellationToken::new();
let token_clone = token.clone();
// Spawn task that respects cancellation
tokio::spawn(async move {
loop {
tokio::select! {
_ = token_clone.cancelled() => {
tracing::info!("Task shutting down");
break;
}
_ = do_work() => {}
}
}
});
// Wait for shutdown signal
signal::ctrl_c().await?;
tracing::info!("Shutdown signal received");
// Cancel all tasks
token.cancel();
// Give tasks time to cleanup
tokio::time::sleep(Duration::from_secs(5)).await;
Ok(())
}
// Method 2: Broadcast channel for shutdown
async fn run_with_broadcast() -> Result<()> {
let (shutdown_tx, _) = broadcast::channel::<()>(1);
let mut rx = shutdown_tx.subscribe();
tokio::spawn(async move {
tokio::select! {
_ = rx.recv() => {
tracing::info!("Received shutdown");
}
_ = async { loop { do_work().await } } => {}
}
});
signal::ctrl_c().await?;
let _ = shutdown_tx.send(());
Ok(())
}
```
### Pattern 5: Async Traits
```rust
use async_trait::async_trait;
#[async_trait]
pub trait Repository {
async fn get(&self, id: &str) -> Result<Entity>;
async fn save(&self, entity: &Entity) -> Result<()>;
async fn delete(&self, id: &str) -> Result<()>;
}
pub struct PostgresRepository {
pool: sqlx::PgPool,
}
#[async_trait]
impl Repository for PostgresRepository {
async fn get(&self, id: &str) -> Result<Entity> {
sqlx::query_as!(Entity, "SELECT * FROM entities WHERE id = $1", id)
.fetch_one(&self.pool)
.await
.map_err(Into::into)
}
async fn save(&self, entity: &Entity) -> Result<()> {
sqlx::query!(
"INSERT INTO entities (id, data) VALUES ($1, $2)
ON CONFLICT (id) DO UPDATE SET data = $2",
entity.id,
entity.data
)
.execute(&self.pool)
.await?;
Ok(())
}
async fn delete(&self, id: &str) -> Result<()> {
sqlx::query!("DELETE FROM entities WHERE id = $1", id)
.execute(&self.pool)
.await?;
Ok(())
}
}
// Trait object usage
async fn process(repo: &dyn Repository, id: &str) -> Result<()> {
let entity = repo.get(id).await?;
// Process...
repo.save(&entity).await
}
```
### Pattern 6: Streams and Async Iteration
```rust
use futures::stream::{self, Stream, StreamExt};
use async_stream::stream;
// Create stream from async iterator
fn numbers_stream() -> impl Stream<Item = i32> {
stream! {
for i in 0..10 {
tokio::time::sleep(Duration::from_millis(100)).await;
yield i;
}
}
}
// Process stream
async fn process_stream() {
let stream = numbers_stream();
// Map and filter
let processed: Vec<_> = stream
.filter(|n| futures::future::ready(*n % 2 == 0))
.map(|n| n * 2)
.collect()
.await;
println!("{:?}", processed);
}
// Chunked processing
async fn process_in_chunks() {
let stream = numbers_stream();
let mut chunks = stream.chunks(3);
while let Some(chunk) = chunks.next().await {
println!("Processing chunk: {:?}", chunk);
}
}
// Merge multiple streams
async fn merge_streams() {
let stream1 = numbers_stream();
let stream2 = numbers_stream();
let merged = stream::select(stream1, stream2);
merged
.for_each(|n| async move {
println!("Got: {}", n);
})
.await;
}
```
### Pattern 7: Resource Management
```rust
use std::sync::Arc;
use tokio::sync::{Mutex, RwLock, Semaphore};
// Shared state with RwLock (prefer for read-heavy)
struct Cache {
data: RwLock<HashMap<String, String>>,
}
impl Cache {
async fn get(&self, key: &str) -> Option<String> {
self.data.read().await.get(key).cloned()
}
async fn set(&self, key: String, value: String) {
self.data.write().await.insert(key, value);
}
}
// Connection pool with semaphore
struct Pool {
semaphore: Semaphore,
connections: Mutex<Vec<Connection>>,
}
impl Pool {
fn new(size: usize) -> Self {
Self {
semaphore: Semaphore::new(size),
connections: Mutex::new((0..size).map(|_| Connection::new()).collect()),
}
}
async fn acquire(&self) -> PooledConnection<'_> {
let permit = self.semaphore.acquire().await.unwrap();
let conn = self.connections.lock().await.pop().unwrap();
PooledConnection { pool: self, conn: Some(conn), _permit: permit }
}
}
struct PooledConnection<'a> {
pool: &'a Pool,
conn: Option<Connection>,
_permit: tokio::sync::SemaphorePermit<'a>,
}
impl Drop for PooledConnection<'_> {
fn drop(&mut self) {
if let Some(conn) = self.conn.take() {
let pool = self.pool;
tokio::spawn(async move {
pool.connections.lock().await.push(conn);
});
}
}
}
```
## Debugging Tips
```rust
// Enable tokio-console for runtime debugging
// Cargo.toml: tokio = { features = ["tracing"] }
// Run: RUSTFLAGS="--cfg tokio_unstable" cargo run
// Then: tokio-console
// Instrument async functions
use tracing::instrument;
#[instrument(skip(pool))]
async fn fetch_user(pool: &PgPool, id: &str) -> Result<User> {
tracing::debug!("Fetching user");
// ...
}
// Track task spawning
let span = tracing::info_span!("worker", id = %worker_id);
tokio::spawn(async move {
// Enters span when polled
}.instrument(span));
```
## Best Practices
### Do's
- **Use `tokio::select!`** - For racing futures
- **Prefer channels** - Over shared state when possible
- **Use `JoinSet`** - For managing multiple tasks
- **Instrument with tracing** - For debugging async code
- **Handle cancellation** - Check `CancellationToken`
### Don'ts
- **Don't block** - Never use `std::thread::sleep` in async
- **Don't hold locks across awaits** - Causes deadlocks
- **Don't spawn unboundedly** - Use semaphores for limits
- **Don't ignore errors** - Propagate with `?` or log
- **Don't forget Send bounds** - For spawned futures
## Resources
- [Tokio Tutorial](https://tokio.rs/tokio/tutorial)
- [Async Book](https://rust-lang.github.io/async-book/)
- [Tokio Console](https://github.com/tokio-rs/console)