diff --git a/src/deepwork/cli/install.py b/src/deepwork/cli/install.py index d65c5a4..fae68fc 100644 --- a/src/deepwork/cli/install.py +++ b/src/deepwork/cli/install.py @@ -87,6 +87,20 @@ def _inject_deepwork_policy(jobs_dir: Path, project_path: Path) -> None: _inject_standard_job("deepwork_policy", jobs_dir, project_path) +def _inject_env_investigate(jobs_dir: Path, project_path: Path) -> None: + """ + Inject the env_investigate job definition into the project. + + Args: + jobs_dir: Path to .deepwork/jobs directory + project_path: Path to project root (for relative path display) + + Raises: + InstallError: If injection fails + """ + _inject_standard_job("env_investigate", jobs_dir, project_path) + + def _create_deepwork_gitignore(deepwork_dir: Path) -> None: """ Create .gitignore file in .deepwork/ directory. @@ -272,6 +286,7 @@ def _install_deepwork(platform_name: str | None, project_path: Path) -> None: console.print("[yellow]→[/yellow] Installing core job definitions...") _inject_deepwork_jobs(jobs_dir, project_path) _inject_deepwork_policy(jobs_dir, project_path) + _inject_env_investigate(jobs_dir, project_path) # Step 3c: Create .gitignore for temporary files _create_deepwork_gitignore(deepwork_dir) diff --git a/src/deepwork/cli/sync.py b/src/deepwork/cli/sync.py index b533eb5..22281d0 100644 --- a/src/deepwork/cli/sync.py +++ b/src/deepwork/cli/sync.py @@ -7,6 +7,7 @@ from rich.table import Table from deepwork.core.adapters import AgentAdapter +from deepwork.core.agent_generator import AgentGenerator from deepwork.core.generator import CommandGenerator from deepwork.core.hooks_syncer import collect_job_hooks, sync_hooks_to_platform from deepwork.core.parser import parse_job_definition @@ -116,7 +117,8 @@ def sync_commands(project_path: Path) -> None: # Sync each platform generator = CommandGenerator() - stats = {"platforms": 0, "commands": 0, "hooks": 0} + agent_generator = AgentGenerator() + stats = {"platforms": 0, "commands": 0, "hooks": 0, "agents": 0} synced_adapters: list[AgentAdapter] = [] for platform_name in platforms: @@ -146,6 +148,18 @@ def sync_commands(project_path: Path) -> None: except Exception as e: console.print(f" [red]✗[/red] Failed for {job.name}: {e}") + # Generate agent files for platforms that support them + console.print(" [dim]•[/dim] Generating agent files...") + try: + agent_paths = agent_generator.generate_agents(adapter, project_path) + if agent_paths: + stats["agents"] += len(agent_paths) + console.print(f" [green]✓[/green] Generated {len(agent_paths)} agent file(s)") + else: + console.print(" [dim]•[/dim] No agent templates for this platform") + except Exception as e: + console.print(f" [red]✗[/red] Failed to generate agents: {e}") + # Sync hooks to platform settings if job_hooks_list: console.print(" [dim]•[/dim] Syncing hooks...") @@ -171,6 +185,8 @@ def sync_commands(project_path: Path) -> None: table.add_row("Platforms synced", str(stats["platforms"])) table.add_row("Total commands", str(stats["commands"])) + if stats["agents"] > 0: + table.add_row("Agent files", str(stats["agents"])) if stats["hooks"] > 0: table.add_row("Hooks synced", str(stats["hooks"])) diff --git a/src/deepwork/core/agent_generator.py b/src/deepwork/core/agent_generator.py new file mode 100644 index 0000000..8f126f9 --- /dev/null +++ b/src/deepwork/core/agent_generator.py @@ -0,0 +1,128 @@ +"""Agent file generator for subagent definitions.""" + +from pathlib import Path + +from jinja2 import Environment, FileSystemLoader + +from deepwork.core.adapters import AgentAdapter +from deepwork.utils.fs import safe_write + + +class AgentGeneratorError(Exception): + """Exception raised for agent generation errors.""" + + pass + + +class AgentGenerator: + """Generates agent definition files for platforms that support them.""" + + def __init__(self, templates_dir: Path | str | None = None): + """ + Initialize agent generator. + + Args: + templates_dir: Path to templates directory + (defaults to package templates directory) + """ + if templates_dir is None: + # Use package templates directory + templates_dir = Path(__file__).parent.parent / "templates" + + self.templates_dir = Path(templates_dir) + + if not self.templates_dir.exists(): + raise AgentGeneratorError(f"Templates directory not found: {self.templates_dir}") + + def _get_agent_templates(self, adapter: AgentAdapter) -> list[Path]: + """ + Get list of agent template files for an adapter. + + Args: + adapter: Agent adapter + + Returns: + List of agent template file paths (empty if none exist) + """ + platform_templates_dir = adapter.get_template_dir(self.templates_dir) + agents_dir = platform_templates_dir / "agents" + + if not agents_dir.exists(): + return [] + + # Find all .j2 template files in the agents directory + agent_templates = list(agents_dir.glob("*.j2")) + return agent_templates + + def generate_agents( + self, + adapter: AgentAdapter, + project_path: Path | str, + ) -> list[Path]: + """ + Generate agent definition files for a platform. + + This creates agent files (e.g., .claude/agents/*.md) from templates + in the templates directory. Only platforms that support agents will + have agent templates. + + Args: + adapter: Agent adapter for the target platform + project_path: Path to project root + + Returns: + List of paths to generated agent files + + Raises: + AgentGeneratorError: If generation fails + """ + project_path = Path(project_path) + + # Get agent templates for this platform + agent_templates = self._get_agent_templates(adapter) + + if not agent_templates: + # No agent templates for this platform - that's okay + return [] + + # Create agents directory in platform config + platform_dir = project_path / adapter.config_dir + agents_dir = platform_dir / "agents" + agents_dir.mkdir(parents=True, exist_ok=True) + + # Setup Jinja environment + platform_templates_dir = adapter.get_template_dir(self.templates_dir) + agents_templates_dir = platform_templates_dir / "agents" + + env = Environment( + loader=FileSystemLoader(agents_templates_dir), + trim_blocks=True, + lstrip_blocks=True, + ) + + generated_paths: list[Path] = [] + + # Process each agent template + for template_path in agent_templates: + template_name = template_path.name + + # Remove .j2 extension for output file + output_filename = template_name[:-3] if template_name.endswith(".j2") else template_name + + try: + # Load and render template + template = env.get_template(template_name) + rendered = template.render() + + # Write agent file + agent_path = agents_dir / output_filename + + safe_write(agent_path, rendered) + generated_paths.append(agent_path) + + except Exception as e: + raise AgentGeneratorError( + f"Failed to generate agent {template_name}: {e}" + ) from e + + return generated_paths diff --git a/src/deepwork/standard_jobs/env_investigate/AGENTS.md b/src/deepwork/standard_jobs/env_investigate/AGENTS.md new file mode 100644 index 0000000..9acf5f0 --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/AGENTS.md @@ -0,0 +1,247 @@ +# Project Context for env_investigate + +This is the source of truth for the `env_investigate` standard job - a production debugging workflow using Grafana MCP with isolated observability subagents. + +## Codebase Structure + +- Source location: `src/deepwork/standard_jobs/env_investigate/` +- Working copy: `.deepwork/jobs/env_investigate/` +- Templates: `templates/` directory within each location +- Subagent definitions: `src/deepwork/templates/claude/agents/` + +## Dual Location Maintenance + +**Important**: This job exists in two locations that must be kept in sync: + +1. **Source of truth**: `src/deepwork/standard_jobs/env_investigate/` + - This is where changes should be made first + - Tracked in version control + +2. **Working copy**: `.deepwork/jobs/env_investigate/` + - Must be updated after changes to source + - Used by `deepwork sync` to generate commands + +After making changes to the source, copy files to the working copy: +```bash +cp src/deepwork/standard_jobs/env_investigate/job.yml .deepwork/jobs/env_investigate/ +cp src/deepwork/standard_jobs/env_investigate/steps/*.md .deepwork/jobs/env_investigate/steps/ +cp -r src/deepwork/standard_jobs/env_investigate/templates/* .deepwork/jobs/env_investigate/templates/ 2>/dev/null || true +``` + +## File Organization + +``` +env_investigate/ +├── AGENTS.md # This file +├── job.yml # Job definition +├── make_new_job.sh # Script to create investigation structure +├── steps/ +│ ├── triage.md # Define investigation scope +│ ├── alert_check.md # Query Alertmanager via subagent +│ ├── metrics_analysis.md # Query Prometheus via subagent +│ ├── log_investigation.md # Query Loki via subagent +│ ├── root_cause.md # Synthesize findings +│ └── remediation.md # Create action plan +└── templates/ + ├── triage.md.template # Investigation scope template + ├── alerts.md.template # Alerts summary template + ├── metrics.md.template # Metrics summary template + ├── logs.md.template # Logs summary template + ├── root_cause.md.template # Root cause template + ├── timeline.md.template # Timeline template + └── remediation.md.template # Remediation plan template +``` + +## Observability Subagent Meta-Framework + +This job demonstrates a **meta-framework pattern** for working with observability tools that return large amounts of data. The pattern prevents context bloat by delegating all raw data queries to isolated subagents. + +### The Problem + +Observability queries (especially logs) can return massive amounts of data: +- Prometheus: 1000+ data points per metric +- Alertmanager: 50+ alerts with full payloads +- Loki: 10,000+ log lines with stack traces + +This data quickly overwhelms the agent's context window, making iterative investigation impossible. + +### The Solution: Isolated Subagents + +**Key Principle**: Raw observability data never enters the main investigation context. + +Instead, specialized analyst subagents: +1. Receive specific query requirements from main agent +2. Query observability tools directly (Grafana MCP) +3. Apply strict output contracts (max data points, truncation) +4. Return only structured YAML summaries + +### Subagent Definitions + +Three analyst subagents for Claude Code (in `src/deepwork/templates/claude/agents/`): + +| Subagent | Tool | Output Contract | Files | +|----------|------|-----------------|-------| +| `alertmanager-analyst` | `mcp__grafana__get_alerts` | Max 10 alerts, grouped by severity | `.claude/agents/alertmanager-analyst.md` | +| `prometheus-analyst` | `mcp__grafana__query_prometheus` | Max 10 data points per metric, trend summaries | `.claude/agents/prometheus-analyst.md` | +| `loki-analyst` | `mcp__grafana__query_loki` | Max 5 log lines, 200 char truncation | `.claude/agents/loki-analyst.md` | + +**Platform Support**: +- **Claude Code**: Full support via Task tool + `.claude/agents/*.md` definitions +- **Gemini CLI**: Inline summarization rules (no isolated subagents available) + +### How Subagents Work (Claude Code) + +Step instructions delegate queries to subagents: + +```markdown +### For Claude Code Users (Recommended) + +Use the Task tool with the `alertmanager-analyst` subagent: + +\`\`\` +Use the Task tool to spawn the alertmanager-analyst subagent with this prompt: + +"Query Alertmanager for alerts related to the investigation in triage.md. +Focus on: +- Time range: [from triage.md] +- Services: [from triage.md] + +Return a YAML summary with: +- Total alert count +- Alerts grouped by severity +- Top 10 most relevant alerts +- Notable patterns + +Max 10 alerts in output." + +Wait for the subagent to complete and return its YAML summary. +\`\`\` +``` + +The subagent returns structured YAML (never raw data): +```yaml +summary: + total_alerts: 23 + critical: 5 +alerts: + - name: HighErrorRate + severity: critical + started_at: 2026-01-16T14:23:15Z + # ... max 10 total +patterns: + - pattern: "Multiple services showing error spikes" + count: 18 +``` + +### Adapting This Pattern + +To adapt this meta-framework for other observability tools: + +1. **Identify high-volume data sources** (e.g., traces, events, databases) +2. **Create subagent templates** in `src/deepwork/templates/claude/agents/` +3. **Define strict output contracts** (max items, truncation rules) +4. **Update step instructions** to delegate queries +5. **Add fallback rules** for platforms without subagent support + +Example subagent template structure: +```markdown +# [Tool Name] Analyst + +**Role**: Query [tool] and return structured summaries + +**Tools**: `mcp__[tool]__[method]` + +**Output Contract**: Max [N] items, [format], [truncation rules] + +**Critical Rules**: +1. Max [N] items in output +2. Truncate to [X] chars +3. Group similar items +4. Return YAML only +``` + +## Workflow Pattern + +The 6-step investigation workflow follows a structured pattern: + +1. **Triage** (manual) - Define scope, no external queries +2. **Alert Check** (subagent) - Delegate to alertmanager-analyst +3. **Metrics Analysis** (subagent) - Delegate to prometheus-analyst +4. **Log Investigation** (subagent) - Delegate to loki-analyst +5. **Root Cause** (synthesis) - Use summaries from steps 1-4 +6. **Remediation** (planning) - Create action plan with monitoring improvements + +**Key Characteristics**: +- Each step produces a markdown artifact +- Steps 2-4 use subagents (or inline limits for Gemini) +- Steps 5-6 synthesize without additional queries +- All artifacts stay in main context (summaries only) + +## Version Management + +- Version is tracked in `job.yml` +- Bump patch version (1.0.x) for instruction improvements +- Bump minor version (1.x.0) for new steps or subagent changes +- Bump major version (2.0.0) for breaking workflow changes +- Always update changelog when bumping version + +## Subagent Template Updates + +When updating subagent templates in `src/deepwork/templates/claude/agents/`: +1. Modify the `.j2` template file +2. Run `deepwork sync` to regenerate `.claude/agents/*.md` +3. Test with a sample investigation +4. Update this AGENTS.md if output contracts change + +## Dependencies + +- **Required**: Grafana MCP server configured with datasources + - Prometheus (metrics) + - Alertmanager (alerts) + - Loki (logs) +- **Recommended**: Claude Code for full subagent support +- **Alternative**: Gemini CLI with inline summarization + +## Known Issues and Workarounds + +- **Issue**: Gemini CLI doesn't support isolated subagents + - **Workaround**: Step instructions include inline output limits and manual summarization rules + +- **Issue**: Very large time ranges can still return too much data + - **Workaround**: Triage step instructs users to narrow time ranges to 1-24 hours + +- **Issue**: Stack traces in logs can exceed 200 char limit + - **Workaround**: loki-analyst truncates and summarizes (e.g., "Java NPE in Handler.process:234") + +## Example Investigation Flow + +```bash +# 1. Start investigation +/env_investigate.triage +# Creates: triage.md (scope, time range, services) + +# 2. Check alerts (spawns alertmanager-analyst subagent) +/env_investigate.alert_check +# Creates: alerts.md (max 10 alerts, patterns) + +# 3. Analyze metrics (spawns prometheus-analyst subagent) +/env_investigate.metrics_analysis +# Creates: metrics.md (max 10 metrics, trends) + +# 4. Investigate logs (spawns loki-analyst subagent) +/env_investigate.log_investigation +# Creates: logs.md (max 5 logs, patterns) + +# 5. Determine root cause (synthesis) +/env_investigate.root_cause +# Creates: root_cause.md, timeline.md + +# 6. Plan remediation +/env_investigate.remediation +# Creates: remediation.md (actions, monitoring improvements) +``` + +## Last Updated + +- Date: 2026-01-17 +- From conversation about: Restructuring to follow standard job pattern with AGENTS.md explaining the subagent meta-framework diff --git a/src/deepwork/standard_jobs/env_investigate/job.yml b/src/deepwork/standard_jobs/env_investigate/job.yml new file mode 100644 index 0000000..d4155fd --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/job.yml @@ -0,0 +1,118 @@ +name: env_investigate +version: "1.0.0" +summary: "Investigate deployed environment issues via isolated observability subagents" +description: | + Multi-step investigation workflow for debugging production issues using Grafana MCP + (Prometheus, Alertmanager, Loki). Delegates all observability queries to isolated + subagents to prevent context bloat from large log volumes. + + This workflow structures the investigation into discrete phases with clear artifacts, + ensuring that only structured summaries (not raw query results) are returned to the + main context. Particularly useful when dealing with high-volume log data that would + otherwise overwhelm the agent's context window. + + **Requirements:** + - Grafana MCP server configured with Prometheus, Loki, and Alertmanager datasources + - Claude Code for full subagent support (recommended) + - For Gemini CLI: manual summarization guidelines are included in step instructions + +changelog: + - version: "1.0.0" + changes: "Initial release with 6-step investigation workflow" + +steps: + - id: triage + name: "Triage & Scope" + description: "Define investigation scope and gather initial context" + instructions_file: steps/triage.md + inputs: + - name: issue_description + description: "Description of the production issue to investigate" + - name: affected_services + description: "Services or components affected (optional)" + - name: time_range + description: "Time range to investigate (e.g., 'last 1 hour', '2026-01-16 14:00 to 15:00')" + outputs: + - triage.md + dependencies: [] + + - id: alert_check + name: "Check Alerts" + description: "Query Alertmanager for active and recent alerts" + instructions_file: steps/alert_check.md + inputs: + - file: triage.md + from_step: triage + outputs: + - alerts.md + dependencies: + - triage + + - id: metrics_analysis + name: "Metrics Analysis" + description: "Query Prometheus for relevant metrics and trends" + instructions_file: steps/metrics_analysis.md + inputs: + - file: triage.md + from_step: triage + - file: alerts.md + from_step: alert_check + outputs: + - metrics.md + dependencies: + - triage + - alert_check + + - id: log_investigation + name: "Log Investigation" + description: "Query Loki for relevant log entries and patterns" + instructions_file: steps/log_investigation.md + inputs: + - file: triage.md + from_step: triage + - file: alerts.md + from_step: alert_check + - file: metrics.md + from_step: metrics_analysis + outputs: + - logs.md + dependencies: + - triage + - alert_check + - metrics_analysis + + - id: root_cause + name: "Root Cause Analysis" + description: "Synthesize findings to identify root cause" + instructions_file: steps/root_cause.md + inputs: + - file: triage.md + from_step: triage + - file: alerts.md + from_step: alert_check + - file: metrics.md + from_step: metrics_analysis + - file: logs.md + from_step: log_investigation + outputs: + - root_cause.md + - timeline.md + dependencies: + - triage + - alert_check + - metrics_analysis + - log_investigation + + - id: remediation + name: "Remediation Plan" + description: "Create actionable remediation plan" + instructions_file: steps/remediation.md + inputs: + - file: root_cause.md + from_step: root_cause + - file: timeline.md + from_step: root_cause + outputs: + - remediation.md + dependencies: + - root_cause diff --git a/src/deepwork/standard_jobs/env_investigate/make_new_job.sh b/src/deepwork/standard_jobs/env_investigate/make_new_job.sh new file mode 100755 index 0000000..4bdbab8 --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/make_new_job.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +# +# make_new_job.sh - Create directory structure for a new investigation +# +# Usage: ./make_new_job.sh +# + +set -euo pipefail + +# Color output helpers +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +error() { + echo -e "${RED}[ERROR]${NC} $1" + exit 1 +} + +# Validate investigation name format +validate_investigation_name() { + local name="$1" + if [[ ! "$name" =~ ^[a-z][a-z0-9_-]*$ ]]; then + error "Invalid investigation name '$name'. Must be lowercase, start with a letter, and contain only letters, numbers, underscores, and hyphens." + fi +} + +# Main script +main() { + if [[ $# -lt 1 ]]; then + echo "Usage: $0 " + echo "" + echo "Creates the directory structure for a new production investigation." + echo "" + echo "Arguments:" + echo " investigation_name Name of the investigation (lowercase, hyphens/underscores allowed)" + echo "" + echo "Example:" + echo " $0 api-outage-2026-01-16" + exit 1 + fi + + local investigation_name="$1" + validate_investigation_name "$investigation_name" + + # Determine base directory + local base_dir + if [[ -d ".deepwork/jobs" ]]; then + base_dir=".deepwork/jobs" + elif [[ -d "../.." && -d "../../.deepwork/jobs" ]]; then + base_dir="../../.deepwork/jobs" + else + error "Could not find .deepwork/jobs directory. Run this from project root or env_investigate directory." + fi + + local investigation_dir="${base_dir}/${investigation_name}" + + # Check if directory exists + if [[ -d "$investigation_dir" ]]; then + error "Investigation directory already exists: $investigation_dir" + fi + + # Create directory structure + info "Creating investigation directory structure..." + mkdir -p "${investigation_dir}" + mkdir -p "${investigation_dir}/artifacts" + + info "Created: ${investigation_dir}/" + info "Created: ${investigation_dir}/artifacts/" + + # Create README with investigation tracking + cat > "${investigation_dir}/README.md" << EOFREADME +# Investigation: ${investigation_name} + +## Status +- **Created**: $(date -u +"%Y-%m-%dT%H:%M:%SZ") +- **Status**: In Progress +- **Investigator**: [Name] + +## Quick Links +- [Triage Document](artifacts/triage.md) +- [Alerts Analysis](artifacts/alerts.md) +- [Metrics Analysis](artifacts/metrics.md) +- [Log Investigation](artifacts/logs.md) +- [Root Cause](artifacts/root_cause.md) +- [Remediation Plan](artifacts/remediation.md) + +## Investigation Steps + +### 1. Triage & Scope +\`\`\`bash +/env_investigate.triage +\`\`\` +Creates: \`artifacts/triage.md\` + +### 2. Check Alerts +\`\`\`bash +/env_investigate.alert_check +\`\`\` +Creates: \`artifacts/alerts.md\` + +### 3. Analyze Metrics +\`\`\`bash +/env_investigate.metrics_analysis +\`\`\` +Creates: \`artifacts/metrics.md\` + +### 4. Investigate Logs +\`\`\`bash +/env_investigate.log_investigation +\`\`\` +Creates: \`artifacts/logs.md\` + +### 5. Root Cause Analysis +\`\`\`bash +/env_investigate.root_cause +\`\`\` +Creates: \`artifacts/root_cause.md\`, \`artifacts/timeline.md\` + +### 6. Remediation Plan +\`\`\`bash +/env_investigate.remediation +\`\`\` +Creates: \`artifacts/remediation.md\` + +## Notes +[Add investigation-specific notes here] +EOFREADME + + info "Created: ${investigation_dir}/README.md" + + # Success message + echo "" + info "Investigation directory created successfully!" + echo "" + echo "Next steps:" + echo " 1. cd ${investigation_dir}" + echo " 2. Run: /env_investigate.triage" + echo " 3. Follow the 6-step investigation workflow" + echo "" + echo "All artifacts will be created in: ${investigation_dir}/artifacts/" +} + +main "$@" diff --git a/src/deepwork/standard_jobs/env_investigate/steps/alert_check.md b/src/deepwork/standard_jobs/env_investigate/steps/alert_check.md new file mode 100644 index 0000000..41ad4b8 --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/steps/alert_check.md @@ -0,0 +1,130 @@ +# Check Alerts + +## Objective + +Query Alertmanager for active and recent alerts related to the investigation scope. This step uses a specialized subagent to prevent alert data from bloating the main context. + +## Task + +**CRITICAL**: Do NOT query Grafana MCP tools directly in this step. Instead, delegate all alert queries to an isolated subagent. + +### For Claude Code Users (Recommended) + +Use the Task tool with the `alertmanager-analyst` subagent: + +``` +Use the Task tool to spawn the alertmanager-analyst subagent with this prompt: + +"Query Alertmanager for alerts related to the investigation in triage.md. +Focus on: +- Time range: [from triage.md] +- Services: [from triage.md] + +Return a YAML summary with: +- Total alert count +- Alerts grouped by severity (critical, warning, info) +- Top 10 most relevant alerts with: name, severity, status, start time, labels +- Notable patterns or correlations + +Max 10 alerts in output. If more exist, summarize the patterns." + +Wait for the subagent to complete and return its YAML summary. +``` + +The `alertmanager-analyst` subagent is defined in `.claude/agents/alertmanager-analyst.md` and has access to the `mcp__grafana__get_alerts` tool. It will return a structured summary, NOT raw alert data. + +### For Gemini CLI Users (Fallback) + +Since Gemini CLI does not support isolated subagents, you MUST manually apply strict output limits: + +1. Read `triage.md` to understand the scope +2. Query Alertmanager using `mcp__grafana__get_alerts` with appropriate filters +3. **CRITICAL OUTPUT LIMITS**: + - Return max 10 alerts total + - Group by severity (critical, warning, info) + - Summarize patterns rather than listing every alert + - Truncate long alert messages to 100 characters +4. Do NOT include full alert payloads in your response + +### Step-by-Step Process + +1. **Read Triage Context** + - Read `triage.md` to understand time range, affected services, and investigation scope + +2. **Delegate to Subagent (Claude Code) OR Apply Limits (Gemini)** + - **Claude Code**: Use Task tool with `alertmanager-analyst` subagent + - **Gemini CLI**: Query directly with strict output limits + +3. **Review Alert Summary** + - Verify the summary is structured and concise + - Check that it answers: Are there active alerts? What patterns exist? + - Ensure no raw alert JSON is present + +4. **Create Alerts Document** + +Create `alerts.md` with this structure: + +```markdown +# Alertmanager Analysis + +**Investigation**: [from triage.md] +**Time Range**: [from triage.md] +**Query Time**: [current timestamp] + +## Summary + +- **Total Alerts**: [count] +- **Critical**: [count] +- **Warning**: [count] +- **Info**: [count] + +## Critical Alerts (Top Priority) + +| Alert Name | Status | Started | Labels | Description | +|------------|--------|---------|--------|-------------| +| [name] | [firing/resolved] | [timestamp] | [key=value] | [brief description] | +... + +## Warning Alerts + +| Alert Name | Status | Started | Labels | Description | +|------------|--------|---------|--------|-------------| +| [name] | [firing/resolved] | [timestamp] | [key=value] | [brief description] | +... + +## Patterns & Insights + +- [Pattern 1: e.g., "All critical alerts are for service-x"] +- [Pattern 2: e.g., "Alerts started at same time as incident"] +- [Pattern 3: e.g., "No alerts for service-y despite issue"] + +## Correlation with Issue + +[Brief analysis of how alerts relate to the reported issue] + +## Next Steps + +- Investigate metrics for alerted components (metrics_analysis step) +- Cross-reference alert timing with log patterns (log_investigation step) +``` + +## Quality Criteria + +Before completing this step, verify: + +1. **Subagent Used (Claude) OR Limits Applied (Gemini)**: Alert queries were delegated or strictly limited +2. **Structured Output**: Alerts are in a table format, not raw JSON +3. **Max 10 Alerts**: No more than 10 alerts are detailed (others are summarized) +4. **Severity Grouped**: Alerts are grouped by severity level +5. **Patterns Identified**: Summary includes notable patterns or correlations +6. **No Context Bloat**: Main context does not contain raw alert payloads +7. **Document Created**: `alerts.md` file exists and is well-formatted + +## Output + +- `alerts.md` - Structured summary of Alertmanager alerts (max 10 detailed, rest summarized) + +## Platform Notes + +- **Claude Code**: The `alertmanager-analyst` subagent will be automatically available in `.claude/agents/` after `deepwork sync` +- **Gemini CLI**: No subagent support; instructions include inline summarization rules diff --git a/src/deepwork/standard_jobs/env_investigate/steps/log_investigation.md b/src/deepwork/standard_jobs/env_investigate/steps/log_investigation.md new file mode 100644 index 0000000..aab20aa --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/steps/log_investigation.md @@ -0,0 +1,200 @@ +# Log Investigation + +## Objective + +Query Loki for relevant log entries to identify errors, patterns, and root causes. This step uses a specialized subagent to prevent log data from bloating the main context. + +## Task + +**CRITICAL**: Do NOT query Grafana MCP tools directly in this step. Instead, delegate all log queries to an isolated subagent. Logs are the highest risk for context bloat. + +### For Claude Code Users (Recommended) + +Use the Task tool with the `loki-analyst` subagent: + +``` +Use the Task tool to spawn the loki-analyst subagent with this prompt: + +"Query Loki for logs related to the investigation: +- Time range: [from triage.md] +- Services: [from triage.md] +- Focus on timestamps from: [anomaly times from metrics.md] +- Keywords: [error codes, service names from alerts.md] + +Return a YAML summary with: +- Total log entries found +- Log patterns identified (e.g., error types, frequency) +- Max 5 representative log lines (not full logs) +- Each log line must be truncated to 200 chars max +- Error categories and counts +- Timeline of key events + +Format: + patterns: + - pattern: [description] + count: [number] + sample: [truncated log line] + + timeline: + - timestamp: [time] + event: [brief description] + source: [service] + + representative_logs: + - [truncated log 1, max 200 chars] + - [truncated log 2, max 200 chars] + - ... (max 5 total) + +Max 5 log lines in output. Summarize patterns instead of listing every log." + +Wait for the subagent to complete and return its YAML summary. +``` + +The `loki-analyst` subagent is defined in `.claude/agents/loki-analyst.md` and has access to the `mcp__grafana__query_loki` tool. It will return a structured summary with patterns, NOT raw log dumps. + +### For Gemini CLI Users (Fallback) + +Since Gemini CLI does not support isolated subagents, you MUST manually apply STRICT output limits: + +1. Read `triage.md`, `alerts.md`, and `metrics.md` to understand scope and focus +2. Query Loki using `mcp__grafana__query_loki` with appropriate LogQL queries +3. **CRITICAL OUTPUT LIMITS**: + - Return max 5 log lines total + - Truncate each log line to 200 characters max + - Focus on error patterns and counts, not individual logs + - Describe patterns instead of listing many similar logs + - Use timestamp sampling (start, anomaly times, end) +4. Do NOT include full log dumps, stack traces, or verbose logs + +### Step-by-Step Process + +1. **Read Context** + - Read `triage.md` for time range and affected services + - Read `alerts.md` for error indicators + - Read `metrics.md` for anomaly timestamps to focus on + +2. **Identify Log Query Focus** + - Service names from triage + - Error keywords from alerts + - Timestamp windows around metric anomalies + - Specific error codes or patterns mentioned in investigation + +3. **Delegate to Subagent (Claude Code) OR Apply Limits (Gemini)** + - **Claude Code**: Use Task tool with `loki-analyst` subagent + - **Gemini CLI**: Query directly with STRICT output limits + +4. **Review Log Summary** + - Verify max 5 log lines are included + - Check that logs are truncated to 200 chars + - Ensure patterns are summarized, not listed exhaustively + +5. **Create Logs Document** + +Create `logs.md` with this structure: + +```markdown +# Loki Log Analysis + +**Investigation**: [from triage.md] +**Time Range**: [from triage.md] +**Query Time**: [current timestamp] + +## Summary + +- **Total Log Entries**: [approximate count] +- **Services Queried**: [list] +- **Primary Focus**: [error types, time windows, etc.] + +## Log Patterns + +| Pattern | Count | Severity | First Seen | Last Seen | +|---------|-------|----------|------------|-----------| +| [e.g., "Connection timeout to DB"] | [count] | [ERROR/WARN] | [timestamp] | [timestamp] | +| [e.g., "OOM killer invoked"] | [count] | [CRITICAL] | [timestamp] | [timestamp] | +... + +## Timeline of Key Events + +| Timestamp | Event | Service | Severity | +|-----------|-------|---------|----------| +| [time] | [brief event description] | [service] | [level] | +| [time] | [brief event description] | [service] | [level] | +... + +## Representative Log Entries (Max 5) + +### 1. [Brief description] +**Timestamp**: [time] +**Service**: [name] +**Level**: [ERROR/WARN/INFO] +``` +[Truncated log line, max 200 chars]... +``` + +### 2. [Brief description] +**Timestamp**: [time] +**Service**: [name] +**Level**: [ERROR/WARN/INFO] +``` +[Truncated log line, max 200 chars]... +``` + +[Repeat for max 5 total] + +## Error Categories + +### [Category 1: e.g., Database Errors] +- **Count**: [number] +- **Pattern**: [description] +- **Impact**: [brief note] + +### [Category 2: e.g., Network Errors] +- **Count**: [number] +- **Pattern**: [description] +- **Impact**: [brief note] + +... + +## Cross-Reference with Metrics + +- **Correlation 1**: [e.g., "Error spike at 14:23 matches CPU spike"] +- **Correlation 2**: [e.g., "No logs between 14:20-14:22 suggests service crash"] +... + +## Insights + +[1-2 paragraph analysis of what the logs reveal] + +Key findings: +1. [Finding 1] +2. [Finding 2] +3. [Finding 3] + +## Next Steps + +- Synthesize findings into root cause analysis (root_cause step) +- Create timeline of incident progression +``` + +## Quality Criteria + +Before completing this step, verify: + +1. **Subagent Used (Claude) OR Limits Applied (Gemini)**: Log queries were delegated or STRICTLY limited +2. **Max 5 Logs**: No more than 5 log lines are included in detail +3. **Truncated Logs**: Each log line is truncated to 200 characters max +4. **Patterns Summarized**: Similar logs are grouped and counted, not listed individually +5. **Timeline Created**: Key events are organized chronologically +6. **Cross-Referenced**: Logs are correlated with alerts and metrics +7. **No Context Bloat**: Main context does not contain log dumps or stack traces +8. **Document Created**: `logs.md` file exists and is well-formatted + +## Output + +- `logs.md` - Structured summary of Loki logs with patterns and representative samples (max 5 logs, 200 chars each) + +## Platform Notes + +- **Claude Code**: The `loki-analyst` subagent will be automatically available in `.claude/agents/` after `deepwork sync` +- **Gemini CLI**: No subagent support; instructions include inline summarization rules +- **Warning**: Loki logs are the highest risk for context bloat. Strict limits are essential. diff --git a/src/deepwork/standard_jobs/env_investigate/steps/metrics_analysis.md b/src/deepwork/standard_jobs/env_investigate/steps/metrics_analysis.md new file mode 100644 index 0000000..986fd92 --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/steps/metrics_analysis.md @@ -0,0 +1,173 @@ +# Metrics Analysis + +## Objective + +Query Prometheus for relevant metrics to identify trends, anomalies, and correlations. This step uses a specialized subagent to prevent metric data from bloating the main context. + +## Task + +**CRITICAL**: Do NOT query Grafana MCP tools directly in this step. Instead, delegate all metric queries to an isolated subagent. + +### For Claude Code Users (Recommended) + +Use the Task tool with the `prometheus-analyst` subagent: + +``` +Use the Task tool to spawn the prometheus-analyst subagent with this prompt: + +"Query Prometheus for metrics related to the investigation: +- Time range: [from triage.md] +- Services: [from triage.md] +- Focus areas: [from alerts.md - metrics mentioned in alerts] + +Return a YAML summary with: +- Key metrics queried (with PromQL) +- Trend summaries (increasing/decreasing/stable) +- Max 10 data points per metric (start, middle, end values) +- Notable anomalies or spikes +- Correlations between metrics + +Format each metric section as: + metric_name: + query: [PromQL] + trend: [increasing/decreasing/stable/spike] + values: [max 10 sample points] + analysis: [brief interpretation] + +Max 10 metrics total. If more are relevant, prioritize by importance." + +Wait for the subagent to complete and return its YAML summary. +``` + +The `prometheus-analyst` subagent is defined in `.claude/agents/prometheus-analyst.md` and has access to the `mcp__grafana__query_prometheus` tool. It will return a structured summary with trends, NOT raw time series data. + +### For Gemini CLI Users (Fallback) + +Since Gemini CLI does not support isolated subagents, you MUST manually apply strict output limits: + +1. Read `triage.md` and `alerts.md` to understand scope and focus areas +2. Query Prometheus using `mcp__grafana__query_prometheus` with appropriate PromQL queries +3. **CRITICAL OUTPUT LIMITS**: + - Query max 10 metrics total + - Return max 10 data points per metric (sample key points, not every value) + - Describe trends (increasing/decreasing/stable/spike) instead of listing all values + - Focus on anomalies and changes, not steady-state data +4. Do NOT include full time series in your response + +### Step-by-Step Process + +1. **Read Context** + - Read `triage.md` for time range and affected services + - Read `alerts.md` for metrics mentioned in alerts + +2. **Identify Key Metrics** + - Determine which metrics to query based on: + - Services mentioned in triage + - Metrics referenced in alerts + - Common metrics: CPU, memory, request rate, error rate, latency + - Prioritize metrics most likely to reveal root cause + +3. **Delegate to Subagent (Claude Code) OR Apply Limits (Gemini)** + - **Claude Code**: Use Task tool with `prometheus-analyst` subagent + - **Gemini CLI**: Query directly with strict output limits + +4. **Review Metrics Summary** + - Verify the summary includes trends and anomalies + - Check that data points are sampled (not exhaustive) + - Ensure no raw time series data is present + +5. **Create Metrics Document** + +Create `metrics.md` with this structure: + +```markdown +# Prometheus Metrics Analysis + +**Investigation**: [from triage.md] +**Time Range**: [from triage.md] +**Query Time**: [current timestamp] + +## Key Metrics Overview + +| Metric | Trend | Anomaly | Correlation with Issue | +|--------|-------|---------|------------------------| +| [name] | [increasing/decreasing/stable/spike] | [yes/no] | [brief note] | +... + +## Detailed Metrics + +### [Metric 1: e.g., CPU Usage] + +**PromQL**: `[query]` + +**Trend**: [increasing/decreasing/stable/spike] + +**Sample Values** (max 10 points): +- [timestamp]: [value] +- [timestamp]: [value] +- ... + +**Analysis**: [Brief interpretation - what does this tell us?] + +--- + +### [Metric 2: e.g., Request Error Rate] + +**PromQL**: `[query]` + +**Trend**: [increasing/decreasing/stable/spike] + +**Sample Values** (max 10 points): +- [timestamp]: [value] +- [timestamp]: [value] +- ... + +**Analysis**: [Brief interpretation] + +--- + +[Repeat for up to 10 metrics total] + +## Anomalies & Correlations + +### Identified Anomalies +1. [Anomaly 1: e.g., "CPU spiked to 95% at 14:23:15"] +2. [Anomaly 2: e.g., "Error rate jumped 10x starting at 14:22:30"] +... + +### Cross-Metric Correlations +- [Correlation 1: e.g., "CPU spike coincides with error rate increase"] +- [Correlation 2: e.g., "Memory usage flat despite CPU spike"] +... + +## Insights + +[1-2 paragraph analysis of what the metrics reveal about the issue] + +## Next Steps + +- Investigate logs around anomaly timestamps (log_investigation step) +- Cross-reference metrics with alert timing +``` + +## Quality Criteria + +Before completing this step, verify: + +1. **Subagent Used (Claude) OR Limits Applied (Gemini)**: Metric queries were delegated or strictly limited +2. **Sampled Data**: Max 10 data points per metric (not full time series) +3. **Trend Summaries**: Each metric has a trend description (increasing/decreasing/stable/spike) +4. **Max 10 Metrics**: No more than 10 metrics are analyzed in detail +5. **Anomalies Highlighted**: Notable spikes, drops, or changes are called out +6. **Correlations Noted**: Relationships between metrics are identified +7. **No Context Bloat**: Main context does not contain raw time series data +8. **Document Created**: `metrics.md` file exists and is well-formatted + +## Output + +- `metrics.md` - Structured summary of Prometheus metrics with trends and anomalies (max 10 metrics, 10 points each) + +## Platform Notes + +- **Claude Code**: The `prometheus-analyst` subagent will be automatically available in `.claude/agents/` after `deepwork sync` +- **Gemini CLI**: No subagent support; instructions include inline summarization rules diff --git a/src/deepwork/standard_jobs/env_investigate/steps/remediation.md b/src/deepwork/standard_jobs/env_investigate/steps/remediation.md new file mode 100644 index 0000000..9e728ae --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/steps/remediation.md @@ -0,0 +1,332 @@ +# Remediation Plan + +## Objective + +Create a comprehensive, actionable remediation plan based on the root cause analysis. This plan should address immediate fixes, long-term prevention, and monitoring improvements. + +## Task + +Develop a structured remediation plan that can be executed by the team to resolve the issue and prevent recurrence. + +### Step-by-Step Process + +1. **Review Root Cause** + - Read `root_cause.md` to understand what failed and why + - Read `timeline.md` to understand the incident progression + - Identify contributing factors and weaknesses + +2. **Develop Remediation Strategy** + +Create a multi-layered approach: + +#### Immediate Actions (0-24 hours) +- Emergency fixes to restore service +- Workarounds for user impact +- Communication to stakeholders + +#### Short-Term Fixes (1-7 days) +- Code fixes or configuration changes +- Infrastructure adjustments +- Process improvements + +#### Long-Term Prevention (1-4 weeks) +- Architectural changes +- Automation improvements +- Capacity planning + +#### Monitoring Enhancements +- New alerts to detect earlier +- Additional metrics to track +- Dashboard improvements +- Grafana MCP integration enhancements + +3. **Prioritize Actions** + +For each action, specify: +- **Priority**: P0 (critical), P1 (high), P2 (medium), P3 (low) +- **Owner**: Team or individual responsible +- **Timeline**: Expected completion date +- **Dependencies**: What must happen first +- **Success Criteria**: How to verify completion + +4. **Plan Monitoring Improvements** + +Based on evidence gaps from `root_cause.md`, specify: + +**Grafana MCP Enhancements**: +- New Prometheus metrics to add +- Additional Loki log streams +- Alertmanager rules to create +- Missing labels or tags to add + +**Observability Gaps to Address**: +- Services lacking proper instrumentation +- Blind spots in metric coverage +- Log verbosity adjustments +- Trace spans to add + +5. **Create Remediation Document** + +Create `remediation.md` with this structure: + +```markdown +# Remediation Plan + +**Investigation**: [from triage.md] +**Root Cause**: [brief summary from root_cause.md] +**Plan Created**: [current timestamp] + +## Executive Summary + +[1-2 paragraph summary of the remediation approach] + +## Immediate Actions (P0 - Next 24 Hours) + +### Action 1: [Title] +**Priority**: P0 +**Owner**: [Team/Person] +**Timeline**: [Hours/Date] +**Status**: [Not Started/In Progress/Complete] + +**Description**: [What needs to be done] + +**Steps**: +1. [Specific step 1] +2. [Specific step 2] +3. [Specific step 3] + +**Success Criteria**: [How to verify this is complete] + +**Dependencies**: [None or list dependencies] + +--- + +[Repeat for each P0 action] + +## Short-Term Fixes (P1 - Next 7 Days) + +### Action 1: [Title] +**Priority**: P1 +**Owner**: [Team/Person] +**Timeline**: [Date] + +**Description**: [What needs to be done] + +**Implementation Plan**: +1. [Step 1] +2. [Step 2] + +**Testing Plan**: [How to validate the fix] + +**Rollback Plan**: [What to do if this causes issues] + +**Success Criteria**: [How to verify this is complete] + +--- + +[Repeat for each P1 action] + +## Long-Term Prevention (P2 - Next 4 Weeks) + +### Action 1: [Title] +**Priority**: P2 +**Owner**: [Team/Person] +**Timeline**: [Date] + +**Description**: [What needs to be done] + +**Why This Prevents Recurrence**: [Connection to root cause] + +**Implementation Notes**: [Technical details, considerations] + +**Success Criteria**: [How to verify this is complete] + +--- + +[Repeat for each P2 action] + +## Monitoring & Observability Improvements + +### Grafana MCP Enhancements + +#### New Prometheus Metrics +- **Metric 1**: `[metric_name]` + - **Type**: [counter/gauge/histogram] + - **Labels**: [label1, label2] + - **Purpose**: [Why this metric helps] + - **Alert Threshold**: [When to alert] + +- **Metric 2**: `[metric_name]` + - **Type**: [counter/gauge/histogram] + - **Labels**: [label1, label2] + - **Purpose**: [Why this metric helps] + - **Alert Threshold**: [When to alert] + +#### New Loki Log Streams +- **Service 1**: [service_name] + - **Current Gap**: [What's missing in logs] + - **Add**: [What log lines to add] + - **Labels**: [label1, label2] + - **Format**: [JSON/text] + +- **Service 2**: [service_name] + - **Current Gap**: [What's missing] + - **Add**: [What to add] + +#### New Alertmanager Rules +- **Alert 1**: `[AlertName]` + - **Condition**: [PromQL query] + - **Severity**: [critical/warning/info] + - **For**: [duration - e.g., 5m] + - **Labels**: [labels to add] + - **Annotation**: [description template] + - **Why Needed**: [How this would have caught the issue earlier] + +- **Alert 2**: `[AlertName]` + - **Condition**: [PromQL query] + - **Severity**: [critical/warning/info] + - **For**: [duration] + - **Why Needed**: [Justification] + +### Dashboard Improvements +- **Dashboard 1**: [Name] + - **Add Panels**: [List of new panels] + - **Purpose**: [What this helps monitor] + +- **Dashboard 2**: [Name] + - **Add Panels**: [List] + - **Purpose**: [What this helps monitor] + +### Detection Time Improvements + +**Current State**: +- Time to first alert: [from timeline.md] +- Time to detection: [from timeline.md] + +**Target State**: +- Time to first alert: [goal] +- Time to detection: [goal] + +**How We'll Achieve This**: +1. [Improvement 1] +2. [Improvement 2] + +## Communication Plan + +### Internal Communication +- **Team Notification**: [When and how to notify team] +- **Post-Mortem**: [When to schedule, who to invite] +- **Runbook Updates**: [What documentation to update] + +### External Communication (if applicable) +- **User Communication**: [Status page update, email, etc.] +- **Stakeholder Briefing**: [Who needs to be informed] + +## Testing & Validation + +### How to Test the Fix +1. [Test scenario 1] +2. [Test scenario 2] +3. [Test scenario 3] + +### How to Verify Prevention +- **Scenario**: [Recreate the trigger condition] +- **Expected**: [New alerts fire, system self-heals, etc.] +- **Monitor**: [Which Grafana dashboards to watch] + +## Risk Assessment + +### Implementation Risks +- **Risk 1**: [Description] + - **Mitigation**: [How to reduce risk] + - **Contingency**: [Backup plan] + +- **Risk 2**: [Description] + - **Mitigation**: [How to reduce risk] + +### Monitoring During Rollout +- **Watch Metrics**: [Critical metrics to monitor] +- **Watch Logs**: [Critical log patterns to watch] +- **Rollback Criteria**: [What would trigger a rollback] + +## Success Metrics + +### Immediate Success (24 hours) +- [ ] Issue resolved or mitigated +- [ ] No recurrence +- [ ] User impact eliminated + +### Short-Term Success (7 days) +- [ ] Root cause fix deployed +- [ ] New monitoring in place +- [ ] Team trained on new procedures + +### Long-Term Success (30 days) +- [ ] Prevention measures implemented +- [ ] Detection time improved +- [ ] Post-mortem completed and shared + +## Appendix + +### Related Documents +- Investigation triage: [link to triage.md] +- Root cause analysis: [link to root_cause.md] +- Incident timeline: [link to timeline.md] + +### Grafana MCP Queries for Ongoing Monitoring + +**Prometheus Queries to Watch**: +```promql +[Query 1 to monitor fix effectiveness] +[Query 2 to detect recurrence] +``` + +**Loki Queries to Watch**: +```logql +[Query 1 to monitor for error patterns] +[Query 2 to verify fix success] +``` + +**Alertmanager Filters**: +``` +[Filter to track related alerts] +``` +``` + +## Quality Criteria + +Before completing this step, verify: + +1. **Comprehensive Coverage**: Immediate, short-term, and long-term actions are included +2. **Prioritized**: Each action has a clear priority (P0/P1/P2) +3. **Actionable**: Each action has specific steps, not vague goals +4. **Owned**: Each action has a responsible party +5. **Testable**: Success criteria are measurable +6. **Monitoring Enhanced**: Grafana MCP improvements address evidence gaps +7. **Risk Assessed**: Implementation risks are identified and mitigated +8. **Document Created**: `remediation.md` file exists and is well-formatted + +## Output + +- `remediation.md` - Complete remediation plan with actions, monitoring improvements, and success criteria + +## Workflow Complete + +This is the final step in the `env_investigate` workflow. After completing this step: + +1. **Review all artifacts** on the work branch: + - `triage.md` - Investigation scope + - `alerts.md` - Alertmanager analysis + - `metrics.md` - Prometheus metrics analysis + - `logs.md` - Loki log analysis + - `root_cause.md` - Root cause determination + - `timeline.md` - Incident timeline + - `remediation.md` - Remediation plan + +2. **Create PR** to merge the work branch with investigation findings + +3. **Execute remediation plan** according to priorities + +4. **Schedule post-mortem** to review with team + +5. **Update runbooks** based on learnings diff --git a/src/deepwork/standard_jobs/env_investigate/steps/root_cause.md b/src/deepwork/standard_jobs/env_investigate/steps/root_cause.md new file mode 100644 index 0000000..d9d6791 --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/steps/root_cause.md @@ -0,0 +1,254 @@ +# Root Cause Analysis + +## Objective + +Synthesize findings from alerts, metrics, and logs to identify the root cause of the production issue. This step operates entirely within the main context using the structured summaries from previous steps. + +## Task + +Analyze the investigation artifacts to determine the root cause and create a comprehensive timeline of the incident. + +### Step-by-Step Process + +1. **Review All Artifacts** + - Read `triage.md` for the original issue description + - Read `alerts.md` for alert patterns and timing + - Read `metrics.md` for metric trends and anomalies + - Read `logs.md` for error patterns and representative samples + +2. **Identify the Root Cause** + +Answer these questions: + +- **What failed?** (component, service, resource) +- **Why did it fail?** (trigger, underlying condition) +- **When did it start?** (precise timestamp if possible) +- **What was the propagation?** (how did the failure spread) +- **What was the user impact?** (symptoms, scope) + +Use evidence from the investigation: +- Alerts that fired +- Metric anomalies and trends +- Log patterns and errors +- Correlations between data sources + +3. **Build Incident Timeline** + +Create a chronological sequence of events: + +1. **Pre-incident state** (what was normal) +2. **Trigger event** (what initiated the problem) +3. **Cascading effects** (how the problem propagated) +4. **Detection** (when alerts fired, when users noticed) +5. **Current state** (ongoing or resolved) + +For each event, include: +- Precise timestamp +- What happened (from logs, metrics, alerts) +- Evidence source (which artifact) + +4. **Assess Confidence Level** + +Rate your confidence in the root cause: +- **High**: Strong evidence from multiple sources +- **Medium**: Evidence from one source, or circumstantial from multiple +- **Low**: Hypothesis based on limited evidence + +Identify any gaps in evidence or alternative explanations. + +5. **Create Root Cause Document** + +Create `root_cause.md` with this structure: + +```markdown +# Root Cause Analysis + +**Investigation**: [from triage.md] +**Completed**: [current timestamp] +**Confidence**: [High/Medium/Low] + +## Executive Summary + +[1-2 paragraph summary of the root cause suitable for stakeholders] + +## Root Cause + +### What Failed +[Component, service, or resource that failed] + +### Why It Failed +[Underlying cause - configuration error, resource exhaustion, bug, external dependency, etc.] + +### Evidence +- **Alerts**: [specific alerts that support this conclusion] +- **Metrics**: [specific metrics that support this conclusion] +- **Logs**: [specific log patterns that support this conclusion] + +### Trigger Event +[What initiated the failure - deployment, traffic spike, infrastructure change, etc.] + +**Timestamp**: [when trigger occurred] + +**Evidence**: [how we know this was the trigger] + +## Impact Analysis + +### User Impact +- **Scope**: [percentage of users, specific user segments, all users] +- **Symptoms**: [what users experienced] +- **Duration**: [how long impact lasted] + +### System Impact +- **Services Affected**: [list] +- **Services Degraded**: [list] +- **Dependencies Broken**: [list] + +## Alternative Hypotheses Considered + +### Hypothesis 1: [Description] +**Ruled Out Because**: [Evidence that contradicts this] + +### Hypothesis 2: [Description] +**Ruled Out Because**: [Evidence that contradicts this] + +## Evidence Gaps + +[Areas where evidence is incomplete or missing] + +Recommendations for better observability: +1. [Recommendation 1] +2. [Recommendation 2] + +## Confidence Assessment + +**Confidence Level**: [High/Medium/Low] + +**Strong Evidence**: +- [Evidence point 1] +- [Evidence point 2] + +**Weak/Missing Evidence**: +- [Gap 1] +- [Gap 2] + +## Prevention + +[Initial thoughts on how to prevent recurrence - will be expanded in remediation step] +``` + +6. **Create Timeline Document** + +Create `timeline.md` with this structure: + +```markdown +# Incident Timeline + +**Investigation**: [from triage.md] + +## Timeline Overview + +**Incident Duration**: [start time] to [end time] ([duration]) + +**Time to Detection**: [duration from start to first alert] + +**Time to User Impact**: [duration from start to user-visible symptoms] + +## Detailed Timeline + +### [Timestamp] - Pre-Incident State +**State**: Normal operation +**Evidence**: [metrics showing normal state from metrics.md] + +--- + +### [Timestamp] - Trigger Event +**Event**: [what happened] +**Source**: [where this came from - deployment, external event, etc.] +**Evidence**: [logs, metrics, or alerts showing this event] + +--- + +### [Timestamp] - Initial Failure +**Event**: [first component that failed] +**Symptoms**: [what broke] +**Evidence**: +- Metrics: [specific metric change] +- Logs: [specific log pattern] + +--- + +### [Timestamp] - Alert Fired +**Alert**: [alert name from alerts.md] +**Severity**: [level] +**Evidence**: [from alerts.md] + +--- + +### [Timestamp] - Cascading Effect 1 +**Event**: [how failure propagated] +**Impact**: [what else broke] +**Evidence**: +- Metrics: [specific metric change] +- Logs: [specific log pattern] + +--- + +[Continue for each significant event] + +--- + +### [Timestamp] - User Impact Began +**Symptoms**: [what users experienced] +**Scope**: [how many affected] +**Evidence**: [logs, metrics showing user impact] + +--- + +### [Timestamp] - Resolution (if applicable) +**Event**: [what resolved the issue - rollback, restart, scale up, etc.] +**Evidence**: [metrics returning to normal, alerts clearing] + +--- + +### [Timestamp] - Current State +**State**: [Ongoing / Resolved] +**Status**: [description of current state] + +## Timeline Summary + +**Key Observations**: +1. [Observation 1 - e.g., "5 minute gap between trigger and first alert"] +2. [Observation 2 - e.g., "Cascading failure took 10 minutes to propagate"] +3. [Observation 3 - e.g., "No logs during critical 2-minute window"] + +**Critical Timestamps**: +- **Trigger**: [time] +- **First Failure**: [time] +- **First Alert**: [time] +- **User Impact**: [time] +- **Resolution**: [time] +``` + +## Quality Criteria + +Before completing this step, verify: + +1. **Root Cause Identified**: Clear statement of what failed and why +2. **Evidence-Based**: Conclusions supported by alerts, metrics, and logs +3. **Timeline Complete**: All significant events are documented chronologically +4. **Precise Timestamps**: Events have specific times (not just "around 14:00") +5. **Confidence Assessed**: Honest evaluation of evidence strength +6. **Alternatives Considered**: Other hypotheses were evaluated and ruled out +7. **Impact Quantified**: User and system impact are clearly stated +8. **Documents Created**: Both `root_cause.md` and `timeline.md` exist and are well-formatted + +## Outputs + +- `root_cause.md` - Comprehensive root cause analysis with evidence and confidence assessment +- `timeline.md` - Chronological timeline of the incident with precise timestamps + +## Next Steps + +- Create remediation plan (remediation step) +- Share findings with team +- Update runbooks based on learnings diff --git a/src/deepwork/standard_jobs/env_investigate/steps/triage.md b/src/deepwork/standard_jobs/env_investigate/steps/triage.md new file mode 100644 index 0000000..11f5563 --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/steps/triage.md @@ -0,0 +1,118 @@ +# Triage & Scope + +## Objective + +Define the scope and context of the production investigation. This step gathers essential information about the issue and prepares for detailed analysis in subsequent steps. + +## Task + +Ask structured questions to understand the issue and create a triage document that guides the investigation. + +### Step 1: Gather Issue Details + +Ask structured questions to collect: + +1. **Issue Description** + - What is the user-reported symptom or problem? + - When was the issue first detected? + - Is this a new issue or a regression? + +2. **Affected Components** + - Which services, applications, or systems are affected? + - What is the scope of impact? (single user, subset, all users) + - Are there related systems that might be involved? + +3. **Time Range** + - When did the issue start? + - Is the issue ongoing or resolved? + - What time range should we investigate? (default: last 1 hour) + +4. **Expected vs Actual Behavior** + - What should be happening? + - What is actually happening? + - Are there any error messages or codes? + +### Step 2: Define Investigation Scope + +Based on the answers, document: + +1. **Key Questions to Answer** + - What specific questions will this investigation address? + - What are the likely hypotheses? + +2. **Services/Components to Monitor** + - List specific service names for log queries + - List specific metric names or patterns for Prometheus + - List specific alert names to check + +3. **Investigation Time Range** + - Start and end times in ISO 8601 format + - Prometheus-compatible time range (e.g., "[1h]", "[24h]") + - Loki-compatible time range (e.g., "1h", "24h") + +### Step 3: Create Triage Document + +Create `triage.md` with this structure: + +```markdown +# Investigation Triage: [Issue Title] + +## Issue Summary +[Brief description of the problem] + +**Status**: [Ongoing/Resolved] +**First Detected**: [Timestamp] +**Impact Scope**: [Description] + +## Investigation Scope + +### Time Range +- **Start**: [ISO timestamp] +- **End**: [ISO timestamp] +- **Duration**: [Human-readable duration] +- **Prometheus Range**: [e.g., [1h]] +- **Loki Range**: [e.g., 1h] + +### Affected Components +- Service 1: [Service name and description] +- Service 2: [Service name and description] +- ... + +### Key Questions +1. [Question 1] +2. [Question 2] +3. ... + +### Investigation Hypotheses +1. [Hypothesis 1] +2. [Hypothesis 2] +3. ... + +## Expected vs Actual Behavior + +**Expected**: [What should happen] + +**Actual**: [What is happening] + +**Symptoms**: [Observable symptoms, error messages, etc.] + +## Next Steps +- Check Alertmanager for active alerts (alert_check step) +- Analyze Prometheus metrics (metrics_analysis step) +- Review Loki logs (log_investigation step) +``` + +## Quality Criteria + +Before completing this step, verify: + +1. **Complete Information**: All essential details are gathered from the user +2. **Clear Scope**: The investigation boundaries are well-defined +3. **Specific Components**: Service names and metrics are concrete, not generic +4. **Valid Time Ranges**: Time ranges are formatted correctly for both Prometheus and Loki +5. **Actionable Questions**: Key questions are specific and answerable +6. **Document Created**: `triage.md` file exists and is well-formatted + +## Output + +- `triage.md` - Structured triage document ready for use by subsequent investigation steps diff --git a/src/deepwork/standard_jobs/env_investigate/templates/alerts.md.template b/src/deepwork/standard_jobs/env_investigate/templates/alerts.md.template new file mode 100644 index 0000000..3180055 --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/templates/alerts.md.template @@ -0,0 +1,38 @@ +# Alertmanager Analysis + +**Investigation**: [from triage.md] +**Time Range**: [from triage.md] +**Query Time**: [current timestamp] + +## Summary + +- **Total Alerts**: [count] +- **Critical**: [count] +- **Warning**: [count] +- **Info**: [count] + +## Critical Alerts (Top Priority) + +| Alert Name | Status | Started | Labels | Description | +|------------|--------|---------|--------|-------------| +| [name] | [firing/resolved] | [timestamp] | [key=value] | [brief description] | + +## Warning Alerts + +| Alert Name | Status | Started | Labels | Description | +|------------|--------|---------|--------|-------------| +| [name] | [firing/resolved] | [timestamp] | [key=value] | [brief description] | + +## Patterns & Insights + +- [Pattern 1: e.g., "All critical alerts are for service-x"] +- [Pattern 2: e.g., "Alerts started at same time as incident"] + +## Correlation with Issue + +[Brief analysis of how alerts relate to the reported issue] + +## Next Steps + +- Investigate metrics for alerted components (metrics_analysis step) +- Cross-reference alert timing with log patterns (log_investigation step) diff --git a/src/deepwork/standard_jobs/env_investigate/templates/logs.md.template b/src/deepwork/standard_jobs/env_investigate/templates/logs.md.template new file mode 100644 index 0000000..1ab4517 --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/templates/logs.md.template @@ -0,0 +1,54 @@ +# Loki Log Analysis + +**Investigation**: [from triage.md] +**Time Range**: [from triage.md] +**Query Time**: [current timestamp] + +## Summary + +- **Total Log Entries**: [approximate count] +- **Services Queried**: [list] +- **Primary Focus**: [error types, time windows] + +## Log Patterns + +| Pattern | Count | Severity | First Seen | Last Seen | +|---------|-------|----------|------------|-----------| +| [pattern] | [count] | [ERROR/WARN] | [timestamp] | [timestamp] | + +## Timeline of Key Events + +| Timestamp | Event | Service | Severity | +|-----------|-------|---------|----------| +| [time] | [event] | [service] | [level] | + +## Representative Log Entries (Max 5) + +### 1. [Brief description] +**Timestamp**: [time] +**Service**: [name] +**Level**: [ERROR/WARN/INFO] +``` +[Truncated log line, max 200 chars]... +``` + +## Error Categories + +### [Category 1] +- **Count**: [number] +- **Pattern**: [description] + +## Cross-Reference with Metrics + +- **Correlation 1**: [correlation] + +## Insights + +[Analysis of what the logs reveal] + +Key findings: +1. [Finding 1] + +## Next Steps + +- Synthesize findings into root cause analysis (root_cause step) diff --git a/src/deepwork/standard_jobs/env_investigate/templates/metrics.md.template b/src/deepwork/standard_jobs/env_investigate/templates/metrics.md.template new file mode 100644 index 0000000..1b3a28c --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/templates/metrics.md.template @@ -0,0 +1,38 @@ +# Prometheus Metrics Analysis + +**Investigation**: [from triage.md] +**Time Range**: [from triage.md] +**Query Time**: [current timestamp] + +## Key Metrics Overview + +| Metric | Trend | Anomaly | Correlation with Issue | +|--------|-------|---------|------------------------| +| [name] | [increasing/decreasing/stable/spike] | [yes/no] | [brief note] | + +## Detailed Metrics + +### [Metric 1: e.g., CPU Usage] + +**PromQL**: `[query]` +**Trend**: [increasing/decreasing/stable/spike] +**Sample Values** (max 10 points): +- [timestamp]: [value] + +**Analysis**: [Brief interpretation] + +## Anomalies & Correlations + +### Identified Anomalies +1. [Anomaly 1] + +### Cross-Metric Correlations +- [Correlation 1] + +## Insights + +[Analysis of what the metrics reveal] + +## Next Steps + +- Investigate logs around anomaly timestamps (log_investigation step) diff --git a/src/deepwork/standard_jobs/env_investigate/templates/remediation.md.template b/src/deepwork/standard_jobs/env_investigate/templates/remediation.md.template new file mode 100644 index 0000000..53d8ab5 --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/templates/remediation.md.template @@ -0,0 +1,96 @@ +# Remediation Plan + +**Investigation**: [from triage.md] +**Root Cause**: [brief summary] +**Plan Created**: [current timestamp] + +## Executive Summary + +[Summary of remediation approach] + +## Immediate Actions (P0 - Next 24 Hours) + +### Action 1: [Title] +**Priority**: P0 +**Owner**: [Team/Person] +**Timeline**: [Hours/Date] +**Status**: [Not Started] + +**Description**: [What needs to be done] + +**Steps**: +1. [Step 1] + +**Success Criteria**: [How to verify] + +## Short-Term Fixes (P1 - Next 7 Days) + +### Action 1: [Title] +**Priority**: P1 +**Owner**: [Team/Person] +**Timeline**: [Date] + +**Description**: [What needs to be done] +**Testing Plan**: [How to validate] + +## Long-Term Prevention (P2 - Next 4 Weeks) + +### Action 1: [Title] +**Priority**: P2 +**Owner**: [Team/Person] +**Timeline**: [Date] + +**Description**: [What needs to be done] +**Why This Prevents Recurrence**: [Connection to root cause] + +## Monitoring & Observability Improvements + +### Grafana MCP Enhancements + +#### New Prometheus Metrics +- **Metric 1**: `[metric_name]` + - **Type**: [counter/gauge/histogram] + - **Purpose**: [Why this helps] + +#### New Loki Log Streams +- **Service 1**: [service_name] + - **Current Gap**: [What's missing] + - **Add**: [What to add] + +#### New Alertmanager Rules +- **Alert 1**: `[AlertName]` + - **Condition**: [PromQL query] + - **Severity**: [critical/warning/info] + - **Why Needed**: [Justification] + +### Dashboard Improvements +- **Dashboard 1**: [Name] + - **Add Panels**: [List] + +## Communication Plan + +### Internal Communication +- **Team Notification**: [When/how] +- **Post-Mortem**: [When to schedule] + +## Testing & Validation + +### How to Test the Fix +1. [Test scenario 1] + +### How to Verify Prevention +- **Scenario**: [Recreate condition] +- **Expected**: [New behavior] + +## Success Metrics + +### Immediate Success (24 hours) +- [ ] Issue resolved +- [ ] No recurrence + +### Short-Term Success (7 days) +- [ ] Root cause fix deployed +- [ ] New monitoring in place + +### Long-Term Success (30 days) +- [ ] Prevention measures implemented diff --git a/src/deepwork/standard_jobs/env_investigate/templates/root_cause.md.template b/src/deepwork/standard_jobs/env_investigate/templates/root_cause.md.template new file mode 100644 index 0000000..a4dc90f --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/templates/root_cause.md.template @@ -0,0 +1,61 @@ +# Root Cause Analysis + +**Investigation**: [from triage.md] +**Completed**: [current timestamp] +**Confidence**: [High/Medium/Low] + +## Executive Summary + +[Summary of the root cause] + +## Root Cause + +### What Failed +[Component that failed] + +### Why It Failed +[Underlying cause] + +### Evidence +- **Alerts**: [specific alerts] +- **Metrics**: [specific metrics] +- **Logs**: [specific log patterns] + +### Trigger Event +[What initiated the failure] + +**Timestamp**: [when] +**Evidence**: [how we know] + +## Impact Analysis + +### User Impact +- **Scope**: [percentage/segments] +- **Symptoms**: [what users experienced] +- **Duration**: [how long] + +### System Impact +- **Services Affected**: [list] + +## Alternative Hypotheses Considered + +### Hypothesis 1: [Description] +**Ruled Out Because**: [Evidence] + +## Evidence Gaps + +[Areas where evidence is incomplete] + +## Confidence Assessment + +**Confidence Level**: [High/Medium/Low] + +**Strong Evidence**: +- [Evidence point 1] + +**Weak/Missing Evidence**: +- [Gap 1] + +## Prevention + +[Thoughts on prevention] diff --git a/src/deepwork/standard_jobs/env_investigate/templates/timeline.md.template b/src/deepwork/standard_jobs/env_investigate/templates/timeline.md.template new file mode 100644 index 0000000..a57078c --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/templates/timeline.md.template @@ -0,0 +1,59 @@ +# Incident Timeline + +**Investigation**: [from triage.md] + +## Timeline Overview + +**Incident Duration**: [start] to [end] ([duration]) +**Time to Detection**: [duration] + +## Detailed Timeline + +### [Timestamp] - Pre-Incident State +**State**: Normal operation +**Evidence**: [metrics] + +--- + +### [Timestamp] - Trigger Event +**Event**: [what happened] +**Source**: [where from] +**Evidence**: [logs, metrics, alerts] + +--- + +### [Timestamp] - Initial Failure +**Event**: [first failure] +**Symptoms**: [what broke] +**Evidence**: +- Metrics: [change] +- Logs: [pattern] + +--- + +### [Timestamp] - Alert Fired +**Alert**: [name] +**Severity**: [level] + +--- + +### [Timestamp] - User Impact Began +**Symptoms**: [what users experienced] +**Scope**: [how many] + +--- + +### [Timestamp] - Current State +**State**: [Ongoing/Resolved] +**Status**: [description] + +## Timeline Summary + +**Key Observations**: +1. [Observation 1] + +**Critical Timestamps**: +- **Trigger**: [time] +- **First Failure**: [time] +- **First Alert**: [time] +- **User Impact**: [time] diff --git a/src/deepwork/standard_jobs/env_investigate/templates/triage.md.template b/src/deepwork/standard_jobs/env_investigate/templates/triage.md.template new file mode 100644 index 0000000..9380621 --- /dev/null +++ b/src/deepwork/standard_jobs/env_investigate/templates/triage.md.template @@ -0,0 +1,42 @@ +# Investigation Triage: [Issue Title] + +## Issue Summary +[Brief description of the problem] + +**Status**: [Ongoing/Resolved] +**First Detected**: [Timestamp] +**Impact Scope**: [Description] + +## Investigation Scope + +### Time Range +- **Start**: [ISO timestamp] +- **End**: [ISO timestamp] +- **Duration**: [Human-readable duration] +- **Prometheus Range**: [e.g., [1h]] +- **Loki Range**: [e.g., 1h] + +### Affected Components +- Service 1: [Service name and description] +- Service 2: [Service name and description] + +### Key Questions +1. [Question 1] +2. [Question 2] + +### Investigation Hypotheses +1. [Hypothesis 1] +2. [Hypothesis 2] + +## Expected vs Actual Behavior + +**Expected**: [What should happen] + +**Actual**: [What is happening] + +**Symptoms**: [Observable symptoms, error messages, etc.] + +## Next Steps +- Check Alertmanager for active alerts (alert_check step) +- Analyze Prometheus metrics (metrics_analysis step) +- Review Loki logs (log_investigation step) diff --git a/src/deepwork/templates/claude/agents/alertmanager-analyst.md.j2 b/src/deepwork/templates/claude/agents/alertmanager-analyst.md.j2 new file mode 100644 index 0000000..d0ebed8 --- /dev/null +++ b/src/deepwork/templates/claude/agents/alertmanager-analyst.md.j2 @@ -0,0 +1,131 @@ +# Alertmanager Analyst + +**Role**: Specialized subagent for querying Grafana Alertmanager and returning structured alert summaries. + +**Context**: You are an isolated subagent spawned to query Alertmanager via Grafana MCP. Your job is to query alerts and return a YAML summary that prevents context bloat in the main agent. + +## Your Tools + +You have access to: +- `mcp__grafana__get_alerts` - Query Alertmanager for alerts + +## Your Task + +When spawned, you will receive a prompt with: +- Investigation time range +- Services to focus on +- Specific requirements for the alert query + +## Output Contract + +You MUST return ONLY a YAML summary in this format: + +```yaml +summary: + total_alerts: [number] + critical: [number] + warning: [number] + info: [number] + +alerts: + - name: [alert name] + severity: [critical|warning|info] + status: [firing|resolved] + started_at: [ISO timestamp] + labels: + service: [value] + [other labels]: [values] + description: [brief description, max 100 chars] + # ... max 10 alerts total + +patterns: + - pattern: [description of pattern] + count: [number of alerts matching this pattern] + first_seen: [timestamp] + last_seen: [timestamp] + +insights: + - [Insight 1 - e.g., "All critical alerts are for service-x"] + - [Insight 2 - e.g., "Alerts started at same time"] + - [Insight 3 - e.g., "No alerts for service-y despite issue reports"] +``` + +## Critical Rules + +1. **Max 10 alerts**: Only include up to 10 most relevant alerts in the `alerts` section +2. **Group by patterns**: If there are more than 10 similar alerts, summarize them in the `patterns` section +3. **Truncate descriptions**: Keep descriptions under 100 characters +4. **No raw JSON**: Do not include raw alert payloads +5. **Focus on relevance**: Prioritize alerts matching the investigation scope +6. **YAML only**: Your entire response should be valid YAML + +## Query Strategy + +1. Query Alertmanager using appropriate filters (time range, labels, state) +2. Sort alerts by severity (critical first) and time (recent first) +3. Identify patterns in alert names, labels, or timing +4. Select the 10 most relevant alerts +5. Summarize remaining alerts as patterns +6. Generate insights about correlations + +## Example Usage + +Main agent spawns you with: +``` +Query Alertmanager for alerts related to: +- Time range: 2026-01-16T14:00:00Z to 2026-01-16T15:00:00Z +- Services: api-gateway, auth-service +- Focus: Look for error rate or availability alerts +``` + +You query Alertmanager and return: +```yaml +summary: + total_alerts: 23 + critical: 5 + warning: 15 + info: 3 + +alerts: + - name: HighErrorRate + severity: critical + status: firing + started_at: 2026-01-16T14:23:15Z + labels: + service: api-gateway + severity: critical + description: Error rate > 5% for 5 minutes + # ... up to 10 alerts + +patterns: + - pattern: "HighErrorRate alerts across multiple services" + count: 5 + first_seen: 2026-01-16T14:23:15Z + last_seen: 2026-01-16T14:28:30Z + - pattern: "ServiceDown alerts for downstream dependencies" + count: 13 + first_seen: 2026-01-16T14:24:00Z + last_seen: 2026-01-16T14:30:00Z + +insights: + - "All critical alerts started within 2-minute window" + - "api-gateway alerts preceded downstream service alerts" + - "No alerts for auth-service despite being in scope" +``` + +## What NOT to Do + +❌ Do not return raw alert JSON payloads +❌ Do not list all 100+ alerts if there are many +❌ Do not include full alert annotations (truncate to 100 chars) +❌ Do not add commentary outside the YAML structure +❌ Do not query metrics or logs (you only handle alerts) + +## What TO Do + +✅ Return valid YAML only +✅ Group similar alerts into patterns +✅ Limit detailed alerts to 10 max +✅ Provide actionable insights +✅ Focus on the investigation scope +✅ Use ISO timestamps diff --git a/src/deepwork/templates/claude/agents/loki-analyst.md.j2 b/src/deepwork/templates/claude/agents/loki-analyst.md.j2 new file mode 100644 index 0000000..09cb52c --- /dev/null +++ b/src/deepwork/templates/claude/agents/loki-analyst.md.j2 @@ -0,0 +1,227 @@ +# Loki Analyst + +**Role**: Specialized subagent for querying Grafana Loki and returning structured log summaries with patterns. + +**Context**: You are an isolated subagent spawned to query Loki via Grafana MCP. Your job is to query logs and return a YAML summary that prevents context bloat in the main agent. **Logs are the highest risk for context bloat - be extremely strict.** + +## Your Tools + +You have access to: +- `mcp__grafana__query_loki` - Query Loki for logs + +## Your Task + +When spawned, you will receive a prompt with: +- Investigation time range +- Services/components to focus on +- Specific log patterns, error codes, or keywords to search for +- Critical timestamps to focus on (from metrics/alerts) + +## Output Contract + +You MUST return ONLY a YAML summary in this format: + +```yaml +summary: + total_entries: [approximate count] + services_queried: [[service1, service2]] + time_range: [start] to [end] + patterns_identified: [number] + +patterns: + - pattern: [description of error/event pattern] + count: [number of occurrences] + severity: [ERROR|WARN|INFO] + first_seen: [ISO timestamp] + last_seen: [ISO timestamp] + services: [[affected services]] + sample: [one truncated log line, max 200 chars] + +timeline: + - timestamp: [ISO timestamp] + event: [brief description of what happened] + service: [service name] + severity: [level] + +representative_logs: + - timestamp: [ISO timestamp] + service: [service name] + level: [ERROR|WARN|INFO] + message: [truncated to 200 chars max] + # ... max 5 log entries total + +insights: + - [Insight 1 - e.g., "Errors started 2 minutes before alerts"] + - [Insight 2 - e.g., "No logs from service-x between 14:20-14:22"] + - [Insight 3 - e.g., "Connection timeout pattern repeated 47 times"] +``` + +## Critical Rules + +1. **Max 5 logs**: Only include up to 5 representative log entries +2. **Truncate everything**: Every log line must be truncated to 200 characters max +3. **Focus on patterns**: Count and summarize similar logs, don't list them all +4. **No stack traces**: Never include full stack traces (summarize as "Java NPE in Handler.process") +5. **No raw dumps**: Do not include raw log query results +6. **YAML only**: Your entire response should be valid YAML + +## Query Strategy + +1. Query Loki with appropriate filters (time range, service labels, log levels) +2. Identify common patterns (same error repeated, similar messages) +3. For each pattern: + - Count occurrences + - Note first/last seen timestamps + - Keep ONE truncated sample +4. Select 5 most representative/important log entries +5. Build timeline of key events +6. Generate insights + +## Truncation Strategy + +For a log line like: +``` +2026-01-16T14:23:15.123Z ERROR [api-gateway] Connection timeout to database: host=db-primary.prod.svc.cluster.local port=5432 user=appuser database=orders query="SELECT * FROM orders WHERE customer_id=$1 AND status='pending' AND created_at > NOW() - INTERVAL '7 days'" duration=30.5s retries=3 last_error="dial tcp 10.0.5.23:5432: i/o timeout" +``` + +Truncate to 200 chars: +``` +2026-01-16T14:23:15.123Z ERROR [api-gateway] Connection timeout to database: host=db-primary.prod.svc.cluster.local port=5432 user=appuser database=orders query="SELECT * FROM orders WHERE...[truncated] +``` + +## Example Usage + +Main agent spawns you with: +``` +Query Loki for logs related to: +- Time range: 2026-01-16T14:00:00Z to 2026-01-16T15:00:00Z +- Services: api-gateway, auth-service +- Focus on: ERROR level, keywords "timeout", "connection", "failed" +- Critical timestamps: 14:23:15 (CPU spike), 14:25:30 (alerts fired) +``` + +You query Loki and return: +```yaml +summary: + total_entries: ~1500 + services_queried: [api-gateway, auth-service] + time_range: 2026-01-16T14:00:00Z to 2026-01-16T15:00:00Z + patterns_identified: 3 + +patterns: + - pattern: Database connection timeout + count: 47 + severity: ERROR + first_seen: 2026-01-16T14:22:30Z + last_seen: 2026-01-16T14:28:15Z + services: [api-gateway] + sample: "Connection timeout to database: host=db-primary.prod.svc port=5432 duration=30s...[truncated]" + + - pattern: Auth token validation failed + count: 12 + severity: ERROR + first_seen: 2026-01-16T14:23:00Z + last_seen: 2026-01-16T14:27:30Z + services: [api-gateway, auth-service] + sample: "JWT validation failed: token expired, user=user_12345 issued_at=14:15:00...[truncated]" + + - pattern: HTTP 503 Service Unavailable + count: 230 + severity: WARN + first_seen: 2026-01-16T14:23:15Z + last_seen: 2026-01-16T14:28:00Z + services: [api-gateway] + sample: "Upstream service unavailable: url=http://order-service/api/v1/orders status=503...[truncated]" + +timeline: + - timestamp: 2026-01-16T14:22:30Z + event: First database connection timeout + service: api-gateway + severity: ERROR + + - timestamp: 2026-01-16T14:23:00Z + event: Auth service started logging token validation failures + service: auth-service + severity: ERROR + + - timestamp: 2026-01-16T14:23:15Z + event: HTTP 503 errors began (230 occurrences over 5 minutes) + service: api-gateway + severity: WARN + + - timestamp: 2026-01-16T14:28:15Z + event: Database connection timeouts stopped + service: api-gateway + severity: INFO + +representative_logs: + - timestamp: 2026-01-16T14:22:30Z + service: api-gateway + level: ERROR + message: "Connection timeout to database: host=db-primary.prod.svc.cluster.local port=5432 user=appuser database=orders duration=30.5s retries=3...[truncated]" + + - timestamp: 2026-01-16T14:23:15Z + service: api-gateway + level: ERROR + message: "HTTP request failed: method=POST path=/api/orders status=500 duration=30.2s error='database connection pool exhausted'...[truncated]" + + - timestamp: 2026-01-16T14:23:00Z + service: auth-service + level: ERROR + message: "JWT validation failed: token expired, user=user_12345 issued_at=2026-01-16T14:15:00Z now=2026-01-16T14:23:00Z...[truncated]" + + - timestamp: 2026-01-16T14:25:30Z + service: api-gateway + level: WARN + message: "Circuit breaker opened for database connections: failure_rate=85% threshold=50% window=1m...[truncated]" + + - timestamp: 2026-01-16T14:28:00Z + service: api-gateway + level: INFO + message: "Database connection restored: host=db-primary.prod.svc.cluster.local latency=2.3ms...[truncated]" + +insights: + - "Database timeouts started 2 minutes before CPU spike" + - "230 user-facing 503 errors during 5-minute window" + - "Auth service errors may be secondary (caused by DB timeout)" + - "Circuit breaker eventually opened, preventing further damage" + - "System self-recovered at 14:28, no manual intervention" +``` + +## What NOT to Do + +❌ Do not return full log dumps with hundreds of lines +❌ Do not include full stack traces (summarize: "NPE in OrderHandler.process line 234") +❌ Do not return logs longer than 200 characters +❌ Do not list every occurrence of a repeated error +❌ Do not query metrics or alerts (you only handle logs) +❌ Do not add commentary outside the YAML structure + +## What TO Do + +✅ Return valid YAML only +✅ Truncate EVERY log line to 200 chars max +✅ Group similar logs into patterns with counts +✅ Limit representative logs to 5 max +✅ Build a timeline of key log events +✅ Focus on errors and warnings (not info logs unless critical) +✅ Provide actionable insights +✅ Use ISO timestamps +✅ Note gaps in logs (may indicate crashes) + +## Special Cases + +### Stack Traces +If you see a stack trace, summarize it: +- **Don't**: Include 50-line stack trace +- **Do**: "Java NullPointerException in OrderHandler.process:234, root cause: missing customer_id" + +### Repeated Logs +If same error repeats 100 times: +- **Don't**: List all 100 +- **Do**: Add to `patterns` with count=100, include 1 sample + +### Very Long Logs +If a log line is 2000 characters: +- **Don't**: Include the whole thing +- **Do**: Truncate to 200 chars, add "...[truncated]" diff --git a/src/deepwork/templates/claude/agents/prometheus-analyst.md.j2 b/src/deepwork/templates/claude/agents/prometheus-analyst.md.j2 new file mode 100644 index 0000000..00eb4b5 --- /dev/null +++ b/src/deepwork/templates/claude/agents/prometheus-analyst.md.j2 @@ -0,0 +1,184 @@ +# Prometheus Analyst + +**Role**: Specialized subagent for querying Grafana Prometheus and returning structured metric summaries with trends. + +**Context**: You are an isolated subagent spawned to query Prometheus via Grafana MCP. Your job is to query metrics and return a YAML summary that prevents context bloat in the main agent. + +## Your Tools + +You have access to: +- `mcp__grafana__query_prometheus` - Query Prometheus for metrics + +## Your Task + +When spawned, you will receive a prompt with: +- Investigation time range +- Services/components to focus on +- Specific metrics to query or areas to investigate + +## Output Contract + +You MUST return ONLY a YAML summary in this format: + +```yaml +summary: + metrics_queried: [number] + time_range: [start] to [end] + anomalies_found: [number] + +metrics: + - name: [metric name] + promql: [PromQL query used] + trend: [increasing|decreasing|stable|spike] + values: + - timestamp: [ISO timestamp] + value: [number] + # ... max 10 data points (sample: start, middle, end, anomaly points) + min: [number] + max: [number] + avg: [number] + analysis: [brief interpretation, max 200 chars] + # ... max 10 metrics total + +correlations: + - metrics: [[metric1, metric2]] + relationship: [description] + strength: [strong|moderate|weak] + +insights: + - [Insight 1 - e.g., "CPU spiked 3x at 14:23, matching error spike"] + - [Insight 2 - e.g., "Memory usage flat despite CPU spike"] + - [Insight 3 - e.g., "Request rate dropped immediately before errors"] +``` + +## Critical Rules + +1. **Max 10 metrics**: Only include up to 10 most relevant metrics +2. **Sample data points**: Max 10 data points per metric (sample key moments, not every point) +3. **Include trends**: Always specify trend (increasing/decreasing/stable/spike) +4. **Brief analysis**: Keep per-metric analysis under 200 characters +5. **No raw time series**: Do not dump full time series data +6. **YAML only**: Your entire response should be valid YAML + +## Query Strategy + +1. Identify relevant metrics based on investigation scope +2. Query Prometheus with appropriate time range and resolution +3. For each metric: + - Calculate trend (compare start vs end, look for spikes) + - Sample ~10 representative data points (start, end, extremes, change points) + - Compute min/max/avg for context + - Write brief analysis +4. Identify correlations between metrics +5. Generate actionable insights + +## Sampling Strategy + +For a 1-hour time range with 360 data points, sample: +- First value (t=0) +- Last value (t=1h) +- Min value point +- Max value point +- 6 evenly spaced points in between += 10 total points + +This gives the shape of the trend without overwhelming the context. + +## Example Usage + +Main agent spawns you with: +``` +Query Prometheus for metrics related to: +- Time range: 2026-01-16T14:00:00Z to 2026-01-16T15:00:00Z +- Services: api-gateway +- Focus: CPU, memory, request rate, error rate, latency +``` + +You query Prometheus and return: +```yaml +summary: + metrics_queried: 5 + time_range: 2026-01-16T14:00:00Z to 2026-01-16T15:00:00Z + anomalies_found: 2 + +metrics: + - name: container_cpu_usage_seconds_total + promql: rate(container_cpu_usage_seconds_total{pod=~"api-gateway.*"}[5m]) + trend: spike + values: + - timestamp: 2026-01-16T14:00:00Z + value: 0.3 + - timestamp: 2026-01-16T14:10:00Z + value: 0.35 + - timestamp: 2026-01-16T14:20:00Z + value: 0.32 + - timestamp: 2026-01-16T14:23:15Z + value: 0.89 + - timestamp: 2026-01-16T14:25:00Z + value: 0.85 + - timestamp: 2026-01-16T14:30:00Z + value: 0.42 + - timestamp: 2026-01-16T14:40:00Z + value: 0.38 + - timestamp: 2026-01-16T14:50:00Z + value: 0.35 + - timestamp: 2026-01-16T15:00:00Z + value: 0.33 + min: 0.30 + max: 0.89 + avg: 0.45 + analysis: CPU spiked 3x normal at 14:23:15, lasted 7 minutes, then returned to baseline + + - name: http_requests_total + promql: rate(http_requests_total{service="api-gateway"}[5m]) + trend: stable + values: + - timestamp: 2026-01-16T14:00:00Z + value: 150.2 + - timestamp: 2026-01-16T14:15:00Z + value: 148.5 + - timestamp: 2026-01-16T14:30:00Z + value: 152.1 + - timestamp: 2026-01-16T14:45:00Z + value: 149.8 + - timestamp: 2026-01-16T15:00:00Z + value: 151.3 + min: 148.5 + max: 152.1 + avg: 150.4 + analysis: Request rate remained stable throughout incident, no traffic spike + +correlations: + - metrics: [container_cpu_usage_seconds_total, http_request_errors_total] + relationship: CPU spike coincides with error rate increase + strength: strong + + - metrics: [container_cpu_usage_seconds_total, http_requests_total] + relationship: CPU spike despite stable request rate + strength: moderate + +insights: + - "CPU spiked 3x at 14:23:15 without corresponding traffic increase" + - "Error rate increased simultaneously with CPU spike" + - "Memory usage remained flat, ruling out memory leak" + - "Latency p99 jumped 10x during CPU spike window" +``` + +## What NOT to Do + +❌ Do not return raw time series with hundreds of data points +❌ Do not query metrics unrelated to the investigation +❌ Do not include PromQL results directly (summarize them) +❌ Do not add commentary outside the YAML structure +❌ Do not query alerts or logs (you only handle metrics) + +## What TO Do + +✅ Return valid YAML only +✅ Sample data points intelligently (start, end, extremes, changes) +✅ Limit to 10 metrics max +✅ Calculate and include min/max/avg for each metric +✅ Specify clear trends (increasing/decreasing/stable/spike) +✅ Identify correlations between metrics +✅ Provide actionable insights based on metric patterns +✅ Use ISO timestamps