diff --git a/.deepwork/rules/architecture-documentation-accuracy.md b/.deepwork/rules/architecture-documentation-accuracy.md index 91798109..eafa3c47 100644 --- a/.deepwork/rules/architecture-documentation-accuracy.md +++ b/.deepwork/rules/architecture-documentation-accuracy.md @@ -3,9 +3,12 @@ name: Architecture Documentation Accuracy trigger: src/**/* safety: doc/architecture.md compare_to: base +prompt_runtime: claude --- Source code in src/ has been modified. Please review doc/architecture.md for accuracy: 1. Verify the documented architecture matches the current implementation 2. Check that file paths and directory structures are still correct 3. Ensure component descriptions reflect actual behavior 4. Update any diagrams or flows that may have changed + +If the architecture documentation needs updates, make the changes directly. If the documentation is accurate, confirm it matches the current implementation. diff --git a/.deepwork/rules/manual-test-claude-runtime.md b/.deepwork/rules/manual-test-claude-runtime.md new file mode 100644 index 00000000..c58c4b14 --- /dev/null +++ b/.deepwork/rules/manual-test-claude-runtime.md @@ -0,0 +1,27 @@ +--- +name: "Manual Test: Claude Runtime" +trigger: manual_tests/test_claude_runtime/test_claude_runtime_code.py +compare_to: prompt +prompt_runtime: claude +--- + +# Manual Test: Claude Runtime + +You are evaluating code changes as part of an automated rule check. + +**Review the code in the trigger file for:** +1. Basic code quality (clear variable names, proper structure) +2. Presence of docstrings or comments +3. No obvious bugs or issues + +**This is a test rule.** For testing purposes: +- If the code looks reasonable, respond with `allow` +- If there are obvious issues (syntax errors, missing functions, etc.), respond with `block` + +Since this is a manual test, the code is intentionally simple and should pass review. + +## This tests: + +The `prompt_runtime: claude` feature where instead of returning the prompt to +the triggering agent, Claude Code is invoked in headless mode to process +the rule autonomously. diff --git a/.deepwork/rules/manual-test-created-mode.md b/.deepwork/rules/manual-test-created-mode.md index abb6108d..8c9fb33d 100644 --- a/.deepwork/rules/manual-test-created-mode.md +++ b/.deepwork/rules/manual-test-created-mode.md @@ -2,6 +2,7 @@ name: "Manual Test: Created Mode" created: manual_tests/test_created_mode/*.yml compare_to: prompt +prompt_runtime: send_to_stopping_agent --- # Manual Test: Created Mode (File Creation Trigger) diff --git a/.deepwork/rules/manual-test-infinite-block-prompt.md b/.deepwork/rules/manual-test-infinite-block-prompt.md index 67c97414..7f9d629a 100644 --- a/.deepwork/rules/manual-test-infinite-block-prompt.md +++ b/.deepwork/rules/manual-test-infinite-block-prompt.md @@ -2,6 +2,7 @@ name: "Manual Test: Infinite Block Prompt" trigger: manual_tests/test_infinite_block_prompt/test_infinite_block_prompt.py compare_to: prompt +prompt_runtime: send_to_stopping_agent --- # Manual Test: Infinite Block Prompt (Promise Required) diff --git a/.deepwork/rules/manual-test-multi-safety.md b/.deepwork/rules/manual-test-multi-safety.md index 4ce978cb..3e19a710 100644 --- a/.deepwork/rules/manual-test-multi-safety.md +++ b/.deepwork/rules/manual-test-multi-safety.md @@ -5,6 +5,7 @@ safety: - manual_tests/test_multi_safety/test_multi_safety_changelog.md - manual_tests/test_multi_safety/test_multi_safety_version.txt compare_to: prompt +prompt_runtime: send_to_stopping_agent --- # Manual Test: Multiple Safety Patterns diff --git a/.deepwork/rules/manual-test-pair-mode.md b/.deepwork/rules/manual-test-pair-mode.md index 9c2379bf..d0ed65ef 100644 --- a/.deepwork/rules/manual-test-pair-mode.md +++ b/.deepwork/rules/manual-test-pair-mode.md @@ -4,6 +4,7 @@ pair: trigger: manual_tests/test_pair_mode/test_pair_mode_trigger.py expects: manual_tests/test_pair_mode/test_pair_mode_expected.md compare_to: prompt +prompt_runtime: send_to_stopping_agent --- # Manual Test: Pair Mode (Directional Correspondence) diff --git a/.deepwork/rules/manual-test-set-mode.md b/.deepwork/rules/manual-test-set-mode.md index abe504ec..41e38b63 100644 --- a/.deepwork/rules/manual-test-set-mode.md +++ b/.deepwork/rules/manual-test-set-mode.md @@ -4,6 +4,7 @@ set: - manual_tests/test_set_mode/test_set_mode_source.py - manual_tests/test_set_mode/test_set_mode_test.py compare_to: prompt +prompt_runtime: send_to_stopping_agent --- # Manual Test: Set Mode (Bidirectional Correspondence) diff --git a/.deepwork/rules/manual-test-trigger-safety.md b/.deepwork/rules/manual-test-trigger-safety.md index b144a2a0..be391dd6 100644 --- a/.deepwork/rules/manual-test-trigger-safety.md +++ b/.deepwork/rules/manual-test-trigger-safety.md @@ -3,6 +3,7 @@ name: "Manual Test: Trigger Safety" trigger: manual_tests/test_trigger_safety_mode/test_trigger_safety_mode.py safety: manual_tests/test_trigger_safety_mode/test_trigger_safety_mode_doc.md compare_to: prompt +prompt_runtime: send_to_stopping_agent --- # Manual Test: Trigger/Safety Mode diff --git a/.deepwork/rules/readme-accuracy.md b/.deepwork/rules/readme-accuracy.md index 9e75c596..e04672a3 100644 --- a/.deepwork/rules/readme-accuracy.md +++ b/.deepwork/rules/readme-accuracy.md @@ -3,9 +3,12 @@ name: README Accuracy trigger: src/**/* safety: README.md compare_to: base +prompt_runtime: claude --- Source code in src/ has been modified. Please review README.md for accuracy: 1. Verify project overview still reflects current functionality 2. Check that usage examples are still correct 3. Ensure installation/setup instructions remain valid 4. Update any sections that reference changed code + +If the README needs updates, make the changes directly. If the README is accurate, confirm it matches the current implementation. diff --git a/.deepwork/rules/skill-template-best-practices.md b/.deepwork/rules/skill-template-best-practices.md index ff33ecfd..941cff57 100644 --- a/.deepwork/rules/skill-template-best-practices.md +++ b/.deepwork/rules/skill-template-best-practices.md @@ -2,6 +2,7 @@ name: Skill Template Best Practices trigger: src/deepwork/templates/**/skill-job*.jinja compare_to: prompt +prompt_runtime: send_to_stopping_agent --- Skill template files are being modified. Ensure the generated skills follow these best practices: diff --git a/.deepwork/rules/standard-jobs-source-of-truth.md b/.deepwork/rules/standard-jobs-source-of-truth.md index 2d0092c9..086b5707 100644 --- a/.deepwork/rules/standard-jobs-source-of-truth.md +++ b/.deepwork/rules/standard-jobs-source-of-truth.md @@ -7,6 +7,7 @@ safety: - src/deepwork/standard_jobs/deepwork_jobs/**/* - src/deepwork/standard_jobs/deepwork_rules/**/* compare_to: base +prompt_runtime: send_to_stopping_agent --- You modified files in `.deepwork/jobs/deepwork_jobs/` or `.deepwork/jobs/deepwork_rules/`. diff --git a/.deepwork/rules/version-and-changelog-update.md b/.deepwork/rules/version-and-changelog-update.md index ac617f8e..9d0497af 100644 --- a/.deepwork/rules/version-and-changelog-update.md +++ b/.deepwork/rules/version-and-changelog-update.md @@ -5,6 +5,7 @@ safety: - pyproject.toml - CHANGELOG.md compare_to: base +prompt_runtime: send_to_stopping_agent --- Source code in src/ has been modified. **You MUST evaluate whether version and changelog updates are needed.** diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a6a3ce0..654b696d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,49 +5,10 @@ All notable changes to DeepWork will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.5.2] - 2026-01-22 - -### Fixed -- Fixed COMMAND rules promise handling to properly update queue status - - When an agent provides a promise tag for a FAILED command rule, the queue entry is now correctly updated to SKIPPED status - - Previously, FAILED queue entries remained in FAILED state even after being acknowledged via promise - - This ensures the rules queue accurately reflects rule state throughout the workflow - -## [0.5.1] - 2026-01-22 - -### Fixed -- Fixed quality criteria validation logic in skill template (#111) - - Changed promise condition from AND to OR: promise OR all criteria met now passes - - Changed failure condition from OR to AND: requires both criteria NOT met AND promise missing to fail - - This corrects the logic so the promise mechanism properly serves as a bypass for quality criteria - -## [0.5.0] - 2026-01-20 - -### Changed -- **BREAKING**: Renamed `document_type` to `doc_spec` throughout the codebase - - Job.yml field: `document_type` → `doc_spec` (e.g., `outputs: [{file: "report.md", doc_spec: ".deepwork/doc_specs/report.md"}]`) - - Class: `DocumentTypeDefinition` → `DocSpec` (backward compat alias provided) - - Methods: `has_document_type()` → `has_doc_spec()`, `validate_document_type_references()` → `validate_doc_spec_references()` - - Template variables: `has_document_type` → `has_doc_spec`, `document_type` → `doc_spec` - - Internal: `_load_document_type()` → `_load_doc_spec()`, `_doc_type_cache` → `_doc_spec_cache` +## [0.4.0] - 2026-01-22 ### Added -- Comprehensive tests for generator doc spec integration (9 new tests) - - `test_load_doc_spec_returns_parsed_spec` - Verifies doc spec loading - - `test_load_doc_spec_caches_result` - Verifies caching behavior - - `test_load_doc_spec_returns_none_for_missing_file` - Graceful handling of missing files - - `test_generate_step_skill_with_doc_spec` - End-to-end skill generation with doc spec - - `test_build_step_context_includes_doc_spec_info` - Context building verification - -### Migration Guide -- Update job.yml files: Change `document_type:` to `doc_spec:` in output definitions -- Update any code importing `DocumentTypeDefinition`: Use `DocSpec` instead (alias still works) -- Run `deepwork install` to regenerate skills with updated terminology - -## [0.4.0] - 2026-01-20 - -### Added -- Doc specs (document specifications) as a first-class feature for formalizing document quality criteria +- **Doc specs** (document specifications) as a first-class feature for formalizing document quality criteria - New `src/deepwork/schemas/doc_spec_schema.py` with JSON schema validation - New `src/deepwork/core/doc_spec_parser.py` with parser for frontmatter markdown doc spec files - Doc spec files stored in `.deepwork/doc_specs/` directory with quality criteria and example documents @@ -55,23 +16,47 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Extended job.yml output schema to support doc spec references - Outputs can now be strings (backward compatible) or objects with `file` and optional `doc_spec` fields - Example: `outputs: [{file: "report.md", doc_spec: ".deepwork/doc_specs/monthly_report.md"}]` - - The `doc_spec` uses the full path to the doc spec file, making references self-documenting -- Doc spec-aware skill generation - - Step skills now include doc spec quality criteria, target audience, and example documents - - Both Claude and Gemini templates updated for doc spec rendering -- Document detection workflow in `deepwork_jobs.define` - - Steps 1.5, 1.6, 1.7 guide users through creating doc specs for document-oriented jobs - - Pattern indicators: "report", "summary", "create", "monthly", "for stakeholders" -- Doc spec improvement workflow in `deepwork_jobs.learn` - - Steps 3.5, 4.5 capture doc spec-related learnings and update doc spec files -- New `OutputSpec` dataclass in parser for structured output handling -- Comprehensive doc spec documentation in `doc/doc-specs.md` -- New test fixtures for doc spec validation and parsing +- Doc spec-aware skill generation with quality criteria, target audience, and example documents +- **`prompt_runtime` setting** for rules to control how prompt-type actions are executed + - `send_to_stopping_agent` (default): Returns prompt to the agent that triggered the rule + - `claude`: Invokes Claude Code in headless mode to handle the rule independently +- Claude headless mode execution for automated rule remediation + - Rules with `prompt_runtime: claude` spawn a separate Claude process + - Claude performs required actions and returns structured `block`/`allow` decision + - Useful for automated tasks like documentation updates without blocking the main agent +- **`deepwork rules clear_queue` CLI command** for managing the rules queue (#117) + - Clears all entries from the rules queue to reset state +- Code review stage added to the `commit` standard job (#99) + - New `commit.review` step runs before testing to catch issues early +- Session start hook for version checking (#106) +- Manual tests job for validating hook/rule behavior (#102) ### Changed +- **BREAKING**: Renamed `document_type` to `doc_spec` throughout the codebase + - Job.yml field: `document_type` → `doc_spec` + - Class: `DocumentTypeDefinition` → `DocSpec` (backward compat alias provided) + - Methods: `has_document_type()` → `has_doc_spec()`, `validate_document_type_references()` → `validate_doc_spec_references()` - `Step.outputs` changed from `list[str]` to `list[OutputSpec]` for richer output metadata - `SkillGenerator.generate_all_skills()` now accepts `project_root` parameter for doc spec loading - Updated `deepwork_jobs` to v0.6.0 with doc spec-related quality criteria +- Skill template documentation now uses generic "agent" terminology (#115) + +### Fixed +- Fixed infinite loop bug in rules system when promise tags weren't recognized (#96) + - Rules now properly detect and honor promise acknowledgments +- Fixed COMMAND rules promise handling to properly update queue status (#120) + - FAILED queue entries now correctly update to SKIPPED when acknowledged via promise +- Fixed quality criteria validation logic in skill template (#113) + - Promise OR all criteria met now passes (was incorrectly AND) + - Requires both criteria NOT met AND promise missing to fail +- Fixed `compare_to: prompt` mode not detecting committed files during agent response (#95) + - Rules now search prompts for directory references +- Added timeout to deepwork install hook (#101) + +### Migration Guide +- Update job.yml files: Change `document_type:` to `doc_spec:` in output definitions +- Update any code importing `DocumentTypeDefinition`: Use `DocSpec` instead (alias still works) +- Run `deepwork install` to regenerate skills with updated terminology ## [0.3.1] - 2026-01-20 @@ -180,7 +165,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Initial version. -[0.5.0]: https://github.com/anthropics/deepwork/releases/tag/0.5.0 [0.4.0]: https://github.com/anthropics/deepwork/releases/tag/0.4.0 [0.3.1]: https://github.com/anthropics/deepwork/releases/tag/0.3.1 [0.3.0]: https://github.com/anthropics/deepwork/releases/tag/0.3.0 diff --git a/README.md b/README.md index 7c54d29d..ea5c38ce 100644 --- a/README.md +++ b/README.md @@ -313,6 +313,21 @@ compare_to: prompt --- ``` +**Example Rule with Claude Runtime** (`.deepwork/rules/readme-accuracy.md`): +```markdown +--- +name: README Accuracy +trigger: "src/**/*.py" +compare_to: prompt +prompt_runtime: claude +--- +Source code has been modified. Review README.md for accuracy and update if needed. +``` + +The `prompt_runtime` setting controls how prompt-based rules are executed: +- `send_to_stopping_agent` (default): Returns the rule prompt to the agent that triggered it +- `claude`: Invokes Claude Code in headless mode to evaluate the rule independently + ### Multi-Platform Support Generate native commands and skills tailored for your AI coding assistant. - **Native Integration**: Works directly with the skill/command formats of supported agents. diff --git a/doc/architecture.md b/doc/architecture.md index 95617d1f..4997f447 100644 --- a/doc/architecture.md +++ b/doc/architecture.md @@ -1043,7 +1043,7 @@ Please create or update tests for the modified source files. ### Detection Modes -Rules support three detection modes: +Rules support four detection modes: **1. Trigger/Safety (default)** - Fire when trigger matches but safety doesn't: ```yaml @@ -1078,6 +1078,16 @@ compare_to: base --- ``` +**4. Created** - Fire when newly created files match patterns: +```yaml +--- +name: New Component Checklist +created: "src/components/**/*.tsx" +compare_to: base +--- +``` +This mode triggers only for files that are newly created (not modified), useful for enforcing standards on new files. + ### Action Types **1. Prompt (default)** - Show instructions to the agent: @@ -1102,6 +1112,42 @@ compare_to: prompt --- ``` +### Prompt Runtime + +For prompt-type actions, you can specify how the prompt is delivered using the `prompt_runtime` setting: + +**1. send_to_stopping_agent (default)** - Return the prompt to the agent that triggered the rule: +```yaml +--- +name: Security Review +trigger: "src/auth/**/*" +compare_to: base +prompt_runtime: send_to_stopping_agent +--- +Please check for hardcoded credentials and validate input. +``` + +**2. claude** - Invoke Claude Code in headless mode to handle the rule: +```yaml +--- +name: Architecture Documentation Accuracy +trigger: "src/deepwork/core/**/*.py" +safety: "doc/architecture.md" +compare_to: base +prompt_runtime: claude +--- +Review doc/architecture.md for accuracy against the current implementation. +``` + +When `prompt_runtime: claude` is set, the rule evaluation: +1. Spawns a separate Claude Code process in headless mode +2. Passes the rule instructions as a prompt +3. Claude performs the required actions (e.g., updating documentation) +4. Returns a structured `block` or `allow` decision +5. If `allow`, the rule is marked as passed without blocking the original agent + +This is useful for automated remediation tasks that don't require user interaction. + ### Rule Evaluation Flow 1. **Session Start**: When a Claude Code session begins, the baseline git state is captured @@ -1289,15 +1335,21 @@ See `doc/doc-specs.md` for complete documentation. ### Rule Schema -Rules are validated against a JSON Schema: +Rules are validated against a JSON Schema. The frontmatter supports these fields: -```yaml -- name: string # Required: Friendly name for the rule - trigger: string|array # Required: Glob pattern(s) for triggering files - safety: string|array # Optional: Glob pattern(s) for safety files - instructions: string # Required (unless instructions_file): What to do - instructions_file: string # Alternative: Path to instructions file -``` +| Field | Required | Description | +|-------|----------|-------------| +| `name` | Yes | Human-friendly name for the rule (displayed in promise tags) | +| `compare_to` | Yes | Baseline for detecting file changes: `base`, `default_tip`, or `prompt` | +| `trigger` | One mode required | Glob pattern(s) for triggering files (trigger/safety mode) | +| `safety` | No | Glob pattern(s) that suppress the rule if changed | +| `set` | One mode required | Array of patterns for bidirectional correspondence | +| `pair` | One mode required | Object with `trigger` and `expects` for directional correspondence | +| `created` | One mode required | Glob pattern(s) for newly created files | +| `action` | No | Object with `command` and optional `run_for` for command actions | +| `prompt_runtime` | No | `send_to_stopping_agent` (default) or `claude` for headless execution | + +The markdown body after the frontmatter contains the instructions for prompt-type rules. ### Defining Rules diff --git a/doc/debugging_history/claude_subprocess_investigation.md b/doc/debugging_history/claude_subprocess_investigation.md new file mode 100644 index 00000000..d3737e63 --- /dev/null +++ b/doc/debugging_history/claude_subprocess_investigation.md @@ -0,0 +1,164 @@ +# Claude Subprocess Investigation + +**Date**: 2026-01-22 +**Branch**: `claude/add-prompt-runtime-setting-gPJDA` +**Issue**: Running Claude as a subprocess from within Claude Code hangs indefinitely + +## Problem Statement + +The `prompt_runtime: claude` feature in DeepWork rules is designed to invoke Claude Code in headless mode to autonomously evaluate rules. When a rule with this setting triggers, the hook should: + +1. Spawn `claude --print` as a subprocess +2. Send the rule prompt to it +3. Parse the response for allow/block decision +4. Return the result + +However, when running inside a Claude Code session, this subprocess invocation hangs indefinitely. + +## Environment + +- Claude Code version: 2.1.15 +- Platform: macOS (Darwin 25.2.0) +- Python: 3.11.14 (nix-managed) +- Environment variables: `CLAUDECODE=1`, `CLAUDE_CODE_ENTRYPOINT=cli` + +## What Works + +### Direct Bash Execution (via Claude's Bash tool) +```bash +echo "Say TEST" | claude --print --output-format json 2>&1 | cat +# Returns JSON response immediately +``` + +### Piping through head -1 +```bash +echo "Say TEST" | claude --print --output-format json 2>&1 | head -1 +# Returns JSON and terminates cleanly +``` + +### Python heredoc script (run directly in bash) +```bash +python3 << 'EOF' +import subprocess +# ... subprocess code ... +EOF +# Works correctly +``` + +## What Doesn't Work + +### Python subprocess.run from within Claude +```python +# This hangs indefinitely when run from Python inside Claude Code +result = subprocess.run( + ["claude", "--print", "prompt"], + capture_output=True, + timeout=30, +) +``` + +### Shell=True with pipes +```python +# Also hangs +subprocess.run( + 'echo "prompt" | claude --print', + shell=True, + capture_output=True, +) +``` + +### Popen with various options +```python +# All of these hang: +# - start_new_session=True +# - close_fds=True +# - stdin=subprocess.DEVNULL +# - Writing to temp file instead of capture_output +``` + +### Environment variable clearing +```bash +# Still hangs +CLAUDECODE= timeout 15 bash -c 'echo "test" | claude --print' +env -i PATH="$PATH" HOME="$HOME" timeout 15 bash -c 'echo "test" | claude --print' +``` + +## Key Observations + +1. **Direct bash works, Python subprocess doesn't**: The exact same command that works when run via Claude's Bash tool hangs when run via Python's subprocess module. + +2. **Piping to `head -1` helps in some cases**: The command `| head -1` causes Claude to terminate after outputting the JSON line, but this doesn't help when the subprocess itself never starts producing output. + +3. **The hang occurs at the subprocess level**: Python's subprocess.run times out waiting for the process, suggesting Claude itself is blocked on something. + +4. **`--output-format json` is required**: Without this, Claude hangs even longer (possibly waiting for terminal interaction). + +5. **Hooks configuration doesn't prevent the hang**: Using `--settings '{"hooks": {}}'` to disable hooks in the subprocess doesn't help. + +## Research Findings + +### Related GitHub Issues +- [#1481 - Background Process Hangs](https://github.com/anthropics/claude-code/issues/1481): Claude Code waits for child processes even when backgrounded +- [#13598 - /dev/tty hang](https://github.com/anthropics/claude-code/issues/13598): Claude can hang when accessing terminal devices +- Subagent documentation states: "Subagents cannot spawn other subagents" - suggesting nested invocation is intentionally limited + +### Root Cause Hypothesis +Claude Code appears to manage its process tree in a way that blocks nested Claude invocations. When running as a subprocess of another Claude instance (detected via `CLAUDECODE=1` environment variable or process hierarchy), the child Claude may be waiting for resources held by the parent. + +## Attempted Solutions + +### 1. Use `--output-format json` + `| head -1` +**Result**: Works from bash, still hangs from Python subprocess + +### 2. Write to temp file instead of capturing output +**Result**: Still hangs - the file remains empty + +### 3. Clear CLAUDECODE environment variable +**Result**: Still hangs - the detection/blocking isn't based on this variable alone + +### 4. Use `start_new_session=True` for process isolation +**Result**: Still hangs + +### 5. Fall back to returning prompt to agent when inside Claude +**Result**: Works but defeats the purpose of `prompt_runtime: claude` + +### 6. Change hook command to use `uv run python` +**Result**: Still hangs - the issue is the nested Claude invocation, not Python version + +## Recommended Next Steps + +1. **Test immediate "allow" return**: Modify the code to immediately return "allow" for claude runtime rules to verify the rest of the flow works. + +2. **Create bash wrapper script**: Instead of invoking Claude from Python, create a standalone bash script that the hook can call. This might bypass the subprocess blocking. + +3. **Investigate Claude's process management**: Look at Claude Code's source or documentation for how it handles child processes and whether there's an API for nested invocation. + +4. **External execution approach**: Consider having the hook queue the rule evaluation and have an external process (outside Claude) handle the actual Claude invocation. + +5. **Test from CI/cron**: Verify that `prompt_runtime: claude` works correctly when invoked from outside a Claude session (e.g., from GitHub Actions or a cron job). + +## Code Changes Made + +The following changes were made to `src/deepwork/hooks/rules_check.py` during this investigation: + +1. Added `is_inside_claude_session()` function to detect nested Claude context +2. Added `--output-format json` to get structured output +3. Added `| head -1` pipe to force clean termination +4. Added temp file approach for prompt/output handling +5. Added extensive comments explaining the sensitivity of the subprocess code + +## Files Modified (not yet committed) + +- `src/deepwork/hooks/rules_check.py` - Multiple changes to invoke_claude_headless() +- `.claude/settings.json` - Changed hook command to use `uv run python` +- `.deepwork/jobs/manual_tests/job.yml` - Added functionality_tests step +- `.deepwork/jobs/manual_tests/steps/functionality_tests.md` - Test instructions + +## Conclusion + +Running Claude as a subprocess from within a Claude Code session appears to be blocked at a fundamental level. The solution likely requires either: +- An official API for nested Claude invocation +- Running the subprocess invocation from outside the Claude process tree +- Accepting the limitation and falling back to returning prompts to the agent + +The `prompt_runtime: claude` feature should work correctly when invoked from external automation (CI, cron, etc.) but cannot work when running inside Claude Code itself. diff --git a/doc/rules_syntax.md b/doc/rules_syntax.md index 2ab86be1..eba0dbfc 100644 --- a/doc/rules_syntax.md +++ b/doc/rules_syntax.md @@ -269,6 +269,19 @@ If an existing file `src/api/users.py` is modified: The markdown body after frontmatter serves as instructions shown to the agent. This is the default when no `action` field is specified. +**Prompt Runtime:** + +Prompt actions can be executed in two ways, controlled by the `prompt_runtime` field: + +| Runtime | Description | +|---------|-------------| +| `send_to_stopping_agent` | Return prompt to the triggering agent (default) | +| `claude` | Invoke Claude Code in headless mode | + +The default (`send_to_stopping_agent`) returns the rule's markdown instructions to whatever agent triggered the hook. The agent sees the instructions and responds accordingly. + +With `claude` runtime, the system invokes Claude Code in headless mode to process the rule autonomously. Claude receives the instructions, performs the requested task, and returns a structured result indicating success or failure. + **Template Variables in Instructions:** | Variable | Description | @@ -483,6 +496,50 @@ compare_to: base --- ``` +### prompt_runtime (optional) + +Determines how prompt actions are executed. Only applies to rules with prompt actions (no `action` field). + +| Value | Description | +|-------|-------------| +| `send_to_stopping_agent` | Return the prompt to the agent that triggered the rule (default) | +| `claude` | Invoke Claude Code in headless mode to process the prompt | + +```yaml +--- +prompt_runtime: send_to_stopping_agent +--- +``` + +**Default behavior (`send_to_stopping_agent`):** + +The rule's markdown body is returned to the agent that triggered the hook. The agent sees the instructions and can respond accordingly, using promise tags to acknowledge the rule. + +**Claude runtime (`claude`):** + +Instead of returning instructions to the triggering agent, Claude Code is invoked in headless mode with the rule's instructions. Claude processes the prompt autonomously and returns a structured response indicating whether the rule was satisfied. + +This is useful when: +- You want rules to be handled by a dedicated Claude instance +- The triggering agent is not Claude (e.g., Gemini) +- You want consistent rule processing regardless of which agent triggered it + +Example with Claude runtime: +```yaml +--- +name: Auto Code Review +trigger: src/**/*.py +compare_to: prompt +prompt_runtime: claude +--- +Review the following Python code changes for: +1. Type safety issues +2. Missing error handling +3. Code style violations + +If issues found, fix them directly. +``` + ## Complete Examples ### Example 1: Test Coverage Rule @@ -623,6 +680,32 @@ action: Automatically lints newly created React components. ``` +### Example 8: Claude-Powered Code Review + +`.deepwork/rules/security-review.md`: +```markdown +--- +name: Security Review +trigger: + - src/auth/**/* + - src/api/**/* +compare_to: prompt +prompt_runtime: claude +--- +Security-sensitive code has been modified. Review for: + +1. **Input validation**: All user inputs are validated and sanitized +2. **Authentication**: Auth checks are properly implemented +3. **Authorization**: Access controls are correctly applied +4. **Secrets**: No hardcoded credentials or API keys +5. **SQL/Injection**: Parameterized queries used, no string concatenation + +If you find any issues, fix them directly in the code. +If the code passes review, confirm it meets security standards. +``` + +This rule invokes Claude Code in headless mode to perform an autonomous security review when auth or API code changes. Claude will analyze the changes and either fix issues directly or confirm the code is secure. + ## Promise Tags When a rule fires but should be dismissed, use promise tags in the conversation. The tag content should be human-readable, using the rule's `name` field: diff --git a/manual_tests/test_claude_runtime/test_claude_runtime_code.py b/manual_tests/test_claude_runtime/test_claude_runtime_code.py new file mode 100644 index 00000000..11208126 --- /dev/null +++ b/manual_tests/test_claude_runtime/test_claude_runtime_code.py @@ -0,0 +1,29 @@ +# Manual Test: Claude Runtime +# This file triggers a rule that uses the 'claude' prompt_runtime. +# +# When this file is edited, the rule should: +# 1. Invoke Claude Code in headless mode +# 2. Claude reviews the code and responds with a structured result +# 3. The hook parses Claude's response (block/allow) +# +# To test: +# 1. Introduce a BLATANT ERROR in the code below (e.g., undefined variable, +# obvious bug like dividing by zero, or completely wrong logic) +# 2. The Claude runtime will invoke Claude Code in headless mode +# 3. Claude should detect the error and return a "block" response +# 4. In Claude Code Web environment, you'll see the fallback prompt instead + + +def calculate_sum(numbers: list[int]) -> int: + """Calculate the sum of a list of numbers.""" + total = 0 + for num in numbers: + total += num + return total + + +def calculate_average(numbers: list[int]) -> float: + """Calculate the average of a list of numbers.""" + if not numbers: + return 0.0 + return calculate_sum(numbers) / len(numbers) diff --git a/pyproject.toml b/pyproject.toml index 1c5f8e02..78db1439 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "deepwork" -version = "0.5.2" +version = "0.4.0" description = "Framework for enabling AI agents to perform complex, multi-step work tasks" readme = "README.md" requires-python = ">=3.11" @@ -108,3 +108,9 @@ warn_redundant_casts = true warn_unused_ignores = true warn_no_return = true strict_equality = true + +[dependency-groups] +dev = [ + "gitpython>=3.1.46", + "pytest>=9.0.2", +] diff --git a/src/deepwork/core/rules_parser.py b/src/deepwork/core/rules_parser.py index 04b1e3d2..7f516592 100644 --- a/src/deepwork/core/rules_parser.py +++ b/src/deepwork/core/rules_parser.py @@ -39,9 +39,19 @@ class ActionType(Enum): COMMAND = "command" # Run an idempotent command +class PromptRuntime(Enum): + """Runtime for executing prompt actions.""" + + SEND_TO_STOPPING_AGENT = "send_to_stopping_agent" # Return prompt to agent (default) + CLAUDE = "claude" # Invoke Claude Code in headless mode + + # Valid compare_to values COMPARE_TO_VALUES = frozenset({"base", "default_tip", "prompt"}) +# Valid prompt_runtime values +PROMPT_RUNTIME_VALUES = frozenset({"send_to_stopping_agent", "claude"}) + @dataclass class CommandAction: @@ -85,6 +95,9 @@ class Rule: instructions: str = "" # For PROMPT action (markdown body) command_action: CommandAction | None = None # For COMMAND action + # Prompt runtime (only relevant for PROMPT action type) + prompt_runtime: PromptRuntime = PromptRuntime.SEND_TO_STOPPING_AGENT + @classmethod def from_frontmatter( cls, @@ -179,6 +192,16 @@ def from_frontmatter( # Get compare_to (required field) compare_to = frontmatter["compare_to"] + # Get prompt_runtime (optional, defaults to send_to_stopping_agent) + prompt_runtime_str = frontmatter.get("prompt_runtime", "send_to_stopping_agent") + try: + prompt_runtime = PromptRuntime(prompt_runtime_str) + except ValueError: + raise RulesParseError( + f"Rule '{name}' has invalid prompt_runtime '{prompt_runtime_str}'. " + f"Valid values: {', '.join(PROMPT_RUNTIME_VALUES)}" + ) from None + return cls( name=name, filename=filename, @@ -192,6 +215,7 @@ def from_frontmatter( instructions=markdown_body.strip(), command_action=command_action, compare_to=compare_to, + prompt_runtime=prompt_runtime, ) diff --git a/src/deepwork/hooks/rules_check.py b/src/deepwork/hooks/rules_check.py index 6ac2d652..ee952dc7 100644 --- a/src/deepwork/hooks/rules_check.py +++ b/src/deepwork/hooks/rules_check.py @@ -34,6 +34,7 @@ from deepwork.core.rules_parser import ( ActionType, DetectionMode, + PromptRuntime, Rule, RuleEvaluationResult, RulesParseError, @@ -534,6 +535,193 @@ def format_rules_message(results: list[RuleEvaluationResult]) -> str: return "\n".join(lines) +def format_claude_prompt(result: RuleEvaluationResult, transcript_path: str | None = None) -> str: + """ + Format a rule evaluation result as a prompt for Claude Code headless mode. + + The prompt includes the rule instructions and expects Claude to return + a structured response indicating whether to block or allow. + + Args: + result: The rule evaluation result + transcript_path: Optional path to the conversation transcript file + + Returns: + Formatted prompt string for Claude + """ + rule = result.rule + lines = [ + "# DeepWork Rule Evaluation", + "", + f"Rule: {rule.name}", + "", + ] + + # Add transcript location for conversation context + if transcript_path: + lines.append("## Conversation Context") + lines.append("") + lines.append(f"The conversation transcript is located at: {transcript_path}") + lines.append("You can read this file to understand the context of the changes being made.") + lines.append("") + + # Add trigger file context + if result.trigger_files: + lines.append("Trigger files:") + for f in result.trigger_files: + lines.append(f" - {f}") + lines.append("") + + # For set/pair modes, show missing files + if result.missing_files: + lines.append("Expected files (not changed):") + for f in result.missing_files: + lines.append(f" - {f}") + lines.append("") + + # Add the rule instructions + lines.append("## Instructions") + lines.append("") + if rule.instructions: + lines.append(rule.instructions.strip()) + lines.append("") + + # Add response format instructions + lines.extend( + [ + "## Response Format", + "", + "After completing the task above, you MUST end your response with a structured block:", + "", + "```", + "---RULE_RESULT---", + 'decision: <"block" or "allow">', + "reason: ", + "---END_RULE_RESULT---", + "```", + "", + "Use 'block' if the rule violation was not resolved, 'allow' if it was resolved.", + ] + ) + + return "\n".join(lines) + + +def parse_claude_response(output: str) -> tuple[str, str]: + """ + Parse the structured response from Claude Code headless mode. + + Returns (decision, reason) tuple. Defaults to ("block", "No response") if parsing fails. + """ + # Look for the structured result block + pattern = r"---RULE_RESULT---\s*\n\s*decision:\s*[\"']?(\w+)[\"']?\s*\n\s*reason:\s*(.+?)\s*\n\s*---END_RULE_RESULT---" + match = re.search(pattern, output, re.IGNORECASE | re.DOTALL) + + if match: + decision = match.group(1).lower().strip() + reason = match.group(2).strip() + # Normalize decision + if decision not in ("block", "allow"): + decision = "block" + return decision, reason + + # If no structured block found, default to block + return "block", "Claude did not return a structured response" + + +def is_claude_code_remote() -> bool: + """Check if running in Claude Code Web/Remote environment.""" + return os.environ.get("CLAUDE_CODE_REMOTE", "").lower() == "true" + + +def invoke_claude_headless(prompt: str, rule_name: str) -> tuple[str, str, str | None]: + """ + Invoke Claude Code in headless mode with the given prompt. + + Args: + prompt: The prompt to send to Claude + rule_name: Name of the rule being evaluated (for error messages) + + Returns: + Tuple of (decision, reason, fallback_prompt) where: + - decision is "block" or "allow" + - reason is the explanation + - fallback_prompt is the prompt to show to agent if Claude can't run (or None) + """ + import tempfile + + # Check if we're in Claude Code Web/Remote environment + if is_claude_code_remote(): + fallback_msg = ( + "**Cannot run `claude` command in Claude Code Web environment.**\n\n" + "Please evaluate the following rule in a sub-agent:\n\n" + f"---\n{prompt}\n---" + ) + return "block", f"Rule '{rule_name}' requires manual evaluation", fallback_msg + + output_path = None + try: + # Create a temporary file for capturing output + # IMPORTANT: We redirect stdout/stderr to a file instead of using pipes + # (capture_output=True). This is critical because when Claude runs as a + # subprocess of another Claude instance, using pipes holds the parent's + # stdout file descriptor open. This blocks the snapshotter in the parent + # Claude, causing a 60-second timeout delay when the subprocess runs. + # By writing to a file and reading it after, we avoid this blocking issue. + with tempfile.NamedTemporaryFile( + mode="w", + suffix="_claude_output.log", + delete=False, + prefix="deepwork_", + ) as tmp: + output_path = tmp.name + + # Run claude in headless mode with output redirected to file + with open(output_path, "w") as outfile: + process = subprocess.Popen( + ["claude", "--print", "--dangerously-skip-permissions", "-p", prompt], + stdout=outfile, + stderr=subprocess.STDOUT, # Merge stderr into the same file + close_fds=True, # Close inherited file descriptors to prevent blocking + cwd=Path.cwd(), + ) + + try: + # Wait for completion with timeout + process.wait(timeout=300) # 5 minute timeout + except subprocess.TimeoutExpired: + process.kill() + process.wait() + return "block", f"Claude timed out while processing rule '{rule_name}'", None + + # Read the output from the file + with open(output_path, "r") as f: + output = f.read().strip() + + if process.returncode != 0: + error_msg = output or "Unknown error" + return "block", f"Claude execution failed: {error_msg}", None + + decision, reason = parse_claude_response(output) + return decision, reason, None + + except FileNotFoundError: + return ( + "block", + "Claude CLI not found. Please ensure 'claude' is installed and in PATH", + None, + ) + except Exception as e: + return "block", f"Error invoking Claude: {str(e)}", None + finally: + # Clean up the temporary file + if output_path: + try: + Path(output_path).unlink(missing_ok=True) + except Exception: + pass # Ignore cleanup errors + + def rules_check_hook(hook_input: HookInput) -> HookOutput: """ Main hook logic for rules evaluation (v2). @@ -614,13 +802,17 @@ def rules_check_hook(hook_input: HookInput) -> HookOutput: ): continue - # For PROMPT rules, also skip if already QUEUED (already shown to agent). - # This prevents infinite loops when transcript is unavailable or promise - # tags haven't been written yet. The agent has already seen this rule. + # For PROMPT rules with send_to_stopping_agent runtime, also skip if + # already QUEUED (already shown to agent). This prevents infinite loops + # when transcript is unavailable or promise tags haven't been written yet. + # The agent has already seen this rule. + # Note: Claude runtime rules should NOT be skipped here because they're + # executed by a separate Claude process, not shown to the stopping agent. if ( existing and existing.status == QueueEntryStatus.QUEUED and rule.action_type == ActionType.PROMPT + and rule.prompt_runtime == PromptRuntime.SEND_TO_STOPPING_AGENT ): continue @@ -705,6 +897,62 @@ def rules_check_hook(hook_input: HookInput) -> HookOutput: ), ) + # Separate prompt results by runtime + agent_prompt_results: list[RuleEvaluationResult] = [] + claude_prompt_results: list[RuleEvaluationResult] = [] + + for result in prompt_results: + if result.rule.prompt_runtime == PromptRuntime.CLAUDE: + claude_prompt_results.append(result) + else: + agent_prompt_results.append(result) + + # Process Claude runtime rules + claude_errors: list[str] = [] + claude_fallback_prompts: list[str] = [] + for result in claude_prompt_results: + rule = result.rule + + # Compute trigger hash for queue + baseline_ref = get_baseline_ref(rule.compare_to) + trigger_hash = compute_trigger_hash( + rule.name, + result.trigger_files, + baseline_ref, + ) + + # Invoke Claude in headless mode + prompt = format_claude_prompt(result, hook_input.transcript_path) + decision, reason, fallback_prompt = invoke_claude_headless(prompt, rule.name) + + if fallback_prompt: + # Claude can't run in this environment, return prompt to agent + claude_fallback_prompts.append(f"## {rule.name}\n\n{fallback_prompt}\n") + # Don't update queue status - let agent handle it + elif decision == "allow": + # Claude resolved the issue + queue.update_status( + trigger_hash, + QueueEntryStatus.PASSED, + ActionResult( + type="claude", + output=reason, + exit_code=0, + ), + ) + else: + # Claude could not resolve or blocked + claude_errors.append(f"## {rule.name}\n{reason}\n") + queue.update_status( + trigger_hash, + QueueEntryStatus.FAILED, + ActionResult( + type="claude", + output=reason, + exit_code=1, + ), + ) + # Build response messages: list[str] = [] @@ -715,9 +963,22 @@ def rules_check_hook(hook_input: HookInput) -> HookOutput: messages.extend(command_errors) messages.append("") - # Add prompt rules if any - if prompt_results: - messages.append(format_rules_message(prompt_results)) + # Add Claude errors if any + if claude_errors: + messages.append("## Claude Rule Errors\n") + messages.append("The following rules were processed by Claude but require attention.\n") + messages.extend(claude_errors) + messages.append("") + + # Add Claude fallback prompts (when Claude can't run in this environment) + if claude_fallback_prompts: + messages.append("## Rules Requiring Sub-Agent Evaluation\n") + messages.extend(claude_fallback_prompts) + messages.append("") + + # Add prompt rules if any (send_to_stopping_agent runtime) + if agent_prompt_results: + messages.append(format_rules_message(agent_prompt_results)) if messages: return HookOutput(decision="block", reason="\n".join(messages)) diff --git a/src/deepwork/schemas/rules_schema.py b/src/deepwork/schemas/rules_schema.py index bf091ab9..64e8501c 100644 --- a/src/deepwork/schemas/rules_schema.py +++ b/src/deepwork/schemas/rules_schema.py @@ -87,6 +87,12 @@ "enum": ["base", "default_tip", "prompt"], "description": "Baseline for detecting file changes", }, + "prompt_runtime": { + "type": "string", + "enum": ["send_to_stopping_agent", "claude"], + "default": "send_to_stopping_agent", + "description": "Runtime for prompt action: 'send_to_stopping_agent' returns prompt to the agent that triggered the rule, 'claude' invokes Claude Code in headless mode", + }, }, "additionalProperties": False, # Detection mode must be exactly one of: trigger, set, pair, or created diff --git a/tests/shell_script_tests/test_rules_stop_hook.py b/tests/shell_script_tests/test_rules_stop_hook.py index 23418021..e0dc2543 100644 --- a/tests/shell_script_tests/test_rules_stop_hook.py +++ b/tests/shell_script_tests/test_rules_stop_hook.py @@ -305,10 +305,13 @@ class TestRulesStopHookInfiniteLoopPrevention: def test_queued_prompt_rule_does_not_refire( self, src_dir: Path, git_repo_with_src_rule: Path ) -> None: - """Test that a prompt rule with QUEUED status doesn't fire again. + """Test that a send_to_stopping_agent prompt rule with QUEUED status doesn't fire again. This prevents infinite loops when the transcript is unavailable or - promise tags haven't been written yet. + promise tags haven't been written yet. The agent has already seen this rule. + + Note: This only applies to rules with prompt_runtime: send_to_stopping_agent (default). + Claude runtime rules should refire - see test_claude_runtime_rule_refires_when_queued. """ # Create a file that triggers the rule test_src_dir = git_repo_with_src_rule / "src" @@ -408,6 +411,92 @@ def test_promise_tag_still_prevents_firing( finally: os.unlink(transcript_path) + def test_claude_runtime_rule_refires_when_queued( + self, src_dir: Path, tmp_path: Path + ) -> None: + """Test that a claude runtime prompt rule DOES refire when QUEUED. + + Claude runtime rules execute in a separate subprocess, not shown to the + stopping agent. Therefore they should NOT be subject to the infinite loop + prevention that skips QUEUED rules. + + This test uses CLAUDE_CODE_REMOTE=true to simulate claude unavailability, + which causes the rule to remain QUEUED (fallback path) rather than + getting PASSED/FAILED status. + """ + # Create a git repo with a claude runtime rule + repo = Repo.init(tmp_path) + readme = tmp_path / "README.md" + readme.write_text("# Test Project\n") + repo.index.add(["README.md"]) + repo.index.commit("Initial commit") + + # Create rule with prompt_runtime: claude + rules_dir = tmp_path / ".deepwork" / "rules" + rules_dir.mkdir(parents=True, exist_ok=True) + + rule_file = rules_dir / "claude-runtime-rule.md" + rule_file.write_text( + """--- +name: Claude Runtime Rule +trigger: "src/**/*" +compare_to: prompt +prompt_runtime: claude +--- +This is a rule that runs in claude runtime. +Please check the code. +""" + ) + + # Set up baseline + deepwork_dir = tmp_path / ".deepwork" + (deepwork_dir / ".last_work_tree").write_text("") + + # Create a file that triggers the rule + test_src_dir = tmp_path / "src" + test_src_dir.mkdir(exist_ok=True) + (test_src_dir / "main.py").write_text("# New file\n") + repo.index.add(["src/main.py"]) + + # Run with CLAUDE_CODE_REMOTE=true so claude returns fallback (stays QUEUED) + env = os.environ.copy() + env["DEEPWORK_HOOK_PLATFORM"] = "claude" + env["CLAUDE_CODE_REMOTE"] = "true" + if src_dir: + env["PYTHONPATH"] = str(src_dir) + + import subprocess + + # First run: rule should fire and create queue entry + result1 = subprocess.run( + ["python", "-m", "deepwork.hooks.rules_check"], + cwd=tmp_path, + capture_output=True, + text=True, + input="", + env=env, + ) + output1 = json.loads(result1.stdout.strip()) + assert output1.get("decision") == "block", f"First run should block: {output1}" + assert "Claude Runtime Rule" in output1.get("reason", "") + + # Second run: rule SHOULD fire again (claude runtime rules are NOT skipped) + result2 = subprocess.run( + ["python", "-m", "deepwork.hooks.rules_check"], + cwd=tmp_path, + capture_output=True, + text=True, + input="", + env=env, + ) + output2 = json.loads(result2.stdout.strip()) + # Claude runtime rules should refire even when QUEUED + assert output2.get("decision") == "block", ( + f"Second run should also block (claude runtime rule should refire): {output2}" + ) + assert "Claude Runtime Rule" in output2.get("reason", "") + + class TestSubagentStopEvent: """Tests for SubagentStop event triggering agentFinished rules.""" diff --git a/tests/unit/test_rules_check.py b/tests/unit/test_rules_check.py index e672fd94..78c7233f 100644 --- a/tests/unit/test_rules_check.py +++ b/tests/unit/test_rules_check.py @@ -1,6 +1,22 @@ """Tests for rules_check hook module.""" -from deepwork.hooks.rules_check import extract_promise_tags +import os +from unittest.mock import patch + +from deepwork.core.rules_parser import ( + DetectionMode, + PairConfig, + PromptRuntime, + Rule, + RuleEvaluationResult, +) +from deepwork.hooks.rules_check import ( + extract_promise_tags, + format_claude_prompt, + invoke_claude_headless, + is_claude_code_remote, + parse_claude_response, +) class TestExtractPromiseTags: @@ -103,3 +119,493 @@ def test_promise_embedded_in_markdown(self) -> None: """ result = extract_promise_tags(text) assert result == {"Architecture Documentation Accuracy", "README Accuracy"} + + +class TestFormatClaudePrompt: + """Tests for format_claude_prompt function.""" + + def test_formats_basic_trigger_safety_rule(self) -> None: + """Test formatting a basic trigger/safety rule for Claude.""" + rule = Rule( + name="Security Review", + filename="security-review", + detection_mode=DetectionMode.TRIGGER_SAFETY, + triggers=["src/auth/**/*"], + safety=[], + instructions="Review the code for security issues.", + compare_to="prompt", + prompt_runtime=PromptRuntime.CLAUDE, + ) + result = RuleEvaluationResult( + rule=rule, + should_fire=True, + trigger_files=["src/auth/login.py"], + ) + + prompt = format_claude_prompt(result) + + assert "Security Review" in prompt + assert "src/auth/login.py" in prompt + assert "Review the code for security issues" in prompt + assert "---RULE_RESULT---" in prompt + assert 'decision: <"block" or "allow">' in prompt + + def test_formats_set_mode_rule_with_missing_files(self) -> None: + """Test formatting a set mode rule showing missing files.""" + rule = Rule( + name="Source/Test Pairing", + filename="source-test-pairing", + detection_mode=DetectionMode.SET, + set_patterns=["src/{path}.py", "tests/{path}_test.py"], + instructions="Update the corresponding test file.", + compare_to="base", + prompt_runtime=PromptRuntime.CLAUDE, + ) + result = RuleEvaluationResult( + rule=rule, + should_fire=True, + trigger_files=["src/auth/login.py"], + missing_files=["tests/auth/login_test.py"], + ) + + prompt = format_claude_prompt(result) + + assert "Source/Test Pairing" in prompt + assert "src/auth/login.py" in prompt + assert "tests/auth/login_test.py" in prompt + assert "Expected files (not changed)" in prompt + assert "Update the corresponding test file" in prompt + + def test_formats_pair_mode_rule(self) -> None: + """Test formatting a pair mode rule.""" + rule = Rule( + name="API Documentation", + filename="api-documentation", + detection_mode=DetectionMode.PAIR, + pair_config=PairConfig( + trigger="api/{path}.py", + expects=["docs/api/{path}.md"], + ), + instructions="Update the API documentation.", + compare_to="base", + prompt_runtime=PromptRuntime.CLAUDE, + ) + result = RuleEvaluationResult( + rule=rule, + should_fire=True, + trigger_files=["api/users.py"], + missing_files=["docs/api/users.md"], + ) + + prompt = format_claude_prompt(result) + + assert "API Documentation" in prompt + assert "api/users.py" in prompt + assert "docs/api/users.md" in prompt + assert "Update the API documentation" in prompt + + def test_includes_response_format_instructions(self) -> None: + """Test that prompt includes response format instructions.""" + rule = Rule( + name="Test Rule", + filename="test-rule", + detection_mode=DetectionMode.TRIGGER_SAFETY, + triggers=["src/**/*"], + safety=[], + instructions="Check the code.", + compare_to="base", + prompt_runtime=PromptRuntime.CLAUDE, + ) + result = RuleEvaluationResult( + rule=rule, + should_fire=True, + trigger_files=["src/main.py"], + ) + + prompt = format_claude_prompt(result) + + assert "Response Format" in prompt + assert "---RULE_RESULT---" in prompt + assert "---END_RULE_RESULT---" in prompt + assert "block" in prompt + assert "allow" in prompt + + def test_includes_transcript_path_when_provided(self) -> None: + """Test that prompt includes transcript path when provided.""" + rule = Rule( + name="Test Rule", + filename="test-rule", + detection_mode=DetectionMode.TRIGGER_SAFETY, + triggers=["src/**/*"], + safety=[], + instructions="Check the code.", + compare_to="base", + prompt_runtime=PromptRuntime.CLAUDE, + ) + result = RuleEvaluationResult( + rule=rule, + should_fire=True, + trigger_files=["src/main.py"], + ) + + prompt = format_claude_prompt(result, transcript_path="/tmp/conversation.jsonl") + + assert "Conversation Context" in prompt + assert "/tmp/conversation.jsonl" in prompt + assert "transcript" in prompt.lower() + + def test_omits_transcript_section_when_not_provided(self) -> None: + """Test that prompt omits transcript section when path is None.""" + rule = Rule( + name="Test Rule", + filename="test-rule", + detection_mode=DetectionMode.TRIGGER_SAFETY, + triggers=["src/**/*"], + safety=[], + instructions="Check the code.", + compare_to="base", + prompt_runtime=PromptRuntime.CLAUDE, + ) + result = RuleEvaluationResult( + rule=rule, + should_fire=True, + trigger_files=["src/main.py"], + ) + + prompt = format_claude_prompt(result, transcript_path=None) + + assert "Conversation Context" not in prompt + # But instructions and other parts should still be present + assert "Check the code" in prompt + assert "---RULE_RESULT---" in prompt + + +class TestParseClaudeResponse: + """Tests for parse_claude_response function.""" + + def test_parses_allow_decision(self) -> None: + """Test parsing an allow decision.""" + output = """ +I've reviewed the code and it looks good. + +---RULE_RESULT--- +decision: allow +reason: Code follows security best practices +---END_RULE_RESULT--- +""" + decision, reason = parse_claude_response(output) + + assert decision == "allow" + assert reason == "Code follows security best practices" + + def test_parses_block_decision(self) -> None: + """Test parsing a block decision.""" + output = """ +There are security issues in the code. + +---RULE_RESULT--- +decision: block +reason: Found hardcoded credentials on line 42 +---END_RULE_RESULT--- +""" + decision, reason = parse_claude_response(output) + + assert decision == "block" + assert reason == "Found hardcoded credentials on line 42" + + def test_parses_quoted_decision(self) -> None: + """Test parsing decision with quotes.""" + output = """ +---RULE_RESULT--- +decision: "allow" +reason: All tests pass +---END_RULE_RESULT--- +""" + decision, reason = parse_claude_response(output) + + assert decision == "allow" + assert reason == "All tests pass" + + def test_parses_single_quoted_decision(self) -> None: + """Test parsing decision with single quotes.""" + output = """ +---RULE_RESULT--- +decision: 'block' +reason: Missing test coverage +---END_RULE_RESULT--- +""" + decision, reason = parse_claude_response(output) + + assert decision == "block" + assert reason == "Missing test coverage" + + def test_defaults_to_block_when_no_result_block(self) -> None: + """Test defaults to block when no result block found.""" + output = "I reviewed the code but forgot to include the result block." + + decision, reason = parse_claude_response(output) + + assert decision == "block" + assert "did not return a structured response" in reason + + def test_defaults_to_block_for_empty_output(self) -> None: + """Test defaults to block for empty output.""" + decision, reason = parse_claude_response("") + + assert decision == "block" + assert "did not return a structured response" in reason + + def test_handles_invalid_decision_value(self) -> None: + """Test handles invalid decision value by defaulting to block.""" + output = """ +---RULE_RESULT--- +decision: maybe +reason: Not sure about this +---END_RULE_RESULT--- +""" + decision, reason = parse_claude_response(output) + + # Invalid decision should default to block + assert decision == "block" + + def test_case_insensitive_decision(self) -> None: + """Test that decision parsing is case-insensitive.""" + output = """ +---RULE_RESULT--- +decision: ALLOW +reason: Everything looks good +---END_RULE_RESULT--- +""" + decision, reason = parse_claude_response(output) + + assert decision == "allow" + assert reason == "Everything looks good" + + def test_handles_multiline_reason(self) -> None: + """Test handling of reason that spans context before end marker.""" + output = """ +---RULE_RESULT--- +decision: block +reason: Multiple issues found including security vulnerabilities +---END_RULE_RESULT--- +""" + decision, reason = parse_claude_response(output) + + assert decision == "block" + assert "Multiple issues found" in reason + + def test_parses_result_embedded_in_longer_output(self) -> None: + """Test parsing result block embedded in longer output.""" + output = """ +I've completed the security review of the authentication code. + +Here are my findings: +1. The password hashing uses bcrypt which is good +2. Input validation is properly implemented +3. No SQL injection vulnerabilities found + +Overall, the code follows security best practices. + +---RULE_RESULT--- +decision: allow +reason: Code passes security review - no vulnerabilities found +---END_RULE_RESULT--- + +Let me know if you need any clarification. +""" + decision, reason = parse_claude_response(output) + + assert decision == "allow" + assert "passes security review" in reason + + +class TestIsClaudeCodeRemote: + """Tests for is_claude_code_remote function.""" + + def test_returns_true_when_env_var_is_true(self) -> None: + """Test returns True when CLAUDE_CODE_REMOTE=true.""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "true"}): + assert is_claude_code_remote() is True + + def test_returns_true_when_env_var_is_TRUE(self) -> None: + """Test returns True when CLAUDE_CODE_REMOTE=TRUE (case insensitive).""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "TRUE"}): + assert is_claude_code_remote() is True + + def test_returns_false_when_env_var_is_false(self) -> None: + """Test returns False when CLAUDE_CODE_REMOTE=false.""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "false"}): + assert is_claude_code_remote() is False + + def test_returns_false_when_env_var_not_set(self) -> None: + """Test returns False when CLAUDE_CODE_REMOTE is not set.""" + env = os.environ.copy() + env.pop("CLAUDE_CODE_REMOTE", None) + with patch.dict(os.environ, env, clear=True): + assert is_claude_code_remote() is False + + def test_returns_false_when_env_var_is_empty(self) -> None: + """Test returns False when CLAUDE_CODE_REMOTE is empty.""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": ""}): + assert is_claude_code_remote() is False + + +class TestInvokeClaudeHeadlessFallback: + """Tests for invoke_claude_headless fallback behavior in remote environments.""" + + def test_returns_fallback_prompt_in_remote_environment(self) -> None: + """Test returns fallback prompt when in Claude Code Remote environment.""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "true"}): + decision, reason, fallback = invoke_claude_headless( + "Test prompt content", "Test Rule" + ) + + assert decision == "block" + assert "manual evaluation" in reason + assert fallback is not None + assert "Cannot run `claude` command" in fallback + assert "Claude Code Web" in fallback + assert "Test prompt content" in fallback + + def test_no_fallback_in_local_environment(self) -> None: + """Test no fallback when not in remote environment (but may fail if claude not installed).""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "false"}): + # Mock subprocess to simulate claude not found + with patch("deepwork.hooks.rules_check.subprocess.run") as mock_run: + mock_run.side_effect = FileNotFoundError() + decision, reason, fallback = invoke_claude_headless( + "Test prompt", "Test Rule" + ) + + assert decision == "block" + assert "not found" in reason + assert fallback is None # No fallback, actual error + + +class TestInvokeClaudeHeadlessExecution: + """Tests for invoke_claude_headless subprocess execution.""" + + def test_successful_allow_decision(self) -> None: + """Test successful execution with allow decision.""" + mock_output = """ +I've reviewed the code. + +---RULE_RESULT--- +decision: allow +reason: Code looks good +---END_RULE_RESULT--- +""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "false"}): + with patch("deepwork.hooks.rules_check.subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + mock_run.return_value.stdout = mock_output + mock_run.return_value.stderr = "" + + decision, reason, fallback = invoke_claude_headless( + "Test prompt", "Test Rule" + ) + + assert decision == "allow" + assert reason == "Code looks good" + assert fallback is None + + def test_successful_block_decision(self) -> None: + """Test successful execution with block decision.""" + mock_output = """ +Found issues in the code. + +---RULE_RESULT--- +decision: block +reason: Security vulnerability detected +---END_RULE_RESULT--- +""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "false"}): + with patch("deepwork.hooks.rules_check.subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + mock_run.return_value.stdout = mock_output + mock_run.return_value.stderr = "" + + decision, reason, fallback = invoke_claude_headless( + "Test prompt", "Test Rule" + ) + + assert decision == "block" + assert reason == "Security vulnerability detected" + assert fallback is None + + def test_nonzero_exit_code(self) -> None: + """Test handling of non-zero exit code from Claude.""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "false"}): + with patch("deepwork.hooks.rules_check.subprocess.run") as mock_run: + mock_run.return_value.returncode = 1 + mock_run.return_value.stdout = "" + mock_run.return_value.stderr = "API rate limit exceeded" + + decision, reason, fallback = invoke_claude_headless( + "Test prompt", "Test Rule" + ) + + assert decision == "block" + assert "execution failed" in reason + assert "rate limit" in reason + assert fallback is None + + def test_timeout_handling(self) -> None: + """Test handling of subprocess timeout.""" + import subprocess + + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "false"}): + with patch("deepwork.hooks.rules_check.subprocess.run") as mock_run: + mock_run.side_effect = subprocess.TimeoutExpired( + cmd=["claude"], timeout=300 + ) + + decision, reason, fallback = invoke_claude_headless( + "Test prompt", "Test Rule" + ) + + assert decision == "block" + assert "timed out" in reason + assert "Test Rule" in reason + assert fallback is None + + def test_generic_exception_handling(self) -> None: + """Test handling of generic exceptions.""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "false"}): + with patch("deepwork.hooks.rules_check.subprocess.run") as mock_run: + mock_run.side_effect = OSError("Permission denied") + + decision, reason, fallback = invoke_claude_headless( + "Test prompt", "Test Rule" + ) + + assert decision == "block" + assert "Error invoking Claude" in reason + assert "Permission denied" in reason + assert fallback is None + + def test_calls_claude_with_correct_arguments(self) -> None: + """Test that Claude is called with the correct command-line arguments.""" + mock_output = """ +---RULE_RESULT--- +decision: allow +reason: OK +---END_RULE_RESULT--- +""" + with patch.dict(os.environ, {"CLAUDE_CODE_REMOTE": "false"}): + with patch("deepwork.hooks.rules_check.subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + mock_run.return_value.stdout = mock_output + mock_run.return_value.stderr = "" + + invoke_claude_headless("My test prompt", "Test Rule") + + mock_run.assert_called_once() + call_args = mock_run.call_args + cmd = call_args[0][0] + + assert cmd[0] == "claude" + assert "--print" in cmd + assert "--dangerously-skip-permissions" in cmd + assert "-p" in cmd + assert "My test prompt" in cmd diff --git a/tests/unit/test_rules_parser.py b/tests/unit/test_rules_parser.py index ee8a2375..5e31c462 100644 --- a/tests/unit/test_rules_parser.py +++ b/tests/unit/test_rules_parser.py @@ -2,11 +2,16 @@ from pathlib import Path +import pytest + from deepwork.core.pattern_matcher import matches_any_pattern as matches_pattern from deepwork.core.rules_parser import ( + ActionType, DetectionMode, PairConfig, + PromptRuntime, Rule, + RulesParseError, evaluate_rule, evaluate_rules, load_rules_from_directory, @@ -993,3 +998,243 @@ def test_loads_created_rule_with_command_action(self, temp_dir: Path) -> None: assert rules[0].action_type == ActionType.COMMAND assert rules[0].command_action is not None assert rules[0].command_action.command == "ruff check {file}" + + +class TestPromptRuntime: + """Tests for prompt_runtime field parsing and behavior.""" + + def test_default_prompt_runtime_is_send_to_stopping_agent(self) -> None: + """Test that default prompt_runtime is send_to_stopping_agent.""" + rule = Rule( + name="Test Rule", + filename="test-rule", + detection_mode=DetectionMode.TRIGGER_SAFETY, + triggers=["src/**/*"], + safety=[], + instructions="Check it", + compare_to="base", + ) + assert rule.prompt_runtime == PromptRuntime.SEND_TO_STOPPING_AGENT + + def test_explicit_send_to_stopping_agent_runtime(self) -> None: + """Test explicit send_to_stopping_agent runtime.""" + rule = Rule( + name="Test Rule", + filename="test-rule", + detection_mode=DetectionMode.TRIGGER_SAFETY, + triggers=["src/**/*"], + safety=[], + instructions="Check it", + compare_to="base", + prompt_runtime=PromptRuntime.SEND_TO_STOPPING_AGENT, + ) + assert rule.prompt_runtime == PromptRuntime.SEND_TO_STOPPING_AGENT + + def test_claude_runtime(self) -> None: + """Test claude runtime.""" + rule = Rule( + name="Test Rule", + filename="test-rule", + detection_mode=DetectionMode.TRIGGER_SAFETY, + triggers=["src/**/*"], + safety=[], + instructions="Check it", + compare_to="base", + prompt_runtime=PromptRuntime.CLAUDE, + ) + assert rule.prompt_runtime == PromptRuntime.CLAUDE + + +class TestLoadPromptRuntimeFromFile: + """Tests for loading rules with prompt_runtime from files.""" + + def test_loads_rule_without_prompt_runtime_defaults(self, temp_dir: Path) -> None: + """Test loading a rule without prompt_runtime defaults to send_to_stopping_agent.""" + rules_dir = temp_dir / "rules" + rules_dir.mkdir() + + rule_file = rules_dir / "test-rule.md" + rule_file.write_text( + """--- +name: Test Rule +trigger: "src/**/*" +compare_to: base +--- +Please check the source files. +""" + ) + + rules = load_rules_from_directory(rules_dir) + + assert len(rules) == 1 + assert rules[0].prompt_runtime == PromptRuntime.SEND_TO_STOPPING_AGENT + + def test_loads_rule_with_send_to_stopping_agent_runtime(self, temp_dir: Path) -> None: + """Test loading a rule with explicit send_to_stopping_agent runtime.""" + rules_dir = temp_dir / "rules" + rules_dir.mkdir() + + rule_file = rules_dir / "test-rule.md" + rule_file.write_text( + """--- +name: Test Rule +trigger: "src/**/*" +compare_to: base +prompt_runtime: send_to_stopping_agent +--- +Please check the source files. +""" + ) + + rules = load_rules_from_directory(rules_dir) + + assert len(rules) == 1 + assert rules[0].prompt_runtime == PromptRuntime.SEND_TO_STOPPING_AGENT + + def test_loads_rule_with_claude_runtime(self, temp_dir: Path) -> None: + """Test loading a rule with claude runtime.""" + rules_dir = temp_dir / "rules" + rules_dir.mkdir() + + rule_file = rules_dir / "test-rule.md" + rule_file.write_text( + """--- +name: Security Review +trigger: "src/auth/**/*" +compare_to: prompt +prompt_runtime: claude +--- +Review the security-sensitive code for vulnerabilities. +""" + ) + + rules = load_rules_from_directory(rules_dir) + + assert len(rules) == 1 + assert rules[0].name == "Security Review" + assert rules[0].prompt_runtime == PromptRuntime.CLAUDE + assert rules[0].action_type == ActionType.PROMPT + + def test_loads_command_action_rule_with_prompt_runtime(self, temp_dir: Path) -> None: + """Test loading a command action rule with prompt_runtime (ignored for command actions).""" + rules_dir = temp_dir / "rules" + rules_dir.mkdir() + + rule_file = rules_dir / "format-python.md" + rule_file.write_text( + """--- +name: Format Python +trigger: "**/*.py" +action: + command: "ruff format {file}" + run_for: each_match +compare_to: prompt +prompt_runtime: send_to_stopping_agent +--- +""" + ) + + rules = load_rules_from_directory(rules_dir) + + assert len(rules) == 1 + assert rules[0].action_type == ActionType.COMMAND + # prompt_runtime is still parsed even for command actions + assert rules[0].prompt_runtime == PromptRuntime.SEND_TO_STOPPING_AGENT + + def test_invalid_prompt_runtime_raises_error(self, temp_dir: Path) -> None: + """Test that invalid prompt_runtime value raises an error.""" + rules_dir = temp_dir / "rules" + rules_dir.mkdir() + + rule_file = rules_dir / "test-rule.md" + rule_file.write_text( + """--- +name: Test Rule +trigger: "src/**/*" +compare_to: base +prompt_runtime: invalid_value +--- +Please check the source files. +""" + ) + + with pytest.raises(RulesParseError) as exc_info: + load_rules_from_directory(rules_dir) + + # Schema validation catches invalid enum values + error_message = str(exc_info.value) + assert "invalid_value" in error_message + assert "send_to_stopping_agent" in error_message or "prompt_runtime" in error_message + + def test_loads_set_mode_rule_with_claude_runtime(self, temp_dir: Path) -> None: + """Test loading a set mode rule with claude runtime.""" + rules_dir = temp_dir / "rules" + rules_dir.mkdir() + + rule_file = rules_dir / "source-test-pairing.md" + rule_file.write_text( + """--- +name: Source/Test Pairing +set: + - src/{path}.py + - tests/{path}_test.py +compare_to: base +prompt_runtime: claude +--- +Source and test files should change together. +""" + ) + + rules = load_rules_from_directory(rules_dir) + + assert len(rules) == 1 + assert rules[0].detection_mode == DetectionMode.SET + assert rules[0].prompt_runtime == PromptRuntime.CLAUDE + + def test_loads_pair_mode_rule_with_claude_runtime(self, temp_dir: Path) -> None: + """Test loading a pair mode rule with claude runtime.""" + rules_dir = temp_dir / "rules" + rules_dir.mkdir() + + rule_file = rules_dir / "api-docs.md" + rule_file.write_text( + """--- +name: API Documentation +pair: + trigger: src/api/{name}.py + expects: docs/api/{name}.md +compare_to: base +prompt_runtime: claude +--- +API code requires documentation. +""" + ) + + rules = load_rules_from_directory(rules_dir) + + assert len(rules) == 1 + assert rules[0].detection_mode == DetectionMode.PAIR + assert rules[0].prompt_runtime == PromptRuntime.CLAUDE + + def test_loads_created_mode_rule_with_claude_runtime(self, temp_dir: Path) -> None: + """Test loading a created mode rule with claude runtime.""" + rules_dir = temp_dir / "rules" + rules_dir.mkdir() + + rule_file = rules_dir / "new-module-review.md" + rule_file.write_text( + """--- +name: New Module Review +created: src/**/*.py +compare_to: prompt +prompt_runtime: claude +--- +Review the new module for best practices. +""" + ) + + rules = load_rules_from_directory(rules_dir) + + assert len(rules) == 1 + assert rules[0].detection_mode == DetectionMode.CREATED + assert rules[0].prompt_runtime == PromptRuntime.CLAUDE diff --git a/uv.lock b/uv.lock index 474e30f9..ec35b2f3 100644 --- a/uv.lock +++ b/uv.lock @@ -126,7 +126,7 @@ toml = [ [[package]] name = "deepwork" -version = "0.5.2" +version = "0.4.0" source = { editable = "." } dependencies = [ { name = "click" }, @@ -147,6 +147,12 @@ dev = [ { name = "types-pyyaml" }, ] +[package.dev-dependencies] +dev = [ + { name = "gitpython" }, + { name = "pytest" }, +] + [package.metadata] requires-dist = [ { name = "click", specifier = ">=8.1.0" }, @@ -164,6 +170,12 @@ requires-dist = [ ] provides-extras = ["dev"] +[package.metadata.requires-dev] +dev = [ + { name = "gitpython", specifier = ">=3.1.46" }, + { name = "pytest", specifier = ">=9.0.2" }, +] + [[package]] name = "gitdb" version = "4.0.12"