diff --git a/src/agents_shipgate/cli/agent_result.py b/src/agents_shipgate/cli/agent_result.py index 0237164a..d109f5eb 100644 --- a/src/agents_shipgate/cli/agent_result.py +++ b/src/agents_shipgate/cli/agent_result.py @@ -11,9 +11,28 @@ parse_unified_diff, ) from agents_shipgate.schemas.codex_boundary_result import CodexBoundaryResultV1 -from agents_shipgate.triggers import _git_diff_context +from agents_shipgate.triggers import _git_diff_context, load_triggers from agents_shipgate.triggers import evaluate as evaluate_trigger +# Trigger rule IDs that mark a changed file as a *tool/capability source* — +# the surfaces ``verify`` compiles into the capability delta. Deliberately +# EXCLUDES the other ``run_shipgate`` rules that are not declarable tool +# sources: ``TRIGGER-SHIPGATE-MANIFEST`` (the manifest itself), +# ``TRIGGER-PROMPTS-OR-POLICIES`` (prompts/policies), ``TRIGGER-SHIPGATE-CI- +# WORKFLOW`` (the gate), and ``TRIGGER-CODEX-BOUNDARY-CONFIG-CHANGED`` (which +# the boundary check itself evaluates). A change to one of those must not be +# mislabeled an "undeclared tool surface". +_TOOL_SOURCE_TRIGGER_IDS = frozenset( + { + "TRIGGER-MCP-EXPORT-CHANGED", + "TRIGGER-OPENAPI-SPEC-CHANGED", + "TRIGGER-STATIC-TOOL-INVENTORY-CHANGED", + "TRIGGER-CODEX-PLUGIN-CHANGED", + "TRIGGER-N8N-WORKFLOW-CHANGED", + "TRIGGER-FUNCTION-TOOL-DECORATOR", + } +) + def build_codex_agent_result( *, @@ -24,7 +43,8 @@ def build_codex_agent_result( policy: Path | None, ) -> CodexBoundaryResultV1: workspace = workspace.resolve() - changed_files = sorted({item.path for item in parse_unified_diff(diff_text) if item.path}) + diff_files = parse_unified_diff(diff_text) + changed_files = sorted({item.path for item in diff_files if item.path}) config_path = config if config.is_absolute() else workspace / config trigger = evaluate_trigger( paths=changed_files, @@ -32,20 +52,80 @@ def build_codex_agent_result( manifest_present=config_path.is_file(), user_requested=True, ) + declared = _declared_tool_surfaces_changed( + workspace=workspace, + config_path=config_path, + changed_files=changed_files, + ) return evaluate_codex_boundary_result( workspace=workspace, diff_text=diff_text, agent=agent, policy_path=policy, trigger=trigger, - capability_surfaces_changed=_declared_tool_surfaces_changed( - workspace=workspace, - config_path=config_path, + capability_surfaces_changed=declared, + undeclared_capability_surfaces=_undeclared_tool_surfaces_changed( + diff_files=diff_files, changed_files=changed_files, + declared=declared, ), ) +def _undeclared_tool_surfaces_changed( + *, + diff_files: list[Any], + changed_files: list[str], + declared: list[str], +) -> list[str]: + """Changed files that are tool/capability surfaces the manifest does NOT declare. + + ``verify`` only gates *declared* tool sources, so an undeclared surface (a + new MCP/OpenAPI/tool-inventory/codex-plugin file, or an SDK/n8n tool the + manifest does not list) escapes the gate even though ``check`` returns a + clean ``allow``. Each changed file is classified per-file against only the + *tool-source* trigger rules (so the manifest, prompts/policies, the CI gate, + and ``.codex`` boundary config are never mislabeled), excluding boundary + paths (the boundary evaluator already inspects those) and files the manifest + already declares (``verify`` gates those). Computed independently of the + declared set so a mixed diff — one declared surface plus one undeclared — + still surfaces the undeclared one. + """ + if not changed_files: + return [] + declared_set = set(declared) + added_by_path = { + item.path: "\n".join(getattr(item, "added_lines", []) or []) + for item in diff_files + if item.path + } + catalog = load_triggers() + undeclared: list[str] = [] + for path in changed_files: + if path in declared_set or is_boundary_path(path): + continue + # Per-file classification: a glob rule matches on the path, a + # diff_contains rule (n8n, @function_tool) on this file's added lines. + result = evaluate_trigger( + paths=[path], + diff_text=added_by_path.get(path, ""), + manifest_present=False, + user_requested=False, + triggers=catalog, + ) + # Require the evaluator's WINNING verdict, not just a matched rule: a + # docs/test file that incidentally mentions ``@tool`` also matches + # ``TRIGGER-DOCS-ONLY-NEGATIVE``, which beats ``run_shipgate`` so the + # catalog skips it. Only treat the file as a tool surface when the + # trigger actually runs AND a tool-source rule is what carried it. + if result.get("run_shipgate") and any( + rule.get("id") in _TOOL_SOURCE_TRIGGER_IDS + for rule in result.get("matched_rules", []) + ): + undeclared.append(path) + return sorted(dict.fromkeys(undeclared)) + + def _declared_tool_surfaces_changed( *, workspace: Path, diff --git a/src/agents_shipgate/core/codex_boundary.py b/src/agents_shipgate/core/codex_boundary.py index 7ef3de9b..62fec0ea 100644 --- a/src/agents_shipgate/core/codex_boundary.py +++ b/src/agents_shipgate/core/codex_boundary.py @@ -376,16 +376,24 @@ def evaluate_codex_boundary_result( trigger: dict[str, Any] | None = None, release_decision: dict[str, Any] | None = None, capability_surfaces_changed: list[str] | None = None, + undeclared_capability_surfaces: list[str] | None = None, ) -> AgentResultV1: """Return the local Codex boundary-result projection for a unified diff. - ``capability_surfaces_changed`` lists changed files that the manifest - declares as tool sources. The boundary evaluator does not inspect tool - surfaces (only ``verify`` computes the capability delta), so when one - changed and the boundary result is otherwise a clean ``allow``, the - result is escalated to ``warn`` and routed to ``verify`` rather than - green-lighting a capability change ``check`` never evaluated. This keeps - ``check`` from disagreeing with the ``release_decision.decision`` gate. + The boundary evaluator does not inspect tool surfaces (only ``verify`` + computes the capability delta), so a clean ``allow`` over a changed tool + surface is escalated to ``warn`` rather than green-lighting a capability + change ``check`` never evaluated. This keeps ``check`` from disagreeing + with the ``release_decision.decision`` gate. Two inputs drive it (both + pre-computed by ``build_codex_agent_result``): + + - ``capability_surfaces_changed`` — changed files the manifest DECLARES as + tool sources. ``verify`` will gate these, so route to ``verify``. + - ``undeclared_capability_surfaces`` — changed files that ARE tool surfaces + but the manifest does not declare (or there is no manifest). ``verify`` + cannot gate an undeclared surface, so route to declare-then-verify + (``detect``). Takes precedence when a diff changes both, since ``verify`` + alone would miss the undeclared one. """ # Keep this local diff projector aligned with @@ -458,13 +466,21 @@ def add(rule_id: str, *, path: str | None, evidence: dict[str, Any]) -> None: decision = _decision_for(violations, release_decision=release_decision) # Coverage gap: check is boundary-only, so a clean ``allow`` over a diff - # that touches a manifest-declared tool surface would silently green-light - # a capability change that only ``verify`` gates. Escalate to ``warn`` and - # route to verify instead. Gated on ``release_decision is None`` because a - # provided release decision means the full capability scan already ran. + # that touches a tool/capability surface would silently green-light a + # capability change that only ``verify`` gates. Escalate to ``warn`` and + # route onward. Gated on ``release_decision is None`` because a provided + # release decision means the full capability scan already ran. The two + # surface lists are classified per-file upstream (declared vs undeclared); + # see the docstring. ``undeclared_gap`` is computed independently of the + # declared set and takes precedence so a mixed diff routes to + # declare-then-verify rather than a ``verify`` that misses the undeclared + # surface. + boundary_clean_allow = decision == "allow" and release_decision is None coverage_surfaces = sorted(dict.fromkeys(capability_surfaces_changed or [])) - coverage_gap = decision == "allow" and release_decision is None and bool(coverage_surfaces) - if coverage_gap: + undeclared_surfaces = sorted(dict.fromkeys(undeclared_capability_surfaces or [])) + coverage_gap = boundary_clean_allow and bool(coverage_surfaces) + undeclared_gap = boundary_clean_allow and bool(undeclared_surfaces) + if coverage_gap or undeclared_gap: decision = "warn" risk_level = _risk_for(violations) @@ -478,7 +494,13 @@ def add(rule_id: str, *, path: str | None, evidence: dict[str, Any]) -> None: finding_fingerprints=finding_fingerprints, evaluated_files=evaluated_files, ) - if coverage_gap: + if undeclared_gap: + first_next_action = _undeclared_next_action() + summary = _undeclared_summary(undeclared_surfaces) + diagnostics = [*diagnostics, _undeclared_diagnostic(undeclared_surfaces)] + trace = [*_trace_for(policy, decision, violations), _undeclared_trace(undeclared_surfaces)] + suggested_fixes = [_DETECT_COMMAND, _VERIFY_COMMAND] + elif coverage_gap: first_next_action = _coverage_next_action() summary = _coverage_summary(coverage_surfaces) diagnostics = [*diagnostics, _coverage_diagnostic(coverage_surfaces)] @@ -1338,6 +1360,51 @@ def _risk_for(violations: list[AgentResultViolatedRule]) -> AgentResultRiskLevel # auto-detects the base (v0.13) and emits the boundary-result surface, so it # works for both the local working tree and committed refs. _VERIFY_COMMAND = "agents-shipgate verify --json" +_DETECT_COMMAND = "agents-shipgate detect --json" + + +def _undeclared_next_action() -> AgentResultNextAction: + return AgentResultNextAction( + actor="coding_agent", + kind="warn", + command=_DETECT_COMMAND, + why=( + "This diff changes a tool/capability surface that shipgate.yaml does not " + "declare, so neither check nor verify gates it yet. Declare the surface " + "(run detect or add it to tool_sources), then run verify before completing." + ), + ) + + +def _undeclared_summary(surfaces: list[str]) -> str: + return ( + "No Codex boundary rule fired, but the diff changes a tool/capability surface " + f"({', '.join(surfaces[:5])}) that shipgate.yaml does not declare, so verify " + "cannot gate it yet. Declare it (detect or tool_sources) and run verify before " + "reporting completion." + ) + + +def _undeclared_diagnostic(surfaces: list[str]) -> AgentResultDiagnostic: + return AgentResultDiagnostic( + level="warning", + code="undeclared_capability_surface", + message=( + "Undeclared tool/capability surface(s) " + f"{', '.join(surfaces[:5])} changed; check is boundary-only and verify " + "only gates declared surfaces. Declare the surface, then verify." + ), + ) + + +def _undeclared_trace(surfaces: list[str]) -> AgentResultTraceEvent: + return AgentResultTraceEvent( + step="coverage", + summary=( + f"boundary_only: {len(surfaces)} undeclared tool surface(s) changed; " + "routed to detect + verify." + ), + ) def _coverage_next_action() -> AgentResultNextAction: diff --git a/tests/test_codex_boundary_check.py b/tests/test_codex_boundary_check.py index 287e6228..f6e0786a 100644 --- a/tests/test_codex_boundary_check.py +++ b/tests/test_codex_boundary_check.py @@ -246,6 +246,275 @@ def test_check_does_not_warn_on_docs_change_in_opted_in_repo(tmp_path: Path) -> assert result.decision == "allow" +# --- Undeclared coverage gap: a changed file IS a tool surface but the -------- +# manifest does not declare it (or there is no manifest). verify only gates +# declared surfaces, so route to declare-then-verify (detect) rather than a +# clean allow or a verify that never scans it. + +# A second changed file that is an *undeclared* tool surface (an OpenAPI spec), +# used to exercise mixed declared+undeclared diffs (review finding P1). +_MIXED_TOOL_SOURCE_DIFF = _TOOL_SOURCE_DIFF + ( + "diff --git a/api/openapi.yaml b/api/openapi.yaml\n" + "--- a/api/openapi.yaml\n" + "+++ b/api/openapi.yaml\n" + "@@ -1 +1,2 @@\n" + " openapi: 3.0.0\n" + "+paths: {}\n" +) + + +def test_undeclared_surface_warns_and_routes_to_detect(tmp_path: Path) -> None: + result = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + agent="claude-code", + undeclared_capability_surfaces=["mcp-tools.json"], + ) + payload = result.model_dump(mode="json", exclude_none=True) + _validate(payload) + # Was a bare allow before the fix; now a warn that routes to detect/declare. + assert payload["decision"] == "warn" + assert payload["completion_allowed"] is True + assert payload["must_stop"] is False + assert payload["first_next_action"]["kind"] == "warn" + assert payload["first_next_action"]["command"].startswith("agents-shipgate detect") + assert any(d["code"] == "undeclared_capability_surface" for d in payload["diagnostics"]) + assert any(t["step"] == "coverage" for t in payload["trace"]) + assert payload["suggested_fixes"][0].startswith("agents-shipgate detect") + assert any(fix.startswith("agents-shipgate verify") for fix in payload["suggested_fixes"]) + + +def test_mixed_declared_and_undeclared_routes_to_detect(tmp_path: Path) -> None: + # Review finding P1: a diff that changes BOTH a declared surface (verify + # gates it) and an undeclared one (verify does not) must route to detect — + # declare-then-verify — not a verify that silently misses the undeclared + # surface. Undeclared takes precedence over the declared coverage gap. + result = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=_MIXED_TOOL_SOURCE_DIFF, + agent="claude-code", + capability_surfaces_changed=["mcp-tools.json"], + undeclared_capability_surfaces=["api/openapi.yaml"], + ) + assert result.decision == "warn" + assert result.first_next_action.command.startswith("agents-shipgate detect") + payload = result.model_dump(mode="json", exclude_none=True) + diag = next(d for d in payload["diagnostics"] if d["code"] == "undeclared_capability_surface") + assert "api/openapi.yaml" in diag["message"] + + +def test_no_manifest_capability_add_via_check_warns_and_routes_to_detect( + tmp_path: Path, +) -> None: + # End-to-end: empty workspace (no shipgate.yaml). build_codex_agent_result + # classifies mcp-tools.json as an undeclared tool surface. + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "warn" + assert result.first_next_action.command.startswith("agents-shipgate detect") + + +def test_capability_add_to_undeclared_surface_warns_when_manifest_declares_other( + tmp_path: Path, +) -> None: + # Manifest exists but declares a *different* tool source than the changed + # file. The declared-coverage path does not match, so the undeclared path + # must catch it (and route to detect, not a verify that never scans it). + _write_manifest( + tmp_path, + " - id: other\n type: mcp\n path: other-tools.json\n trust: internal\n", + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "warn" + assert result.first_next_action.command.startswith("agents-shipgate detect") + payload = result.model_dump(mode="json", exclude_none=True) + assert any(d["code"] == "undeclared_capability_surface" for d in payload["diagnostics"]) + + +def test_mixed_declared_and_undeclared_via_check_routes_to_detect( + tmp_path: Path, +) -> None: + # Review finding P1, end-to-end through build_codex_agent_result: manifest + # declares mcp-tools.json; the diff also adds an undeclared OpenAPI spec. + _write_manifest( + tmp_path, + " - id: mcp\n type: mcp\n path: mcp-tools.json\n trust: internal\n", + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=_MIXED_TOOL_SOURCE_DIFF, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "warn" + assert result.first_next_action.command.startswith("agents-shipgate detect") + payload = result.model_dump(mode="json", exclude_none=True) + diag = next(d for d in payload["diagnostics"] if d["code"] == "undeclared_capability_surface") + assert "api/openapi.yaml" in diag["message"] + + +def test_manifest_edit_is_not_an_undeclared_tool_surface(tmp_path: Path) -> None: + # Review finding P2: editing shipgate.yaml in an opted-in repo fires a + # run_shipgate trigger rule (TRIGGER-SHIPGATE-MANIFEST) but is NOT a + # declarable tool source, so it must not be reported as an undeclared + # surface routed to detect. + _write_manifest( + tmp_path, + " - id: mcp\n type: mcp\n path: mcp-tools.json\n trust: internal\n", + ) + diff = ( + "diff --git a/shipgate.yaml b/shipgate.yaml\n" + "--- a/shipgate.yaml\n" + "+++ b/shipgate.yaml\n" + "@@ -1 +1,2 @@\n" + ' version: "0.1"\n' + "+# touch\n" + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=diff, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "allow" + payload = result.model_dump(mode="json", exclude_none=True) + assert not any( + d["code"] == "undeclared_capability_surface" for d in payload.get("diagnostics", []) + ) + + +def test_prompts_edit_is_not_an_undeclared_tool_surface(tmp_path: Path) -> None: + # Review finding P2: a prompts/ edit fires TRIGGER-PROMPTS-OR-POLICIES but + # is not a declarable tool source — no undeclared-surface warn / detect. + _write_manifest( + tmp_path, + " - id: mcp\n type: mcp\n path: mcp-tools.json\n trust: internal\n", + ) + diff = ( + "diff --git a/prompts/system.md b/prompts/system.md\n" + "--- a/prompts/system.md\n" + "+++ b/prompts/system.md\n" + "@@ -1 +1,2 @@\n" + " You are helpful.\n" + "+Be concise.\n" + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=diff, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "allow" + + +def test_docs_file_mentioning_tool_decorator_is_not_undeclared_surface( + tmp_path: Path, +) -> None: + # Review finding: a docs file that incidentally mentions @tool matches the + # FUNCTION-TOOL-DECORATOR rule but ALSO TRIGGER-DOCS-ONLY-NEGATIVE, which + # wins — the catalog skips it, so it must not be flagged as an undeclared + # tool surface routed to detect. + _write_manifest( + tmp_path, + " - id: mcp\n type: mcp\n path: mcp-tools.json\n trust: internal\n", + ) + diff = ( + "diff --git a/README.md b/README.md\n" + "--- a/README.md\n" + "+++ b/README.md\n" + "@@ -1 +1,2 @@\n" + " # Docs\n" + "+Use the @tool / @function_tool decorator to register a tool.\n" + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=diff, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "allow" + payload = result.model_dump(mode="json", exclude_none=True) + assert not any( + d["code"] == "undeclared_capability_surface" for d in payload.get("diagnostics", []) + ) + + +def test_test_file_mentioning_tool_decorator_is_not_undeclared_surface( + tmp_path: Path, +) -> None: + # Same property for a tests/ file: docs-only-negative covers tests/**. + _write_manifest( + tmp_path, + " - id: mcp\n type: mcp\n path: mcp-tools.json\n trust: internal\n", + ) + diff = ( + "diff --git a/tests/test_docs.py b/tests/test_docs.py\n" + "--- a/tests/test_docs.py\n" + "+++ b/tests/test_docs.py\n" + "@@ -1 +1,2 @@\n" + " def test_x():\n" + "+ # documents the @function_tool example\n" + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=diff, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "allow" + + +def test_undeclared_gap_never_downgrades_a_block(tmp_path: Path) -> None: + # A real boundary block plus an undeclared surface must stay blocked. + block_diff = (CORPUS / "mcp_auto_approve_write.diff").read_text(encoding="utf-8") + result = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=block_diff, + agent="claude-code", + undeclared_capability_surfaces=["mcp-tools.json"], + ) + assert result.decision == "block" + + +def test_undeclared_gap_inactive_without_signal_or_when_release_decision_present( + tmp_path: Path, +) -> None: + # No undeclared surfaces supplied (bare call) preserves the clean allow. + no_signal = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + agent="claude-code", + ) + assert no_signal.decision == "allow" + + # A supplied release_decision means the full scan already ran; the + # projection governs and the boundary-only heuristic stays out of it. + scanned = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + agent="claude-code", + undeclared_capability_surfaces=["mcp-tools.json"], + release_decision={"decision": "passed", "reason": "clean"}, + ) + assert scanned.decision == "allow" + + def test_codex_check_reads_diff_from_stdin(tmp_path: Path) -> None: diff_text = (CORPUS / "docs_only.diff").read_text(encoding="utf-8") result = runner.invoke(