From 88dca3114688e5ab82cf6ef2c1c544cf39e83f5c Mon Sep 17 00:00:00 2001 From: Pengfei Hu Date: Mon, 22 Jun 2026 11:09:10 -0700 Subject: [PATCH 1/3] Defer undeclared tool-surface changes from check to verify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `shipgate check` (and the MCP `shipgate.check` tool) returned `allow` on a capability-adding diff that touches a tool/capability surface the manifest does not declare — or when there is no manifest — even though the trigger already flags the diff in scope. That is a false clear: only `verify` compiles the capability delta, so an agent acting on a boundary `allow` could ship a change `verify` would block. main already defers manifest-declared tool sources (`capability_surfaces_changed` → `coverage_gap`). This extends that coverage to the undeclared / no-manifest case: - Fire only when a content-driven `run_shipgate` trigger rule matched (e.g. TRIGGER-MCP-EXPORT-CHANGED), never the blanket `force_run` rule — so a docs-only edit in an opted-in repo stays `allow` (no noise). - Require a changed file the boundary evaluator does not itself inspect (`not is_boundary_path`), so a clean `allow` over `.codex/config.toml`, `AGENTS.md`, or the Shipgate workflow stays authoritative. - Escalate `allow` → `warn` and route to `agents-shipgate detect` (declare-then-verify), because `verify` only gates *declared* surfaces; routing straight to verify would falsely reassure on a surface it never scans. Never downgrades a real boundary `block` / `require_review`. The MCP `shipgate.check` tool inherits the fix automatically via the shared `build_codex_agent_result` path. No schema or check-ID changes. Co-Authored-By: Claude Opus 4.8 --- src/agents_shipgate/core/codex_boundary.py | 105 ++++++++++++++++- tests/test_codex_boundary_check.py | 131 +++++++++++++++++++++ 2 files changed, 230 insertions(+), 6 deletions(-) diff --git a/src/agents_shipgate/core/codex_boundary.py b/src/agents_shipgate/core/codex_boundary.py index 7ef3de9b..aab63100 100644 --- a/src/agents_shipgate/core/codex_boundary.py +++ b/src/agents_shipgate/core/codex_boundary.py @@ -458,13 +458,36 @@ def add(rule_id: str, *, path: str | None, evidence: dict[str, Any]) -> None: decision = _decision_for(violations, release_decision=release_decision) # Coverage gap: check is boundary-only, so a clean ``allow`` over a diff - # that touches a manifest-declared tool surface would silently green-light - # a capability change that only ``verify`` gates. Escalate to ``warn`` and - # route to verify instead. Gated on ``release_decision is None`` because a - # provided release decision means the full capability scan already ran. + # that touches a tool/capability surface would silently green-light a + # capability change that only ``verify`` gates. Escalate to ``warn`` and + # route onward. Gated on ``release_decision is None`` because a provided + # release decision means the full capability scan already ran. + # + # Two sub-cases: + # * ``coverage_gap`` — the changed surface is one the manifest DECLARES as + # a tool source, so ``verify`` will gate it: route straight to verify. + # * ``undeclared_gap`` — the trigger flags the diff as a tool-surface + # change but the manifest declares no matching source (or there is no + # manifest), so ``verify`` cannot gate it yet: route to declare-then- + # verify instead of a misleading clean ``verify`` over a surface it + # never scans. + # + # The undeclared case requires a changed file the boundary evaluator does + # NOT itself inspect (``not is_boundary_path``): a clean ``allow`` over a + # purely Codex-local surface (``.codex/config.toml``, ``AGENTS.md``, the + # Shipgate workflow) is authoritative — ``check`` already evaluated it — so + # it is not a coverage gap. Same exclusion ``_declared_tool_surfaces_changed`` + # applies to declared sources. + boundary_clean_allow = decision == "allow" and release_decision is None coverage_surfaces = sorted(dict.fromkeys(capability_surfaces_changed or [])) - coverage_gap = decision == "allow" and release_decision is None and bool(coverage_surfaces) - if coverage_gap: + coverage_gap = boundary_clean_allow and bool(coverage_surfaces) + undeclared_gap = ( + boundary_clean_allow + and not coverage_surfaces + and _trigger_in_scope(trigger) + and any(not is_boundary_path(path) for path in changed_files) + ) + if coverage_gap or undeclared_gap: decision = "warn" risk_level = _risk_for(violations) @@ -484,6 +507,12 @@ def add(rule_id: str, *, path: str | None, evidence: dict[str, Any]) -> None: diagnostics = [*diagnostics, _coverage_diagnostic(coverage_surfaces)] trace = [*_trace_for(policy, decision, violations), _coverage_trace(coverage_surfaces)] suggested_fixes = [_VERIFY_COMMAND] + elif undeclared_gap: + first_next_action = _undeclared_next_action() + summary = _undeclared_summary() + diagnostics = [*diagnostics, _undeclared_diagnostic()] + trace = [*_trace_for(policy, decision, violations), _undeclared_trace()] + suggested_fixes = [_DETECT_COMMAND, _VERIFY_COMMAND] else: first_next_action = _next_action_for(decision, violations, repair) summary = _summary_for(decision, violations) @@ -1338,6 +1367,70 @@ def _risk_for(violations: list[AgentResultViolatedRule]) -> AgentResultRiskLevel # auto-detects the base (v0.13) and emits the boundary-result surface, so it # works for both the local working tree and committed refs. _VERIFY_COMMAND = "agents-shipgate verify --json" +_DETECT_COMMAND = "agents-shipgate detect --json" + + +def _trigger_in_scope(trigger: dict[str, Any] | None) -> bool: + """True when a CONTENT-driven trigger rule flags a tool/capability-surface change. + + Keys on a matched rule whose ``action`` is ``run_shipgate`` — the + path/diff-driven tool-surface rules (MCP export, tool decorator, OpenAPI, + framework bump). Deliberately NOT the top-level ``run_shipgate`` boolean: + that is also ``True`` for the blanket ``force_run`` rule + (``TRIGGER-EXISTING-MANIFEST-PRESENT``), which fires on every diff in an + opted-in repo and would turn a docs-only edit into a coverage warn. ``None`` + (a bare function call with no trigger context) is out of scope, so the + boundary evaluator keeps its prior behavior when called directly. + """ + if not isinstance(trigger, dict): + return False + return any( + isinstance(rule, dict) and rule.get("action") == "run_shipgate" + for rule in trigger.get("matched_rules") or [] + ) + + +def _undeclared_next_action() -> AgentResultNextAction: + return AgentResultNextAction( + actor="coding_agent", + kind="warn", + command=_DETECT_COMMAND, + why=( + "This diff changes a tool/capability surface that shipgate.yaml does not " + "declare, so neither check nor verify gates it yet. Declare the surface " + "(run detect or add it to tool_sources), then run verify before completing." + ), + ) + + +def _undeclared_summary() -> str: + return ( + "No Codex boundary rule fired, but the diff changes a tool/capability surface " + "that shipgate.yaml does not declare, so verify cannot gate it yet. Declare it " + "(detect or tool_sources) and run verify before reporting completion." + ) + + +def _undeclared_diagnostic() -> AgentResultDiagnostic: + return AgentResultDiagnostic( + level="warning", + code="undeclared_capability_surface", + message=( + "A changed tool/capability surface is not declared in shipgate.yaml; " + "check is boundary-only and verify only gates declared surfaces. Declare " + "the surface, then verify." + ), + ) + + +def _undeclared_trace() -> AgentResultTraceEvent: + return AgentResultTraceEvent( + step="coverage", + summary=( + "boundary_only: in-scope diff changes an undeclared tool surface; " + "routed to detect + verify." + ), + ) def _coverage_next_action() -> AgentResultNextAction: diff --git a/tests/test_codex_boundary_check.py b/tests/test_codex_boundary_check.py index 287e6228..505aba20 100644 --- a/tests/test_codex_boundary_check.py +++ b/tests/test_codex_boundary_check.py @@ -246,6 +246,137 @@ def test_check_does_not_warn_on_docs_change_in_opted_in_repo(tmp_path: Path) -> assert result.decision == "allow" +# --- Undeclared coverage gap: the trigger flags a tool-surface change the ----- +# manifest does not declare (or there is no manifest). verify cannot gate an +# undeclared surface, so route to declare-then-verify rather than a clean allow. + +# A trigger context where a content-driven tool-surface rule fired (what the +# real trigger emits for an MCP/tool-export change), distinct from the blanket +# manifest-present force_run rule. +_SURFACE_TRIGGER = { + "run_shipgate": True, + "matched_rules": [ + {"id": "TRIGGER-MCP-EXPORT-CHANGED", "action": "run_shipgate"} + ], +} +# Manifest-present force_run only: in scope for "run shipgate" but NOT a +# tool-surface change, so it must not trip the undeclared coverage warn. +_FORCE_RUN_TRIGGER = { + "run_shipgate": True, + "matched_rules": [ + {"id": "TRIGGER-EXISTING-MANIFEST-PRESENT", "action": "force_run"} + ], +} + + +def test_undeclared_tool_surface_in_scope_trigger_warns_and_routes_to_detect( + tmp_path: Path, +) -> None: + result = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + agent="claude-code", + trigger=_SURFACE_TRIGGER, + capability_surfaces_changed=None, + ) + payload = result.model_dump(mode="json", exclude_none=True) + _validate(payload) + # Was a bare allow before the fix; now a warn that routes to detect/declare. + assert payload["decision"] == "warn" + assert payload["completion_allowed"] is True + assert payload["must_stop"] is False + assert payload["first_next_action"]["kind"] == "warn" + assert payload["first_next_action"]["command"].startswith("agents-shipgate detect") + assert any(d["code"] == "undeclared_capability_surface" for d in payload["diagnostics"]) + assert any(t["step"] == "coverage" for t in payload["trace"]) + assert payload["suggested_fixes"][0].startswith("agents-shipgate detect") + assert any(fix.startswith("agents-shipgate verify") for fix in payload["suggested_fixes"]) + + +def test_no_manifest_capability_add_via_check_warns_and_routes_to_detect( + tmp_path: Path, +) -> None: + # End-to-end: empty workspace (no shipgate.yaml). build_codex_agent_result + # derives no declared surfaces, but the trigger flags the diff in scope. + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "warn" + assert result.first_next_action.command.startswith("agents-shipgate detect") + + +def test_capability_add_to_undeclared_surface_warns_when_manifest_declares_other( + tmp_path: Path, +) -> None: + # Manifest exists but declares a *different* tool source than the changed + # file. The declared-coverage path does not match, so the undeclared path + # must catch it (and route to detect, not a verify that never scans it). + _write_manifest( + tmp_path, + " - id: other\n type: mcp\n path: other-tools.json\n trust: internal\n", + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "warn" + assert result.first_next_action.command.startswith("agents-shipgate detect") + payload = result.model_dump(mode="json", exclude_none=True) + assert any(d["code"] == "undeclared_capability_surface" for d in payload["diagnostics"]) + + +def test_undeclared_gap_never_downgrades_a_block(tmp_path: Path) -> None: + # A real boundary block plus an in-scope trigger must stay blocked, not warn. + block_diff = (CORPUS / "mcp_auto_approve_write.diff").read_text(encoding="utf-8") + result = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=block_diff, + agent="claude-code", + trigger=_SURFACE_TRIGGER, + ) + assert result.decision == "block" + + +def test_undeclared_gap_inactive_out_of_scope_or_when_release_decision_present( + tmp_path: Path, +) -> None: + # Manifest-present force_run is "in scope" overall but is NOT a tool-surface + # change, so it must keep the clean allow (the "no noise on docs" property). + force_run_only = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + agent="claude-code", + trigger=_FORCE_RUN_TRIGGER, + ) + assert force_run_only.decision == "allow" + + # No trigger context at all (bare call) preserves legacy behavior. + no_trigger = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + agent="claude-code", + ) + assert no_trigger.decision == "allow" + + # A supplied release_decision means the full scan already ran; the + # projection governs and the boundary-only heuristic stays out of it. + scanned = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=_TOOL_SOURCE_DIFF, + agent="claude-code", + trigger=_SURFACE_TRIGGER, + release_decision={"decision": "passed", "reason": "clean"}, + ) + assert scanned.decision == "allow" + + def test_codex_check_reads_diff_from_stdin(tmp_path: Path) -> None: diff_text = (CORPUS / "docs_only.diff").read_text(encoding="utf-8") result = runner.invoke( From a721bbae15383e7d725732e9c4783b42946df432 Mon Sep 17 00:00:00 2001 From: Pengfei Hu Date: Mon, 22 Jun 2026 11:31:43 -0700 Subject: [PATCH 2/3] Classify undeclared tool surfaces per-file (review P1/P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses two review findings on the boundary coverage-gap defer. P1 — mixed declared + undeclared diffs only routed to `verify`. The undeclared gap was gated on `not coverage_surfaces`, making declared and undeclared mutually exclusive: a diff that changed a declared MCP export *and* an undeclared `api/openapi.yaml` routed to `verify`, which never scans the undeclared surface, so the new capability escaped the gate. Undeclared surfaces are now computed independently, per-file, and take precedence — a mixed diff routes to `detect` (declare-then-verify), which then covers both. P2 — `_trigger_in_scope` treated every `run_shipgate` rule as a tool-surface rule, so `shipgate.yaml` (TRIGGER-SHIPGATE-MANIFEST), `prompts/**` (TRIGGER-PROMPTS-OR-POLICIES), and the CI gate were mislabeled `undeclared_capability_surface` and routed to `detect`. Classification now keys on the actual tool-source rules only (MCP export, OpenAPI, static tool inventory, Codex plugin, n8n, @function_tool), excluding the manifest, prompts/policies, CI workflow, and `.codex` boundary config (which the boundary check itself evaluates). Implementation: `build_codex_agent_result` classifies each changed file per-file via the public trigger evaluator (glob rules match the path; diff_contains rules match the file's added lines), excluding boundary paths and already-declared files, and passes an explicit `undeclared_capability_surfaces` list to `evaluate_codex_boundary_result`. The coarse `_trigger_in_scope` + `is_boundary_path` heuristic is removed in favor of that explicit list. Tests: added mixed declared+undeclared cases (direct + via `check`) and manifest/prompts no-false-detect cases. Full suite green. Co-Authored-By: Claude Opus 4.8 --- src/agents_shipgate/cli/agent_result.py | 85 ++++++++++- src/agents_shipgate/core/codex_boundary.py | 106 +++++-------- tests/test_codex_boundary_check.py | 167 +++++++++++++++------ 3 files changed, 243 insertions(+), 115 deletions(-) diff --git a/src/agents_shipgate/cli/agent_result.py b/src/agents_shipgate/cli/agent_result.py index 0237164a..4d361ba0 100644 --- a/src/agents_shipgate/cli/agent_result.py +++ b/src/agents_shipgate/cli/agent_result.py @@ -11,9 +11,28 @@ parse_unified_diff, ) from agents_shipgate.schemas.codex_boundary_result import CodexBoundaryResultV1 -from agents_shipgate.triggers import _git_diff_context +from agents_shipgate.triggers import _git_diff_context, load_triggers from agents_shipgate.triggers import evaluate as evaluate_trigger +# Trigger rule IDs that mark a changed file as a *tool/capability source* — +# the surfaces ``verify`` compiles into the capability delta. Deliberately +# EXCLUDES the other ``run_shipgate`` rules that are not declarable tool +# sources: ``TRIGGER-SHIPGATE-MANIFEST`` (the manifest itself), +# ``TRIGGER-PROMPTS-OR-POLICIES`` (prompts/policies), ``TRIGGER-SHIPGATE-CI- +# WORKFLOW`` (the gate), and ``TRIGGER-CODEX-BOUNDARY-CONFIG-CHANGED`` (which +# the boundary check itself evaluates). A change to one of those must not be +# mislabeled an "undeclared tool surface". +_TOOL_SOURCE_TRIGGER_IDS = frozenset( + { + "TRIGGER-MCP-EXPORT-CHANGED", + "TRIGGER-OPENAPI-SPEC-CHANGED", + "TRIGGER-STATIC-TOOL-INVENTORY-CHANGED", + "TRIGGER-CODEX-PLUGIN-CHANGED", + "TRIGGER-N8N-WORKFLOW-CHANGED", + "TRIGGER-FUNCTION-TOOL-DECORATOR", + } +) + def build_codex_agent_result( *, @@ -24,7 +43,8 @@ def build_codex_agent_result( policy: Path | None, ) -> CodexBoundaryResultV1: workspace = workspace.resolve() - changed_files = sorted({item.path for item in parse_unified_diff(diff_text) if item.path}) + diff_files = parse_unified_diff(diff_text) + changed_files = sorted({item.path for item in diff_files if item.path}) config_path = config if config.is_absolute() else workspace / config trigger = evaluate_trigger( paths=changed_files, @@ -32,20 +52,75 @@ def build_codex_agent_result( manifest_present=config_path.is_file(), user_requested=True, ) + declared = _declared_tool_surfaces_changed( + workspace=workspace, + config_path=config_path, + changed_files=changed_files, + ) return evaluate_codex_boundary_result( workspace=workspace, diff_text=diff_text, agent=agent, policy_path=policy, trigger=trigger, - capability_surfaces_changed=_declared_tool_surfaces_changed( - workspace=workspace, - config_path=config_path, + capability_surfaces_changed=declared, + undeclared_capability_surfaces=_undeclared_tool_surfaces_changed( + diff_files=diff_files, changed_files=changed_files, + declared=declared, ), ) +def _undeclared_tool_surfaces_changed( + *, + diff_files: list[Any], + changed_files: list[str], + declared: list[str], +) -> list[str]: + """Changed files that are tool/capability surfaces the manifest does NOT declare. + + ``verify`` only gates *declared* tool sources, so an undeclared surface (a + new MCP/OpenAPI/tool-inventory/codex-plugin file, or an SDK/n8n tool the + manifest does not list) escapes the gate even though ``check`` returns a + clean ``allow``. Each changed file is classified per-file against only the + *tool-source* trigger rules (so the manifest, prompts/policies, the CI gate, + and ``.codex`` boundary config are never mislabeled), excluding boundary + paths (the boundary evaluator already inspects those) and files the manifest + already declares (``verify`` gates those). Computed independently of the + declared set so a mixed diff — one declared surface plus one undeclared — + still surfaces the undeclared one. + """ + if not changed_files: + return [] + declared_set = set(declared) + added_by_path = { + item.path: "\n".join(getattr(item, "added_lines", []) or []) + for item in diff_files + if item.path + } + catalog = load_triggers() + undeclared: list[str] = [] + for path in changed_files: + if path in declared_set or is_boundary_path(path): + continue + # Per-file classification: a glob rule matches on the path, a + # diff_contains rule (n8n, @function_tool) on this file's added lines. + result = evaluate_trigger( + paths=[path], + diff_text=added_by_path.get(path, ""), + manifest_present=False, + user_requested=False, + triggers=catalog, + ) + if any( + rule.get("id") in _TOOL_SOURCE_TRIGGER_IDS + for rule in result.get("matched_rules", []) + ): + undeclared.append(path) + return sorted(dict.fromkeys(undeclared)) + + def _declared_tool_surfaces_changed( *, workspace: Path, diff --git a/src/agents_shipgate/core/codex_boundary.py b/src/agents_shipgate/core/codex_boundary.py index aab63100..62fec0ea 100644 --- a/src/agents_shipgate/core/codex_boundary.py +++ b/src/agents_shipgate/core/codex_boundary.py @@ -376,16 +376,24 @@ def evaluate_codex_boundary_result( trigger: dict[str, Any] | None = None, release_decision: dict[str, Any] | None = None, capability_surfaces_changed: list[str] | None = None, + undeclared_capability_surfaces: list[str] | None = None, ) -> AgentResultV1: """Return the local Codex boundary-result projection for a unified diff. - ``capability_surfaces_changed`` lists changed files that the manifest - declares as tool sources. The boundary evaluator does not inspect tool - surfaces (only ``verify`` computes the capability delta), so when one - changed and the boundary result is otherwise a clean ``allow``, the - result is escalated to ``warn`` and routed to ``verify`` rather than - green-lighting a capability change ``check`` never evaluated. This keeps - ``check`` from disagreeing with the ``release_decision.decision`` gate. + The boundary evaluator does not inspect tool surfaces (only ``verify`` + computes the capability delta), so a clean ``allow`` over a changed tool + surface is escalated to ``warn`` rather than green-lighting a capability + change ``check`` never evaluated. This keeps ``check`` from disagreeing + with the ``release_decision.decision`` gate. Two inputs drive it (both + pre-computed by ``build_codex_agent_result``): + + - ``capability_surfaces_changed`` — changed files the manifest DECLARES as + tool sources. ``verify`` will gate these, so route to ``verify``. + - ``undeclared_capability_surfaces`` — changed files that ARE tool surfaces + but the manifest does not declare (or there is no manifest). ``verify`` + cannot gate an undeclared surface, so route to declare-then-verify + (``detect``). Takes precedence when a diff changes both, since ``verify`` + alone would miss the undeclared one. """ # Keep this local diff projector aligned with @@ -461,32 +469,17 @@ def add(rule_id: str, *, path: str | None, evidence: dict[str, Any]) -> None: # that touches a tool/capability surface would silently green-light a # capability change that only ``verify`` gates. Escalate to ``warn`` and # route onward. Gated on ``release_decision is None`` because a provided - # release decision means the full capability scan already ran. - # - # Two sub-cases: - # * ``coverage_gap`` — the changed surface is one the manifest DECLARES as - # a tool source, so ``verify`` will gate it: route straight to verify. - # * ``undeclared_gap`` — the trigger flags the diff as a tool-surface - # change but the manifest declares no matching source (or there is no - # manifest), so ``verify`` cannot gate it yet: route to declare-then- - # verify instead of a misleading clean ``verify`` over a surface it - # never scans. - # - # The undeclared case requires a changed file the boundary evaluator does - # NOT itself inspect (``not is_boundary_path``): a clean ``allow`` over a - # purely Codex-local surface (``.codex/config.toml``, ``AGENTS.md``, the - # Shipgate workflow) is authoritative — ``check`` already evaluated it — so - # it is not a coverage gap. Same exclusion ``_declared_tool_surfaces_changed`` - # applies to declared sources. + # release decision means the full capability scan already ran. The two + # surface lists are classified per-file upstream (declared vs undeclared); + # see the docstring. ``undeclared_gap`` is computed independently of the + # declared set and takes precedence so a mixed diff routes to + # declare-then-verify rather than a ``verify`` that misses the undeclared + # surface. boundary_clean_allow = decision == "allow" and release_decision is None coverage_surfaces = sorted(dict.fromkeys(capability_surfaces_changed or [])) + undeclared_surfaces = sorted(dict.fromkeys(undeclared_capability_surfaces or [])) coverage_gap = boundary_clean_allow and bool(coverage_surfaces) - undeclared_gap = ( - boundary_clean_allow - and not coverage_surfaces - and _trigger_in_scope(trigger) - and any(not is_boundary_path(path) for path in changed_files) - ) + undeclared_gap = boundary_clean_allow and bool(undeclared_surfaces) if coverage_gap or undeclared_gap: decision = "warn" @@ -501,18 +494,18 @@ def add(rule_id: str, *, path: str | None, evidence: dict[str, Any]) -> None: finding_fingerprints=finding_fingerprints, evaluated_files=evaluated_files, ) - if coverage_gap: + if undeclared_gap: + first_next_action = _undeclared_next_action() + summary = _undeclared_summary(undeclared_surfaces) + diagnostics = [*diagnostics, _undeclared_diagnostic(undeclared_surfaces)] + trace = [*_trace_for(policy, decision, violations), _undeclared_trace(undeclared_surfaces)] + suggested_fixes = [_DETECT_COMMAND, _VERIFY_COMMAND] + elif coverage_gap: first_next_action = _coverage_next_action() summary = _coverage_summary(coverage_surfaces) diagnostics = [*diagnostics, _coverage_diagnostic(coverage_surfaces)] trace = [*_trace_for(policy, decision, violations), _coverage_trace(coverage_surfaces)] suggested_fixes = [_VERIFY_COMMAND] - elif undeclared_gap: - first_next_action = _undeclared_next_action() - summary = _undeclared_summary() - diagnostics = [*diagnostics, _undeclared_diagnostic()] - trace = [*_trace_for(policy, decision, violations), _undeclared_trace()] - suggested_fixes = [_DETECT_COMMAND, _VERIFY_COMMAND] else: first_next_action = _next_action_for(decision, violations, repair) summary = _summary_for(decision, violations) @@ -1370,26 +1363,6 @@ def _risk_for(violations: list[AgentResultViolatedRule]) -> AgentResultRiskLevel _DETECT_COMMAND = "agents-shipgate detect --json" -def _trigger_in_scope(trigger: dict[str, Any] | None) -> bool: - """True when a CONTENT-driven trigger rule flags a tool/capability-surface change. - - Keys on a matched rule whose ``action`` is ``run_shipgate`` — the - path/diff-driven tool-surface rules (MCP export, tool decorator, OpenAPI, - framework bump). Deliberately NOT the top-level ``run_shipgate`` boolean: - that is also ``True`` for the blanket ``force_run`` rule - (``TRIGGER-EXISTING-MANIFEST-PRESENT``), which fires on every diff in an - opted-in repo and would turn a docs-only edit into a coverage warn. ``None`` - (a bare function call with no trigger context) is out of scope, so the - boundary evaluator keeps its prior behavior when called directly. - """ - if not isinstance(trigger, dict): - return False - return any( - isinstance(rule, dict) and rule.get("action") == "run_shipgate" - for rule in trigger.get("matched_rules") or [] - ) - - def _undeclared_next_action() -> AgentResultNextAction: return AgentResultNextAction( actor="coding_agent", @@ -1403,31 +1376,32 @@ def _undeclared_next_action() -> AgentResultNextAction: ) -def _undeclared_summary() -> str: +def _undeclared_summary(surfaces: list[str]) -> str: return ( "No Codex boundary rule fired, but the diff changes a tool/capability surface " - "that shipgate.yaml does not declare, so verify cannot gate it yet. Declare it " - "(detect or tool_sources) and run verify before reporting completion." + f"({', '.join(surfaces[:5])}) that shipgate.yaml does not declare, so verify " + "cannot gate it yet. Declare it (detect or tool_sources) and run verify before " + "reporting completion." ) -def _undeclared_diagnostic() -> AgentResultDiagnostic: +def _undeclared_diagnostic(surfaces: list[str]) -> AgentResultDiagnostic: return AgentResultDiagnostic( level="warning", code="undeclared_capability_surface", message=( - "A changed tool/capability surface is not declared in shipgate.yaml; " - "check is boundary-only and verify only gates declared surfaces. Declare " - "the surface, then verify." + "Undeclared tool/capability surface(s) " + f"{', '.join(surfaces[:5])} changed; check is boundary-only and verify " + "only gates declared surfaces. Declare the surface, then verify." ), ) -def _undeclared_trace() -> AgentResultTraceEvent: +def _undeclared_trace(surfaces: list[str]) -> AgentResultTraceEvent: return AgentResultTraceEvent( step="coverage", summary=( - "boundary_only: in-scope diff changes an undeclared tool surface; " + f"boundary_only: {len(surfaces)} undeclared tool surface(s) changed; " "routed to detect + verify." ), ) diff --git a/tests/test_codex_boundary_check.py b/tests/test_codex_boundary_check.py index 505aba20..4c025eeb 100644 --- a/tests/test_codex_boundary_check.py +++ b/tests/test_codex_boundary_check.py @@ -246,38 +246,29 @@ def test_check_does_not_warn_on_docs_change_in_opted_in_repo(tmp_path: Path) -> assert result.decision == "allow" -# --- Undeclared coverage gap: the trigger flags a tool-surface change the ----- -# manifest does not declare (or there is no manifest). verify cannot gate an -# undeclared surface, so route to declare-then-verify rather than a clean allow. - -# A trigger context where a content-driven tool-surface rule fired (what the -# real trigger emits for an MCP/tool-export change), distinct from the blanket -# manifest-present force_run rule. -_SURFACE_TRIGGER = { - "run_shipgate": True, - "matched_rules": [ - {"id": "TRIGGER-MCP-EXPORT-CHANGED", "action": "run_shipgate"} - ], -} -# Manifest-present force_run only: in scope for "run shipgate" but NOT a -# tool-surface change, so it must not trip the undeclared coverage warn. -_FORCE_RUN_TRIGGER = { - "run_shipgate": True, - "matched_rules": [ - {"id": "TRIGGER-EXISTING-MANIFEST-PRESENT", "action": "force_run"} - ], -} +# --- Undeclared coverage gap: a changed file IS a tool surface but the -------- +# manifest does not declare it (or there is no manifest). verify only gates +# declared surfaces, so route to declare-then-verify (detect) rather than a +# clean allow or a verify that never scans it. + +# A second changed file that is an *undeclared* tool surface (an OpenAPI spec), +# used to exercise mixed declared+undeclared diffs (review finding P1). +_MIXED_TOOL_SOURCE_DIFF = _TOOL_SOURCE_DIFF + ( + "diff --git a/api/openapi.yaml b/api/openapi.yaml\n" + "--- a/api/openapi.yaml\n" + "+++ b/api/openapi.yaml\n" + "@@ -1 +1,2 @@\n" + " openapi: 3.0.0\n" + "+paths: {}\n" +) -def test_undeclared_tool_surface_in_scope_trigger_warns_and_routes_to_detect( - tmp_path: Path, -) -> None: +def test_undeclared_surface_warns_and_routes_to_detect(tmp_path: Path) -> None: result = evaluate_codex_boundary_result( workspace=tmp_path, diff_text=_TOOL_SOURCE_DIFF, agent="claude-code", - trigger=_SURFACE_TRIGGER, - capability_surfaces_changed=None, + undeclared_capability_surfaces=["mcp-tools.json"], ) payload = result.model_dump(mode="json", exclude_none=True) _validate(payload) @@ -293,11 +284,30 @@ def test_undeclared_tool_surface_in_scope_trigger_warns_and_routes_to_detect( assert any(fix.startswith("agents-shipgate verify") for fix in payload["suggested_fixes"]) +def test_mixed_declared_and_undeclared_routes_to_detect(tmp_path: Path) -> None: + # Review finding P1: a diff that changes BOTH a declared surface (verify + # gates it) and an undeclared one (verify does not) must route to detect — + # declare-then-verify — not a verify that silently misses the undeclared + # surface. Undeclared takes precedence over the declared coverage gap. + result = evaluate_codex_boundary_result( + workspace=tmp_path, + diff_text=_MIXED_TOOL_SOURCE_DIFF, + agent="claude-code", + capability_surfaces_changed=["mcp-tools.json"], + undeclared_capability_surfaces=["api/openapi.yaml"], + ) + assert result.decision == "warn" + assert result.first_next_action.command.startswith("agents-shipgate detect") + payload = result.model_dump(mode="json", exclude_none=True) + diag = next(d for d in payload["diagnostics"] if d["code"] == "undeclared_capability_surface") + assert "api/openapi.yaml" in diag["message"] + + def test_no_manifest_capability_add_via_check_warns_and_routes_to_detect( tmp_path: Path, ) -> None: # End-to-end: empty workspace (no shipgate.yaml). build_codex_agent_result - # derives no declared surfaces, but the trigger flags the diff in scope. + # classifies mcp-tools.json as an undeclared tool surface. result = build_codex_agent_result( agent="claude-code", workspace=tmp_path, @@ -332,38 +342,107 @@ def test_capability_add_to_undeclared_surface_warns_when_manifest_declares_other assert any(d["code"] == "undeclared_capability_surface" for d in payload["diagnostics"]) +def test_mixed_declared_and_undeclared_via_check_routes_to_detect( + tmp_path: Path, +) -> None: + # Review finding P1, end-to-end through build_codex_agent_result: manifest + # declares mcp-tools.json; the diff also adds an undeclared OpenAPI spec. + _write_manifest( + tmp_path, + " - id: mcp\n type: mcp\n path: mcp-tools.json\n trust: internal\n", + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=_MIXED_TOOL_SOURCE_DIFF, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "warn" + assert result.first_next_action.command.startswith("agents-shipgate detect") + payload = result.model_dump(mode="json", exclude_none=True) + diag = next(d for d in payload["diagnostics"] if d["code"] == "undeclared_capability_surface") + assert "api/openapi.yaml" in diag["message"] + + +def test_manifest_edit_is_not_an_undeclared_tool_surface(tmp_path: Path) -> None: + # Review finding P2: editing shipgate.yaml in an opted-in repo fires a + # run_shipgate trigger rule (TRIGGER-SHIPGATE-MANIFEST) but is NOT a + # declarable tool source, so it must not be reported as an undeclared + # surface routed to detect. + _write_manifest( + tmp_path, + " - id: mcp\n type: mcp\n path: mcp-tools.json\n trust: internal\n", + ) + diff = ( + "diff --git a/shipgate.yaml b/shipgate.yaml\n" + "--- a/shipgate.yaml\n" + "+++ b/shipgate.yaml\n" + "@@ -1 +1,2 @@\n" + ' version: "0.1"\n' + "+# touch\n" + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=diff, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "allow" + payload = result.model_dump(mode="json", exclude_none=True) + assert not any( + d["code"] == "undeclared_capability_surface" for d in payload.get("diagnostics", []) + ) + + +def test_prompts_edit_is_not_an_undeclared_tool_surface(tmp_path: Path) -> None: + # Review finding P2: a prompts/ edit fires TRIGGER-PROMPTS-OR-POLICIES but + # is not a declarable tool source — no undeclared-surface warn / detect. + _write_manifest( + tmp_path, + " - id: mcp\n type: mcp\n path: mcp-tools.json\n trust: internal\n", + ) + diff = ( + "diff --git a/prompts/system.md b/prompts/system.md\n" + "--- a/prompts/system.md\n" + "+++ b/prompts/system.md\n" + "@@ -1 +1,2 @@\n" + " You are helpful.\n" + "+Be concise.\n" + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=diff, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "allow" + + def test_undeclared_gap_never_downgrades_a_block(tmp_path: Path) -> None: - # A real boundary block plus an in-scope trigger must stay blocked, not warn. + # A real boundary block plus an undeclared surface must stay blocked. block_diff = (CORPUS / "mcp_auto_approve_write.diff").read_text(encoding="utf-8") result = evaluate_codex_boundary_result( workspace=tmp_path, diff_text=block_diff, agent="claude-code", - trigger=_SURFACE_TRIGGER, + undeclared_capability_surfaces=["mcp-tools.json"], ) assert result.decision == "block" -def test_undeclared_gap_inactive_out_of_scope_or_when_release_decision_present( +def test_undeclared_gap_inactive_without_signal_or_when_release_decision_present( tmp_path: Path, ) -> None: - # Manifest-present force_run is "in scope" overall but is NOT a tool-surface - # change, so it must keep the clean allow (the "no noise on docs" property). - force_run_only = evaluate_codex_boundary_result( - workspace=tmp_path, - diff_text=_TOOL_SOURCE_DIFF, - agent="claude-code", - trigger=_FORCE_RUN_TRIGGER, - ) - assert force_run_only.decision == "allow" - - # No trigger context at all (bare call) preserves legacy behavior. - no_trigger = evaluate_codex_boundary_result( + # No undeclared surfaces supplied (bare call) preserves the clean allow. + no_signal = evaluate_codex_boundary_result( workspace=tmp_path, diff_text=_TOOL_SOURCE_DIFF, agent="claude-code", ) - assert no_trigger.decision == "allow" + assert no_signal.decision == "allow" # A supplied release_decision means the full scan already ran; the # projection governs and the boundary-only heuristic stays out of it. @@ -371,7 +450,7 @@ def test_undeclared_gap_inactive_out_of_scope_or_when_release_decision_present( workspace=tmp_path, diff_text=_TOOL_SOURCE_DIFF, agent="claude-code", - trigger=_SURFACE_TRIGGER, + undeclared_capability_surfaces=["mcp-tools.json"], release_decision={"decision": "passed", "reason": "clean"}, ) assert scanned.decision == "allow" From 9528237d389eeac987fe73054e363bbf1a728dcd Mon Sep 17 00:00:00 2001 From: Pengfei Hu Date: Mon, 22 Jun 2026 22:51:34 -0700 Subject: [PATCH 3/3] Respect the trigger's skip verdict in undeclared-surface classification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_undeclared_tool_surfaces_changed` treated any matched tool-source rule as a tool surface, ignoring the evaluator's winning verdict. A docs or test file that incidentally mentions `@tool` / `@function_tool` matches `TRIGGER-FUNCTION-TOOL-DECORATOR` but ALSO `TRIGGER-DOCS-ONLY-NEGATIVE`, which beats `run_shipgate` — so the catalog skips it. The classifier nonetheless flagged `README.md` / `tests/test_*.py` as an `undeclared_capability_surface` and routed to `detect`, contradicting the trigger catalog (docs/triggers.json: docs/tests that incidentally mention `@tool` must not run). Require the per-file trigger to actually run (`run_shipgate` true) in addition to a tool-source rule appearing in `matched_rules`. A real `@function_tool` source (run_shipgate true) still classifies; a docs/test mention (skip) no longer does. Tests: docs file and tests/ file that mention the decorator stay `allow`. Co-Authored-By: Claude Opus 4.8 --- src/agents_shipgate/cli/agent_result.py | 7 ++- tests/test_codex_boundary_check.py | 59 +++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/src/agents_shipgate/cli/agent_result.py b/src/agents_shipgate/cli/agent_result.py index 4d361ba0..d109f5eb 100644 --- a/src/agents_shipgate/cli/agent_result.py +++ b/src/agents_shipgate/cli/agent_result.py @@ -113,7 +113,12 @@ def _undeclared_tool_surfaces_changed( user_requested=False, triggers=catalog, ) - if any( + # Require the evaluator's WINNING verdict, not just a matched rule: a + # docs/test file that incidentally mentions ``@tool`` also matches + # ``TRIGGER-DOCS-ONLY-NEGATIVE``, which beats ``run_shipgate`` so the + # catalog skips it. Only treat the file as a tool surface when the + # trigger actually runs AND a tool-source rule is what carried it. + if result.get("run_shipgate") and any( rule.get("id") in _TOOL_SOURCE_TRIGGER_IDS for rule in result.get("matched_rules", []) ): diff --git a/tests/test_codex_boundary_check.py b/tests/test_codex_boundary_check.py index 4c025eeb..f6e0786a 100644 --- a/tests/test_codex_boundary_check.py +++ b/tests/test_codex_boundary_check.py @@ -421,6 +421,65 @@ def test_prompts_edit_is_not_an_undeclared_tool_surface(tmp_path: Path) -> None: assert result.decision == "allow" +def test_docs_file_mentioning_tool_decorator_is_not_undeclared_surface( + tmp_path: Path, +) -> None: + # Review finding: a docs file that incidentally mentions @tool matches the + # FUNCTION-TOOL-DECORATOR rule but ALSO TRIGGER-DOCS-ONLY-NEGATIVE, which + # wins — the catalog skips it, so it must not be flagged as an undeclared + # tool surface routed to detect. + _write_manifest( + tmp_path, + " - id: mcp\n type: mcp\n path: mcp-tools.json\n trust: internal\n", + ) + diff = ( + "diff --git a/README.md b/README.md\n" + "--- a/README.md\n" + "+++ b/README.md\n" + "@@ -1 +1,2 @@\n" + " # Docs\n" + "+Use the @tool / @function_tool decorator to register a tool.\n" + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=diff, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "allow" + payload = result.model_dump(mode="json", exclude_none=True) + assert not any( + d["code"] == "undeclared_capability_surface" for d in payload.get("diagnostics", []) + ) + + +def test_test_file_mentioning_tool_decorator_is_not_undeclared_surface( + tmp_path: Path, +) -> None: + # Same property for a tests/ file: docs-only-negative covers tests/**. + _write_manifest( + tmp_path, + " - id: mcp\n type: mcp\n path: mcp-tools.json\n trust: internal\n", + ) + diff = ( + "diff --git a/tests/test_docs.py b/tests/test_docs.py\n" + "--- a/tests/test_docs.py\n" + "+++ b/tests/test_docs.py\n" + "@@ -1 +1,2 @@\n" + " def test_x():\n" + "+ # documents the @function_tool example\n" + ) + result = build_codex_agent_result( + agent="claude-code", + workspace=tmp_path, + diff_text=diff, + config=Path("shipgate.yaml"), + policy=None, + ) + assert result.decision == "allow" + + def test_undeclared_gap_never_downgrades_a_block(tmp_path: Path) -> None: # A real boundary block plus an undeclared surface must stay blocked. block_diff = (CORPUS / "mcp_auto_approve_write.diff").read_text(encoding="utf-8")