Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 85 additions & 5 deletions src/agents_shipgate/cli/agent_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,28 @@
parse_unified_diff,
)
from agents_shipgate.schemas.codex_boundary_result import CodexBoundaryResultV1
from agents_shipgate.triggers import _git_diff_context
from agents_shipgate.triggers import _git_diff_context, load_triggers
from agents_shipgate.triggers import evaluate as evaluate_trigger

# Trigger rule IDs that mark a changed file as a *tool/capability source* —
# the surfaces ``verify`` compiles into the capability delta. Deliberately
# EXCLUDES the other ``run_shipgate`` rules that are not declarable tool
# sources: ``TRIGGER-SHIPGATE-MANIFEST`` (the manifest itself),
# ``TRIGGER-PROMPTS-OR-POLICIES`` (prompts/policies), ``TRIGGER-SHIPGATE-CI-
# WORKFLOW`` (the gate), and ``TRIGGER-CODEX-BOUNDARY-CONFIG-CHANGED`` (which
# the boundary check itself evaluates). A change to one of those must not be
# mislabeled an "undeclared tool surface".
_TOOL_SOURCE_TRIGGER_IDS = frozenset(
{
"TRIGGER-MCP-EXPORT-CHANGED",
"TRIGGER-OPENAPI-SPEC-CHANGED",
"TRIGGER-STATIC-TOOL-INVENTORY-CHANGED",
"TRIGGER-CODEX-PLUGIN-CHANGED",
"TRIGGER-N8N-WORKFLOW-CHANGED",
"TRIGGER-FUNCTION-TOOL-DECORATOR",
}
)


def build_codex_agent_result(
*,
Expand All @@ -24,28 +43,89 @@ def build_codex_agent_result(
policy: Path | None,
) -> CodexBoundaryResultV1:
workspace = workspace.resolve()
changed_files = sorted({item.path for item in parse_unified_diff(diff_text) if item.path})
diff_files = parse_unified_diff(diff_text)
changed_files = sorted({item.path for item in diff_files if item.path})
config_path = config if config.is_absolute() else workspace / config
trigger = evaluate_trigger(
paths=changed_files,
diff_text=diff_text,
manifest_present=config_path.is_file(),
user_requested=True,
)
declared = _declared_tool_surfaces_changed(
workspace=workspace,
config_path=config_path,
changed_files=changed_files,
)
return evaluate_codex_boundary_result(
workspace=workspace,
diff_text=diff_text,
agent=agent,
policy_path=policy,
trigger=trigger,
capability_surfaces_changed=_declared_tool_surfaces_changed(
workspace=workspace,
config_path=config_path,
capability_surfaces_changed=declared,
undeclared_capability_surfaces=_undeclared_tool_surfaces_changed(
diff_files=diff_files,
changed_files=changed_files,
declared=declared,
),
)


def _undeclared_tool_surfaces_changed(
*,
diff_files: list[Any],
changed_files: list[str],
declared: list[str],
) -> list[str]:
"""Changed files that are tool/capability surfaces the manifest does NOT declare.

``verify`` only gates *declared* tool sources, so an undeclared surface (a
new MCP/OpenAPI/tool-inventory/codex-plugin file, or an SDK/n8n tool the
manifest does not list) escapes the gate even though ``check`` returns a
clean ``allow``. Each changed file is classified per-file against only the
*tool-source* trigger rules (so the manifest, prompts/policies, the CI gate,
and ``.codex`` boundary config are never mislabeled), excluding boundary
paths (the boundary evaluator already inspects those) and files the manifest
already declares (``verify`` gates those). Computed independently of the
declared set so a mixed diff — one declared surface plus one undeclared —
still surfaces the undeclared one.
"""
if not changed_files:
return []
declared_set = set(declared)
added_by_path = {
item.path: "\n".join(getattr(item, "added_lines", []) or [])
for item in diff_files
if item.path
}
catalog = load_triggers()
undeclared: list[str] = []
for path in changed_files:
if path in declared_set or is_boundary_path(path):
continue
# Per-file classification: a glob rule matches on the path, a
# diff_contains rule (n8n, @function_tool) on this file's added lines.
result = evaluate_trigger(
paths=[path],
diff_text=added_by_path.get(path, ""),
manifest_present=False,
user_requested=False,
triggers=catalog,
)
# Require the evaluator's WINNING verdict, not just a matched rule: a
# docs/test file that incidentally mentions ``@tool`` also matches
# ``TRIGGER-DOCS-ONLY-NEGATIVE``, which beats ``run_shipgate`` so the
# catalog skips it. Only treat the file as a tool surface when the
# trigger actually runs AND a tool-source rule is what carried it.
if result.get("run_shipgate") and any(
rule.get("id") in _TOOL_SOURCE_TRIGGER_IDS
for rule in result.get("matched_rules", [])
):
undeclared.append(path)
return sorted(dict.fromkeys(undeclared))


def _declared_tool_surfaces_changed(
*,
workspace: Path,
Expand Down
95 changes: 81 additions & 14 deletions src/agents_shipgate/core/codex_boundary.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,16 +376,24 @@ def evaluate_codex_boundary_result(
trigger: dict[str, Any] | None = None,
release_decision: dict[str, Any] | None = None,
capability_surfaces_changed: list[str] | None = None,
undeclared_capability_surfaces: list[str] | None = None,
) -> AgentResultV1:
"""Return the local Codex boundary-result projection for a unified diff.

``capability_surfaces_changed`` lists changed files that the manifest
declares as tool sources. The boundary evaluator does not inspect tool
surfaces (only ``verify`` computes the capability delta), so when one
changed and the boundary result is otherwise a clean ``allow``, the
result is escalated to ``warn`` and routed to ``verify`` rather than
green-lighting a capability change ``check`` never evaluated. This keeps
``check`` from disagreeing with the ``release_decision.decision`` gate.
The boundary evaluator does not inspect tool surfaces (only ``verify``
computes the capability delta), so a clean ``allow`` over a changed tool
surface is escalated to ``warn`` rather than green-lighting a capability
change ``check`` never evaluated. This keeps ``check`` from disagreeing
with the ``release_decision.decision`` gate. Two inputs drive it (both
pre-computed by ``build_codex_agent_result``):

- ``capability_surfaces_changed`` — changed files the manifest DECLARES as
tool sources. ``verify`` will gate these, so route to ``verify``.
- ``undeclared_capability_surfaces`` — changed files that ARE tool surfaces
but the manifest does not declare (or there is no manifest). ``verify``
cannot gate an undeclared surface, so route to declare-then-verify
(``detect``). Takes precedence when a diff changes both, since ``verify``
alone would miss the undeclared one.
"""

# Keep this local diff projector aligned with
Expand Down Expand Up @@ -458,13 +466,21 @@ def add(rule_id: str, *, path: str | None, evidence: dict[str, Any]) -> None:
decision = _decision_for(violations, release_decision=release_decision)

# Coverage gap: check is boundary-only, so a clean ``allow`` over a diff
# that touches a manifest-declared tool surface would silently green-light
# a capability change that only ``verify`` gates. Escalate to ``warn`` and
# route to verify instead. Gated on ``release_decision is None`` because a
# provided release decision means the full capability scan already ran.
# that touches a tool/capability surface would silently green-light a
# capability change that only ``verify`` gates. Escalate to ``warn`` and
# route onward. Gated on ``release_decision is None`` because a provided
# release decision means the full capability scan already ran. The two
# surface lists are classified per-file upstream (declared vs undeclared);
# see the docstring. ``undeclared_gap`` is computed independently of the
# declared set and takes precedence so a mixed diff routes to
# declare-then-verify rather than a ``verify`` that misses the undeclared
# surface.
boundary_clean_allow = decision == "allow" and release_decision is None
coverage_surfaces = sorted(dict.fromkeys(capability_surfaces_changed or []))
coverage_gap = decision == "allow" and release_decision is None and bool(coverage_surfaces)
if coverage_gap:
undeclared_surfaces = sorted(dict.fromkeys(undeclared_capability_surfaces or []))
coverage_gap = boundary_clean_allow and bool(coverage_surfaces)
undeclared_gap = boundary_clean_allow and bool(undeclared_surfaces)
if coverage_gap or undeclared_gap:
decision = "warn"

risk_level = _risk_for(violations)
Expand All @@ -478,7 +494,13 @@ def add(rule_id: str, *, path: str | None, evidence: dict[str, Any]) -> None:
finding_fingerprints=finding_fingerprints,
evaluated_files=evaluated_files,
)
if coverage_gap:
if undeclared_gap:
first_next_action = _undeclared_next_action()
summary = _undeclared_summary(undeclared_surfaces)
diagnostics = [*diagnostics, _undeclared_diagnostic(undeclared_surfaces)]
trace = [*_trace_for(policy, decision, violations), _undeclared_trace(undeclared_surfaces)]
suggested_fixes = [_DETECT_COMMAND, _VERIFY_COMMAND]
elif coverage_gap:
first_next_action = _coverage_next_action()
summary = _coverage_summary(coverage_surfaces)
diagnostics = [*diagnostics, _coverage_diagnostic(coverage_surfaces)]
Expand Down Expand Up @@ -1338,6 +1360,51 @@ def _risk_for(violations: list[AgentResultViolatedRule]) -> AgentResultRiskLevel
# auto-detects the base (v0.13) and emits the boundary-result surface, so it
# works for both the local working tree and committed refs.
_VERIFY_COMMAND = "agents-shipgate verify --json"
_DETECT_COMMAND = "agents-shipgate detect --json"


def _undeclared_next_action() -> AgentResultNextAction:
return AgentResultNextAction(
actor="coding_agent",
kind="warn",
command=_DETECT_COMMAND,
why=(
"This diff changes a tool/capability surface that shipgate.yaml does not "
"declare, so neither check nor verify gates it yet. Declare the surface "
"(run detect or add it to tool_sources), then run verify before completing."
),
)


def _undeclared_summary(surfaces: list[str]) -> str:
return (
"No Codex boundary rule fired, but the diff changes a tool/capability surface "
f"({', '.join(surfaces[:5])}) that shipgate.yaml does not declare, so verify "
"cannot gate it yet. Declare it (detect or tool_sources) and run verify before "
"reporting completion."
)


def _undeclared_diagnostic(surfaces: list[str]) -> AgentResultDiagnostic:
return AgentResultDiagnostic(
level="warning",
code="undeclared_capability_surface",
message=(
"Undeclared tool/capability surface(s) "
f"{', '.join(surfaces[:5])} changed; check is boundary-only and verify "
"only gates declared surfaces. Declare the surface, then verify."
),
)


def _undeclared_trace(surfaces: list[str]) -> AgentResultTraceEvent:
return AgentResultTraceEvent(
step="coverage",
summary=(
f"boundary_only: {len(surfaces)} undeclared tool surface(s) changed; "
"routed to detect + verify."
),
)


def _coverage_next_action() -> AgentResultNextAction:
Expand Down
Loading
Loading