From dfb0efbe544dae37677470a9a4974895d9d9fc45 Mon Sep 17 00:00:00 2001 From: CIA Operations Officer Jennifer Pike Date: Mon, 15 Jun 2026 14:36:31 -0700 Subject: [PATCH 1/4] feat(hopper): model pool routing in cecli.hopper Move fast/code/think hopper pool, prompt classification, escalation, and apply-route from host integration into cecli.hopper with optional preload resolver hooks. Add 65 unit tests under cecli/tests/hopper/. Co-authored-by: Cursor --- cecli/hopper/__init__.py | 85 ++ cecli/hopper/apply.py | 90 ++ cecli/hopper/router.py | 1184 +++++++++++++++++++++ tests/basic/test_sessions.py | 2 +- tests/helpers/monorepo/LOCAL_WORKSPACE.md | 2 +- tests/hopper/test_model_pool.py | 30 + tests/hopper/test_model_router.py | 611 +++++++++++ tests/hopper/test_model_router_apply.py | 208 ++++ tests/hopper/test_model_router_preload.py | 251 +++++ tests/hopper/test_model_router_warmup.py | 142 +++ 10 files changed, 2603 insertions(+), 2 deletions(-) create mode 100644 cecli/hopper/__init__.py create mode 100644 cecli/hopper/apply.py create mode 100644 cecli/hopper/router.py create mode 100644 tests/hopper/test_model_pool.py create mode 100644 tests/hopper/test_model_router.py create mode 100644 tests/hopper/test_model_router_apply.py create mode 100644 tests/hopper/test_model_router_preload.py create mode 100644 tests/hopper/test_model_router_warmup.py diff --git a/cecli/hopper/__init__.py b/cecli/hopper/__init__.py new file mode 100644 index 00000000000..2e08171401f --- /dev/null +++ b/cecli/hopper/__init__.py @@ -0,0 +1,85 @@ +"""Model hopper pool + prompt routing (fast / code / think tiers).""" + +from cecli.hopper.apply import ( + apply_hopper_extra_params, + apply_route_to_coder, + apply_thinking_extra_params, + merge_extra_params, +) +from cecli.hopper.router import ( + ModelPoolEntry, + ModelRouterConfig, + OllamaClient, + ResolvedModelPool, + RouteDecision, + RouteRole, + RouteTier, + RouteTurnContext, + classify_prompt, + context_exceeds_fast_model_limit, + escalation_target, + estimate_message_tokens, + estimate_prompt_tokens, + find_pool_entry, + inject_backend_extra_params, + lookup_model_max_input_tokens, + normalize_keep_alive_for_tier, + normalize_pool_tier, + normalize_route_role, + pick_tier_model, + pool_prefers_think, + pool_thinking_for_model, + preload_priority_list, + resolve_model_pool, + resolve_pool_entry_thinking, + resolve_provider_prefix, + resolve_tier_models, + role_to_legacy_tier, + set_backend_client_resolver, + set_static_vram_bytes_resolver, + should_escalate_code_turn, + should_escalate_fast_turn, + thinking_for_role, + warmup_keep_alive, +) + +__all__ = [ + "ModelPoolEntry", + "ModelRouterConfig", + "OllamaClient", + "ResolvedModelPool", + "RouteDecision", + "RouteRole", + "RouteTier", + "RouteTurnContext", + "apply_hopper_extra_params", + "apply_route_to_coder", + "apply_thinking_extra_params", + "classify_prompt", + "context_exceeds_fast_model_limit", + "escalation_target", + "estimate_message_tokens", + "estimate_prompt_tokens", + "find_pool_entry", + "inject_backend_extra_params", + "lookup_model_max_input_tokens", + "merge_extra_params", + "normalize_keep_alive_for_tier", + "normalize_pool_tier", + "normalize_route_role", + "pick_tier_model", + "pool_prefers_think", + "pool_thinking_for_model", + "preload_priority_list", + "resolve_model_pool", + "resolve_pool_entry_thinking", + "resolve_provider_prefix", + "resolve_tier_models", + "role_to_legacy_tier", + "set_backend_client_resolver", + "set_static_vram_bytes_resolver", + "should_escalate_code_turn", + "should_escalate_fast_turn", + "thinking_for_role", + "warmup_keep_alive", +] diff --git a/cecli/hopper/apply.py b/cecli/hopper/apply.py new file mode 100644 index 00000000000..28fbb1ced29 --- /dev/null +++ b/cecli/hopper/apply.py @@ -0,0 +1,90 @@ +"""Apply a route decision to a live cecli Coder (swap main_model + Ollama keep_alive).""" + +from __future__ import annotations + +from typing import Any + +from cecli import models + +from cecli.hopper.router import ( + ModelRouterConfig, + RouteDecision, + RouteRole, + find_pool_entry, + normalize_keep_alive_for_tier, + normalize_route_role, + resolve_pool_entry_thinking, +) + + +def merge_extra_params(into: dict[str, Any], patch: dict[str, Any]) -> None: + """Deep-merge LiteLLM kwargs (cecli-style); router owns ``keep_alive``.""" + for key, value in patch.items(): + if key == "keep_alive": + continue + if isinstance(value, dict) and isinstance(into.get(key), dict): + merge_extra_params(into[key], value) + else: + into[key] = value + + +def apply_hopper_extra_params(model, extra: dict[str, Any] | None) -> None: + if not extra: + return + model._ensure_extra_params_dict() + merge_extra_params(model.extra_params, extra) + + +def apply_thinking_extra_params(model, enable: bool | None) -> None: + """Set Ollama ``think`` for this model; overrides hopper/global ``think``.""" + if enable is None: + return + model._ensure_extra_params_dict() + model.extra_params["think"] = enable + name = (getattr(model, "name", "") or "").lower() + if "qwen3" in name: + if enable: + if getattr(model, "system_prompt_prefix", "") == "/no_think": + model.system_prompt_prefix = "" + else: + model.system_prompt_prefix = "/no_think" + + +def _resolve_enable_thinking( + decision: RouteDecision, + router: ModelRouterConfig, + role: RouteRole, + pool_entry, +) -> bool | None: + enable = decision.enable_thinking + if enable is not None: + return enable + if pool_entry is not None: + return resolve_pool_entry_thinking(pool_entry) + return None + + +def apply_route_to_coder(coder, decision: RouteDecision, router: ModelRouterConfig) -> None: + """Point the coder at the routed model for this turn.""" + prev = coder.main_model + new_model = models.Model(decision.model_name, from_model=prev) + role = decision.role or normalize_route_role(decision.tier) or "code" + pool_entry = ( + find_pool_entry(router.model_pool, decision.model_name, role) + if router.model_pool + else None + ) + apply_hopper_extra_params( + new_model, + pool_entry.extra_params if pool_entry else None, + ) + if new_model.is_ollama(): + new_model._ensure_extra_params_dict() + keep_alive = normalize_keep_alive_for_tier( + role, + router.keep_alive_fast if role == "fast" else router.keep_alive_heavy, + ) + new_model.extra_params["keep_alive"] = keep_alive + enable = _resolve_enable_thinking(decision, router, role, pool_entry) + apply_thinking_extra_params(new_model, enable) + coder.main_model = new_model diff --git a/cecli/hopper/router.py b/cecli/hopper/router.py new file mode 100644 index 00000000000..67f8d45ae01 --- /dev/null +++ b/cecli/hopper/router.py @@ -0,0 +1,1184 @@ +""" +Model hopper + local LLM routing: classify prompts and pick fast vs code vs think models. + +Security: only uses model names supplied in config — no runtime fetch of arbitrary models. +Hosts may register optional preload resolvers via :func:`set_backend_client_resolver`. +""" + +from __future__ import annotations + +import json +import logging +import os +import re +from collections.abc import Callable +from dataclasses import dataclass, field +from functools import lru_cache +from typing import Any, Literal, Protocol, runtime_checkable + +logger = logging.getLogger(__name__) + +_backend_client_resolver: Callable[[], Any] | None = None +_static_vram_bytes_resolver: Callable[[str], int | None] | None = None + + +def set_backend_client_resolver(fn: Callable[[], Any] | None) -> None: + """Host hook: return active backend client for preload when none is passed explicitly.""" + global _backend_client_resolver + _backend_client_resolver = fn + + +def set_static_vram_bytes_resolver(fn: Callable[[str], int | None] | None) -> None: + """Host hook: estimate model VRAM in bytes from a bare tag (no live show API).""" + global _static_vram_bytes_resolver + _static_vram_bytes_resolver = fn + +RouteRole = Literal["fast", "code", "think"] +RouteTier = Literal["fast", "heavy", "code", "think"] + + +@runtime_checkable +class OllamaClient(Protocol): + """Protocol for an async Ollama HTTP client (preload / show).""" + + async def post_generate(self, model: str, *, keep_alive: int = -1) -> None: + """Issue a zero-token generate to preload the model into VRAM.""" + ... + + async def show_model(self, model: str) -> dict[str, Any]: + """Return model info (at minimum ``size`` in bytes). Empty dict on failure.""" + ... + + +def normalize_route_role(tier_or_role: str | None) -> RouteRole | None: + """Map API/UI tier names to a routing role (``heavy`` → ``code``).""" + if not tier_or_role: + return None + key = tier_or_role.strip().lower() + if key == "fast": + return "fast" + if key in ("heavy", "code"): + return "code" + if key == "think": + return "think" + return None + + +def role_to_legacy_tier(role: RouteRole) -> RouteTier: + """SSE/UI tier field: fast stays fast; code+think map to distinct tiers.""" + return role + + +def normalize_pool_tier(raw: str | None) -> RouteRole | None: + if not raw: + return None + return normalize_route_role(raw) + + +# Code + think tiers keep models loaded during agent loops (keep_alive=0 → empty Ollama). +def normalize_keep_alive_for_tier(tier: RouteTier | RouteRole, value: int | str) -> int | str: + if tier in ("heavy", "code", "think") and value in (0, "0"): + return -1 + return value + + +# Per-file context bump for *display* only (routing uses message_tokens). +_FILE_TOKEN_PER_FILE = 500 +_FILE_TOKEN_CAP = 2_000 + +# Reserve completion tokens when comparing session context to fast model window. +_FAST_CONTEXT_OUTPUT_RESERVE = 2_048 + +# Intent signals (case-insensitive word boundaries). +_THINK_PATTERNS = re.compile( + r"\b(" + r"architect(?:ure|ural)?|refactor|rewrite|migrate|migration|" + r"race\s+condition|deadlock|concurrency|distributed|microservice|" + r"security|vulnerability|root\s+cause|design\s+review|" + r"performance|scalability|profil(?:e|ing)|" + r"from\s+scratch|greenfield|system\s+design|" + r"analyze|analyse|debug|why\s+does|explain\s+why|investigate|" + r"tradeoff|trade-off|compare\s+approaches|plan\s+the" + r")\b", + re.IGNORECASE, +) + +_FAST_PATTERNS = re.compile( + r"\b(" + r"rename|typo|whitespace|format(?:ting)?|lint|prettier|" + r"color|colour|style|css|spacing|margin|padding|" + r"label|tooltip|copy|wording|comment(?:s)?|" + r"tweak|ui\s+text|button\s+text|" + r"references?|chips?|filesystem|autocomplete|mention|" + r"chat\s+panel|message\s+input|text\s+field|component|" + r"like\s+we\s+have|@\s*\w" + r")\b", + re.IGNORECASE, +) + +# "add" alone is ambiguous (UI copy vs new feature); routing uses stronger verbs only. +_CODE_TASK_STRONG = re.compile( + r"\b(implement|fix|create|update|change|patch|write|build)\b", + re.IGNORECASE, +) + + +def _parse_pool_extra_params(raw: Any) -> dict[str, Any] | None: + if isinstance(raw, dict) and raw: + return dict(raw) + if isinstance(raw, str) and raw.strip(): + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + return None + return parsed if isinstance(parsed, dict) and parsed else None + return None + + +def _parse_capabilities(raw: Any) -> dict[str, Any] | None: + """Parse model capabilities from payload (dict with bool/int values).""" + if isinstance(raw, dict) and raw: + return dict(raw) + return None + + +@dataclass +class ModelPoolEntry: + model: str + tier: RouteRole + enabled: bool = True + """Per-model LiteLLM ``think`` override; ``None`` → derive from tier.""" + enable_thinking: bool | None = None + """Per-model LiteLLM kwargs when this hopper row is routed.""" + extra_params: dict[str, Any] | None = None + """Priority rank within the global priority list (0 = highest). None when unset.""" + priority_rank: int | None = None + """When True, route to the second-highest-priority model in this tier.""" + prefer_secondary: bool = False + """Model capabilities: vision, large_context, specializations.""" + capabilities: dict[str, Any] | None = None + + @property + def has_vision(self) -> bool: + """True when this model supports multimodal/vision input.""" + return bool(self.capabilities and self.capabilities.get("vision")) + + @property + def max_context(self) -> int | None: + """Max context window size in tokens, if declared.""" + if not self.capabilities: + return None + raw = self.capabilities.get("max_context") + return int(raw) if raw is not None and int(raw) > 0 else None + + +def resolve_tier_models(pool: list[ModelPoolEntry], tier: RouteRole) -> list[ModelPoolEntry]: + """Return all enabled models for a tier, sorted by priority_rank (ascending = highest priority first). + + Models with priority_rank=None are sorted after those with a rank. + """ + filtered = [e for e in pool if e.enabled and e.tier == tier] + filtered.sort(key=lambda e: (e.priority_rank is None, e.priority_rank if e.priority_rank is not None else 0)) + return filtered + + +def pick_tier_model( + pool: list[ModelPoolEntry], + tier: RouteRole, + *, + resident_models: set[str] | None = None, + require_vision: bool = False, + context_tokens: int | None = None, + prefer_warm: bool = False, +) -> tuple[str, bool]: + """Pick the model to route to for a tier. + + Returns (model_name, is_swap). + Respects capability requirements, context limits, residency preference, + prefer_secondary flag, and priority ordering. + + Fallback logic (in order): + 1. If require_vision: filter to vision-capable models. If none, fall through to all. + 2. If context_tokens set: filter out models whose max_context < context_tokens. If none pass, use all. + 3. If prefer_warm and resident_models: prefer resident models (but don't require). + 4. Apply prefer_secondary / priority ordering on the remaining candidates. + """ + models = resolve_tier_models(pool, tier) + if not models: + raise ValueError(f"No enabled models available for tier '{tier}'") + + candidates = models + + # --- Capability filter: vision --- + if require_vision: + vision_models = [m for m in candidates if m.has_vision] + if vision_models: + candidates = vision_models + + # --- Context window filter --- + if context_tokens is not None and context_tokens > 0: + fits = [m for m in candidates if m.max_context is None or m.max_context >= context_tokens] + if fits: + candidates = fits + # If none fit, keep all candidates (best-effort routing) + + # --- Residency preference (soft — prefer warm, don't require) --- + if prefer_warm and resident_models and len(candidates) > 1: + warm = [m for m in candidates if m.model in resident_models] + if warm: + candidates = warm + + # --- Priority / prefer_secondary selection --- + prefer_secondary = any(m.prefer_secondary for m in candidates) + if prefer_secondary and len(candidates) >= 2: + chosen = candidates[1] + else: + chosen = candidates[0] + + # Determine is_swap: True when model not in resident_models + is_swap = False + if resident_models is not None and chosen.model not in resident_models: + is_swap = True + + return (chosen.model, is_swap) + + +async def preload_priority_list( + priority_list: list[str], + *, + ollama_client: Any | None = None, + vram_budget_bytes: int | None = None, + backend_client: Any | None = None, +) -> list[str]: + """Preload models in priority order, respecting VRAM budget. + + Iterates ``priority_list`` from index 0 (highest priority) onward. For each model: + - If ``vram_budget_bytes`` is set, fetches model size info and checks cumulative VRAM. + When the budget would be exceeded, logs deferred models and stops. + - Attempts to preload via ``backend_client``, optional host resolver, or ``ollama_client``. + - On success, appends to the returned list. + - On failure, logs the error, skips the model, and continues with the next. + + Returns the list of successfully preloaded model tags. + """ + preloaded: list[str] = [] + cumulative_vram: int = 0 + + for idx, model_tag in enumerate(priority_list): + tag = model_tag.strip() + if not tag: + continue + + raw_tag = _strip_ollama_prefix(tag) + + model_size: int | None = None + if vram_budget_bytes is not None: + model_size = await _get_model_size_for_budget( + raw_tag, ollama_client=ollama_client + ) + if model_size is not None: + if cumulative_vram + model_size > vram_budget_bytes: + deferred = [t.strip() for t in priority_list[idx:] if t.strip()] + logger.info( + "VRAM budget exceeded (%.1f MB used of %.1f MB). " + "Deferring models: %s", + cumulative_vram / (1024 * 1024), + vram_budget_bytes / (1024 * 1024), + deferred, + ) + break + + if await _preload_single_model( + raw_tag, + ollama_client=ollama_client, + backend_client=backend_client, + ): + preloaded.append(tag) + if model_size is not None: + cumulative_vram += model_size + + return preloaded + + +async def warmup_keep_alive( + priority_list: list[str], + *, + ollama_client: Any | None = None, + backend_client: Any | None = None, +) -> list[str]: + """Send keep-alive requests in priority order to refresh model TTLs. + + Iterates ``priority_list`` from index 0 (highest priority) onward. For each model: + - Strips the ``ollama_chat/`` or ``ollama/`` prefix for backend API calls. + - Sends a keep-alive/preload request via the active backend (or legacy client). + - On success, appends to the returned list. + - On failure, logs the error, skips the model, and continues with the next. + + Returns the list of model tags that were successfully kept alive. + """ + kept_alive: list[str] = [] + + for model_tag in priority_list: + tag = model_tag.strip() + if not tag: + continue + + raw_tag = _strip_ollama_prefix(tag) + + if await _preload_single_model( + raw_tag, + ollama_client=ollama_client, + backend_client=backend_client, + ): + kept_alive.append(tag) + else: + logger.error("Keep-alive warmup failed for model '%s'", tag) + + return kept_alive + + +def _strip_ollama_prefix(tag: str) -> str: + """Remove ``ollama_chat/`` or ``ollama/`` prefix from a model tag.""" + if tag.startswith("ollama_chat/"): + return tag[len("ollama_chat/"):] + if tag.startswith("ollama/"): + return tag[len("ollama/"):] + return tag + + +async def _get_model_size(ollama_client: Any, raw_tag: str) -> int | None: + """Attempt to get model size in bytes via ollama show. Returns None on failure.""" + try: + info = await ollama_client.show_model(raw_tag) + size = info.get("size") if isinstance(info, dict) else None + if isinstance(size, (int, float)) and size > 0: + return int(size) + return None + except Exception: + return None + + +def _estimate_model_size_bytes(raw_tag: str) -> int | None: + """Static VRAM estimate via optional host resolver (bytes).""" + if _static_vram_bytes_resolver is None: + return None + return _static_vram_bytes_resolver(raw_tag) + + +async def _get_model_size_for_budget( + raw_tag: str, + *, + ollama_client: Any | None, +) -> int | None: + """Resolve model size for VRAM budgeting (Ollama show or static metadata).""" + if ollama_client is not None: + return await _get_model_size(ollama_client, raw_tag) + return _estimate_model_size_bytes(raw_tag) + + +async def _preload_single_model( + raw_tag: str, + *, + ollama_client: Any | None = None, + backend_client: Any | None = None, +) -> bool: + """Preload one model via Ollama client or host-injected backend client.""" + if ollama_client is not None: + try: + await ollama_client.post_generate(raw_tag, keep_alive=-1) + return True + except Exception as exc: + logger.error("Preload failed for model '%s': %s", raw_tag, exc) + return False + + client = backend_client + if client is None and _backend_client_resolver is not None: + client = _backend_client_resolver() + if client is None: + return False + try: + loaded = await client.preload_models([raw_tag]) + return raw_tag in loaded + except Exception as exc: + logger.error("Preload failed for model '%s': %s", raw_tag, exc) + return False + + +def find_pool_entry( + pool: list[ModelPoolEntry], + model_name: str, + role: RouteRole, +) -> ModelPoolEntry | None: + """Match hopper row for a routed model (empty code id → session code tier).""" + target = (model_name or "").strip() + for entry in pool: + if not entry.enabled: + continue + name = entry.model.strip() + if name and name != target: + continue + if name == target or (not name and role == "code"): + return entry + return None + + +def thinking_for_pool_tier(tier: RouteRole) -> bool: + return tier == "think" + + +def resolve_pool_entry_thinking(entry: ModelPoolEntry) -> bool: + if entry.enable_thinking is not None: + return entry.enable_thinking + return thinking_for_pool_tier(entry.tier) + + +def pool_thinking_for_model(model_name: str, pool: list[ModelPoolEntry]) -> bool | None: + """Explicit hopper ``enable_thinking`` for a resolved model id.""" + target = (model_name or "").strip() + if not target: + return None + for entry in pool: + if not entry.enabled: + continue + name = entry.model.strip() + if name and name == target: + return resolve_pool_entry_thinking(entry) + return None + + +@dataclass +class ResolvedModelPool: + fast: str + code: str + think: str | None + + +def resolve_model_pool( + pool: list[ModelPoolEntry], + *, + session_code: str, + fallback_fast: str = "", + fallback_code: str | None = None, + fallback_think: str | None = None, +) -> ResolvedModelPool: + """Pick first enabled fast/code/think from hopper order.""" + fast = fallback_fast.strip() + code = (fallback_code or "").strip() or session_code + think = (fallback_think or "").strip() or None + for entry in pool: + if not entry.enabled: + continue + name = entry.model.strip() + if entry.tier == "fast" and name and not fast: + fast = name + elif entry.tier == "code": + if name: + code = name + else: + code = session_code + elif entry.tier == "think" and name and not think: + think = name + return ResolvedModelPool(fast=fast, code=code, think=think) + + +def pool_prefers_think(pool: list[ModelPoolEntry]) -> bool: + """True when the first enabled think entry appears before the first enabled code entry. + + This reflects the user dragging think to the top of the hopper (highest priority). + """ + think_idx: int | None = None + code_idx: int | None = None + for i, entry in enumerate(pool): + if not entry.enabled: + continue + if entry.tier == "think" and entry.model.strip() and think_idx is None: + think_idx = i + elif entry.tier == "code" and code_idx is None: + code_idx = i + if think_idx is None or code_idx is None: + return False + return think_idx < code_idx + + +def _parse_env_bool(key: str) -> bool | None: + """Parse CODE_THINK / FAST_THINK from process env or local-llm config files.""" + # Check process env first + val = os.environ.get(key, "").strip().lower() + if val in ("1", "true", "yes", "on"): + return True + if val in ("0", "false", "no", "off"): + return False + # Fall back to optional local-llm env files on disk + return _read_local_llm_env_bool(key) + + +def _local_llm_env_paths() -> list: + """Candidate env files for tier think flags (last file wins).""" + from pathlib import Path + + paths: list[Path] = [] + explicit = os.environ.get("CECLI_LLM_ENV", "").strip() + if explicit: + paths.append(Path(explicit)) + home = Path.home() + xdg = os.environ.get("XDG_CONFIG_HOME", "").strip() + config_home = Path(xdg) if xdg else home / ".config" + paths.append(config_home / "local-llm" / "env") + repo_root = os.environ.get("CECLI_REPO_ROOT", "").strip() + if repo_root: + paths.append(Path(repo_root) / "local-llm.env") + paths.append(Path.cwd() / "local-llm.env") + return paths + + +def _read_local_llm_env_bool(key: str) -> bool | None: + """Read a key from the local-llm env file chain (last file wins).""" + + result: bool | None = None + for p in _local_llm_env_paths(): + try: + if not p.is_file(): + continue + for line in p.read_text().splitlines(): + line = line.strip() + if line.startswith("#") or "=" not in line: + continue + k, _, v = line.partition("=") + if k.strip() != key: + continue + v = v.strip().strip("'\"").lower() + if v in ("1", "true", "yes", "on"): + result = True + elif v in ("0", "false", "no", "off"): + result = False + except OSError: + continue + return result + + +def _apply_env_think_to_pool(pool: list[ModelPoolEntry]) -> None: + """Override pool enable_thinking from per-slot or tier-level env vars. + + Resolution order (highest priority first): + 1. Per-slot: ``CODE_MODEL_THINK=1`` (slot 0), ``CODE_MODEL_1_THINK=1`` (slot 1) + 2. Tier-level: ``CODE_THINK=1`` (applies to all slots in tier without per-slot override) + + The frontend may send stale localStorage values; the env file is authoritative. + """ + code_think = _parse_env_bool("CODE_THINK") + fast_think = _parse_env_bool("FAST_THINK") + + # Per-slot overrides: {TIER}_MODEL_THINK (slot 0), {TIER}_MODEL_{N}_THINK (slots 1-9) + slot_think: dict[tuple[str, int], bool | None] = {} + for tier_prefix in ("FAST", "CODE", "THINK"): + # Slot 0: {TIER}_MODEL_THINK + val = _parse_env_bool(f"{tier_prefix}_MODEL_THINK") + if val is not None: + slot_think[(tier_prefix.lower(), 0)] = val + # Slots 1-9: {TIER}_MODEL_{N}_THINK + for n in range(1, 10): + val = _parse_env_bool(f"{tier_prefix}_MODEL_{n}_THINK") + if val is not None: + slot_think[(tier_prefix.lower(), n)] = val + + if code_think is None and fast_think is None and not slot_think: + return + + for entry in pool: + if not entry.enabled: + continue + # Determine slot index from priority_rank or default to 0 + slot_idx = entry.priority_rank if entry.priority_rank is not None else 0 + tier = entry.tier + + # Per-slot override takes priority + per_slot = slot_think.get((tier, slot_idx)) + if per_slot is not None: + entry.enable_thinking = per_slot + elif tier == "code" and code_think is not None: + entry.enable_thinking = code_think + elif tier == "fast" and fast_think is not None: + entry.enable_thinking = fast_think + + +@dataclass +class RouteTurnContext: + agent_cmd: bool = False + implement_turn: bool = False + inject_todo_spec: bool = False + spec_gen_turn: bool = False + exploration_aborted: bool = False + + +_BACKEND_PROVIDER_PREFIXES: dict[str, str] = { + "ollama": "ollama_chat/", + "vllm": "openai/", + "tgi": "openai/", + "llamacpp": "openai/", + "mlx-lm": "openai/", +} + + +def resolve_provider_prefix(backend: str) -> str: + """Map a backend name to its LiteLLM provider prefix. + + Defaults to ``ollama_chat/`` for unknown backends. + """ + return _BACKEND_PROVIDER_PREFIXES.get((backend or "").strip().lower(), "ollama_chat/") + + +def inject_backend_extra_params(backend: str, extra_params: dict[str, object] | None) -> dict[str, object]: + """Merge ``LITELLM_EXTRA_PARAMS`` for non-Ollama backends. + + Ollama uses its own env wiring; other backends may need auth headers or base URLs + via JSON in ``LITELLM_EXTRA_PARAMS``. Existing *extra_params* keys are preserved. + """ + merged: dict[str, object] = dict(extra_params or {}) + name = (backend or "").strip().lower() + if name in ("", "ollama"): + return merged + raw = os.environ.get("LITELLM_EXTRA_PARAMS", "").strip() + if not raw: + return merged + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + logger.warning("LITELLM_EXTRA_PARAMS is not valid JSON — ignoring for backend %s", name) + return merged + if isinstance(parsed, dict): + merged.update(parsed) + return merged + + +@dataclass +class ModelRouterConfig: + enabled: bool = False + fast_model: str = "" + heavy_model: str | None = None + code_model: str | None = None + think_model: str | None = None + model_pool: list[ModelPoolEntry] = field(default_factory=list) + token_fast_max: int = 4_096 + token_heavy_min: int = 12_000 + keep_alive_fast: int | str = 300 + keep_alive_heavy: int | str = -1 + escalate_on_failure: bool = True + prefer_think: bool = False + """Global priority list of model tags in priority order (index 0 = highest).""" + priority_list: list[str] = field(default_factory=list) + backend: str = "ollama" + provider_prefix: str = "ollama_chat/" + + def __post_init__(self) -> None: + self.keep_alive_heavy = normalize_keep_alive_for_tier("code", self.keep_alive_heavy) + if not self.code_model and self.heavy_model: + self.code_model = self.heavy_model + self.provider_prefix = resolve_provider_prefix(self.backend) + + @property + def resolved_code_model(self) -> str: + return (self.code_model or self.heavy_model or self.fast_model or "").strip() + + @property + def resolved_think_model(self) -> str | None: + name = (self.think_model or "").strip() + return name or None + + @classmethod + def from_payload(cls, raw: dict[str, Any] | None) -> ModelRouterConfig | None: + if not raw: + return None + enabled = bool(raw.get("enabled")) + if not enabled: + return cls(enabled=False) + pool_raw = raw.get("model_pool") or [] + pool: list[ModelPoolEntry] = [] + if isinstance(pool_raw, list): + for item in pool_raw: + if not isinstance(item, dict): + continue + tier = normalize_pool_tier(str(item.get("tier") or "")) + if tier is None: + continue + raw_rank = item.get("priority_rank") + priority_rank: int | None = ( + int(raw_rank) if raw_rank is not None else None + ) + pool.append( + ModelPoolEntry( + model=str(item.get("model") or ""), + tier=tier, + enabled=bool(item.get("enabled", True)), + enable_thinking=( + item["enable_thinking"] + if item.get("enable_thinking") is not None + else None + ), + extra_params=_parse_pool_extra_params(item.get("extra_params")), + priority_rank=priority_rank, + prefer_secondary=bool(item.get("prefer_secondary", False)), + capabilities=_parse_capabilities(item.get("capabilities")), + ) + ) + fallback_fast = str(raw.get("fast_model") or "").strip() + fallback_code = ( + str(raw.get("code_model") or raw.get("heavy_model") or "").strip() or None + ) + fallback_think = str(raw.get("think_model") or "").strip() or None + session_code = fallback_code or fallback_fast or "" + if pool: + resolved = resolve_model_pool( + pool, + session_code=session_code or fallback_fast, + fallback_fast=fallback_fast, + fallback_code=fallback_code, + fallback_think=fallback_think, + ) + fast, code, think = resolved.fast, resolved.code, resolved.think + else: + fast = fallback_fast + code = fallback_code or fallback_fast + think = fallback_think + if not fast: + return None + # Override pool enable_thinking from env (CODE_THINK / FAST_THINK) — + # the frontend may send stale localStorage values. + _apply_env_think_to_pool(pool) + # Parse global priority list from payload (list of model tag strings). + priority_list_raw = raw.get("priority_list") + priority_list: list[str] = [] + if isinstance(priority_list_raw, list): + for tag in priority_list_raw: + s = str(tag).strip() + if s: + priority_list.append(s) + return cls( + enabled=True, + fast_model=fast, + heavy_model=code or None, + code_model=code or None, + think_model=think, + model_pool=pool, + token_fast_max=int(raw.get("token_fast_max") or 4_096), + token_heavy_min=int(raw.get("token_heavy_min") or 12_000), + keep_alive_fast=raw.get("keep_alive_fast", 300), + keep_alive_heavy=normalize_keep_alive_for_tier( + "code", raw.get("keep_alive_heavy", -1) + ), + escalate_on_failure=bool(raw.get("escalate_on_failure", True)), + prefer_think=bool( + raw.get("prefer_think") + if raw.get("prefer_think") is not None + else pool_prefers_think(pool) + ), + priority_list=priority_list, + ) + + +@dataclass +class RouteDecision: + tier: RouteTier + model_name: str + estimated_tokens: int + reasons: list[str] = field(default_factory=list) + role: RouteRole = "code" + enable_thinking: bool | None = None + """Priority rank of the chosen model within the global priority list (0 = highest). None for single-model tiers.""" + priority_rank: int | None = None + """Snapshot of the config's priority_list at decision time. None when not applicable.""" + priority_list_snapshot: list[str] | None = None + """True when the chosen model is not currently resident in Ollama memory (cold-start swap).""" + swap: bool = False + + +def thinking_for_role( + role: RouteRole, + model_name: str, + *, + pool: list[ModelPoolEntry] | None = None, +) -> bool | None: + """Per-model LiteLLM ``think`` for this route (hopper entry overrides role).""" + if pool: + explicit = pool_thinking_for_model(model_name, pool) + if explicit is not None: + return explicit + if role == "think": + return True + if role in ("fast", "code"): + return False + return None + + +def estimate_message_tokens( + user_message: str, + *, + message_token_count: int | None = None, +) -> int: + """Tokens from the user message only — used for routing.""" + if message_token_count is not None and message_token_count > 0: + return message_token_count + return max(len(user_message) // 4, 32) + + +def estimate_prompt_tokens( + user_message: str, + *, + files_in_chat: int = 0, + message_token_count: int | None = None, +) -> int: + """Rough context size for UI (message + capped file bump). Not used for tier choice.""" + base = estimate_message_tokens(user_message, message_token_count=message_token_count) + file_part = min(max(files_in_chat, 0) * _FILE_TOKEN_PER_FILE, _FILE_TOKEN_CAP) + return base + file_part + + +@lru_cache(maxsize=64) +def lookup_model_max_input_tokens(model_name: str) -> int | None: + """Cecli/LiteLLM metadata for a model id (e.g. ``ollama_chat/deepseek-coder:6.7b``).""" + name = (model_name or "").strip() + if not name: + return None + try: + from cecli.models import model_info_manager + + info = model_info_manager.get_model_info(name) or {} + raw = info.get("max_input_tokens") or 0 + return int(raw) if int(raw) > 0 else None + except Exception: + return None + + +def context_exceeds_fast_model_limit( + context_tokens: int, + fast_model_name: str, + *, + fast_max_input: int | None = None, + output_reserve: int = _FAST_CONTEXT_OUTPUT_RESERVE, +) -> tuple[bool, int | None]: + """ + True when the live session context cannot fit the fast model (plus completion reserve). + + ``fast_max_input`` overrides metadata lookup (tests). + """ + if context_tokens <= 0: + return False, None + limit = fast_max_input + if limit is None: + limit = lookup_model_max_input_tokens(fast_model_name) + if limit is None: + return False, None + return context_tokens + output_reserve > limit, limit + + +def _pick_think_model( + router: ModelRouterConfig, + *, + reasons: list[str], +) -> tuple[RouteRole, str]: + think = router.resolved_think_model + if think: + return "think", think + reasons.append("think_unconfigured→code") + return "code", router.resolved_code_model + + +def _has_multi_model_tier(pool: list[ModelPoolEntry], tier: RouteRole) -> bool: + """True when the pool has multiple enabled entries for `tier` with priority_rank set.""" + ranked = [e for e in pool if e.enabled and e.tier == tier and e.priority_rank is not None] + return len(ranked) >= 2 + + +def _apply_multi_model_routing( + role: RouteRole, + model_name: str, + *, + router: ModelRouterConfig, + display_tokens: int, + reasons: list[str], + resident_models: set[str] | None = None, + require_vision: bool = False, + context_tokens: int | None = None, +) -> RouteDecision: + """Wrap _finish_decision with multi-model tier routing when applicable. + + If the resolved tier has multiple enabled models with priority_rank set, + use pick_tier_model to select the model; otherwise fall back to the + single-model behavior (the model_name already determined by classify_prompt). + """ + pool = router.model_pool + priority_rank: int | None = None + priority_list_snapshot: list[str] | None = None + swap = False + + if pool and _has_multi_model_tier(pool, role): + chosen_model, is_swap = pick_tier_model( + pool, role, + resident_models=resident_models, + require_vision=require_vision, + context_tokens=context_tokens, + prefer_warm=True, + ) + model_name = chosen_model + swap = is_swap + # Find the priority_rank of the chosen model from the pool entry + for entry in pool: + if entry.enabled and entry.model == chosen_model and entry.tier == role: + priority_rank = entry.priority_rank + break + # Snapshot the config's priority_list if non-empty + if router.priority_list: + priority_list_snapshot = list(router.priority_list) + + # Add reason if vision or context fallback was used + if require_vision: + reasons.append("vision_required") + if context_tokens and context_tokens > 0: + # Check if we fell through to a different model than the top priority + top_models = resolve_tier_models(pool, role) + if top_models and chosen_model != top_models[0].model: + reasons.append(f"context_fallback:{chosen_model.split('/')[-1]}") + + return _finish_decision( + role, + model_name, + router=router, + display_tokens=display_tokens, + reasons=reasons, + priority_rank=priority_rank, + priority_list_snapshot=priority_list_snapshot, + swap=swap, + ) + + +def _finish_decision( + role: RouteRole, + model_name: str, + *, + router: ModelRouterConfig, + display_tokens: int, + reasons: list[str], + priority_rank: int | None = None, + priority_list_snapshot: list[str] | None = None, + swap: bool = False, +) -> RouteDecision: + return RouteDecision( + tier=role_to_legacy_tier(role), + role=role, + model_name=model_name, + estimated_tokens=display_tokens, + reasons=reasons, + enable_thinking=thinking_for_role(role, model_name, pool=router.model_pool), + priority_rank=priority_rank, + priority_list_snapshot=priority_list_snapshot, + swap=swap, + ) + + +def classify_prompt( + user_message: str, + *, + message_tokens: int, + router: ModelRouterConfig, + code_model_name: str | None = None, + think_model_name: str | None = None, + context_tokens: int | None = None, + force_tier: RouteTier | None = None, + turn: RouteTurnContext | None = None, + # Back-compat for tests calling estimated_tokens= + estimated_tokens: int | None = None, + heavy_model_name: str | None = None, + fast_max_input: int | None = None, + resident_models: set[str] | None = None, + has_images: bool = False, +) -> RouteDecision: + if estimated_tokens is not None and context_tokens is None: + context_tokens = estimated_tokens + display_tokens = context_tokens if context_tokens is not None else message_tokens + ctx = turn or RouteTurnContext() + code = (code_model_name or heavy_model_name or router.resolved_code_model).strip() + think = (think_model_name or router.resolved_think_model or "").strip() or None + + # Common kwargs for all _apply_multi_model_routing calls in this function. + def _route(role: RouteRole, model: str, *, reasons: list[str]) -> RouteDecision: + return _apply_multi_model_routing( + role, model, + router=router, + display_tokens=display_tokens, + reasons=reasons, + resident_models=resident_models, + require_vision=has_images, + context_tokens=context_tokens, + ) + + forced = normalize_route_role(force_tier) + if forced: + if forced == "think" and not think: + forced = "code" + model = { + "fast": router.fast_model, + "code": code, + "think": think or code, + }[forced] + return _route(forced, model, reasons=[f"forced:{forced}"]) + + reasons: list[str] = [] + + if ctx.implement_turn or ctx.agent_cmd: + tag = "implement_turn" if ctx.implement_turn else "agent_cmd" + reasons.append(tag) + return _route("code", code, reasons=reasons) + + if ctx.inject_todo_spec and not ctx.implement_turn: + reasons.append("inject_todo_spec") + role, model = _pick_think_model(router, reasons=reasons) + return _route(role, model, reasons=reasons) + + if ctx.spec_gen_turn: + reasons.append("spec_gen") + role, model = _pick_think_model(router, reasons=reasons) + return _route(role, model, reasons=reasons) + + if ctx.exploration_aborted: + reasons.append("exploration_aborted") + role, model = _pick_think_model(router, reasons=reasons) + return _route(role, model, reasons=reasons) + + if re.search(r"/agent\b", user_message, re.IGNORECASE): + reasons.append("slash:/agent") + return _route("code", code, reasons=reasons) + + if context_tokens is not None and context_tokens > 0: + exceeds_fast, fast_limit = context_exceeds_fast_model_limit( + context_tokens, router.fast_model, fast_max_input=fast_max_input + ) + if exceeds_fast and fast_limit is not None: + # Check if any fast-tier model in the pool can handle this context + # (multi-model: a larger-context fast model may fit) + pool = router.model_pool + if pool: + fast_models = resolve_tier_models(pool, "fast") + fast_fits = [m for m in fast_models if m.max_context is not None and m.max_context >= context_tokens] + if fast_fits: + # A fast model with sufficient context exists — stay in fast tier + reasons.append( + f"context_tokens>={fast_limit - _FAST_CONTEXT_OUTPUT_RESERVE} " + f"(fast_max={fast_limit}) but fast pool has larger model" + ) + return _route("fast", fast_fits[0].model, reasons=reasons) + + reasons.append( + f"context_tokens>={fast_limit - _FAST_CONTEXT_OUTPUT_RESERVE} " + f"(fast_max={fast_limit})" + ) + if router.prefer_think and think: + reasons.append("prefer_think") + return _route("think", think, reasons=reasons) + return _route("code", code, reasons=reasons) + + if message_tokens >= router.token_heavy_min: + reasons.append(f"msg_tokens>={router.token_heavy_min}") + if _CODE_TASK_STRONG.search(user_message) and not router.prefer_think: + return _route("code", code, reasons=reasons) + role, model = _pick_think_model(router, reasons=reasons) + return _route(role, model, reasons=reasons) + + think_hit = _THINK_PATTERNS.search(user_message) + fast_hit = _FAST_PATTERNS.search(user_message) + code_task = _CODE_TASK_STRONG.search(user_message) is not None + + if think_hit: + reasons.append(f"keyword:{think_hit.group(0).lower()}") + role, model = _pick_think_model(router, reasons=reasons) + return _route(role, model, reasons=reasons) + + if fast_hit and not router.prefer_think: + reasons.append(f"keyword:{fast_hit.group(0).lower()}") + return _route("fast", router.fast_model, reasons=reasons) + + if code_task: + reasons.append("code_task") + if router.prefer_think and think: + reasons.append("prefer_think") + return _route("think", think, reasons=reasons) + return _route("code", code, reasons=reasons) + + if message_tokens < router.token_fast_max: + reasons.append(f"msg_tokens<{router.token_fast_max}") + if router.prefer_think and think: + reasons.append("prefer_think") + return _route("think", think, reasons=reasons) + return _route("fast", router.fast_model, reasons=reasons) + + reasons.append("default_code") + if router.prefer_think and think: + reasons.append("prefer_think") + return _route("think", think, reasons=reasons) + return _route("code", code, reasons=reasons) + + +_CONTEXT_LIMIT_RE = re.compile( + r"exceeds the\s+[\d,]+\s+token limit", + re.IGNORECASE, +) + + +def should_escalate_fast_turn( + decision: RouteDecision, + *, + router: ModelRouterConfig, + user_message: str, + edited_files: list[str], + assistant_text: str, + had_tool_error: bool = False, + tool_error_text: str = "", +) -> bool: + role = decision.role if decision.role else normalize_route_role(decision.tier) or "code" + if not router.escalate_on_failure or role != "fast": + return False + if edited_files: + return False + if had_tool_error and _CONTEXT_LIMIT_RE.search(tool_error_text): + return True + if had_tool_error: + return _CODE_TASK_STRONG.search(user_message) is not None + if len(assistant_text.strip()) > 400: + return False + if not _CODE_TASK_STRONG.search(user_message): + return False + return True + + +def should_escalate_code_turn( + decision: RouteDecision, + *, + router: ModelRouterConfig, + user_message: str, + edited_files: list[str], + assistant_text: str, + had_tool_error: bool = False, +) -> bool: + """Offer think tier when code model stalled on a reasoning-heavy prompt.""" + role = decision.role if decision.role else normalize_route_role(decision.tier) or "code" + if not router.escalate_on_failure or role != "code": + return False + if not router.resolved_think_model: + return False + if edited_files: + return False + if had_tool_error and _THINK_PATTERNS.search(user_message): + return True + if _THINK_PATTERNS.search(user_message) and len(assistant_text.strip()) < 400: + return True + return False + + +def escalation_target(decision: RouteDecision | None) -> RouteRole: + """Next tier when auto-escalating after a failed attempt.""" + if decision is None: + return "code" + role = decision.role if decision.role else normalize_route_role(decision.tier) or "code" + if role == "fast": + return "code" + if role == "code": + return "think" + return "code" diff --git a/tests/basic/test_sessions.py b/tests/basic/test_sessions.py index 7fafd220963..1c79b34d43f 100644 --- a/tests/basic/test_sessions.py +++ b/tests/basic/test_sessions.py @@ -73,7 +73,7 @@ def session_manager(mock_coder): async def test_load_session_quiet_skips_tool_error_on_invalid_json( session_manager, mock_coder, tmp_path ): - """BrightVision auto-load uses quiet=True when restore is best-effort.""" + """Headless hosts may auto-load with quiet=True when restore is best-effort.""" session_dir = tmp_path / ".cecli" / "sessions" os.makedirs(session_dir, exist_ok=True) mock_coder.abs_root_path.side_effect = lambda x: str(tmp_path / x) diff --git a/tests/helpers/monorepo/LOCAL_WORKSPACE.md b/tests/helpers/monorepo/LOCAL_WORKSPACE.md index dd4311195cb..ff9fc3fce91 100644 --- a/tests/helpers/monorepo/LOCAL_WORKSPACE.md +++ b/tests/helpers/monorepo/LOCAL_WORKSPACE.md @@ -6,7 +6,7 @@ Extends cecli’s existing **clone** workspace mode (`repo:` URLs under `~/.cecl ## Motivation -IDE clients (e.g. BrightVision) open a **primary git repo** but need agent context across **sibling repos** without cloning into `~/.cecli/workspaces/`. Submodule-only setups are a different layout; this PR adds an explicit, reviewable config surface. +IDE clients (e.g. desktop agents with a primary git root) open a **primary git repo** but need agent context across **sibling repos** without cloning into `~/.cecli/workspaces/`. Submodule-only setups are a different layout; this PR adds an explicit, reviewable config surface. ## Config diff --git a/tests/hopper/test_model_pool.py b/tests/hopper/test_model_pool.py new file mode 100644 index 00000000000..14e2a0db1de --- /dev/null +++ b/tests/hopper/test_model_pool.py @@ -0,0 +1,30 @@ +from cecli.hopper.router import ModelPoolEntry, resolve_model_pool + + +def test_resolve_pool_priority_order(): + pool = [ + ModelPoolEntry(model="ollama_chat/fast-a", tier="fast", enabled=False), + ModelPoolEntry(model="ollama_chat/fast-b", tier="fast", enabled=True), + ModelPoolEntry(model="ollama_chat/code-x", tier="code", enabled=True), + ] + resolved = resolve_model_pool( + pool, + session_code="ollama_chat/session", + fallback_fast="", + fallback_code=None, + ) + assert resolved.fast == "ollama_chat/fast-b" + assert resolved.code == "ollama_chat/code-x" + + +def test_empty_code_row_uses_session(): + pool = [ + ModelPoolEntry(model="ollama_chat/fast", tier="fast", enabled=True), + ModelPoolEntry(model="", tier="code", enabled=True), + ] + resolved = resolve_model_pool( + pool, + session_code="ollama_chat/session", + ) + assert resolved.fast == "ollama_chat/fast" + assert resolved.code == "ollama_chat/session" diff --git a/tests/hopper/test_model_router.py b/tests/hopper/test_model_router.py new file mode 100644 index 00000000000..0b1a7c94ac5 --- /dev/null +++ b/tests/hopper/test_model_router.py @@ -0,0 +1,611 @@ +from cecli.hopper.router import ( + ModelPoolEntry, + ModelRouterConfig, + RouteTurnContext, + classify_prompt, + context_exceeds_fast_model_limit, + escalation_target, + estimate_message_tokens, + estimate_prompt_tokens, + resolve_model_pool, + should_escalate_code_turn, + should_escalate_fast_turn, + thinking_for_role, +) + + +def test_from_payload_normalizes_heavy_keep_alive_zero(): + cfg = ModelRouterConfig.from_payload( + { + "enabled": True, + "fast_model": "ollama_chat/small", + "heavy_model": "ollama_chat/big", + "keep_alive_heavy": 0, + } + ) + assert cfg is not None + assert cfg.keep_alive_heavy == -1 + + +def test_from_payload_think_and_code_models(): + cfg = ModelRouterConfig.from_payload( + { + "enabled": True, + "fast_model": "ollama_chat/fast", + "code_model": "ollama_chat/code", + "think_model": "ollama_chat/think", + "model_pool": [ + {"model": "ollama_chat/fast", "tier": "fast", "enabled": True}, + {"model": "ollama_chat/code", "tier": "code", "enabled": True}, + {"model": "ollama_chat/think", "tier": "think", "enabled": True}, + ], + } + ) + assert cfg is not None + assert cfg.resolved_code_model == "ollama_chat/code" + assert cfg.resolved_think_model == "ollama_chat/think" + + +def test_classify_low_tokens_fast_keyword(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/big", + ) + d = classify_prompt( + "Rename the button label to Save", + message_tokens=500, + router=router, + code_model_name="ollama_chat/big", + ) + assert d.role == "fast" + assert d.model_name == "ollama_chat/small" + assert d.enable_thinking is False + + +def test_classify_architect_think(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/code", + think_model="ollama_chat/think", + ) + d = classify_prompt( + "Refactor the race condition in the session pool", + message_tokens=800, + router=router, + code_model_name="ollama_chat/code", + ) + assert d.role == "think" + assert d.model_name == "ollama_chat/think" + assert d.enable_thinking is True + + +def test_classify_architect_falls_back_to_code_without_think(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/code", + ) + d = classify_prompt( + "Refactor the race condition in the session pool", + message_tokens=800, + router=router, + code_model_name="ollama_chat/code", + ) + assert d.role == "code" + assert "think_unconfigured" in " ".join(d.reasons) + + +def test_classify_agent_command_code(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/big", + ) + d = classify_prompt( + "/agent explore the repo and update the checklist", + message_tokens=400, + router=router, + code_model_name="ollama_chat/big", + ) + assert d.role == "code" + assert "slash:/agent" in d.reasons + + +def test_classify_implement_turn_code(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/code", + think_model="ollama_chat/think", + ) + d = classify_prompt( + "Implement only implementation task 1.2 per the injected spec.", + message_tokens=400, + router=router, + code_model_name="ollama_chat/code", + turn=RouteTurnContext(implement_turn=True), + ) + assert d.role == "code" + assert d.model_name == "ollama_chat/code" + + +def test_classify_inject_todo_spec_think(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/code", + think_model="ollama_chat/think", + ) + d = classify_prompt( + "Continue planning the auth module", + message_tokens=400, + router=router, + code_model_name="ollama_chat/code", + turn=RouteTurnContext(inject_todo_spec=True), + ) + assert d.role == "think" + + +def test_classify_high_message_tokens_think_without_code_task(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/code", + think_model="ollama_chat/think", + token_heavy_min=12_000, + ) + d = classify_prompt( + "summarize the session so far", + message_tokens=15_000, + router=router, + code_model_name="ollama_chat/code", + ) + assert d.role == "think" + assert "msg_tokens>=" in d.reasons[0] + + +def test_classify_high_message_tokens_code_task(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/code", + think_model="ollama_chat/think", + token_heavy_min=12_000, + ) + d = classify_prompt( + "implement the whole module", + message_tokens=15_000, + router=router, + code_model_name="ollama_chat/code", + ) + assert d.role == "code" + + +def test_files_in_chat_do_not_force_code_when_under_fast_window(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/big", + ) + msg = "I'd like to add @ references like we have for /add with chips" + message_tokens = estimate_message_tokens(msg) + context_tokens = estimate_prompt_tokens(msg, files_in_chat=4) + assert context_tokens > message_tokens + assert message_tokens < router.token_fast_max + assert not context_exceeds_fast_model_limit( + context_tokens, router.fast_model, fast_max_input=32_768 + )[0] + d = classify_prompt( + msg, + message_tokens=message_tokens, + context_tokens=context_tokens, + router=router, + code_model_name="ollama_chat/big", + ) + assert d.role == "fast" + assert d.estimated_tokens == context_tokens + + +def test_context_exceeds_fast_model_limit(): + exceeds, limit = context_exceeds_fast_model_limit( + 17_670, + "ollama_chat/deepseek-coder:6.7b", + fast_max_input=16_384, + ) + assert exceeds is True + assert limit == 16_384 + fits, _ = context_exceeds_fast_model_limit( + 10_000, + "ollama_chat/deepseek-coder:6.7b", + fast_max_input=16_384, + ) + assert fits is False + + +def test_classify_routes_code_when_context_exceeds_fast_window(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/deepseek-coder:6.7b", + code_model="ollama_chat/qwen3.6:27b-q4_K_M", + ) + msg = "tweak the chat panel label" + message_tokens = estimate_message_tokens(msg) + assert message_tokens < router.token_fast_max + d = classify_prompt( + msg, + message_tokens=message_tokens, + context_tokens=17_670, + router=router, + code_model_name="ollama_chat/qwen3.6:27b-q4_K_M", + fast_max_input=16_000, + ) + assert d.role == "code" + assert d.model_name == "ollama_chat/qwen3.6:27b-q4_K_M" + assert any("fast_max=" in r for r in d.reasons) + + +def test_fast_keyword_loses_to_context_overflow(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/deepseek-coder:6.7b", + code_model="ollama_chat/big", + ) + d = classify_prompt( + "Rename the button label to Save", + message_tokens=200, + context_tokens=20_000, + router=router, + code_model_name="ollama_chat/big", + fast_max_input=16_000, + ) + assert d.role == "code" + + +def test_code_task_middle_band_defaults_code(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/big", + ) + d = classify_prompt( + "implement the login form", + message_tokens=800, + router=router, + code_model_name="ollama_chat/big", + ) + assert d.role == "code" + assert "code_task" in d.reasons + + +def test_escalate_when_fast_no_edits(): + router = ModelRouterConfig(enabled=True, fast_model="a", code_model="b") + decision = classify_prompt( + "implement the login form", + message_tokens=800, + router=router, + code_model_name="b", + force_tier="fast", + ) + assert should_escalate_fast_turn( + decision, + router=router, + user_message="implement the login form", + edited_files=[], + assistant_text="ok", + ) + + +def test_escalate_code_to_think(): + router = ModelRouterConfig( + enabled=True, + fast_model="a", + code_model="b", + think_model="c", + ) + decision = classify_prompt( + "Refactor the auth layer", + message_tokens=800, + router=router, + code_model_name="b", + force_tier="code", + ) + assert should_escalate_code_turn( + decision, + router=router, + user_message="Refactor the auth layer", + edited_files=[], + assistant_text="Here is a plan", + ) + + +def test_escalation_target_chain(): + fast = classify_prompt( + "fix", + message_tokens=100, + router=ModelRouterConfig(enabled=True, fast_model="a", code_model="b", think_model="c"), + code_model_name="b", + force_tier="fast", + ) + assert escalation_target(fast) == "code" + code = classify_prompt( + "fix", + message_tokens=100, + router=ModelRouterConfig(enabled=True, fast_model="a", code_model="b", think_model="c"), + code_model_name="b", + force_tier="code", + ) + assert escalation_target(code) == "think" + + +def test_escalate_on_context_limit_tool_error(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/deepseek-coder:6.7b", + code_model="ollama_chat/big", + ) + decision = classify_prompt( + "tweak git tab", + message_tokens=200, + context_tokens=5_000, + router=router, + code_model_name="ollama_chat/big", + force_tier="fast", + ) + err = ( + "Your estimated chat context of 32,672 tokens exceeds the " + "16,384 token limit for ollama_chat/deepseek-coder:6.7b!" + ) + assert should_escalate_fast_turn( + decision, + router=router, + user_message="tweak git tab", + edited_files=[], + assistant_text="", + had_tool_error=True, + tool_error_text=err, + ) + + +def test_lint_in_long_message_not_used_when_routing_short_intent(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/big", + ) + preamble = "## Spec-focus mode\nEARS lint requirements\n" + ("x" * 5000) + short = "In the Git tab, add revert and open-in-editor cues." + d = classify_prompt( + short, + message_tokens=estimate_message_tokens(short), + context_tokens=estimate_prompt_tokens(preamble + short, files_in_chat=0), + router=router, + code_model_name="ollama_chat/big", + ) + assert d.role != "fast" or "keyword:lint" not in " ".join(d.reasons) + + +def test_estimate_tokens_with_files_capped(): + bare = estimate_prompt_tokens("hello") + with_files = estimate_prompt_tokens("hello", files_in_chat=10) + assert with_files > bare + assert with_files <= bare + 2000 + + +def test_thinking_for_role(): + assert thinking_for_role("think", "ollama_chat/deepseek-r1:32b") is True + assert thinking_for_role("code", "ollama_chat/qwen3.6:27b") is False + + +def test_pool_entry_overrides_role_thinking(): + pool = [ + ModelPoolEntry( + model="ollama_chat/custom", + tier="code", + enabled=True, + enable_thinking=True, + ) + ] + assert thinking_for_role("code", "ollama_chat/custom", pool=pool) is True + assert thinking_for_role("code", "ollama_chat/other", pool=pool) is False + + +def test_resolve_model_pool_roles(): + pool = [ + ModelPoolEntry(model="ollama_chat/fast-a", tier="fast", enabled=True), + ModelPoolEntry(model="", tier="code", enabled=True), + ModelPoolEntry(model="ollama_chat/r1", tier="think", enabled=True), + ] + resolved = resolve_model_pool( + pool, + session_code="ollama_chat/session", + fallback_fast="", + ) + assert resolved.fast == "ollama_chat/fast-a" + assert resolved.code == "ollama_chat/session" + assert resolved.think == "ollama_chat/r1" + + +def test_from_payload_parses_hopper_extra_params(): + cfg = ModelRouterConfig.from_payload( + { + "enabled": True, + "fast_model": "ollama_chat/fast", + "code_model": "ollama_chat/code", + "model_pool": [ + { + "model": "ollama_chat/code", + "tier": "code", + "enabled": True, + "extra_params": {"top_p": 0.85}, + } + ], + } + ) + assert cfg is not None + assert cfg.model_pool[0].extra_params == {"top_p": 0.85} + + +def test_pool_prefers_think_when_think_above_code(): + from cecli.hopper.router import pool_prefers_think + + pool = [ + ModelPoolEntry(model="ollama_chat/r1:32b", tier="think", enabled=True), + ModelPoolEntry(model="ollama_chat/qwen3:27b", tier="code", enabled=True), + ModelPoolEntry(model="ollama_chat/small", tier="fast", enabled=True), + ] + assert pool_prefers_think(pool) is True + + +def test_pool_prefers_think_false_when_code_above_think(): + from cecli.hopper.router import pool_prefers_think + + pool = [ + ModelPoolEntry(model="ollama_chat/qwen3:27b", tier="code", enabled=True), + ModelPoolEntry(model="ollama_chat/r1:32b", tier="think", enabled=True), + ModelPoolEntry(model="ollama_chat/small", tier="fast", enabled=True), + ] + assert pool_prefers_think(pool) is False + + +def test_pool_prefers_think_false_when_no_think(): + from cecli.hopper.router import pool_prefers_think + + pool = [ + ModelPoolEntry(model="ollama_chat/qwen3:27b", tier="code", enabled=True), + ModelPoolEntry(model="ollama_chat/small", tier="fast", enabled=True), + ] + assert pool_prefers_think(pool) is False + + +def test_prefer_think_routes_agent_to_think(): + """Agent turns always use code model (tool-capable) even with prefer_think.""" + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/code", + think_model="ollama_chat/think", + prefer_think=True, + ) + d = classify_prompt( + "/agent explore the repo", + message_tokens=400, + router=router, + code_model_name="ollama_chat/code", + ) + assert d.role == "code" + assert d.model_name == "ollama_chat/code" + assert "slash:/agent" in d.reasons + + +def test_prefer_think_routes_implement_turn_to_think(): + """Implement turns always use code model (tool-capable) even with prefer_think.""" + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/code", + think_model="ollama_chat/think", + prefer_think=True, + ) + d = classify_prompt( + "implement the EncryptedStorageRepository", + message_tokens=400, + router=router, + code_model_name="ollama_chat/code", + turn=RouteTurnContext(implement_turn=True), + ) + assert d.role == "code" + assert d.model_name == "ollama_chat/code" + assert "implement_turn" in d.reasons + + +def test_prefer_think_falls_back_to_code_without_think_model(): + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/small", + code_model="ollama_chat/code", + prefer_think=True, + ) + d = classify_prompt( + "/agent explore the repo", + message_tokens=400, + router=router, + code_model_name="ollama_chat/code", + ) + # No think model configured; falls back to code despite prefer_think + assert d.role == "code" + assert "prefer_think" not in d.reasons + + +def test_from_payload_derives_prefer_think_from_pool_order(): + cfg = ModelRouterConfig.from_payload( + { + "enabled": True, + "fast_model": "ollama_chat/fast", + "code_model": "ollama_chat/code", + "think_model": "ollama_chat/think", + "model_pool": [ + {"model": "ollama_chat/think", "tier": "think", "enabled": True}, + {"model": "ollama_chat/code", "tier": "code", "enabled": True}, + {"model": "ollama_chat/fast", "tier": "fast", "enabled": True}, + ], + } + ) + assert cfg is not None + assert cfg.prefer_think is True + + +def test_from_payload_no_prefer_think_when_code_first(): + cfg = ModelRouterConfig.from_payload( + { + "enabled": True, + "fast_model": "ollama_chat/fast", + "code_model": "ollama_chat/code", + "think_model": "ollama_chat/think", + "model_pool": [ + {"model": "ollama_chat/code", "tier": "code", "enabled": True}, + {"model": "ollama_chat/think", "tier": "think", "enabled": True}, + {"model": "ollama_chat/fast", "tier": "fast", "enabled": True}, + ], + } + ) + assert cfg is not None + assert cfg.prefer_think is False + + + +def test_router_lane_fast_prompt_routes_fast_with_think_enabled(): + """E2E router lane contract (regression for e2e/router-llm.spec.ts). + + Hopper order fast → code → think (think last) ⇒ ``prefer_think`` is False, so a + trivial fast-keyword prompt must route to the fast tier even though a think model + is enabled. The e2e ``fast tier routes to Fighter pilot`` test asserts exactly this; + if routing here returned ``think`` the e2e would fail (and previously did, when the + fast model was cold-evicted and the turn escalated fast→code→think). + """ + cfg = ModelRouterConfig.from_payload( + { + "enabled": True, + "fast_model": "ollama_chat/qwen2.5-coder:7b", + "code_model": "ollama_chat/qwen3.6:27b-q4_K_M", + "think_model": "ollama_chat/deepseek-r1:32b", + "model_pool": [ + {"model": "ollama_chat/qwen2.5-coder:7b", "tier": "fast", "enabled": True}, + {"model": "ollama_chat/qwen3.6:27b-q4_K_M", "tier": "code", "enabled": True}, + {"model": "ollama_chat/deepseek-r1:32b", "tier": "think", "enabled": True}, + ], + } + ) + assert cfg is not None + assert cfg.prefer_think is False + d = classify_prompt( + 'Suggest a better button label than "Start" in one sentence only. ' + "No code blocks, no file edits.", + message_tokens=30, + router=cfg, + code_model_name="ollama_chat/qwen3.6:27b-q4_K_M", + think_model_name="ollama_chat/deepseek-r1:32b", + ) + assert d.role == "fast", d.reasons + assert d.model_name == "ollama_chat/qwen2.5-coder:7b" diff --git a/tests/hopper/test_model_router_apply.py b/tests/hopper/test_model_router_apply.py new file mode 100644 index 00000000000..fa1b3603129 --- /dev/null +++ b/tests/hopper/test_model_router_apply.py @@ -0,0 +1,208 @@ +"""Apply-route tests — per-turn LiteLLM think override + keep_alive.""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +from cecli.hopper.router import ModelPoolEntry, ModelRouterConfig, RouteDecision +from cecli.hopper.apply import ( + apply_hopper_extra_params, + apply_route_to_coder, + apply_thinking_extra_params, + merge_extra_params, +) + + +def test_apply_thinking_extra_params_sets_bool(): + model = MagicMock() + model.extra_params = {} + apply_thinking_extra_params(model, True) + assert model.extra_params["think"] is True + apply_thinking_extra_params(model, False) + assert model.extra_params["think"] is False + + +def test_merge_extra_params_deep_merges_dicts(): + base = {"extra_headers": {"A": "1"}, "top_p": 0.5} + merge_extra_params(base, {"extra_headers": {"B": "2"}, "top_p": 0.9}) + assert base["extra_headers"] == {"A": "1", "B": "2"} + assert base["top_p"] == 0.9 + + +def test_apply_hopper_extra_params_skips_keep_alive(): + model = MagicMock() + model.extra_params = {"keep_alive": 99} + apply_hopper_extra_params(model, {"keep_alive": 0, "top_p": 0.8}) + assert model.extra_params.get("keep_alive") == 99 + assert model.extra_params.get("top_p") == 0.8 + + +def test_apply_route_merges_hopper_extra_params(): + prev = MagicMock() + prev.name = "ollama_chat/qwen3.6:27b" + prev.is_ollama.return_value = True + prev.extra_params = {"think": False} + + created: dict = {} + + def _model_ctor(name, from_model=None): + m = MagicMock() + m.name = name + m.is_ollama.return_value = True + m.extra_params = dict(from_model.extra_params) + m._ensure_extra_params_dict = lambda: None + created["model"] = m + return m + + coder = MagicMock() + coder.main_model = prev + + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/fast", + code_model="ollama_chat/code", + model_pool=[ + ModelPoolEntry( + model="ollama_chat/code", + tier="code", + enabled=True, + extra_params={"top_p": 0.85, "think": True}, + ) + ], + ) + decision = RouteDecision( + tier="code", + role="code", + model_name="ollama_chat/code", + estimated_tokens=100, + enable_thinking=False, + ) + + with patch("cecli.hopper.apply.models.Model", side_effect=_model_ctor): + apply_route_to_coder(coder, decision, router) + + assert created["model"].extra_params.get("top_p") == 0.85 + assert created["model"].extra_params.get("think") is False + assert created["model"].extra_params.get("keep_alive") == -1 + + +def test_apply_route_code_disables_think(): + prev = MagicMock() + prev.name = "ollama_chat/qwen3.6:27b" + prev.is_ollama.return_value = True + prev.extra_params = {"think": False} + + created: dict = {} + + def _model_ctor(name, from_model=None): + m = MagicMock() + m.name = name + m.is_ollama.return_value = True + m.extra_params = dict(from_model.extra_params) + m._ensure_extra_params_dict = lambda: None + created["model"] = m + return m + + coder = MagicMock() + coder.main_model = prev + + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/fast", + code_model="ollama_chat/code", + ) + decision = RouteDecision( + tier="code", + role="code", + model_name="ollama_chat/code", + estimated_tokens=100, + enable_thinking=False, + ) + + with patch("cecli.hopper.apply.models.Model", side_effect=_model_ctor): + apply_route_to_coder(coder, decision, router) + + assert created["model"].extra_params.get("think") is False + assert created["model"].extra_params.get("keep_alive") == -1 + + +def test_apply_route_think_enables_think(): + prev = MagicMock() + prev.name = "ollama_chat/qwen3.6:27b" + prev.is_ollama.return_value = True + prev.extra_params = {"think": False} + + created: dict = {} + + def _model_ctor(name, from_model=None): + m = MagicMock() + m.name = name + m.is_ollama.return_value = True + m.extra_params = dict(from_model.extra_params) + m._ensure_extra_params_dict = lambda: None + created["model"] = m + return m + + coder = MagicMock() + coder.main_model = prev + + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/fast", + code_model="ollama_chat/code", + think_model="ollama_chat/deepseek-r1:32b", + ) + decision = RouteDecision( + tier="think", + role="think", + model_name="ollama_chat/deepseek-r1:32b", + estimated_tokens=100, + enable_thinking=True, + ) + + with patch("cecli.hopper.apply.models.Model", side_effect=_model_ctor): + apply_route_to_coder(coder, decision, router) + + assert created["model"].extra_params.get("think") is True + + +def test_apply_route_qwen_sets_no_think_prefix(): + prev = MagicMock() + prev.name = "ollama_chat/qwen3.6:27b" + prev.is_ollama.return_value = True + prev.extra_params = {} + prev.system_prompt_prefix = "" + + created: dict = {} + + def _model_ctor(name, from_model=None): + m = MagicMock() + m.name = name + m.is_ollama.return_value = True + m.extra_params = {} + m.system_prompt_prefix = "" + m._ensure_extra_params_dict = lambda: None + created["model"] = m + return m + + coder = MagicMock() + coder.main_model = prev + + router = ModelRouterConfig( + enabled=True, + fast_model="ollama_chat/fast", + code_model="ollama_chat/qwen3.6:27b", + ) + decision = RouteDecision( + tier="code", + role="code", + model_name="ollama_chat/qwen3.6:27b", + estimated_tokens=100, + enable_thinking=False, + ) + + with patch("cecli.hopper.apply.models.Model", side_effect=_model_ctor): + apply_route_to_coder(coder, decision, router) + + assert created["model"].extra_params.get("think") is False + assert created["model"].system_prompt_prefix == "/no_think" diff --git a/tests/hopper/test_model_router_preload.py b/tests/hopper/test_model_router_preload.py new file mode 100644 index 00000000000..bc7d138a764 --- /dev/null +++ b/tests/hopper/test_model_router_preload.py @@ -0,0 +1,251 @@ +"""Tests for preload_priority_list — priority-ordered preloading with VRAM budget.""" + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass, field +from typing import Any + +import pytest + +from cecli.hopper.router import ( + preload_priority_list, + _strip_ollama_prefix, +) + + +# --------------------------------------------------------------------------- +# Mock Ollama client +# --------------------------------------------------------------------------- + + +@dataclass +class MockOllamaClient: + """Mock OllamaClient for testing preload_priority_list.""" + + # Track calls for assertions + generate_calls: list[tuple[str, int]] = field(default_factory=list) + show_calls: list[str] = field(default_factory=list) + + # Configurable behavior + model_sizes: dict[str, int] = field(default_factory=dict) + failing_models: set[str] = field(default_factory=set) + show_failures: set[str] = field(default_factory=set) + + async def post_generate(self, model: str, *, keep_alive: int = -1) -> None: + self.generate_calls.append((model, keep_alive)) + if model in self.failing_models: + raise RuntimeError(f"Preload failed: model '{model}' not found") + + async def show_model(self, model: str) -> dict[str, Any]: + self.show_calls.append(model) + if model in self.show_failures: + raise RuntimeError(f"Show failed for '{model}'") + size = self.model_sizes.get(model) + if size is not None: + return {"size": size} + return {} + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_preload_all_models_in_order(): + """All models preload successfully in priority order.""" + client = MockOllamaClient() + priority = ["model-a:7b", "model-b:13b", "model-c:32b"] + + result = await preload_priority_list(priority, ollama_client=client) + + assert result == ["model-a:7b", "model-b:13b", "model-c:32b"] + assert client.generate_calls == [ + ("model-a:7b", -1), + ("model-b:13b", -1), + ("model-c:32b", -1), + ] + + +@pytest.mark.asyncio +async def test_preload_strips_ollama_prefix(): + """ollama_chat/ prefix is stripped for API calls but preserved in results.""" + client = MockOllamaClient() + priority = ["ollama_chat/deepseek-r1:32b", "ollama/qwen:7b"] + + result = await preload_priority_list(priority, ollama_client=client) + + assert result == ["ollama_chat/deepseek-r1:32b", "ollama/qwen:7b"] + assert client.generate_calls == [ + ("deepseek-r1:32b", -1), + ("qwen:7b", -1), + ] + + +@pytest.mark.asyncio +async def test_preload_failure_skips_and_continues(): + """On preload failure, log error, skip model, continue with next.""" + client = MockOllamaClient(failing_models={"model-b:13b"}) + priority = ["model-a:7b", "model-b:13b", "model-c:32b"] + + result = await preload_priority_list(priority, ollama_client=client) + + assert result == ["model-a:7b", "model-c:32b"] + # All three were attempted + assert len(client.generate_calls) == 3 + + +@pytest.mark.asyncio +async def test_preload_vram_budget_stops_when_exceeded(): + """When cumulative VRAM exceeds budget, stop preloading remaining models.""" + client = MockOllamaClient( + model_sizes={ + "model-a:7b": 4_000_000_000, # 4 GB + "model-b:13b": 8_000_000_000, # 8 GB + "model-c:32b": 18_000_000_000, # 18 GB + } + ) + priority = ["model-a:7b", "model-b:13b", "model-c:32b"] + # Budget: 11 GB → model-a (4 GB) + model-b (8 GB) = 12 GB > 11 GB + # So model-a fits, model-b does NOT fit, stop. + budget = 11_000_000_000 + + result = await preload_priority_list( + priority, ollama_client=client, vram_budget_bytes=budget + ) + + assert result == ["model-a:7b"] + # Only model-a was actually preloaded (generate called) + assert len(client.generate_calls) == 1 + assert client.generate_calls[0][0] == "model-a:7b" + + +@pytest.mark.asyncio +async def test_preload_vram_budget_all_fit(): + """All models fit within VRAM budget.""" + client = MockOllamaClient( + model_sizes={ + "model-a:7b": 4_000_000_000, + "model-b:7b": 4_000_000_000, + } + ) + priority = ["model-a:7b", "model-b:7b"] + budget = 10_000_000_000 # 10 GB — both fit + + result = await preload_priority_list( + priority, ollama_client=client, vram_budget_bytes=budget + ) + + assert result == ["model-a:7b", "model-b:7b"] + assert len(client.generate_calls) == 2 + + +@pytest.mark.asyncio +async def test_preload_vram_unknown_size_proceeds(): + """When model size info unavailable, skip budget check and preload anyway.""" + client = MockOllamaClient( + model_sizes={ + "model-a:7b": 4_000_000_000, + # model-b has no size info + } + ) + priority = ["model-a:7b", "model-b:unknown"] + budget = 5_000_000_000 # 5 GB + + result = await preload_priority_list( + priority, ollama_client=client, vram_budget_bytes=budget + ) + + # Both preloaded — model-b has no size info, so budget check skipped for it + assert result == ["model-a:7b", "model-b:unknown"] + + +@pytest.mark.asyncio +async def test_preload_no_budget_preloads_all(): + """Without VRAM budget, all models preloaded regardless of size.""" + client = MockOllamaClient( + model_sizes={ + "huge:70b": 40_000_000_000, + "also-huge:70b": 40_000_000_000, + } + ) + priority = ["huge:70b", "also-huge:70b"] + + result = await preload_priority_list(priority, ollama_client=client) + + assert result == ["huge:70b", "also-huge:70b"] + # No show calls when budget is None + assert client.show_calls == [] + + +@pytest.mark.asyncio +async def test_preload_empty_list(): + """Empty priority list returns empty result.""" + client = MockOllamaClient() + result = await preload_priority_list([], ollama_client=client) + assert result == [] + assert client.generate_calls == [] + + +@pytest.mark.asyncio +async def test_preload_skips_whitespace_only_entries(): + """Whitespace-only entries in priority list are skipped.""" + client = MockOllamaClient() + priority = ["model-a:7b", " ", "", "model-b:7b"] + + result = await preload_priority_list(priority, ollama_client=client) + + assert result == ["model-a:7b", "model-b:7b"] + assert len(client.generate_calls) == 2 + + +@pytest.mark.asyncio +async def test_preload_uses_backend_resolver_when_no_ollama_client(): + """Host resolver hook supplies BackendClient.preload_models when no ollama_client.""" + from unittest.mock import AsyncMock + + from cecli.hopper.router import set_backend_client_resolver + + mock_client = AsyncMock() + mock_client.preload_models = AsyncMock(return_value=[]) + + set_backend_client_resolver(lambda: mock_client) + try: + result = await preload_priority_list(["model-a:7b"]) + finally: + set_backend_client_resolver(None) + + assert result == [] + mock_client.preload_models.assert_called_once_with(["model-a:7b"]) + + +@pytest.mark.asyncio +async def test_preload_show_failure_skips_budget_check(): + """When show_model fails, skip budget check and preload anyway.""" + client = MockOllamaClient(show_failures={"model-a:7b"}) + priority = ["model-a:7b"] + budget = 1_000 # Tiny budget — but show fails, so budget check skipped + + result = await preload_priority_list( + priority, ollama_client=client, vram_budget_bytes=budget + ) + + assert result == ["model-a:7b"] + + +# --------------------------------------------------------------------------- +# Unit tests for _strip_ollama_prefix +# --------------------------------------------------------------------------- + + +def test_strip_ollama_prefix_chat(): + assert _strip_ollama_prefix("ollama_chat/deepseek-r1:32b") == "deepseek-r1:32b" + + +def test_strip_ollama_prefix_plain(): + assert _strip_ollama_prefix("ollama/qwen:7b") == "qwen:7b" + + +def test_strip_ollama_prefix_no_prefix(): + assert _strip_ollama_prefix("deepseek-r1:32b") == "deepseek-r1:32b" diff --git a/tests/hopper/test_model_router_warmup.py b/tests/hopper/test_model_router_warmup.py new file mode 100644 index 00000000000..3345397fc0d --- /dev/null +++ b/tests/hopper/test_model_router_warmup.py @@ -0,0 +1,142 @@ +"""Tests for warmup_keep_alive — keep-alive requests in priority order.""" + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass, field +from typing import Any + +import pytest + +from cecli.hopper.router import warmup_keep_alive + + +# --------------------------------------------------------------------------- +# Mock Ollama client +# --------------------------------------------------------------------------- + + +@dataclass +class MockOllamaClient: + """Mock OllamaClient for testing warmup_keep_alive.""" + + generate_calls: list[tuple[str, int]] = field(default_factory=list) + failing_models: set[str] = field(default_factory=set) + + async def post_generate(self, model: str, *, keep_alive: int = -1) -> None: + self.generate_calls.append((model, keep_alive)) + if model in self.failing_models: + raise RuntimeError(f"Keep-alive failed: model '{model}' not found") + + async def show_model(self, model: str) -> dict[str, Any]: + return {} + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_warmup_sends_requests_in_priority_order(): + """Keep-alive requests are sent in priority-list index order.""" + client = MockOllamaClient() + priority = ["model-a:7b", "model-b:13b", "model-c:32b"] + + result = await warmup_keep_alive(priority, ollama_client=client) + + assert result == ["model-a:7b", "model-b:13b", "model-c:32b"] + assert client.generate_calls == [ + ("model-a:7b", -1), + ("model-b:13b", -1), + ("model-c:32b", -1), + ] + + +@pytest.mark.asyncio +async def test_warmup_higher_priority_refreshes_first(): + """Index 0 (highest priority) refreshes TTL before index N-1.""" + client = MockOllamaClient() + priority = ["high-priority:7b", "mid-priority:13b", "low-priority:32b"] + + await warmup_keep_alive(priority, ollama_client=client) + + # Verify ordering: high-priority called first + call_models = [call[0] for call in client.generate_calls] + assert call_models == ["high-priority:7b", "mid-priority:13b", "low-priority:32b"] + + +@pytest.mark.asyncio +async def test_warmup_strips_ollama_prefix(): + """ollama_chat/ prefix is stripped for API calls but preserved in results.""" + client = MockOllamaClient() + priority = ["ollama_chat/deepseek-r1:32b", "ollama/qwen:7b"] + + result = await warmup_keep_alive(priority, ollama_client=client) + + assert result == ["ollama_chat/deepseek-r1:32b", "ollama/qwen:7b"] + assert client.generate_calls == [ + ("deepseek-r1:32b", -1), + ("qwen:7b", -1), + ] + + +@pytest.mark.asyncio +async def test_warmup_failure_skips_and_continues(): + """On keep-alive failure, log error, skip model, continue with next.""" + client = MockOllamaClient(failing_models={"model-b:13b"}) + priority = ["model-a:7b", "model-b:13b", "model-c:32b"] + + result = await warmup_keep_alive(priority, ollama_client=client) + + assert result == ["model-a:7b", "model-c:32b"] + # All three were attempted + assert len(client.generate_calls) == 3 + + +@pytest.mark.asyncio +async def test_warmup_empty_list(): + """Empty priority list returns empty result.""" + client = MockOllamaClient() + result = await warmup_keep_alive([], ollama_client=client) + assert result == [] + assert client.generate_calls == [] + + +@pytest.mark.asyncio +async def test_warmup_skips_whitespace_only_entries(): + """Whitespace-only entries in priority list are skipped.""" + client = MockOllamaClient() + priority = ["model-a:7b", " ", "", "model-b:7b"] + + result = await warmup_keep_alive(priority, ollama_client=client) + + assert result == ["model-a:7b", "model-b:7b"] + assert len(client.generate_calls) == 2 + + +@pytest.mark.asyncio +async def test_warmup_uses_keep_alive_minus_one(): + """All keep-alive requests use keep_alive=-1 to refresh TTL indefinitely.""" + client = MockOllamaClient() + priority = ["model-a:7b", "model-b:7b"] + + await warmup_keep_alive(priority, ollama_client=client) + + for _, keep_alive_val in client.generate_calls: + assert keep_alive_val == -1 + + +@pytest.mark.asyncio +async def test_warmup_all_failures_returns_empty(): + """When all models fail, returns empty list.""" + client = MockOllamaClient( + failing_models={"model-a:7b", "model-b:7b"} + ) + priority = ["model-a:7b", "model-b:7b"] + + result = await warmup_keep_alive(priority, ollama_client=client) + + assert result == [] + # Both attempted + assert len(client.generate_calls) == 2 From bd33b32efb063358f392643df23cf18f4ae0bca2 Mon Sep 17 00:00:00 2001 From: CIA Operations Officer Jennifer Pike Date: Mon, 15 Jun 2026 14:38:59 -0700 Subject: [PATCH 2/4] chore(hopper): drop unused asyncio imports in hopper tests Co-authored-by: Cursor --- tests/hopper/test_model_router_preload.py | 1 - tests/hopper/test_model_router_warmup.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/hopper/test_model_router_preload.py b/tests/hopper/test_model_router_preload.py index bc7d138a764..72e7003e047 100644 --- a/tests/hopper/test_model_router_preload.py +++ b/tests/hopper/test_model_router_preload.py @@ -2,7 +2,6 @@ from __future__ import annotations -import asyncio from dataclasses import dataclass, field from typing import Any diff --git a/tests/hopper/test_model_router_warmup.py b/tests/hopper/test_model_router_warmup.py index 3345397fc0d..44287b162fc 100644 --- a/tests/hopper/test_model_router_warmup.py +++ b/tests/hopper/test_model_router_warmup.py @@ -2,7 +2,6 @@ from __future__ import annotations -import asyncio from dataclasses import dataclass, field from typing import Any From 881c086120f57ba313ef3752a098775bc3c9ab22 Mon Sep 17 00:00:00 2001 From: CIA Operations Officer Jennifer Pike Date: Mon, 15 Jun 2026 14:43:57 -0700 Subject: [PATCH 3/4] style(hopper): black/isort for cecli pre-commit CI Run verify-cecli-pre-commit on pr/hopper before upstream merge. Co-authored-by: Cursor --- cecli/hopper/apply.py | 5 +-- cecli/hopper/router.py | 47 +++++++++++++---------- tests/hopper/test_model_router.py | 1 - tests/hopper/test_model_router_apply.py | 2 +- tests/hopper/test_model_router_preload.py | 21 +++------- tests/hopper/test_model_router_warmup.py | 5 +-- 6 files changed, 35 insertions(+), 46 deletions(-) diff --git a/cecli/hopper/apply.py b/cecli/hopper/apply.py index 28fbb1ced29..495d71643ed 100644 --- a/cecli/hopper/apply.py +++ b/cecli/hopper/apply.py @@ -5,7 +5,6 @@ from typing import Any from cecli import models - from cecli.hopper.router import ( ModelRouterConfig, RouteDecision, @@ -70,9 +69,7 @@ def apply_route_to_coder(coder, decision: RouteDecision, router: ModelRouterConf new_model = models.Model(decision.model_name, from_model=prev) role = decision.role or normalize_route_role(decision.tier) or "code" pool_entry = ( - find_pool_entry(router.model_pool, decision.model_name, role) - if router.model_pool - else None + find_pool_entry(router.model_pool, decision.model_name, role) if router.model_pool else None ) apply_hopper_extra_params( new_model, diff --git a/cecli/hopper/router.py b/cecli/hopper/router.py index 67f8d45ae01..ed1f1c7fd59 100644 --- a/cecli/hopper/router.py +++ b/cecli/hopper/router.py @@ -33,6 +33,7 @@ def set_static_vram_bytes_resolver(fn: Callable[[str], int | None] | None) -> No global _static_vram_bytes_resolver _static_vram_bytes_resolver = fn + RouteRole = Literal["fast", "code", "think"] RouteTier = Literal["fast", "heavy", "code", "think"] @@ -178,7 +179,12 @@ def resolve_tier_models(pool: list[ModelPoolEntry], tier: RouteRole) -> list[Mod Models with priority_rank=None are sorted after those with a rank. """ filtered = [e for e in pool if e.enabled and e.tier == tier] - filtered.sort(key=lambda e: (e.priority_rank is None, e.priority_rank if e.priority_rank is not None else 0)) + filtered.sort( + key=lambda e: ( + e.priority_rank is None, + e.priority_rank if e.priority_rank is not None else 0, + ) + ) return filtered @@ -273,15 +279,12 @@ async def preload_priority_list( model_size: int | None = None if vram_budget_bytes is not None: - model_size = await _get_model_size_for_budget( - raw_tag, ollama_client=ollama_client - ) + model_size = await _get_model_size_for_budget(raw_tag, ollama_client=ollama_client) if model_size is not None: if cumulative_vram + model_size > vram_budget_bytes: deferred = [t.strip() for t in priority_list[idx:] if t.strip()] logger.info( - "VRAM budget exceeded (%.1f MB used of %.1f MB). " - "Deferring models: %s", + "VRAM budget exceeded (%.1f MB used of %.1f MB). " "Deferring models: %s", cumulative_vram / (1024 * 1024), vram_budget_bytes / (1024 * 1024), deferred, @@ -340,9 +343,9 @@ async def warmup_keep_alive( def _strip_ollama_prefix(tag: str) -> str: """Remove ``ollama_chat/`` or ``ollama/`` prefix from a model tag.""" if tag.startswith("ollama_chat/"): - return tag[len("ollama_chat/"):] + return tag[len("ollama_chat/") :] if tag.startswith("ollama/"): - return tag[len("ollama/"):] + return tag[len("ollama/") :] return tag @@ -627,7 +630,9 @@ def resolve_provider_prefix(backend: str) -> str: return _BACKEND_PROVIDER_PREFIXES.get((backend or "").strip().lower(), "ollama_chat/") -def inject_backend_extra_params(backend: str, extra_params: dict[str, object] | None) -> dict[str, object]: +def inject_backend_extra_params( + backend: str, extra_params: dict[str, object] | None +) -> dict[str, object]: """Merge ``LITELLM_EXTRA_PARAMS`` for non-Ollama backends. Ollama uses its own env wiring; other backends may need auth headers or base URLs @@ -701,9 +706,7 @@ def from_payload(cls, raw: dict[str, Any] | None) -> ModelRouterConfig | None: if tier is None: continue raw_rank = item.get("priority_rank") - priority_rank: int | None = ( - int(raw_rank) if raw_rank is not None else None - ) + priority_rank: int | None = int(raw_rank) if raw_rank is not None else None pool.append( ModelPoolEntry( model=str(item.get("model") or ""), @@ -721,9 +724,7 @@ def from_payload(cls, raw: dict[str, Any] | None) -> ModelRouterConfig | None: ) ) fallback_fast = str(raw.get("fast_model") or "").strip() - fallback_code = ( - str(raw.get("code_model") or raw.get("heavy_model") or "").strip() or None - ) + fallback_code = str(raw.get("code_model") or raw.get("heavy_model") or "").strip() or None fallback_think = str(raw.get("think_model") or "").strip() or None session_code = fallback_code or fallback_fast or "" if pool: @@ -762,9 +763,7 @@ def from_payload(cls, raw: dict[str, Any] | None) -> ModelRouterConfig | None: token_fast_max=int(raw.get("token_fast_max") or 4_096), token_heavy_min=int(raw.get("token_heavy_min") or 12_000), keep_alive_fast=raw.get("keep_alive_fast", 300), - keep_alive_heavy=normalize_keep_alive_for_tier( - "code", raw.get("keep_alive_heavy", -1) - ), + keep_alive_heavy=normalize_keep_alive_for_tier("code", raw.get("keep_alive_heavy", -1)), escalate_on_failure=bool(raw.get("escalate_on_failure", True)), prefer_think=bool( raw.get("prefer_think") @@ -912,7 +911,8 @@ def _apply_multi_model_routing( if pool and _has_multi_model_tier(pool, role): chosen_model, is_swap = pick_tier_model( - pool, role, + pool, + role, resident_models=resident_models, require_vision=require_vision, context_tokens=context_tokens, @@ -1001,7 +1001,8 @@ def classify_prompt( # Common kwargs for all _apply_multi_model_routing calls in this function. def _route(role: RouteRole, model: str, *, reasons: list[str]) -> RouteDecision: return _apply_multi_model_routing( - role, model, + role, + model, router=router, display_tokens=display_tokens, reasons=reasons, @@ -1057,7 +1058,11 @@ def _route(role: RouteRole, model: str, *, reasons: list[str]) -> RouteDecision: pool = router.model_pool if pool: fast_models = resolve_tier_models(pool, "fast") - fast_fits = [m for m in fast_models if m.max_context is not None and m.max_context >= context_tokens] + fast_fits = [ + m + for m in fast_models + if m.max_context is not None and m.max_context >= context_tokens + ] if fast_fits: # A fast model with sufficient context exists — stay in fast tier reasons.append( diff --git a/tests/hopper/test_model_router.py b/tests/hopper/test_model_router.py index 0b1a7c94ac5..491c711078b 100644 --- a/tests/hopper/test_model_router.py +++ b/tests/hopper/test_model_router.py @@ -574,7 +574,6 @@ def test_from_payload_no_prefer_think_when_code_first(): assert cfg.prefer_think is False - def test_router_lane_fast_prompt_routes_fast_with_think_enabled(): """E2E router lane contract (regression for e2e/router-llm.spec.ts). diff --git a/tests/hopper/test_model_router_apply.py b/tests/hopper/test_model_router_apply.py index fa1b3603129..bedbe6766fe 100644 --- a/tests/hopper/test_model_router_apply.py +++ b/tests/hopper/test_model_router_apply.py @@ -4,13 +4,13 @@ from unittest.mock import MagicMock, patch -from cecli.hopper.router import ModelPoolEntry, ModelRouterConfig, RouteDecision from cecli.hopper.apply import ( apply_hopper_extra_params, apply_route_to_coder, apply_thinking_extra_params, merge_extra_params, ) +from cecli.hopper.router import ModelPoolEntry, ModelRouterConfig, RouteDecision def test_apply_thinking_extra_params_sets_bool(): diff --git a/tests/hopper/test_model_router_preload.py b/tests/hopper/test_model_router_preload.py index 72e7003e047..db52b62fdf4 100644 --- a/tests/hopper/test_model_router_preload.py +++ b/tests/hopper/test_model_router_preload.py @@ -8,11 +8,10 @@ import pytest from cecli.hopper.router import ( - preload_priority_list, _strip_ollama_prefix, + preload_priority_list, ) - # --------------------------------------------------------------------------- # Mock Ollama client # --------------------------------------------------------------------------- @@ -100,7 +99,7 @@ async def test_preload_vram_budget_stops_when_exceeded(): """When cumulative VRAM exceeds budget, stop preloading remaining models.""" client = MockOllamaClient( model_sizes={ - "model-a:7b": 4_000_000_000, # 4 GB + "model-a:7b": 4_000_000_000, # 4 GB "model-b:13b": 8_000_000_000, # 8 GB "model-c:32b": 18_000_000_000, # 18 GB } @@ -110,9 +109,7 @@ async def test_preload_vram_budget_stops_when_exceeded(): # So model-a fits, model-b does NOT fit, stop. budget = 11_000_000_000 - result = await preload_priority_list( - priority, ollama_client=client, vram_budget_bytes=budget - ) + result = await preload_priority_list(priority, ollama_client=client, vram_budget_bytes=budget) assert result == ["model-a:7b"] # Only model-a was actually preloaded (generate called) @@ -132,9 +129,7 @@ async def test_preload_vram_budget_all_fit(): priority = ["model-a:7b", "model-b:7b"] budget = 10_000_000_000 # 10 GB — both fit - result = await preload_priority_list( - priority, ollama_client=client, vram_budget_bytes=budget - ) + result = await preload_priority_list(priority, ollama_client=client, vram_budget_bytes=budget) assert result == ["model-a:7b", "model-b:7b"] assert len(client.generate_calls) == 2 @@ -152,9 +147,7 @@ async def test_preload_vram_unknown_size_proceeds(): priority = ["model-a:7b", "model-b:unknown"] budget = 5_000_000_000 # 5 GB - result = await preload_priority_list( - priority, ollama_client=client, vram_budget_bytes=budget - ) + result = await preload_priority_list(priority, ollama_client=client, vram_budget_bytes=budget) # Both preloaded — model-b has no size info, so budget check skipped for it assert result == ["model-a:7b", "model-b:unknown"] @@ -226,9 +219,7 @@ async def test_preload_show_failure_skips_budget_check(): priority = ["model-a:7b"] budget = 1_000 # Tiny budget — but show fails, so budget check skipped - result = await preload_priority_list( - priority, ollama_client=client, vram_budget_bytes=budget - ) + result = await preload_priority_list(priority, ollama_client=client, vram_budget_bytes=budget) assert result == ["model-a:7b"] diff --git a/tests/hopper/test_model_router_warmup.py b/tests/hopper/test_model_router_warmup.py index 44287b162fc..1de974c6ebd 100644 --- a/tests/hopper/test_model_router_warmup.py +++ b/tests/hopper/test_model_router_warmup.py @@ -9,7 +9,6 @@ from cecli.hopper.router import warmup_keep_alive - # --------------------------------------------------------------------------- # Mock Ollama client # --------------------------------------------------------------------------- @@ -129,9 +128,7 @@ async def test_warmup_uses_keep_alive_minus_one(): @pytest.mark.asyncio async def test_warmup_all_failures_returns_empty(): """When all models fail, returns empty list.""" - client = MockOllamaClient( - failing_models={"model-a:7b", "model-b:7b"} - ) + client = MockOllamaClient(failing_models={"model-a:7b", "model-b:7b"}) priority = ["model-a:7b", "model-b:7b"] result = await warmup_keep_alive(priority, ollama_client=client) From 4536367a2d8a66e9a0881db6dcb85640f2cefc2d Mon Sep 17 00:00:00 2001 From: CIA Operations Officer Jennifer Pike Date: Mon, 15 Jun 2026 14:45:50 -0700 Subject: [PATCH 4/4] fix(hopper): satisfy codespell on think-tier keyword regex Expand architect(?:ure|ural)? to explicit words so codespell does not flag "ure". Co-authored-by: Cursor --- cecli/hopper/router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cecli/hopper/router.py b/cecli/hopper/router.py index ed1f1c7fd59..c7457cbf73f 100644 --- a/cecli/hopper/router.py +++ b/cecli/hopper/router.py @@ -93,7 +93,7 @@ def normalize_keep_alive_for_tier(tier: RouteTier | RouteRole, value: int | str) # Intent signals (case-insensitive word boundaries). _THINK_PATTERNS = re.compile( r"\b(" - r"architect(?:ure|ural)?|refactor|rewrite|migrate|migration|" + r"architecture|architectural|architect|refactor|rewrite|migrate|migration|" r"race\s+condition|deadlock|concurrency|distributed|microservice|" r"security|vulnerability|root\s+cause|design\s+review|" r"performance|scalability|profil(?:e|ing)|"