From dfb0efbe544dae37677470a9a4974895d9d9fc45 Mon Sep 17 00:00:00 2001
From: CIA Operations Officer Jennifer Pike <agent.jennifer.pike@gmail.com>
Date: Mon, 15 Jun 2026 14:36:31 -0700
Subject: [PATCH 1/4] feat(hopper): model pool routing in cecli.hopper

Move fast/code/think hopper pool, prompt classification, escalation, and apply-route from host integration into cecli.hopper with optional preload resolver hooks. Add 65 unit tests under cecli/tests/hopper/.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cecli/hopper/__init__.py                  |   85 ++
 cecli/hopper/apply.py                     |   90 ++
 cecli/hopper/router.py                    | 1184 +++++++++++++++++++++
 tests/basic/test_sessions.py              |    2 +-
 tests/helpers/monorepo/LOCAL_WORKSPACE.md |    2 +-
 tests/hopper/test_model_pool.py           |   30 +
 tests/hopper/test_model_router.py         |  611 +++++++++++
 tests/hopper/test_model_router_apply.py   |  208 ++++
 tests/hopper/test_model_router_preload.py |  251 +++++
 tests/hopper/test_model_router_warmup.py  |  142 +++
 10 files changed, 2603 insertions(+), 2 deletions(-)
 create mode 100644 cecli/hopper/__init__.py
 create mode 100644 cecli/hopper/apply.py
 create mode 100644 cecli/hopper/router.py
 create mode 100644 tests/hopper/test_model_pool.py
 create mode 100644 tests/hopper/test_model_router.py
 create mode 100644 tests/hopper/test_model_router_apply.py
 create mode 100644 tests/hopper/test_model_router_preload.py
 create mode 100644 tests/hopper/test_model_router_warmup.py

diff --git a/cecli/hopper/__init__.py b/cecli/hopper/__init__.py
new file mode 100644
index 00000000000..2e08171401f
--- /dev/null
+++ b/cecli/hopper/__init__.py
@@ -0,0 +1,85 @@
+"""Model hopper pool + prompt routing (fast / code / think tiers)."""
+
+from cecli.hopper.apply import (
+    apply_hopper_extra_params,
+    apply_route_to_coder,
+    apply_thinking_extra_params,
+    merge_extra_params,
+)
+from cecli.hopper.router import (
+    ModelPoolEntry,
+    ModelRouterConfig,
+    OllamaClient,
+    ResolvedModelPool,
+    RouteDecision,
+    RouteRole,
+    RouteTier,
+    RouteTurnContext,
+    classify_prompt,
+    context_exceeds_fast_model_limit,
+    escalation_target,
+    estimate_message_tokens,
+    estimate_prompt_tokens,
+    find_pool_entry,
+    inject_backend_extra_params,
+    lookup_model_max_input_tokens,
+    normalize_keep_alive_for_tier,
+    normalize_pool_tier,
+    normalize_route_role,
+    pick_tier_model,
+    pool_prefers_think,
+    pool_thinking_for_model,
+    preload_priority_list,
+    resolve_model_pool,
+    resolve_pool_entry_thinking,
+    resolve_provider_prefix,
+    resolve_tier_models,
+    role_to_legacy_tier,
+    set_backend_client_resolver,
+    set_static_vram_bytes_resolver,
+    should_escalate_code_turn,
+    should_escalate_fast_turn,
+    thinking_for_role,
+    warmup_keep_alive,
+)
+
+__all__ = [
+    "ModelPoolEntry",
+    "ModelRouterConfig",
+    "OllamaClient",
+    "ResolvedModelPool",
+    "RouteDecision",
+    "RouteRole",
+    "RouteTier",
+    "RouteTurnContext",
+    "apply_hopper_extra_params",
+    "apply_route_to_coder",
+    "apply_thinking_extra_params",
+    "classify_prompt",
+    "context_exceeds_fast_model_limit",
+    "escalation_target",
+    "estimate_message_tokens",
+    "estimate_prompt_tokens",
+    "find_pool_entry",
+    "inject_backend_extra_params",
+    "lookup_model_max_input_tokens",
+    "merge_extra_params",
+    "normalize_keep_alive_for_tier",
+    "normalize_pool_tier",
+    "normalize_route_role",
+    "pick_tier_model",
+    "pool_prefers_think",
+    "pool_thinking_for_model",
+    "preload_priority_list",
+    "resolve_model_pool",
+    "resolve_pool_entry_thinking",
+    "resolve_provider_prefix",
+    "resolve_tier_models",
+    "role_to_legacy_tier",
+    "set_backend_client_resolver",
+    "set_static_vram_bytes_resolver",
+    "should_escalate_code_turn",
+    "should_escalate_fast_turn",
+    "thinking_for_role",
+    "warmup_keep_alive",
+]
diff --git a/cecli/hopper/apply.py b/cecli/hopper/apply.py
new file mode 100644
index 00000000000..28fbb1ced29
--- /dev/null
+++ b/cecli/hopper/apply.py
@@ -0,0 +1,90 @@
+"""Apply a route decision to a live cecli Coder (swap main_model + Ollama keep_alive)."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from cecli import models
+
+from cecli.hopper.router import (
+    ModelRouterConfig,
+    RouteDecision,
+    RouteRole,
+    find_pool_entry,
+    normalize_keep_alive_for_tier,
+    normalize_route_role,
+    resolve_pool_entry_thinking,
+)
+
+
+def merge_extra_params(into: dict[str, Any], patch: dict[str, Any]) -> None:
+    """Deep-merge LiteLLM kwargs (cecli-style); router owns ``keep_alive``."""
+    for key, value in patch.items():
+        if key == "keep_alive":
+            continue
+        if isinstance(value, dict) and isinstance(into.get(key), dict):
+            merge_extra_params(into[key], value)
+        else:
+            into[key] = value
+
+
+def apply_hopper_extra_params(model, extra: dict[str, Any] | None) -> None:
+    if not extra:
+        return
+    model._ensure_extra_params_dict()
+    merge_extra_params(model.extra_params, extra)
+
+
+def apply_thinking_extra_params(model, enable: bool | None) -> None:
+    """Set Ollama ``think`` for this model; overrides hopper/global ``think``."""
+    if enable is None:
+        return
+    model._ensure_extra_params_dict()
+    model.extra_params["think"] = enable
+    name = (getattr(model, "name", "") or "").lower()
+    if "qwen3" in name:
+        if enable:
+            if getattr(model, "system_prompt_prefix", "") == "/no_think":
+                model.system_prompt_prefix = ""
+        else:
+            model.system_prompt_prefix = "/no_think"
+
+
+def _resolve_enable_thinking(
+    decision: RouteDecision,
+    router: ModelRouterConfig,
+    role: RouteRole,
+    pool_entry,
+) -> bool | None:
+    enable = decision.enable_thinking
+    if enable is not None:
+        return enable
+    if pool_entry is not None:
+        return resolve_pool_entry_thinking(pool_entry)
+    return None
+
+
+def apply_route_to_coder(coder, decision: RouteDecision, router: ModelRouterConfig) -> None:
+    """Point the coder at the routed model for this turn."""
+    prev = coder.main_model
+    new_model = models.Model(decision.model_name, from_model=prev)
+    role = decision.role or normalize_route_role(decision.tier) or "code"
+    pool_entry = (
+        find_pool_entry(router.model_pool, decision.model_name, role)
+        if router.model_pool
+        else None
+    )
+    apply_hopper_extra_params(
+        new_model,
+        pool_entry.extra_params if pool_entry else None,
+    )
+    if new_model.is_ollama():
+        new_model._ensure_extra_params_dict()
+        keep_alive = normalize_keep_alive_for_tier(
+            role,
+            router.keep_alive_fast if role == "fast" else router.keep_alive_heavy,
+        )
+        new_model.extra_params["keep_alive"] = keep_alive
+    enable = _resolve_enable_thinking(decision, router, role, pool_entry)
+    apply_thinking_extra_params(new_model, enable)
+    coder.main_model = new_model
diff --git a/cecli/hopper/router.py b/cecli/hopper/router.py
new file mode 100644
index 00000000000..67f8d45ae01
--- /dev/null
+++ b/cecli/hopper/router.py
@@ -0,0 +1,1184 @@
+"""
+Model hopper + local LLM routing: classify prompts and pick fast vs code vs think models.
+
+Security: only uses model names supplied in config — no runtime fetch of arbitrary models.
+Hosts may register optional preload resolvers via :func:`set_backend_client_resolver`.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import Any, Literal, Protocol, runtime_checkable
+
+logger = logging.getLogger(__name__)
+
+_backend_client_resolver: Callable[[], Any] | None = None
+_static_vram_bytes_resolver: Callable[[str], int | None] | None = None
+
+
+def set_backend_client_resolver(fn: Callable[[], Any] | None) -> None:
+    """Host hook: return active backend client for preload when none is passed explicitly."""
+    global _backend_client_resolver
+    _backend_client_resolver = fn
+
+
+def set_static_vram_bytes_resolver(fn: Callable[[str], int | None] | None) -> None:
+    """Host hook: estimate model VRAM in bytes from a bare tag (no live show API)."""
+    global _static_vram_bytes_resolver
+    _static_vram_bytes_resolver = fn
+
+RouteRole = Literal["fast", "code", "think"]
+RouteTier = Literal["fast", "heavy", "code", "think"]
+
+
+@runtime_checkable
+class OllamaClient(Protocol):
+    """Protocol for an async Ollama HTTP client (preload / show)."""
+
+    async def post_generate(self, model: str, *, keep_alive: int = -1) -> None:
+        """Issue a zero-token generate to preload the model into VRAM."""
+        ...
+
+    async def show_model(self, model: str) -> dict[str, Any]:
+        """Return model info (at minimum ``size`` in bytes). Empty dict on failure."""
+        ...
+
+
+def normalize_route_role(tier_or_role: str | None) -> RouteRole | None:
+    """Map API/UI tier names to a routing role (``heavy`` → ``code``)."""
+    if not tier_or_role:
+        return None
+    key = tier_or_role.strip().lower()
+    if key == "fast":
+        return "fast"
+    if key in ("heavy", "code"):
+        return "code"
+    if key == "think":
+        return "think"
+    return None
+
+
+def role_to_legacy_tier(role: RouteRole) -> RouteTier:
+    """SSE/UI tier field: fast stays fast; code+think map to distinct tiers."""
+    return role
+
+
+def normalize_pool_tier(raw: str | None) -> RouteRole | None:
+    if not raw:
+        return None
+    return normalize_route_role(raw)
+
+
+# Code + think tiers keep models loaded during agent loops (keep_alive=0 → empty Ollama).
+def normalize_keep_alive_for_tier(tier: RouteTier | RouteRole, value: int | str) -> int | str:
+    if tier in ("heavy", "code", "think") and value in (0, "0"):
+        return -1
+    return value
+
+
+# Per-file context bump for *display* only (routing uses message_tokens).
+_FILE_TOKEN_PER_FILE = 500
+_FILE_TOKEN_CAP = 2_000
+
+# Reserve completion tokens when comparing session context to fast model window.
+_FAST_CONTEXT_OUTPUT_RESERVE = 2_048
+
+# Intent signals (case-insensitive word boundaries).
+_THINK_PATTERNS = re.compile(
+    r"\b("
+    r"architect(?:ure|ural)?|refactor|rewrite|migrate|migration|"
+    r"race\s+condition|deadlock|concurrency|distributed|microservice|"
+    r"security|vulnerability|root\s+cause|design\s+review|"
+    r"performance|scalability|profil(?:e|ing)|"
+    r"from\s+scratch|greenfield|system\s+design|"
+    r"analyze|analyse|debug|why\s+does|explain\s+why|investigate|"
+    r"tradeoff|trade-off|compare\s+approaches|plan\s+the"
+    r")\b",
+    re.IGNORECASE,
+)
+
+_FAST_PATTERNS = re.compile(
+    r"\b("
+    r"rename|typo|whitespace|format(?:ting)?|lint|prettier|"
+    r"color|colour|style|css|spacing|margin|padding|"
+    r"label|tooltip|copy|wording|comment(?:s)?|"
+    r"tweak|ui\s+text|button\s+text|"
+    r"references?|chips?|filesystem|autocomplete|mention|"
+    r"chat\s+panel|message\s+input|text\s+field|component|"
+    r"like\s+we\s+have|@\s*\w"
+    r")\b",
+    re.IGNORECASE,
+)
+
+# "add" alone is ambiguous (UI copy vs new feature); routing uses stronger verbs only.
+_CODE_TASK_STRONG = re.compile(
+    r"\b(implement|fix|create|update|change|patch|write|build)\b",
+    re.IGNORECASE,
+)
+
+
+def _parse_pool_extra_params(raw: Any) -> dict[str, Any] | None:
+    if isinstance(raw, dict) and raw:
+        return dict(raw)
+    if isinstance(raw, str) and raw.strip():
+        try:
+            parsed = json.loads(raw)
+        except json.JSONDecodeError:
+            return None
+        return parsed if isinstance(parsed, dict) and parsed else None
+    return None
+
+
+def _parse_capabilities(raw: Any) -> dict[str, Any] | None:
+    """Parse model capabilities from payload (dict with bool/int values)."""
+    if isinstance(raw, dict) and raw:
+        return dict(raw)
+    return None
+
+
+@dataclass
+class ModelPoolEntry:
+    model: str
+    tier: RouteRole
+    enabled: bool = True
+    """Per-model LiteLLM ``think`` override; ``None`` → derive from tier."""
+    enable_thinking: bool | None = None
+    """Per-model LiteLLM kwargs when this hopper row is routed."""
+    extra_params: dict[str, Any] | None = None
+    """Priority rank within the global priority list (0 = highest). None when unset."""
+    priority_rank: int | None = None
+    """When True, route to the second-highest-priority model in this tier."""
+    prefer_secondary: bool = False
+    """Model capabilities: vision, large_context, specializations."""
+    capabilities: dict[str, Any] | None = None
+
+    @property
+    def has_vision(self) -> bool:
+        """True when this model supports multimodal/vision input."""
+        return bool(self.capabilities and self.capabilities.get("vision"))
+
+    @property
+    def max_context(self) -> int | None:
+        """Max context window size in tokens, if declared."""
+        if not self.capabilities:
+            return None
+        raw = self.capabilities.get("max_context")
+        return int(raw) if raw is not None and int(raw) > 0 else None
+
+
+def resolve_tier_models(pool: list[ModelPoolEntry], tier: RouteRole) -> list[ModelPoolEntry]:
+    """Return all enabled models for a tier, sorted by priority_rank (ascending = highest priority first).
+
+    Models with priority_rank=None are sorted after those with a rank.
+    """
+    filtered = [e for e in pool if e.enabled and e.tier == tier]
+    filtered.sort(key=lambda e: (e.priority_rank is None, e.priority_rank if e.priority_rank is not None else 0))
+    return filtered
+
+
+def pick_tier_model(
+    pool: list[ModelPoolEntry],
+    tier: RouteRole,
+    *,
+    resident_models: set[str] | None = None,
+    require_vision: bool = False,
+    context_tokens: int | None = None,
+    prefer_warm: bool = False,
+) -> tuple[str, bool]:
+    """Pick the model to route to for a tier.
+
+    Returns (model_name, is_swap).
+    Respects capability requirements, context limits, residency preference,
+    prefer_secondary flag, and priority ordering.
+
+    Fallback logic (in order):
+    1. If require_vision: filter to vision-capable models. If none, fall through to all.
+    2. If context_tokens set: filter out models whose max_context < context_tokens. If none pass, use all.
+    3. If prefer_warm and resident_models: prefer resident models (but don't require).
+    4. Apply prefer_secondary / priority ordering on the remaining candidates.
+    """
+    models = resolve_tier_models(pool, tier)
+    if not models:
+        raise ValueError(f"No enabled models available for tier '{tier}'")
+
+    candidates = models
+
+    # --- Capability filter: vision ---
+    if require_vision:
+        vision_models = [m for m in candidates if m.has_vision]
+        if vision_models:
+            candidates = vision_models
+
+    # --- Context window filter ---
+    if context_tokens is not None and context_tokens > 0:
+        fits = [m for m in candidates if m.max_context is None or m.max_context >= context_tokens]
+        if fits:
+            candidates = fits
+        # If none fit, keep all candidates (best-effort routing)
+
+    # --- Residency preference (soft — prefer warm, don't require) ---
+    if prefer_warm and resident_models and len(candidates) > 1:
+        warm = [m for m in candidates if m.model in resident_models]
+        if warm:
+            candidates = warm
+
+    # --- Priority / prefer_secondary selection ---
+    prefer_secondary = any(m.prefer_secondary for m in candidates)
+    if prefer_secondary and len(candidates) >= 2:
+        chosen = candidates[1]
+    else:
+        chosen = candidates[0]
+
+    # Determine is_swap: True when model not in resident_models
+    is_swap = False
+    if resident_models is not None and chosen.model not in resident_models:
+        is_swap = True
+
+    return (chosen.model, is_swap)
+
+
+async def preload_priority_list(
+    priority_list: list[str],
+    *,
+    ollama_client: Any | None = None,
+    vram_budget_bytes: int | None = None,
+    backend_client: Any | None = None,
+) -> list[str]:
+    """Preload models in priority order, respecting VRAM budget.
+
+    Iterates ``priority_list`` from index 0 (highest priority) onward. For each model:
+    - If ``vram_budget_bytes`` is set, fetches model size info and checks cumulative VRAM.
+      When the budget would be exceeded, logs deferred models and stops.
+    - Attempts to preload via ``backend_client``, optional host resolver, or ``ollama_client``.
+    - On success, appends to the returned list.
+    - On failure, logs the error, skips the model, and continues with the next.
+
+    Returns the list of successfully preloaded model tags.
+    """
+    preloaded: list[str] = []
+    cumulative_vram: int = 0
+
+    for idx, model_tag in enumerate(priority_list):
+        tag = model_tag.strip()
+        if not tag:
+            continue
+
+        raw_tag = _strip_ollama_prefix(tag)
+
+        model_size: int | None = None
+        if vram_budget_bytes is not None:
+            model_size = await _get_model_size_for_budget(
+                raw_tag, ollama_client=ollama_client
+            )
+            if model_size is not None:
+                if cumulative_vram + model_size > vram_budget_bytes:
+                    deferred = [t.strip() for t in priority_list[idx:] if t.strip()]
+                    logger.info(
+                        "VRAM budget exceeded (%.1f MB used of %.1f MB). "
+                        "Deferring models: %s",
+                        cumulative_vram / (1024 * 1024),
+                        vram_budget_bytes / (1024 * 1024),
+                        deferred,
+                    )
+                    break
+
+        if await _preload_single_model(
+            raw_tag,
+            ollama_client=ollama_client,
+            backend_client=backend_client,
+        ):
+            preloaded.append(tag)
+            if model_size is not None:
+                cumulative_vram += model_size
+
+    return preloaded
+
+
+async def warmup_keep_alive(
+    priority_list: list[str],
+    *,
+    ollama_client: Any | None = None,
+    backend_client: Any | None = None,
+) -> list[str]:
+    """Send keep-alive requests in priority order to refresh model TTLs.
+
+    Iterates ``priority_list`` from index 0 (highest priority) onward. For each model:
+    - Strips the ``ollama_chat/`` or ``ollama/`` prefix for backend API calls.
+    - Sends a keep-alive/preload request via the active backend (or legacy client).
+    - On success, appends to the returned list.
+    - On failure, logs the error, skips the model, and continues with the next.
+
+    Returns the list of model tags that were successfully kept alive.
+    """
+    kept_alive: list[str] = []
+
+    for model_tag in priority_list:
+        tag = model_tag.strip()
+        if not tag:
+            continue
+
+        raw_tag = _strip_ollama_prefix(tag)
+
+        if await _preload_single_model(
+            raw_tag,
+            ollama_client=ollama_client,
+            backend_client=backend_client,
+        ):
+            kept_alive.append(tag)
+        else:
+            logger.error("Keep-alive warmup failed for model '%s'", tag)
+
+    return kept_alive
+
+
+def _strip_ollama_prefix(tag: str) -> str:
+    """Remove ``ollama_chat/`` or ``ollama/`` prefix from a model tag."""
+    if tag.startswith("ollama_chat/"):
+        return tag[len("ollama_chat/"):]
+    if tag.startswith("ollama/"):
+        return tag[len("ollama/"):]
+    return tag
+
+
+async def _get_model_size(ollama_client: Any, raw_tag: str) -> int | None:
+    """Attempt to get model size in bytes via ollama show. Returns None on failure."""
+    try:
+        info = await ollama_client.show_model(raw_tag)
+        size = info.get("size") if isinstance(info, dict) else None
+        if isinstance(size, (int, float)) and size > 0:
+            return int(size)
+        return None
+    except Exception:
+        return None
+
+
+def _estimate_model_size_bytes(raw_tag: str) -> int | None:
+    """Static VRAM estimate via optional host resolver (bytes)."""
+    if _static_vram_bytes_resolver is None:
+        return None
+    return _static_vram_bytes_resolver(raw_tag)
+
+
+async def _get_model_size_for_budget(
+    raw_tag: str,
+    *,
+    ollama_client: Any | None,
+) -> int | None:
+    """Resolve model size for VRAM budgeting (Ollama show or static metadata)."""
+    if ollama_client is not None:
+        return await _get_model_size(ollama_client, raw_tag)
+    return _estimate_model_size_bytes(raw_tag)
+
+
+async def _preload_single_model(
+    raw_tag: str,
+    *,
+    ollama_client: Any | None = None,
+    backend_client: Any | None = None,
+) -> bool:
+    """Preload one model via Ollama client or host-injected backend client."""
+    if ollama_client is not None:
+        try:
+            await ollama_client.post_generate(raw_tag, keep_alive=-1)
+            return True
+        except Exception as exc:
+            logger.error("Preload failed for model '%s': %s", raw_tag, exc)
+            return False
+
+    client = backend_client
+    if client is None and _backend_client_resolver is not None:
+        client = _backend_client_resolver()
+    if client is None:
+        return False
+    try:
+        loaded = await client.preload_models([raw_tag])
+        return raw_tag in loaded
+    except Exception as exc:
+        logger.error("Preload failed for model '%s': %s", raw_tag, exc)
+        return False
+
+
+def find_pool_entry(
+    pool: list[ModelPoolEntry],
+    model_name: str,
+    role: RouteRole,
+) -> ModelPoolEntry | None:
+    """Match hopper row for a routed model (empty code id → session code tier)."""
+    target = (model_name or "").strip()
+    for entry in pool:
+        if not entry.enabled:
+            continue
+        name = entry.model.strip()
+        if name and name != target:
+            continue
+        if name == target or (not name and role == "code"):
+            return entry
+    return None
+
+
+def thinking_for_pool_tier(tier: RouteRole) -> bool:
+    return tier == "think"
+
+
+def resolve_pool_entry_thinking(entry: ModelPoolEntry) -> bool:
+    if entry.enable_thinking is not None:
+        return entry.enable_thinking
+    return thinking_for_pool_tier(entry.tier)
+
+
+def pool_thinking_for_model(model_name: str, pool: list[ModelPoolEntry]) -> bool | None:
+    """Explicit hopper ``enable_thinking`` for a resolved model id."""
+    target = (model_name or "").strip()
+    if not target:
+        return None
+    for entry in pool:
+        if not entry.enabled:
+            continue
+        name = entry.model.strip()
+        if name and name == target:
+            return resolve_pool_entry_thinking(entry)
+    return None
+
+
+@dataclass
+class ResolvedModelPool:
+    fast: str
+    code: str
+    think: str | None
+
+
+def resolve_model_pool(
+    pool: list[ModelPoolEntry],
+    *,
+    session_code: str,
+    fallback_fast: str = "",
+    fallback_code: str | None = None,
+    fallback_think: str | None = None,
+) -> ResolvedModelPool:
+    """Pick first enabled fast/code/think from hopper order."""
+    fast = fallback_fast.strip()
+    code = (fallback_code or "").strip() or session_code
+    think = (fallback_think or "").strip() or None
+    for entry in pool:
+        if not entry.enabled:
+            continue
+        name = entry.model.strip()
+        if entry.tier == "fast" and name and not fast:
+            fast = name
+        elif entry.tier == "code":
+            if name:
+                code = name
+            else:
+                code = session_code
+        elif entry.tier == "think" and name and not think:
+            think = name
+    return ResolvedModelPool(fast=fast, code=code, think=think)
+
+
+def pool_prefers_think(pool: list[ModelPoolEntry]) -> bool:
+    """True when the first enabled think entry appears before the first enabled code entry.
+
+    This reflects the user dragging think to the top of the hopper (highest priority).
+    """
+    think_idx: int | None = None
+    code_idx: int | None = None
+    for i, entry in enumerate(pool):
+        if not entry.enabled:
+            continue
+        if entry.tier == "think" and entry.model.strip() and think_idx is None:
+            think_idx = i
+        elif entry.tier == "code" and code_idx is None:
+            code_idx = i
+    if think_idx is None or code_idx is None:
+        return False
+    return think_idx < code_idx
+
+
+def _parse_env_bool(key: str) -> bool | None:
+    """Parse CODE_THINK / FAST_THINK from process env or local-llm config files."""
+    # Check process env first
+    val = os.environ.get(key, "").strip().lower()
+    if val in ("1", "true", "yes", "on"):
+        return True
+    if val in ("0", "false", "no", "off"):
+        return False
+    # Fall back to optional local-llm env files on disk
+    return _read_local_llm_env_bool(key)
+
+
+def _local_llm_env_paths() -> list:
+    """Candidate env files for tier think flags (last file wins)."""
+    from pathlib import Path
+
+    paths: list[Path] = []
+    explicit = os.environ.get("CECLI_LLM_ENV", "").strip()
+    if explicit:
+        paths.append(Path(explicit))
+    home = Path.home()
+    xdg = os.environ.get("XDG_CONFIG_HOME", "").strip()
+    config_home = Path(xdg) if xdg else home / ".config"
+    paths.append(config_home / "local-llm" / "env")
+    repo_root = os.environ.get("CECLI_REPO_ROOT", "").strip()
+    if repo_root:
+        paths.append(Path(repo_root) / "local-llm.env")
+    paths.append(Path.cwd() / "local-llm.env")
+    return paths
+
+
+def _read_local_llm_env_bool(key: str) -> bool | None:
+    """Read a key from the local-llm env file chain (last file wins)."""
+
+    result: bool | None = None
+    for p in _local_llm_env_paths():
+        try:
+            if not p.is_file():
+                continue
+            for line in p.read_text().splitlines():
+                line = line.strip()
+                if line.startswith("#") or "=" not in line:
+                    continue
+                k, _, v = line.partition("=")
+                if k.strip() != key:
+                    continue
+                v = v.strip().strip("'\"").lower()
+                if v in ("1", "true", "yes", "on"):
+                    result = True
+                elif v in ("0", "false", "no", "off"):
+                    result = False
+        except OSError:
+            continue
+    return result
+
+
+def _apply_env_think_to_pool(pool: list[ModelPoolEntry]) -> None:
+    """Override pool enable_thinking from per-slot or tier-level env vars.
+
+    Resolution order (highest priority first):
+      1. Per-slot: ``CODE_MODEL_THINK=1`` (slot 0), ``CODE_MODEL_1_THINK=1`` (slot 1)
+      2. Tier-level: ``CODE_THINK=1`` (applies to all slots in tier without per-slot override)
+
+    The frontend may send stale localStorage values; the env file is authoritative.
+    """
+    code_think = _parse_env_bool("CODE_THINK")
+    fast_think = _parse_env_bool("FAST_THINK")
+
+    # Per-slot overrides: {TIER}_MODEL_THINK (slot 0), {TIER}_MODEL_{N}_THINK (slots 1-9)
+    slot_think: dict[tuple[str, int], bool | None] = {}
+    for tier_prefix in ("FAST", "CODE", "THINK"):
+        # Slot 0: {TIER}_MODEL_THINK
+        val = _parse_env_bool(f"{tier_prefix}_MODEL_THINK")
+        if val is not None:
+            slot_think[(tier_prefix.lower(), 0)] = val
+        # Slots 1-9: {TIER}_MODEL_{N}_THINK
+        for n in range(1, 10):
+            val = _parse_env_bool(f"{tier_prefix}_MODEL_{n}_THINK")
+            if val is not None:
+                slot_think[(tier_prefix.lower(), n)] = val
+
+    if code_think is None and fast_think is None and not slot_think:
+        return
+
+    for entry in pool:
+        if not entry.enabled:
+            continue
+        # Determine slot index from priority_rank or default to 0
+        slot_idx = entry.priority_rank if entry.priority_rank is not None else 0
+        tier = entry.tier
+
+        # Per-slot override takes priority
+        per_slot = slot_think.get((tier, slot_idx))
+        if per_slot is not None:
+            entry.enable_thinking = per_slot
+        elif tier == "code" and code_think is not None:
+            entry.enable_thinking = code_think
+        elif tier == "fast" and fast_think is not None:
+            entry.enable_thinking = fast_think
+
+
+@dataclass
+class RouteTurnContext:
+    agent_cmd: bool = False
+    implement_turn: bool = False
+    inject_todo_spec: bool = False
+    spec_gen_turn: bool = False
+    exploration_aborted: bool = False
+
+
+_BACKEND_PROVIDER_PREFIXES: dict[str, str] = {
+    "ollama": "ollama_chat/",
+    "vllm": "openai/",
+    "tgi": "openai/",
+    "llamacpp": "openai/",
+    "mlx-lm": "openai/",
+}
+
+
+def resolve_provider_prefix(backend: str) -> str:
+    """Map a backend name to its LiteLLM provider prefix.
+
+    Defaults to ``ollama_chat/`` for unknown backends.
+    """
+    return _BACKEND_PROVIDER_PREFIXES.get((backend or "").strip().lower(), "ollama_chat/")
+
+
+def inject_backend_extra_params(backend: str, extra_params: dict[str, object] | None) -> dict[str, object]:
+    """Merge ``LITELLM_EXTRA_PARAMS`` for non-Ollama backends.
+
+    Ollama uses its own env wiring; other backends may need auth headers or base URLs
+    via JSON in ``LITELLM_EXTRA_PARAMS``. Existing *extra_params* keys are preserved.
+    """
+    merged: dict[str, object] = dict(extra_params or {})
+    name = (backend or "").strip().lower()
+    if name in ("", "ollama"):
+        return merged
+    raw = os.environ.get("LITELLM_EXTRA_PARAMS", "").strip()
+    if not raw:
+        return merged
+    try:
+        parsed = json.loads(raw)
+    except json.JSONDecodeError:
+        logger.warning("LITELLM_EXTRA_PARAMS is not valid JSON — ignoring for backend %s", name)
+        return merged
+    if isinstance(parsed, dict):
+        merged.update(parsed)
+    return merged
+
+
+@dataclass
+class ModelRouterConfig:
+    enabled: bool = False
+    fast_model: str = ""
+    heavy_model: str | None = None
+    code_model: str | None = None
+    think_model: str | None = None
+    model_pool: list[ModelPoolEntry] = field(default_factory=list)
+    token_fast_max: int = 4_096
+    token_heavy_min: int = 12_000
+    keep_alive_fast: int | str = 300
+    keep_alive_heavy: int | str = -1
+    escalate_on_failure: bool = True
+    prefer_think: bool = False
+    """Global priority list of model tags in priority order (index 0 = highest)."""
+    priority_list: list[str] = field(default_factory=list)
+    backend: str = "ollama"
+    provider_prefix: str = "ollama_chat/"
+
+    def __post_init__(self) -> None:
+        self.keep_alive_heavy = normalize_keep_alive_for_tier("code", self.keep_alive_heavy)
+        if not self.code_model and self.heavy_model:
+            self.code_model = self.heavy_model
+        self.provider_prefix = resolve_provider_prefix(self.backend)
+
+    @property
+    def resolved_code_model(self) -> str:
+        return (self.code_model or self.heavy_model or self.fast_model or "").strip()
+
+    @property
+    def resolved_think_model(self) -> str | None:
+        name = (self.think_model or "").strip()
+        return name or None
+
+    @classmethod
+    def from_payload(cls, raw: dict[str, Any] | None) -> ModelRouterConfig | None:
+        if not raw:
+            return None
+        enabled = bool(raw.get("enabled"))
+        if not enabled:
+            return cls(enabled=False)
+        pool_raw = raw.get("model_pool") or []
+        pool: list[ModelPoolEntry] = []
+        if isinstance(pool_raw, list):
+            for item in pool_raw:
+                if not isinstance(item, dict):
+                    continue
+                tier = normalize_pool_tier(str(item.get("tier") or ""))
+                if tier is None:
+                    continue
+                raw_rank = item.get("priority_rank")
+                priority_rank: int | None = (
+                    int(raw_rank) if raw_rank is not None else None
+                )
+                pool.append(
+                    ModelPoolEntry(
+                        model=str(item.get("model") or ""),
+                        tier=tier,
+                        enabled=bool(item.get("enabled", True)),
+                        enable_thinking=(
+                            item["enable_thinking"]
+                            if item.get("enable_thinking") is not None
+                            else None
+                        ),
+                        extra_params=_parse_pool_extra_params(item.get("extra_params")),
+                        priority_rank=priority_rank,
+                        prefer_secondary=bool(item.get("prefer_secondary", False)),
+                        capabilities=_parse_capabilities(item.get("capabilities")),
+                    )
+                )
+        fallback_fast = str(raw.get("fast_model") or "").strip()
+        fallback_code = (
+            str(raw.get("code_model") or raw.get("heavy_model") or "").strip() or None
+        )
+        fallback_think = str(raw.get("think_model") or "").strip() or None
+        session_code = fallback_code or fallback_fast or ""
+        if pool:
+            resolved = resolve_model_pool(
+                pool,
+                session_code=session_code or fallback_fast,
+                fallback_fast=fallback_fast,
+                fallback_code=fallback_code,
+                fallback_think=fallback_think,
+            )
+            fast, code, think = resolved.fast, resolved.code, resolved.think
+        else:
+            fast = fallback_fast
+            code = fallback_code or fallback_fast
+            think = fallback_think
+        if not fast:
+            return None
+        # Override pool enable_thinking from env (CODE_THINK / FAST_THINK) —
+        # the frontend may send stale localStorage values.
+        _apply_env_think_to_pool(pool)
+        # Parse global priority list from payload (list of model tag strings).
+        priority_list_raw = raw.get("priority_list")
+        priority_list: list[str] = []
+        if isinstance(priority_list_raw, list):
+            for tag in priority_list_raw:
+                s = str(tag).strip()
+                if s:
+                    priority_list.append(s)
+        return cls(
+            enabled=True,
+            fast_model=fast,
+            heavy_model=code or None,
+            code_model=code or None,
+            think_model=think,
+            model_pool=pool,
+            token_fast_max=int(raw.get("token_fast_max") or 4_096),
+            token_heavy_min=int(raw.get("token_heavy_min") or 12_000),
+            keep_alive_fast=raw.get("keep_alive_fast", 300),
+            keep_alive_heavy=normalize_keep_alive_for_tier(
+                "code", raw.get("keep_alive_heavy", -1)
+            ),
+            escalate_on_failure=bool(raw.get("escalate_on_failure", True)),
+            prefer_think=bool(
+                raw.get("prefer_think")
+                if raw.get("prefer_think") is not None
+                else pool_prefers_think(pool)
+            ),
+            priority_list=priority_list,
+        )
+
+
+@dataclass
+class RouteDecision:
+    tier: RouteTier
+    model_name: str
+    estimated_tokens: int
+    reasons: list[str] = field(default_factory=list)
+    role: RouteRole = "code"
+    enable_thinking: bool | None = None
+    """Priority rank of the chosen model within the global priority list (0 = highest). None for single-model tiers."""
+    priority_rank: int | None = None
+    """Snapshot of the config's priority_list at decision time. None when not applicable."""
+    priority_list_snapshot: list[str] | None = None
+    """True when the chosen model is not currently resident in Ollama memory (cold-start swap)."""
+    swap: bool = False
+
+
+def thinking_for_role(
+    role: RouteRole,
+    model_name: str,
+    *,
+    pool: list[ModelPoolEntry] | None = None,
+) -> bool | None:
+    """Per-model LiteLLM ``think`` for this route (hopper entry overrides role)."""
+    if pool:
+        explicit = pool_thinking_for_model(model_name, pool)
+        if explicit is not None:
+            return explicit
+    if role == "think":
+        return True
+    if role in ("fast", "code"):
+        return False
+    return None
+
+
+def estimate_message_tokens(
+    user_message: str,
+    *,
+    message_token_count: int | None = None,
+) -> int:
+    """Tokens from the user message only — used for routing."""
+    if message_token_count is not None and message_token_count > 0:
+        return message_token_count
+    return max(len(user_message) // 4, 32)
+
+
+def estimate_prompt_tokens(
+    user_message: str,
+    *,
+    files_in_chat: int = 0,
+    message_token_count: int | None = None,
+) -> int:
+    """Rough context size for UI (message + capped file bump). Not used for tier choice."""
+    base = estimate_message_tokens(user_message, message_token_count=message_token_count)
+    file_part = min(max(files_in_chat, 0) * _FILE_TOKEN_PER_FILE, _FILE_TOKEN_CAP)
+    return base + file_part
+
+
+@lru_cache(maxsize=64)
+def lookup_model_max_input_tokens(model_name: str) -> int | None:
+    """Cecli/LiteLLM metadata for a model id (e.g. ``ollama_chat/deepseek-coder:6.7b``)."""
+    name = (model_name or "").strip()
+    if not name:
+        return None
+    try:
+        from cecli.models import model_info_manager
+
+        info = model_info_manager.get_model_info(name) or {}
+        raw = info.get("max_input_tokens") or 0
+        return int(raw) if int(raw) > 0 else None
+    except Exception:
+        return None
+
+
+def context_exceeds_fast_model_limit(
+    context_tokens: int,
+    fast_model_name: str,
+    *,
+    fast_max_input: int | None = None,
+    output_reserve: int = _FAST_CONTEXT_OUTPUT_RESERVE,
+) -> tuple[bool, int | None]:
+    """
+    True when the live session context cannot fit the fast model (plus completion reserve).
+
+    ``fast_max_input`` overrides metadata lookup (tests).
+    """
+    if context_tokens <= 0:
+        return False, None
+    limit = fast_max_input
+    if limit is None:
+        limit = lookup_model_max_input_tokens(fast_model_name)
+    if limit is None:
+        return False, None
+    return context_tokens + output_reserve > limit, limit
+
+
+def _pick_think_model(
+    router: ModelRouterConfig,
+    *,
+    reasons: list[str],
+) -> tuple[RouteRole, str]:
+    think = router.resolved_think_model
+    if think:
+        return "think", think
+    reasons.append("think_unconfigured→code")
+    return "code", router.resolved_code_model
+
+
+def _has_multi_model_tier(pool: list[ModelPoolEntry], tier: RouteRole) -> bool:
+    """True when the pool has multiple enabled entries for `tier` with priority_rank set."""
+    ranked = [e for e in pool if e.enabled and e.tier == tier and e.priority_rank is not None]
+    return len(ranked) >= 2
+
+
+def _apply_multi_model_routing(
+    role: RouteRole,
+    model_name: str,
+    *,
+    router: ModelRouterConfig,
+    display_tokens: int,
+    reasons: list[str],
+    resident_models: set[str] | None = None,
+    require_vision: bool = False,
+    context_tokens: int | None = None,
+) -> RouteDecision:
+    """Wrap _finish_decision with multi-model tier routing when applicable.
+
+    If the resolved tier has multiple enabled models with priority_rank set,
+    use pick_tier_model to select the model; otherwise fall back to the
+    single-model behavior (the model_name already determined by classify_prompt).
+    """
+    pool = router.model_pool
+    priority_rank: int | None = None
+    priority_list_snapshot: list[str] | None = None
+    swap = False
+
+    if pool and _has_multi_model_tier(pool, role):
+        chosen_model, is_swap = pick_tier_model(
+            pool, role,
+            resident_models=resident_models,
+            require_vision=require_vision,
+            context_tokens=context_tokens,
+            prefer_warm=True,
+        )
+        model_name = chosen_model
+        swap = is_swap
+        # Find the priority_rank of the chosen model from the pool entry
+        for entry in pool:
+            if entry.enabled and entry.model == chosen_model and entry.tier == role:
+                priority_rank = entry.priority_rank
+                break
+        # Snapshot the config's priority_list if non-empty
+        if router.priority_list:
+            priority_list_snapshot = list(router.priority_list)
+
+        # Add reason if vision or context fallback was used
+        if require_vision:
+            reasons.append("vision_required")
+        if context_tokens and context_tokens > 0:
+            # Check if we fell through to a different model than the top priority
+            top_models = resolve_tier_models(pool, role)
+            if top_models and chosen_model != top_models[0].model:
+                reasons.append(f"context_fallback:{chosen_model.split('/')[-1]}")
+
+    return _finish_decision(
+        role,
+        model_name,
+        router=router,
+        display_tokens=display_tokens,
+        reasons=reasons,
+        priority_rank=priority_rank,
+        priority_list_snapshot=priority_list_snapshot,
+        swap=swap,
+    )
+
+
+def _finish_decision(
+    role: RouteRole,
+    model_name: str,
+    *,
+    router: ModelRouterConfig,
+    display_tokens: int,
+    reasons: list[str],
+    priority_rank: int | None = None,
+    priority_list_snapshot: list[str] | None = None,
+    swap: bool = False,
+) -> RouteDecision:
+    return RouteDecision(
+        tier=role_to_legacy_tier(role),
+        role=role,
+        model_name=model_name,
+        estimated_tokens=display_tokens,
+        reasons=reasons,
+        enable_thinking=thinking_for_role(role, model_name, pool=router.model_pool),
+        priority_rank=priority_rank,
+        priority_list_snapshot=priority_list_snapshot,
+        swap=swap,
+    )
+
+
+def classify_prompt(
+    user_message: str,
+    *,
+    message_tokens: int,
+    router: ModelRouterConfig,
+    code_model_name: str | None = None,
+    think_model_name: str | None = None,
+    context_tokens: int | None = None,
+    force_tier: RouteTier | None = None,
+    turn: RouteTurnContext | None = None,
+    # Back-compat for tests calling estimated_tokens=
+    estimated_tokens: int | None = None,
+    heavy_model_name: str | None = None,
+    fast_max_input: int | None = None,
+    resident_models: set[str] | None = None,
+    has_images: bool = False,
+) -> RouteDecision:
+    if estimated_tokens is not None and context_tokens is None:
+        context_tokens = estimated_tokens
+    display_tokens = context_tokens if context_tokens is not None else message_tokens
+    ctx = turn or RouteTurnContext()
+    code = (code_model_name or heavy_model_name or router.resolved_code_model).strip()
+    think = (think_model_name or router.resolved_think_model or "").strip() or None
+
+    # Common kwargs for all _apply_multi_model_routing calls in this function.
+    def _route(role: RouteRole, model: str, *, reasons: list[str]) -> RouteDecision:
+        return _apply_multi_model_routing(
+            role, model,
+            router=router,
+            display_tokens=display_tokens,
+            reasons=reasons,
+            resident_models=resident_models,
+            require_vision=has_images,
+            context_tokens=context_tokens,
+        )
+
+    forced = normalize_route_role(force_tier)
+    if forced:
+        if forced == "think" and not think:
+            forced = "code"
+        model = {
+            "fast": router.fast_model,
+            "code": code,
+            "think": think or code,
+        }[forced]
+        return _route(forced, model, reasons=[f"forced:{forced}"])
+
+    reasons: list[str] = []
+
+    if ctx.implement_turn or ctx.agent_cmd:
+        tag = "implement_turn" if ctx.implement_turn else "agent_cmd"
+        reasons.append(tag)
+        return _route("code", code, reasons=reasons)
+
+    if ctx.inject_todo_spec and not ctx.implement_turn:
+        reasons.append("inject_todo_spec")
+        role, model = _pick_think_model(router, reasons=reasons)
+        return _route(role, model, reasons=reasons)
+
+    if ctx.spec_gen_turn:
+        reasons.append("spec_gen")
+        role, model = _pick_think_model(router, reasons=reasons)
+        return _route(role, model, reasons=reasons)
+
+    if ctx.exploration_aborted:
+        reasons.append("exploration_aborted")
+        role, model = _pick_think_model(router, reasons=reasons)
+        return _route(role, model, reasons=reasons)
+
+    if re.search(r"/agent\b", user_message, re.IGNORECASE):
+        reasons.append("slash:/agent")
+        return _route("code", code, reasons=reasons)
+
+    if context_tokens is not None and context_tokens > 0:
+        exceeds_fast, fast_limit = context_exceeds_fast_model_limit(
+            context_tokens, router.fast_model, fast_max_input=fast_max_input
+        )
+        if exceeds_fast and fast_limit is not None:
+            # Check if any fast-tier model in the pool can handle this context
+            # (multi-model: a larger-context fast model may fit)
+            pool = router.model_pool
+            if pool:
+                fast_models = resolve_tier_models(pool, "fast")
+                fast_fits = [m for m in fast_models if m.max_context is not None and m.max_context >= context_tokens]
+                if fast_fits:
+                    # A fast model with sufficient context exists — stay in fast tier
+                    reasons.append(
+                        f"context_tokens>={fast_limit - _FAST_CONTEXT_OUTPUT_RESERVE} "
+                        f"(fast_max={fast_limit}) but fast pool has larger model"
+                    )
+                    return _route("fast", fast_fits[0].model, reasons=reasons)
+
+            reasons.append(
+                f"context_tokens>={fast_limit - _FAST_CONTEXT_OUTPUT_RESERVE} "
+                f"(fast_max={fast_limit})"
+            )
+            if router.prefer_think and think:
+                reasons.append("prefer_think")
+                return _route("think", think, reasons=reasons)
+            return _route("code", code, reasons=reasons)
+
+    if message_tokens >= router.token_heavy_min:
+        reasons.append(f"msg_tokens>={router.token_heavy_min}")
+        if _CODE_TASK_STRONG.search(user_message) and not router.prefer_think:
+            return _route("code", code, reasons=reasons)
+        role, model = _pick_think_model(router, reasons=reasons)
+        return _route(role, model, reasons=reasons)
+
+    think_hit = _THINK_PATTERNS.search(user_message)
+    fast_hit = _FAST_PATTERNS.search(user_message)
+    code_task = _CODE_TASK_STRONG.search(user_message) is not None
+
+    if think_hit:
+        reasons.append(f"keyword:{think_hit.group(0).lower()}")
+        role, model = _pick_think_model(router, reasons=reasons)
+        return _route(role, model, reasons=reasons)
+
+    if fast_hit and not router.prefer_think:
+        reasons.append(f"keyword:{fast_hit.group(0).lower()}")
+        return _route("fast", router.fast_model, reasons=reasons)
+
+    if code_task:
+        reasons.append("code_task")
+        if router.prefer_think and think:
+            reasons.append("prefer_think")
+            return _route("think", think, reasons=reasons)
+        return _route("code", code, reasons=reasons)
+
+    if message_tokens < router.token_fast_max:
+        reasons.append(f"msg_tokens<{router.token_fast_max}")
+        if router.prefer_think and think:
+            reasons.append("prefer_think")
+            return _route("think", think, reasons=reasons)
+        return _route("fast", router.fast_model, reasons=reasons)
+
+    reasons.append("default_code")
+    if router.prefer_think and think:
+        reasons.append("prefer_think")
+        return _route("think", think, reasons=reasons)
+    return _route("code", code, reasons=reasons)
+
+
+_CONTEXT_LIMIT_RE = re.compile(
+    r"exceeds the\s+[\d,]+\s+token limit",
+    re.IGNORECASE,
+)
+
+
+def should_escalate_fast_turn(
+    decision: RouteDecision,
+    *,
+    router: ModelRouterConfig,
+    user_message: str,
+    edited_files: list[str],
+    assistant_text: str,
+    had_tool_error: bool = False,
+    tool_error_text: str = "",
+) -> bool:
+    role = decision.role if decision.role else normalize_route_role(decision.tier) or "code"
+    if not router.escalate_on_failure or role != "fast":
+        return False
+    if edited_files:
+        return False
+    if had_tool_error and _CONTEXT_LIMIT_RE.search(tool_error_text):
+        return True
+    if had_tool_error:
+        return _CODE_TASK_STRONG.search(user_message) is not None
+    if len(assistant_text.strip()) > 400:
+        return False
+    if not _CODE_TASK_STRONG.search(user_message):
+        return False
+    return True
+
+
+def should_escalate_code_turn(
+    decision: RouteDecision,
+    *,
+    router: ModelRouterConfig,
+    user_message: str,
+    edited_files: list[str],
+    assistant_text: str,
+    had_tool_error: bool = False,
+) -> bool:
+    """Offer think tier when code model stalled on a reasoning-heavy prompt."""
+    role = decision.role if decision.role else normalize_route_role(decision.tier) or "code"
+    if not router.escalate_on_failure or role != "code":
+        return False
+    if not router.resolved_think_model:
+        return False
+    if edited_files:
+        return False
+    if had_tool_error and _THINK_PATTERNS.search(user_message):
+        return True
+    if _THINK_PATTERNS.search(user_message) and len(assistant_text.strip()) < 400:
+        return True
+    return False
+
+
+def escalation_target(decision: RouteDecision | None) -> RouteRole:
+    """Next tier when auto-escalating after a failed attempt."""
+    if decision is None:
+        return "code"
+    role = decision.role if decision.role else normalize_route_role(decision.tier) or "code"
+    if role == "fast":
+        return "code"
+    if role == "code":
+        return "think"
+    return "code"
diff --git a/tests/basic/test_sessions.py b/tests/basic/test_sessions.py
index 7fafd220963..1c79b34d43f 100644
--- a/tests/basic/test_sessions.py
+++ b/tests/basic/test_sessions.py
@@ -73,7 +73,7 @@ def session_manager(mock_coder):
 async def test_load_session_quiet_skips_tool_error_on_invalid_json(
     session_manager, mock_coder, tmp_path
 ):
-    """BrightVision auto-load uses quiet=True when restore is best-effort."""
+    """Headless hosts may auto-load with quiet=True when restore is best-effort."""
     session_dir = tmp_path / ".cecli" / "sessions"
     os.makedirs(session_dir, exist_ok=True)
     mock_coder.abs_root_path.side_effect = lambda x: str(tmp_path / x)
diff --git a/tests/helpers/monorepo/LOCAL_WORKSPACE.md b/tests/helpers/monorepo/LOCAL_WORKSPACE.md
index dd4311195cb..ff9fc3fce91 100644
--- a/tests/helpers/monorepo/LOCAL_WORKSPACE.md
+++ b/tests/helpers/monorepo/LOCAL_WORKSPACE.md
@@ -6,7 +6,7 @@ Extends cecli’s existing **clone** workspace mode (`repo:` URLs under `~/.cecl
 
 ## Motivation
 
-IDE clients (e.g. BrightVision) open a **primary git repo** but need agent context across **sibling repos** without cloning into `~/.cecli/workspaces/`. Submodule-only setups are a different layout; this PR adds an explicit, reviewable config surface.
+IDE clients (e.g. desktop agents with a primary git root) open a **primary git repo** but need agent context across **sibling repos** without cloning into `~/.cecli/workspaces/`. Submodule-only setups are a different layout; this PR adds an explicit, reviewable config surface.
 
 ## Config
 
diff --git a/tests/hopper/test_model_pool.py b/tests/hopper/test_model_pool.py
new file mode 100644
index 00000000000..14e2a0db1de
--- /dev/null
+++ b/tests/hopper/test_model_pool.py
@@ -0,0 +1,30 @@
+from cecli.hopper.router import ModelPoolEntry, resolve_model_pool
+
+
+def test_resolve_pool_priority_order():
+    pool = [
+        ModelPoolEntry(model="ollama_chat/fast-a", tier="fast", enabled=False),
+        ModelPoolEntry(model="ollama_chat/fast-b", tier="fast", enabled=True),
+        ModelPoolEntry(model="ollama_chat/code-x", tier="code", enabled=True),
+    ]
+    resolved = resolve_model_pool(
+        pool,
+        session_code="ollama_chat/session",
+        fallback_fast="",
+        fallback_code=None,
+    )
+    assert resolved.fast == "ollama_chat/fast-b"
+    assert resolved.code == "ollama_chat/code-x"
+
+
+def test_empty_code_row_uses_session():
+    pool = [
+        ModelPoolEntry(model="ollama_chat/fast", tier="fast", enabled=True),
+        ModelPoolEntry(model="", tier="code", enabled=True),
+    ]
+    resolved = resolve_model_pool(
+        pool,
+        session_code="ollama_chat/session",
+    )
+    assert resolved.fast == "ollama_chat/fast"
+    assert resolved.code == "ollama_chat/session"
diff --git a/tests/hopper/test_model_router.py b/tests/hopper/test_model_router.py
new file mode 100644
index 00000000000..0b1a7c94ac5
--- /dev/null
+++ b/tests/hopper/test_model_router.py
@@ -0,0 +1,611 @@
+from cecli.hopper.router import (
+    ModelPoolEntry,
+    ModelRouterConfig,
+    RouteTurnContext,
+    classify_prompt,
+    context_exceeds_fast_model_limit,
+    escalation_target,
+    estimate_message_tokens,
+    estimate_prompt_tokens,
+    resolve_model_pool,
+    should_escalate_code_turn,
+    should_escalate_fast_turn,
+    thinking_for_role,
+)
+
+
+def test_from_payload_normalizes_heavy_keep_alive_zero():
+    cfg = ModelRouterConfig.from_payload(
+        {
+            "enabled": True,
+            "fast_model": "ollama_chat/small",
+            "heavy_model": "ollama_chat/big",
+            "keep_alive_heavy": 0,
+        }
+    )
+    assert cfg is not None
+    assert cfg.keep_alive_heavy == -1
+
+
+def test_from_payload_think_and_code_models():
+    cfg = ModelRouterConfig.from_payload(
+        {
+            "enabled": True,
+            "fast_model": "ollama_chat/fast",
+            "code_model": "ollama_chat/code",
+            "think_model": "ollama_chat/think",
+            "model_pool": [
+                {"model": "ollama_chat/fast", "tier": "fast", "enabled": True},
+                {"model": "ollama_chat/code", "tier": "code", "enabled": True},
+                {"model": "ollama_chat/think", "tier": "think", "enabled": True},
+            ],
+        }
+    )
+    assert cfg is not None
+    assert cfg.resolved_code_model == "ollama_chat/code"
+    assert cfg.resolved_think_model == "ollama_chat/think"
+
+
+def test_classify_low_tokens_fast_keyword():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/big",
+    )
+    d = classify_prompt(
+        "Rename the button label to Save",
+        message_tokens=500,
+        router=router,
+        code_model_name="ollama_chat/big",
+    )
+    assert d.role == "fast"
+    assert d.model_name == "ollama_chat/small"
+    assert d.enable_thinking is False
+
+
+def test_classify_architect_think():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/code",
+        think_model="ollama_chat/think",
+    )
+    d = classify_prompt(
+        "Refactor the race condition in the session pool",
+        message_tokens=800,
+        router=router,
+        code_model_name="ollama_chat/code",
+    )
+    assert d.role == "think"
+    assert d.model_name == "ollama_chat/think"
+    assert d.enable_thinking is True
+
+
+def test_classify_architect_falls_back_to_code_without_think():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/code",
+    )
+    d = classify_prompt(
+        "Refactor the race condition in the session pool",
+        message_tokens=800,
+        router=router,
+        code_model_name="ollama_chat/code",
+    )
+    assert d.role == "code"
+    assert "think_unconfigured" in " ".join(d.reasons)
+
+
+def test_classify_agent_command_code():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/big",
+    )
+    d = classify_prompt(
+        "/agent explore the repo and update the checklist",
+        message_tokens=400,
+        router=router,
+        code_model_name="ollama_chat/big",
+    )
+    assert d.role == "code"
+    assert "slash:/agent" in d.reasons
+
+
+def test_classify_implement_turn_code():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/code",
+        think_model="ollama_chat/think",
+    )
+    d = classify_prompt(
+        "Implement only implementation task 1.2 per the injected spec.",
+        message_tokens=400,
+        router=router,
+        code_model_name="ollama_chat/code",
+        turn=RouteTurnContext(implement_turn=True),
+    )
+    assert d.role == "code"
+    assert d.model_name == "ollama_chat/code"
+
+
+def test_classify_inject_todo_spec_think():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/code",
+        think_model="ollama_chat/think",
+    )
+    d = classify_prompt(
+        "Continue planning the auth module",
+        message_tokens=400,
+        router=router,
+        code_model_name="ollama_chat/code",
+        turn=RouteTurnContext(inject_todo_spec=True),
+    )
+    assert d.role == "think"
+
+
+def test_classify_high_message_tokens_think_without_code_task():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/code",
+        think_model="ollama_chat/think",
+        token_heavy_min=12_000,
+    )
+    d = classify_prompt(
+        "summarize the session so far",
+        message_tokens=15_000,
+        router=router,
+        code_model_name="ollama_chat/code",
+    )
+    assert d.role == "think"
+    assert "msg_tokens>=" in d.reasons[0]
+
+
+def test_classify_high_message_tokens_code_task():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/code",
+        think_model="ollama_chat/think",
+        token_heavy_min=12_000,
+    )
+    d = classify_prompt(
+        "implement the whole module",
+        message_tokens=15_000,
+        router=router,
+        code_model_name="ollama_chat/code",
+    )
+    assert d.role == "code"
+
+
+def test_files_in_chat_do_not_force_code_when_under_fast_window():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/big",
+    )
+    msg = "I'd like to add @ references like we have for /add with chips"
+    message_tokens = estimate_message_tokens(msg)
+    context_tokens = estimate_prompt_tokens(msg, files_in_chat=4)
+    assert context_tokens > message_tokens
+    assert message_tokens < router.token_fast_max
+    assert not context_exceeds_fast_model_limit(
+        context_tokens, router.fast_model, fast_max_input=32_768
+    )[0]
+    d = classify_prompt(
+        msg,
+        message_tokens=message_tokens,
+        context_tokens=context_tokens,
+        router=router,
+        code_model_name="ollama_chat/big",
+    )
+    assert d.role == "fast"
+    assert d.estimated_tokens == context_tokens
+
+
+def test_context_exceeds_fast_model_limit():
+    exceeds, limit = context_exceeds_fast_model_limit(
+        17_670,
+        "ollama_chat/deepseek-coder:6.7b",
+        fast_max_input=16_384,
+    )
+    assert exceeds is True
+    assert limit == 16_384
+    fits, _ = context_exceeds_fast_model_limit(
+        10_000,
+        "ollama_chat/deepseek-coder:6.7b",
+        fast_max_input=16_384,
+    )
+    assert fits is False
+
+
+def test_classify_routes_code_when_context_exceeds_fast_window():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/deepseek-coder:6.7b",
+        code_model="ollama_chat/qwen3.6:27b-q4_K_M",
+    )
+    msg = "tweak the chat panel label"
+    message_tokens = estimate_message_tokens(msg)
+    assert message_tokens < router.token_fast_max
+    d = classify_prompt(
+        msg,
+        message_tokens=message_tokens,
+        context_tokens=17_670,
+        router=router,
+        code_model_name="ollama_chat/qwen3.6:27b-q4_K_M",
+        fast_max_input=16_000,
+    )
+    assert d.role == "code"
+    assert d.model_name == "ollama_chat/qwen3.6:27b-q4_K_M"
+    assert any("fast_max=" in r for r in d.reasons)
+
+
+def test_fast_keyword_loses_to_context_overflow():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/deepseek-coder:6.7b",
+        code_model="ollama_chat/big",
+    )
+    d = classify_prompt(
+        "Rename the button label to Save",
+        message_tokens=200,
+        context_tokens=20_000,
+        router=router,
+        code_model_name="ollama_chat/big",
+        fast_max_input=16_000,
+    )
+    assert d.role == "code"
+
+
+def test_code_task_middle_band_defaults_code():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/big",
+    )
+    d = classify_prompt(
+        "implement the login form",
+        message_tokens=800,
+        router=router,
+        code_model_name="ollama_chat/big",
+    )
+    assert d.role == "code"
+    assert "code_task" in d.reasons
+
+
+def test_escalate_when_fast_no_edits():
+    router = ModelRouterConfig(enabled=True, fast_model="a", code_model="b")
+    decision = classify_prompt(
+        "implement the login form",
+        message_tokens=800,
+        router=router,
+        code_model_name="b",
+        force_tier="fast",
+    )
+    assert should_escalate_fast_turn(
+        decision,
+        router=router,
+        user_message="implement the login form",
+        edited_files=[],
+        assistant_text="ok",
+    )
+
+
+def test_escalate_code_to_think():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="a",
+        code_model="b",
+        think_model="c",
+    )
+    decision = classify_prompt(
+        "Refactor the auth layer",
+        message_tokens=800,
+        router=router,
+        code_model_name="b",
+        force_tier="code",
+    )
+    assert should_escalate_code_turn(
+        decision,
+        router=router,
+        user_message="Refactor the auth layer",
+        edited_files=[],
+        assistant_text="Here is a plan",
+    )
+
+
+def test_escalation_target_chain():
+    fast = classify_prompt(
+        "fix",
+        message_tokens=100,
+        router=ModelRouterConfig(enabled=True, fast_model="a", code_model="b", think_model="c"),
+        code_model_name="b",
+        force_tier="fast",
+    )
+    assert escalation_target(fast) == "code"
+    code = classify_prompt(
+        "fix",
+        message_tokens=100,
+        router=ModelRouterConfig(enabled=True, fast_model="a", code_model="b", think_model="c"),
+        code_model_name="b",
+        force_tier="code",
+    )
+    assert escalation_target(code) == "think"
+
+
+def test_escalate_on_context_limit_tool_error():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/deepseek-coder:6.7b",
+        code_model="ollama_chat/big",
+    )
+    decision = classify_prompt(
+        "tweak git tab",
+        message_tokens=200,
+        context_tokens=5_000,
+        router=router,
+        code_model_name="ollama_chat/big",
+        force_tier="fast",
+    )
+    err = (
+        "Your estimated chat context of 32,672 tokens exceeds the "
+        "16,384 token limit for ollama_chat/deepseek-coder:6.7b!"
+    )
+    assert should_escalate_fast_turn(
+        decision,
+        router=router,
+        user_message="tweak git tab",
+        edited_files=[],
+        assistant_text="",
+        had_tool_error=True,
+        tool_error_text=err,
+    )
+
+
+def test_lint_in_long_message_not_used_when_routing_short_intent():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/big",
+    )
+    preamble = "## Spec-focus mode\nEARS lint requirements\n" + ("x" * 5000)
+    short = "In the Git tab, add revert and open-in-editor cues."
+    d = classify_prompt(
+        short,
+        message_tokens=estimate_message_tokens(short),
+        context_tokens=estimate_prompt_tokens(preamble + short, files_in_chat=0),
+        router=router,
+        code_model_name="ollama_chat/big",
+    )
+    assert d.role != "fast" or "keyword:lint" not in " ".join(d.reasons)
+
+
+def test_estimate_tokens_with_files_capped():
+    bare = estimate_prompt_tokens("hello")
+    with_files = estimate_prompt_tokens("hello", files_in_chat=10)
+    assert with_files > bare
+    assert with_files <= bare + 2000
+
+
+def test_thinking_for_role():
+    assert thinking_for_role("think", "ollama_chat/deepseek-r1:32b") is True
+    assert thinking_for_role("code", "ollama_chat/qwen3.6:27b") is False
+
+
+def test_pool_entry_overrides_role_thinking():
+    pool = [
+        ModelPoolEntry(
+            model="ollama_chat/custom",
+            tier="code",
+            enabled=True,
+            enable_thinking=True,
+        )
+    ]
+    assert thinking_for_role("code", "ollama_chat/custom", pool=pool) is True
+    assert thinking_for_role("code", "ollama_chat/other", pool=pool) is False
+
+
+def test_resolve_model_pool_roles():
+    pool = [
+        ModelPoolEntry(model="ollama_chat/fast-a", tier="fast", enabled=True),
+        ModelPoolEntry(model="", tier="code", enabled=True),
+        ModelPoolEntry(model="ollama_chat/r1", tier="think", enabled=True),
+    ]
+    resolved = resolve_model_pool(
+        pool,
+        session_code="ollama_chat/session",
+        fallback_fast="",
+    )
+    assert resolved.fast == "ollama_chat/fast-a"
+    assert resolved.code == "ollama_chat/session"
+    assert resolved.think == "ollama_chat/r1"
+
+
+def test_from_payload_parses_hopper_extra_params():
+    cfg = ModelRouterConfig.from_payload(
+        {
+            "enabled": True,
+            "fast_model": "ollama_chat/fast",
+            "code_model": "ollama_chat/code",
+            "model_pool": [
+                {
+                    "model": "ollama_chat/code",
+                    "tier": "code",
+                    "enabled": True,
+                    "extra_params": {"top_p": 0.85},
+                }
+            ],
+        }
+    )
+    assert cfg is not None
+    assert cfg.model_pool[0].extra_params == {"top_p": 0.85}
+
+
+def test_pool_prefers_think_when_think_above_code():
+    from cecli.hopper.router import pool_prefers_think
+
+    pool = [
+        ModelPoolEntry(model="ollama_chat/r1:32b", tier="think", enabled=True),
+        ModelPoolEntry(model="ollama_chat/qwen3:27b", tier="code", enabled=True),
+        ModelPoolEntry(model="ollama_chat/small", tier="fast", enabled=True),
+    ]
+    assert pool_prefers_think(pool) is True
+
+
+def test_pool_prefers_think_false_when_code_above_think():
+    from cecli.hopper.router import pool_prefers_think
+
+    pool = [
+        ModelPoolEntry(model="ollama_chat/qwen3:27b", tier="code", enabled=True),
+        ModelPoolEntry(model="ollama_chat/r1:32b", tier="think", enabled=True),
+        ModelPoolEntry(model="ollama_chat/small", tier="fast", enabled=True),
+    ]
+    assert pool_prefers_think(pool) is False
+
+
+def test_pool_prefers_think_false_when_no_think():
+    from cecli.hopper.router import pool_prefers_think
+
+    pool = [
+        ModelPoolEntry(model="ollama_chat/qwen3:27b", tier="code", enabled=True),
+        ModelPoolEntry(model="ollama_chat/small", tier="fast", enabled=True),
+    ]
+    assert pool_prefers_think(pool) is False
+
+
+def test_prefer_think_routes_agent_to_think():
+    """Agent turns always use code model (tool-capable) even with prefer_think."""
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/code",
+        think_model="ollama_chat/think",
+        prefer_think=True,
+    )
+    d = classify_prompt(
+        "/agent explore the repo",
+        message_tokens=400,
+        router=router,
+        code_model_name="ollama_chat/code",
+    )
+    assert d.role == "code"
+    assert d.model_name == "ollama_chat/code"
+    assert "slash:/agent" in d.reasons
+
+
+def test_prefer_think_routes_implement_turn_to_think():
+    """Implement turns always use code model (tool-capable) even with prefer_think."""
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/code",
+        think_model="ollama_chat/think",
+        prefer_think=True,
+    )
+    d = classify_prompt(
+        "implement the EncryptedStorageRepository",
+        message_tokens=400,
+        router=router,
+        code_model_name="ollama_chat/code",
+        turn=RouteTurnContext(implement_turn=True),
+    )
+    assert d.role == "code"
+    assert d.model_name == "ollama_chat/code"
+    assert "implement_turn" in d.reasons
+
+
+def test_prefer_think_falls_back_to_code_without_think_model():
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/small",
+        code_model="ollama_chat/code",
+        prefer_think=True,
+    )
+    d = classify_prompt(
+        "/agent explore the repo",
+        message_tokens=400,
+        router=router,
+        code_model_name="ollama_chat/code",
+    )
+    # No think model configured; falls back to code despite prefer_think
+    assert d.role == "code"
+    assert "prefer_think" not in d.reasons
+
+
+def test_from_payload_derives_prefer_think_from_pool_order():
+    cfg = ModelRouterConfig.from_payload(
+        {
+            "enabled": True,
+            "fast_model": "ollama_chat/fast",
+            "code_model": "ollama_chat/code",
+            "think_model": "ollama_chat/think",
+            "model_pool": [
+                {"model": "ollama_chat/think", "tier": "think", "enabled": True},
+                {"model": "ollama_chat/code", "tier": "code", "enabled": True},
+                {"model": "ollama_chat/fast", "tier": "fast", "enabled": True},
+            ],
+        }
+    )
+    assert cfg is not None
+    assert cfg.prefer_think is True
+
+
+def test_from_payload_no_prefer_think_when_code_first():
+    cfg = ModelRouterConfig.from_payload(
+        {
+            "enabled": True,
+            "fast_model": "ollama_chat/fast",
+            "code_model": "ollama_chat/code",
+            "think_model": "ollama_chat/think",
+            "model_pool": [
+                {"model": "ollama_chat/code", "tier": "code", "enabled": True},
+                {"model": "ollama_chat/think", "tier": "think", "enabled": True},
+                {"model": "ollama_chat/fast", "tier": "fast", "enabled": True},
+            ],
+        }
+    )
+    assert cfg is not None
+    assert cfg.prefer_think is False
+
+
+
+def test_router_lane_fast_prompt_routes_fast_with_think_enabled():
+    """E2E router lane contract (regression for e2e/router-llm.spec.ts).
+
+    Hopper order fast → code → think (think last) ⇒ ``prefer_think`` is False, so a
+    trivial fast-keyword prompt must route to the fast tier even though a think model
+    is enabled. The e2e ``fast tier routes to Fighter pilot`` test asserts exactly this;
+    if routing here returned ``think`` the e2e would fail (and previously did, when the
+    fast model was cold-evicted and the turn escalated fast→code→think).
+    """
+    cfg = ModelRouterConfig.from_payload(
+        {
+            "enabled": True,
+            "fast_model": "ollama_chat/qwen2.5-coder:7b",
+            "code_model": "ollama_chat/qwen3.6:27b-q4_K_M",
+            "think_model": "ollama_chat/deepseek-r1:32b",
+            "model_pool": [
+                {"model": "ollama_chat/qwen2.5-coder:7b", "tier": "fast", "enabled": True},
+                {"model": "ollama_chat/qwen3.6:27b-q4_K_M", "tier": "code", "enabled": True},
+                {"model": "ollama_chat/deepseek-r1:32b", "tier": "think", "enabled": True},
+            ],
+        }
+    )
+    assert cfg is not None
+    assert cfg.prefer_think is False
+    d = classify_prompt(
+        'Suggest a better button label than "Start" in one sentence only. '
+        "No code blocks, no file edits.",
+        message_tokens=30,
+        router=cfg,
+        code_model_name="ollama_chat/qwen3.6:27b-q4_K_M",
+        think_model_name="ollama_chat/deepseek-r1:32b",
+    )
+    assert d.role == "fast", d.reasons
+    assert d.model_name == "ollama_chat/qwen2.5-coder:7b"
diff --git a/tests/hopper/test_model_router_apply.py b/tests/hopper/test_model_router_apply.py
new file mode 100644
index 00000000000..fa1b3603129
--- /dev/null
+++ b/tests/hopper/test_model_router_apply.py
@@ -0,0 +1,208 @@
+"""Apply-route tests — per-turn LiteLLM think override + keep_alive."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+from cecli.hopper.router import ModelPoolEntry, ModelRouterConfig, RouteDecision
+from cecli.hopper.apply import (
+    apply_hopper_extra_params,
+    apply_route_to_coder,
+    apply_thinking_extra_params,
+    merge_extra_params,
+)
+
+
+def test_apply_thinking_extra_params_sets_bool():
+    model = MagicMock()
+    model.extra_params = {}
+    apply_thinking_extra_params(model, True)
+    assert model.extra_params["think"] is True
+    apply_thinking_extra_params(model, False)
+    assert model.extra_params["think"] is False
+
+
+def test_merge_extra_params_deep_merges_dicts():
+    base = {"extra_headers": {"A": "1"}, "top_p": 0.5}
+    merge_extra_params(base, {"extra_headers": {"B": "2"}, "top_p": 0.9})
+    assert base["extra_headers"] == {"A": "1", "B": "2"}
+    assert base["top_p"] == 0.9
+
+
+def test_apply_hopper_extra_params_skips_keep_alive():
+    model = MagicMock()
+    model.extra_params = {"keep_alive": 99}
+    apply_hopper_extra_params(model, {"keep_alive": 0, "top_p": 0.8})
+    assert model.extra_params.get("keep_alive") == 99
+    assert model.extra_params.get("top_p") == 0.8
+
+
+def test_apply_route_merges_hopper_extra_params():
+    prev = MagicMock()
+    prev.name = "ollama_chat/qwen3.6:27b"
+    prev.is_ollama.return_value = True
+    prev.extra_params = {"think": False}
+
+    created: dict = {}
+
+    def _model_ctor(name, from_model=None):
+        m = MagicMock()
+        m.name = name
+        m.is_ollama.return_value = True
+        m.extra_params = dict(from_model.extra_params)
+        m._ensure_extra_params_dict = lambda: None
+        created["model"] = m
+        return m
+
+    coder = MagicMock()
+    coder.main_model = prev
+
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/fast",
+        code_model="ollama_chat/code",
+        model_pool=[
+            ModelPoolEntry(
+                model="ollama_chat/code",
+                tier="code",
+                enabled=True,
+                extra_params={"top_p": 0.85, "think": True},
+            )
+        ],
+    )
+    decision = RouteDecision(
+        tier="code",
+        role="code",
+        model_name="ollama_chat/code",
+        estimated_tokens=100,
+        enable_thinking=False,
+    )
+
+    with patch("cecli.hopper.apply.models.Model", side_effect=_model_ctor):
+        apply_route_to_coder(coder, decision, router)
+
+    assert created["model"].extra_params.get("top_p") == 0.85
+    assert created["model"].extra_params.get("think") is False
+    assert created["model"].extra_params.get("keep_alive") == -1
+
+
+def test_apply_route_code_disables_think():
+    prev = MagicMock()
+    prev.name = "ollama_chat/qwen3.6:27b"
+    prev.is_ollama.return_value = True
+    prev.extra_params = {"think": False}
+
+    created: dict = {}
+
+    def _model_ctor(name, from_model=None):
+        m = MagicMock()
+        m.name = name
+        m.is_ollama.return_value = True
+        m.extra_params = dict(from_model.extra_params)
+        m._ensure_extra_params_dict = lambda: None
+        created["model"] = m
+        return m
+
+    coder = MagicMock()
+    coder.main_model = prev
+
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/fast",
+        code_model="ollama_chat/code",
+    )
+    decision = RouteDecision(
+        tier="code",
+        role="code",
+        model_name="ollama_chat/code",
+        estimated_tokens=100,
+        enable_thinking=False,
+    )
+
+    with patch("cecli.hopper.apply.models.Model", side_effect=_model_ctor):
+        apply_route_to_coder(coder, decision, router)
+
+    assert created["model"].extra_params.get("think") is False
+    assert created["model"].extra_params.get("keep_alive") == -1
+
+
+def test_apply_route_think_enables_think():
+    prev = MagicMock()
+    prev.name = "ollama_chat/qwen3.6:27b"
+    prev.is_ollama.return_value = True
+    prev.extra_params = {"think": False}
+
+    created: dict = {}
+
+    def _model_ctor(name, from_model=None):
+        m = MagicMock()
+        m.name = name
+        m.is_ollama.return_value = True
+        m.extra_params = dict(from_model.extra_params)
+        m._ensure_extra_params_dict = lambda: None
+        created["model"] = m
+        return m
+
+    coder = MagicMock()
+    coder.main_model = prev
+
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/fast",
+        code_model="ollama_chat/code",
+        think_model="ollama_chat/deepseek-r1:32b",
+    )
+    decision = RouteDecision(
+        tier="think",
+        role="think",
+        model_name="ollama_chat/deepseek-r1:32b",
+        estimated_tokens=100,
+        enable_thinking=True,
+    )
+
+    with patch("cecli.hopper.apply.models.Model", side_effect=_model_ctor):
+        apply_route_to_coder(coder, decision, router)
+
+    assert created["model"].extra_params.get("think") is True
+
+
+def test_apply_route_qwen_sets_no_think_prefix():
+    prev = MagicMock()
+    prev.name = "ollama_chat/qwen3.6:27b"
+    prev.is_ollama.return_value = True
+    prev.extra_params = {}
+    prev.system_prompt_prefix = ""
+
+    created: dict = {}
+
+    def _model_ctor(name, from_model=None):
+        m = MagicMock()
+        m.name = name
+        m.is_ollama.return_value = True
+        m.extra_params = {}
+        m.system_prompt_prefix = ""
+        m._ensure_extra_params_dict = lambda: None
+        created["model"] = m
+        return m
+
+    coder = MagicMock()
+    coder.main_model = prev
+
+    router = ModelRouterConfig(
+        enabled=True,
+        fast_model="ollama_chat/fast",
+        code_model="ollama_chat/qwen3.6:27b",
+    )
+    decision = RouteDecision(
+        tier="code",
+        role="code",
+        model_name="ollama_chat/qwen3.6:27b",
+        estimated_tokens=100,
+        enable_thinking=False,
+    )
+
+    with patch("cecli.hopper.apply.models.Model", side_effect=_model_ctor):
+        apply_route_to_coder(coder, decision, router)
+
+    assert created["model"].extra_params.get("think") is False
+    assert created["model"].system_prompt_prefix == "/no_think"
diff --git a/tests/hopper/test_model_router_preload.py b/tests/hopper/test_model_router_preload.py
new file mode 100644
index 00000000000..bc7d138a764
--- /dev/null
+++ b/tests/hopper/test_model_router_preload.py
@@ -0,0 +1,251 @@
+"""Tests for preload_priority_list — priority-ordered preloading with VRAM budget."""
+
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+
+from cecli.hopper.router import (
+    preload_priority_list,
+    _strip_ollama_prefix,
+)
+
+
+# ---------------------------------------------------------------------------
+# Mock Ollama client
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MockOllamaClient:
+    """Mock OllamaClient for testing preload_priority_list."""
+
+    # Track calls for assertions
+    generate_calls: list[tuple[str, int]] = field(default_factory=list)
+    show_calls: list[str] = field(default_factory=list)
+
+    # Configurable behavior
+    model_sizes: dict[str, int] = field(default_factory=dict)
+    failing_models: set[str] = field(default_factory=set)
+    show_failures: set[str] = field(default_factory=set)
+
+    async def post_generate(self, model: str, *, keep_alive: int = -1) -> None:
+        self.generate_calls.append((model, keep_alive))
+        if model in self.failing_models:
+            raise RuntimeError(f"Preload failed: model '{model}' not found")
+
+    async def show_model(self, model: str) -> dict[str, Any]:
+        self.show_calls.append(model)
+        if model in self.show_failures:
+            raise RuntimeError(f"Show failed for '{model}'")
+        size = self.model_sizes.get(model)
+        if size is not None:
+            return {"size": size}
+        return {}
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_preload_all_models_in_order():
+    """All models preload successfully in priority order."""
+    client = MockOllamaClient()
+    priority = ["model-a:7b", "model-b:13b", "model-c:32b"]
+
+    result = await preload_priority_list(priority, ollama_client=client)
+
+    assert result == ["model-a:7b", "model-b:13b", "model-c:32b"]
+    assert client.generate_calls == [
+        ("model-a:7b", -1),
+        ("model-b:13b", -1),
+        ("model-c:32b", -1),
+    ]
+
+
+@pytest.mark.asyncio
+async def test_preload_strips_ollama_prefix():
+    """ollama_chat/ prefix is stripped for API calls but preserved in results."""
+    client = MockOllamaClient()
+    priority = ["ollama_chat/deepseek-r1:32b", "ollama/qwen:7b"]
+
+    result = await preload_priority_list(priority, ollama_client=client)
+
+    assert result == ["ollama_chat/deepseek-r1:32b", "ollama/qwen:7b"]
+    assert client.generate_calls == [
+        ("deepseek-r1:32b", -1),
+        ("qwen:7b", -1),
+    ]
+
+
+@pytest.mark.asyncio
+async def test_preload_failure_skips_and_continues():
+    """On preload failure, log error, skip model, continue with next."""
+    client = MockOllamaClient(failing_models={"model-b:13b"})
+    priority = ["model-a:7b", "model-b:13b", "model-c:32b"]
+
+    result = await preload_priority_list(priority, ollama_client=client)
+
+    assert result == ["model-a:7b", "model-c:32b"]
+    # All three were attempted
+    assert len(client.generate_calls) == 3
+
+
+@pytest.mark.asyncio
+async def test_preload_vram_budget_stops_when_exceeded():
+    """When cumulative VRAM exceeds budget, stop preloading remaining models."""
+    client = MockOllamaClient(
+        model_sizes={
+            "model-a:7b": 4_000_000_000,   # 4 GB
+            "model-b:13b": 8_000_000_000,  # 8 GB
+            "model-c:32b": 18_000_000_000,  # 18 GB
+        }
+    )
+    priority = ["model-a:7b", "model-b:13b", "model-c:32b"]
+    # Budget: 11 GB → model-a (4 GB) + model-b (8 GB) = 12 GB > 11 GB
+    # So model-a fits, model-b does NOT fit, stop.
+    budget = 11_000_000_000
+
+    result = await preload_priority_list(
+        priority, ollama_client=client, vram_budget_bytes=budget
+    )
+
+    assert result == ["model-a:7b"]
+    # Only model-a was actually preloaded (generate called)
+    assert len(client.generate_calls) == 1
+    assert client.generate_calls[0][0] == "model-a:7b"
+
+
+@pytest.mark.asyncio
+async def test_preload_vram_budget_all_fit():
+    """All models fit within VRAM budget."""
+    client = MockOllamaClient(
+        model_sizes={
+            "model-a:7b": 4_000_000_000,
+            "model-b:7b": 4_000_000_000,
+        }
+    )
+    priority = ["model-a:7b", "model-b:7b"]
+    budget = 10_000_000_000  # 10 GB — both fit
+
+    result = await preload_priority_list(
+        priority, ollama_client=client, vram_budget_bytes=budget
+    )
+
+    assert result == ["model-a:7b", "model-b:7b"]
+    assert len(client.generate_calls) == 2
+
+
+@pytest.mark.asyncio
+async def test_preload_vram_unknown_size_proceeds():
+    """When model size info unavailable, skip budget check and preload anyway."""
+    client = MockOllamaClient(
+        model_sizes={
+            "model-a:7b": 4_000_000_000,
+            # model-b has no size info
+        }
+    )
+    priority = ["model-a:7b", "model-b:unknown"]
+    budget = 5_000_000_000  # 5 GB
+
+    result = await preload_priority_list(
+        priority, ollama_client=client, vram_budget_bytes=budget
+    )
+
+    # Both preloaded — model-b has no size info, so budget check skipped for it
+    assert result == ["model-a:7b", "model-b:unknown"]
+
+
+@pytest.mark.asyncio
+async def test_preload_no_budget_preloads_all():
+    """Without VRAM budget, all models preloaded regardless of size."""
+    client = MockOllamaClient(
+        model_sizes={
+            "huge:70b": 40_000_000_000,
+            "also-huge:70b": 40_000_000_000,
+        }
+    )
+    priority = ["huge:70b", "also-huge:70b"]
+
+    result = await preload_priority_list(priority, ollama_client=client)
+
+    assert result == ["huge:70b", "also-huge:70b"]
+    # No show calls when budget is None
+    assert client.show_calls == []
+
+
+@pytest.mark.asyncio
+async def test_preload_empty_list():
+    """Empty priority list returns empty result."""
+    client = MockOllamaClient()
+    result = await preload_priority_list([], ollama_client=client)
+    assert result == []
+    assert client.generate_calls == []
+
+
+@pytest.mark.asyncio
+async def test_preload_skips_whitespace_only_entries():
+    """Whitespace-only entries in priority list are skipped."""
+    client = MockOllamaClient()
+    priority = ["model-a:7b", "  ", "", "model-b:7b"]
+
+    result = await preload_priority_list(priority, ollama_client=client)
+
+    assert result == ["model-a:7b", "model-b:7b"]
+    assert len(client.generate_calls) == 2
+
+
+@pytest.mark.asyncio
+async def test_preload_uses_backend_resolver_when_no_ollama_client():
+    """Host resolver hook supplies BackendClient.preload_models when no ollama_client."""
+    from unittest.mock import AsyncMock
+
+    from cecli.hopper.router import set_backend_client_resolver
+
+    mock_client = AsyncMock()
+    mock_client.preload_models = AsyncMock(return_value=[])
+
+    set_backend_client_resolver(lambda: mock_client)
+    try:
+        result = await preload_priority_list(["model-a:7b"])
+    finally:
+        set_backend_client_resolver(None)
+
+    assert result == []
+    mock_client.preload_models.assert_called_once_with(["model-a:7b"])
+
+
+@pytest.mark.asyncio
+async def test_preload_show_failure_skips_budget_check():
+    """When show_model fails, skip budget check and preload anyway."""
+    client = MockOllamaClient(show_failures={"model-a:7b"})
+    priority = ["model-a:7b"]
+    budget = 1_000  # Tiny budget — but show fails, so budget check skipped
+
+    result = await preload_priority_list(
+        priority, ollama_client=client, vram_budget_bytes=budget
+    )
+
+    assert result == ["model-a:7b"]
+
+
+# ---------------------------------------------------------------------------
+# Unit tests for _strip_ollama_prefix
+# ---------------------------------------------------------------------------
+
+
+def test_strip_ollama_prefix_chat():
+    assert _strip_ollama_prefix("ollama_chat/deepseek-r1:32b") == "deepseek-r1:32b"
+
+
+def test_strip_ollama_prefix_plain():
+    assert _strip_ollama_prefix("ollama/qwen:7b") == "qwen:7b"
+
+
+def test_strip_ollama_prefix_no_prefix():
+    assert _strip_ollama_prefix("deepseek-r1:32b") == "deepseek-r1:32b"
diff --git a/tests/hopper/test_model_router_warmup.py b/tests/hopper/test_model_router_warmup.py
new file mode 100644
index 00000000000..3345397fc0d
--- /dev/null
+++ b/tests/hopper/test_model_router_warmup.py
@@ -0,0 +1,142 @@
+"""Tests for warmup_keep_alive — keep-alive requests in priority order."""
+
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+
+from cecli.hopper.router import warmup_keep_alive
+
+
+# ---------------------------------------------------------------------------
+# Mock Ollama client
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MockOllamaClient:
+    """Mock OllamaClient for testing warmup_keep_alive."""
+
+    generate_calls: list[tuple[str, int]] = field(default_factory=list)
+    failing_models: set[str] = field(default_factory=set)
+
+    async def post_generate(self, model: str, *, keep_alive: int = -1) -> None:
+        self.generate_calls.append((model, keep_alive))
+        if model in self.failing_models:
+            raise RuntimeError(f"Keep-alive failed: model '{model}' not found")
+
+    async def show_model(self, model: str) -> dict[str, Any]:
+        return {}
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_warmup_sends_requests_in_priority_order():
+    """Keep-alive requests are sent in priority-list index order."""
+    client = MockOllamaClient()
+    priority = ["model-a:7b", "model-b:13b", "model-c:32b"]
+
+    result = await warmup_keep_alive(priority, ollama_client=client)
+
+    assert result == ["model-a:7b", "model-b:13b", "model-c:32b"]
+    assert client.generate_calls == [
+        ("model-a:7b", -1),
+        ("model-b:13b", -1),
+        ("model-c:32b", -1),
+    ]
+
+
+@pytest.mark.asyncio
+async def test_warmup_higher_priority_refreshes_first():
+    """Index 0 (highest priority) refreshes TTL before index N-1."""
+    client = MockOllamaClient()
+    priority = ["high-priority:7b", "mid-priority:13b", "low-priority:32b"]
+
+    await warmup_keep_alive(priority, ollama_client=client)
+
+    # Verify ordering: high-priority called first
+    call_models = [call[0] for call in client.generate_calls]
+    assert call_models == ["high-priority:7b", "mid-priority:13b", "low-priority:32b"]
+
+
+@pytest.mark.asyncio
+async def test_warmup_strips_ollama_prefix():
+    """ollama_chat/ prefix is stripped for API calls but preserved in results."""
+    client = MockOllamaClient()
+    priority = ["ollama_chat/deepseek-r1:32b", "ollama/qwen:7b"]
+
+    result = await warmup_keep_alive(priority, ollama_client=client)
+
+    assert result == ["ollama_chat/deepseek-r1:32b", "ollama/qwen:7b"]
+    assert client.generate_calls == [
+        ("deepseek-r1:32b", -1),
+        ("qwen:7b", -1),
+    ]
+
+
+@pytest.mark.asyncio
+async def test_warmup_failure_skips_and_continues():
+    """On keep-alive failure, log error, skip model, continue with next."""
+    client = MockOllamaClient(failing_models={"model-b:13b"})
+    priority = ["model-a:7b", "model-b:13b", "model-c:32b"]
+
+    result = await warmup_keep_alive(priority, ollama_client=client)
+
+    assert result == ["model-a:7b", "model-c:32b"]
+    # All three were attempted
+    assert len(client.generate_calls) == 3
+
+
+@pytest.mark.asyncio
+async def test_warmup_empty_list():
+    """Empty priority list returns empty result."""
+    client = MockOllamaClient()
+    result = await warmup_keep_alive([], ollama_client=client)
+    assert result == []
+    assert client.generate_calls == []
+
+
+@pytest.mark.asyncio
+async def test_warmup_skips_whitespace_only_entries():
+    """Whitespace-only entries in priority list are skipped."""
+    client = MockOllamaClient()
+    priority = ["model-a:7b", "  ", "", "model-b:7b"]
+
+    result = await warmup_keep_alive(priority, ollama_client=client)
+
+    assert result == ["model-a:7b", "model-b:7b"]
+    assert len(client.generate_calls) == 2
+
+
+@pytest.mark.asyncio
+async def test_warmup_uses_keep_alive_minus_one():
+    """All keep-alive requests use keep_alive=-1 to refresh TTL indefinitely."""
+    client = MockOllamaClient()
+    priority = ["model-a:7b", "model-b:7b"]
+
+    await warmup_keep_alive(priority, ollama_client=client)
+
+    for _, keep_alive_val in client.generate_calls:
+        assert keep_alive_val == -1
+
+
+@pytest.mark.asyncio
+async def test_warmup_all_failures_returns_empty():
+    """When all models fail, returns empty list."""
+    client = MockOllamaClient(
+        failing_models={"model-a:7b", "model-b:7b"}
+    )
+    priority = ["model-a:7b", "model-b:7b"]
+
+    result = await warmup_keep_alive(priority, ollama_client=client)
+
+    assert result == []
+    # Both attempted
+    assert len(client.generate_calls) == 2

From bd33b32efb063358f392643df23cf18f4ae0bca2 Mon Sep 17 00:00:00 2001
From: CIA Operations Officer Jennifer Pike <agent.jennifer.pike@gmail.com>
Date: Mon, 15 Jun 2026 14:38:59 -0700
Subject: [PATCH 2/4] chore(hopper): drop unused asyncio imports in hopper
 tests

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 tests/hopper/test_model_router_preload.py | 1 -
 tests/hopper/test_model_router_warmup.py  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tests/hopper/test_model_router_preload.py b/tests/hopper/test_model_router_preload.py
index bc7d138a764..72e7003e047 100644
--- a/tests/hopper/test_model_router_preload.py
+++ b/tests/hopper/test_model_router_preload.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import asyncio
 from dataclasses import dataclass, field
 from typing import Any
 
diff --git a/tests/hopper/test_model_router_warmup.py b/tests/hopper/test_model_router_warmup.py
index 3345397fc0d..44287b162fc 100644
--- a/tests/hopper/test_model_router_warmup.py
+++ b/tests/hopper/test_model_router_warmup.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import asyncio
 from dataclasses import dataclass, field
 from typing import Any
 

From 881c086120f57ba313ef3752a098775bc3c9ab22 Mon Sep 17 00:00:00 2001
From: CIA Operations Officer Jennifer Pike <agent.jennifer.pike@gmail.com>
Date: Mon, 15 Jun 2026 14:43:57 -0700
Subject: [PATCH 3/4] style(hopper): black/isort for cecli pre-commit CI

Run verify-cecli-pre-commit on pr/hopper before upstream merge.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cecli/hopper/apply.py                     |  5 +--
 cecli/hopper/router.py                    | 47 +++++++++++++----------
 tests/hopper/test_model_router.py         |  1 -
 tests/hopper/test_model_router_apply.py   |  2 +-
 tests/hopper/test_model_router_preload.py | 21 +++-------
 tests/hopper/test_model_router_warmup.py  |  5 +--
 6 files changed, 35 insertions(+), 46 deletions(-)

diff --git a/cecli/hopper/apply.py b/cecli/hopper/apply.py
index 28fbb1ced29..495d71643ed 100644
--- a/cecli/hopper/apply.py
+++ b/cecli/hopper/apply.py
@@ -5,7 +5,6 @@
 from typing import Any
 
 from cecli import models
-
 from cecli.hopper.router import (
     ModelRouterConfig,
     RouteDecision,
@@ -70,9 +69,7 @@ def apply_route_to_coder(coder, decision: RouteDecision, router: ModelRouterConf
     new_model = models.Model(decision.model_name, from_model=prev)
     role = decision.role or normalize_route_role(decision.tier) or "code"
     pool_entry = (
-        find_pool_entry(router.model_pool, decision.model_name, role)
-        if router.model_pool
-        else None
+        find_pool_entry(router.model_pool, decision.model_name, role) if router.model_pool else None
     )
     apply_hopper_extra_params(
         new_model,
diff --git a/cecli/hopper/router.py b/cecli/hopper/router.py
index 67f8d45ae01..ed1f1c7fd59 100644
--- a/cecli/hopper/router.py
+++ b/cecli/hopper/router.py
@@ -33,6 +33,7 @@ def set_static_vram_bytes_resolver(fn: Callable[[str], int | None] | None) -> No
     global _static_vram_bytes_resolver
     _static_vram_bytes_resolver = fn
 
+
 RouteRole = Literal["fast", "code", "think"]
 RouteTier = Literal["fast", "heavy", "code", "think"]
 
@@ -178,7 +179,12 @@ def resolve_tier_models(pool: list[ModelPoolEntry], tier: RouteRole) -> list[Mod
     Models with priority_rank=None are sorted after those with a rank.
     """
     filtered = [e for e in pool if e.enabled and e.tier == tier]
-    filtered.sort(key=lambda e: (e.priority_rank is None, e.priority_rank if e.priority_rank is not None else 0))
+    filtered.sort(
+        key=lambda e: (
+            e.priority_rank is None,
+            e.priority_rank if e.priority_rank is not None else 0,
+        )
+    )
     return filtered
 
 
@@ -273,15 +279,12 @@ async def preload_priority_list(
 
         model_size: int | None = None
         if vram_budget_bytes is not None:
-            model_size = await _get_model_size_for_budget(
-                raw_tag, ollama_client=ollama_client
-            )
+            model_size = await _get_model_size_for_budget(raw_tag, ollama_client=ollama_client)
             if model_size is not None:
                 if cumulative_vram + model_size > vram_budget_bytes:
                     deferred = [t.strip() for t in priority_list[idx:] if t.strip()]
                     logger.info(
-                        "VRAM budget exceeded (%.1f MB used of %.1f MB). "
-                        "Deferring models: %s",
+                        "VRAM budget exceeded (%.1f MB used of %.1f MB). " "Deferring models: %s",
                         cumulative_vram / (1024 * 1024),
                         vram_budget_bytes / (1024 * 1024),
                         deferred,
@@ -340,9 +343,9 @@ async def warmup_keep_alive(
 def _strip_ollama_prefix(tag: str) -> str:
     """Remove ``ollama_chat/`` or ``ollama/`` prefix from a model tag."""
     if tag.startswith("ollama_chat/"):
-        return tag[len("ollama_chat/"):]
+        return tag[len("ollama_chat/") :]
     if tag.startswith("ollama/"):
-        return tag[len("ollama/"):]
+        return tag[len("ollama/") :]
     return tag
 
 
@@ -627,7 +630,9 @@ def resolve_provider_prefix(backend: str) -> str:
     return _BACKEND_PROVIDER_PREFIXES.get((backend or "").strip().lower(), "ollama_chat/")
 
 
-def inject_backend_extra_params(backend: str, extra_params: dict[str, object] | None) -> dict[str, object]:
+def inject_backend_extra_params(
+    backend: str, extra_params: dict[str, object] | None
+) -> dict[str, object]:
     """Merge ``LITELLM_EXTRA_PARAMS`` for non-Ollama backends.
 
     Ollama uses its own env wiring; other backends may need auth headers or base URLs
@@ -701,9 +706,7 @@ def from_payload(cls, raw: dict[str, Any] | None) -> ModelRouterConfig | None:
                 if tier is None:
                     continue
                 raw_rank = item.get("priority_rank")
-                priority_rank: int | None = (
-                    int(raw_rank) if raw_rank is not None else None
-                )
+                priority_rank: int | None = int(raw_rank) if raw_rank is not None else None
                 pool.append(
                     ModelPoolEntry(
                         model=str(item.get("model") or ""),
@@ -721,9 +724,7 @@ def from_payload(cls, raw: dict[str, Any] | None) -> ModelRouterConfig | None:
                     )
                 )
         fallback_fast = str(raw.get("fast_model") or "").strip()
-        fallback_code = (
-            str(raw.get("code_model") or raw.get("heavy_model") or "").strip() or None
-        )
+        fallback_code = str(raw.get("code_model") or raw.get("heavy_model") or "").strip() or None
         fallback_think = str(raw.get("think_model") or "").strip() or None
         session_code = fallback_code or fallback_fast or ""
         if pool:
@@ -762,9 +763,7 @@ def from_payload(cls, raw: dict[str, Any] | None) -> ModelRouterConfig | None:
             token_fast_max=int(raw.get("token_fast_max") or 4_096),
             token_heavy_min=int(raw.get("token_heavy_min") or 12_000),
             keep_alive_fast=raw.get("keep_alive_fast", 300),
-            keep_alive_heavy=normalize_keep_alive_for_tier(
-                "code", raw.get("keep_alive_heavy", -1)
-            ),
+            keep_alive_heavy=normalize_keep_alive_for_tier("code", raw.get("keep_alive_heavy", -1)),
             escalate_on_failure=bool(raw.get("escalate_on_failure", True)),
             prefer_think=bool(
                 raw.get("prefer_think")
@@ -912,7 +911,8 @@ def _apply_multi_model_routing(
 
     if pool and _has_multi_model_tier(pool, role):
         chosen_model, is_swap = pick_tier_model(
-            pool, role,
+            pool,
+            role,
             resident_models=resident_models,
             require_vision=require_vision,
             context_tokens=context_tokens,
@@ -1001,7 +1001,8 @@ def classify_prompt(
     # Common kwargs for all _apply_multi_model_routing calls in this function.
     def _route(role: RouteRole, model: str, *, reasons: list[str]) -> RouteDecision:
         return _apply_multi_model_routing(
-            role, model,
+            role,
+            model,
             router=router,
             display_tokens=display_tokens,
             reasons=reasons,
@@ -1057,7 +1058,11 @@ def _route(role: RouteRole, model: str, *, reasons: list[str]) -> RouteDecision:
             pool = router.model_pool
             if pool:
                 fast_models = resolve_tier_models(pool, "fast")
-                fast_fits = [m for m in fast_models if m.max_context is not None and m.max_context >= context_tokens]
+                fast_fits = [
+                    m
+                    for m in fast_models
+                    if m.max_context is not None and m.max_context >= context_tokens
+                ]
                 if fast_fits:
                     # A fast model with sufficient context exists — stay in fast tier
                     reasons.append(
diff --git a/tests/hopper/test_model_router.py b/tests/hopper/test_model_router.py
index 0b1a7c94ac5..491c711078b 100644
--- a/tests/hopper/test_model_router.py
+++ b/tests/hopper/test_model_router.py
@@ -574,7 +574,6 @@ def test_from_payload_no_prefer_think_when_code_first():
     assert cfg.prefer_think is False
 
 
-
 def test_router_lane_fast_prompt_routes_fast_with_think_enabled():
     """E2E router lane contract (regression for e2e/router-llm.spec.ts).
 
diff --git a/tests/hopper/test_model_router_apply.py b/tests/hopper/test_model_router_apply.py
index fa1b3603129..bedbe6766fe 100644
--- a/tests/hopper/test_model_router_apply.py
+++ b/tests/hopper/test_model_router_apply.py
@@ -4,13 +4,13 @@
 
 from unittest.mock import MagicMock, patch
 
-from cecli.hopper.router import ModelPoolEntry, ModelRouterConfig, RouteDecision
 from cecli.hopper.apply import (
     apply_hopper_extra_params,
     apply_route_to_coder,
     apply_thinking_extra_params,
     merge_extra_params,
 )
+from cecli.hopper.router import ModelPoolEntry, ModelRouterConfig, RouteDecision
 
 
 def test_apply_thinking_extra_params_sets_bool():
diff --git a/tests/hopper/test_model_router_preload.py b/tests/hopper/test_model_router_preload.py
index 72e7003e047..db52b62fdf4 100644
--- a/tests/hopper/test_model_router_preload.py
+++ b/tests/hopper/test_model_router_preload.py
@@ -8,11 +8,10 @@
 import pytest
 
 from cecli.hopper.router import (
-    preload_priority_list,
     _strip_ollama_prefix,
+    preload_priority_list,
 )
 
-
 # ---------------------------------------------------------------------------
 # Mock Ollama client
 # ---------------------------------------------------------------------------
@@ -100,7 +99,7 @@ async def test_preload_vram_budget_stops_when_exceeded():
     """When cumulative VRAM exceeds budget, stop preloading remaining models."""
     client = MockOllamaClient(
         model_sizes={
-            "model-a:7b": 4_000_000_000,   # 4 GB
+            "model-a:7b": 4_000_000_000,  # 4 GB
             "model-b:13b": 8_000_000_000,  # 8 GB
             "model-c:32b": 18_000_000_000,  # 18 GB
         }
@@ -110,9 +109,7 @@ async def test_preload_vram_budget_stops_when_exceeded():
     # So model-a fits, model-b does NOT fit, stop.
     budget = 11_000_000_000
 
-    result = await preload_priority_list(
-        priority, ollama_client=client, vram_budget_bytes=budget
-    )
+    result = await preload_priority_list(priority, ollama_client=client, vram_budget_bytes=budget)
 
     assert result == ["model-a:7b"]
     # Only model-a was actually preloaded (generate called)
@@ -132,9 +129,7 @@ async def test_preload_vram_budget_all_fit():
     priority = ["model-a:7b", "model-b:7b"]
     budget = 10_000_000_000  # 10 GB — both fit
 
-    result = await preload_priority_list(
-        priority, ollama_client=client, vram_budget_bytes=budget
-    )
+    result = await preload_priority_list(priority, ollama_client=client, vram_budget_bytes=budget)
 
     assert result == ["model-a:7b", "model-b:7b"]
     assert len(client.generate_calls) == 2
@@ -152,9 +147,7 @@ async def test_preload_vram_unknown_size_proceeds():
     priority = ["model-a:7b", "model-b:unknown"]
     budget = 5_000_000_000  # 5 GB
 
-    result = await preload_priority_list(
-        priority, ollama_client=client, vram_budget_bytes=budget
-    )
+    result = await preload_priority_list(priority, ollama_client=client, vram_budget_bytes=budget)
 
     # Both preloaded — model-b has no size info, so budget check skipped for it
     assert result == ["model-a:7b", "model-b:unknown"]
@@ -226,9 +219,7 @@ async def test_preload_show_failure_skips_budget_check():
     priority = ["model-a:7b"]
     budget = 1_000  # Tiny budget — but show fails, so budget check skipped
 
-    result = await preload_priority_list(
-        priority, ollama_client=client, vram_budget_bytes=budget
-    )
+    result = await preload_priority_list(priority, ollama_client=client, vram_budget_bytes=budget)
 
     assert result == ["model-a:7b"]
 
diff --git a/tests/hopper/test_model_router_warmup.py b/tests/hopper/test_model_router_warmup.py
index 44287b162fc..1de974c6ebd 100644
--- a/tests/hopper/test_model_router_warmup.py
+++ b/tests/hopper/test_model_router_warmup.py
@@ -9,7 +9,6 @@
 
 from cecli.hopper.router import warmup_keep_alive
 
-
 # ---------------------------------------------------------------------------
 # Mock Ollama client
 # ---------------------------------------------------------------------------
@@ -129,9 +128,7 @@ async def test_warmup_uses_keep_alive_minus_one():
 @pytest.mark.asyncio
 async def test_warmup_all_failures_returns_empty():
     """When all models fail, returns empty list."""
-    client = MockOllamaClient(
-        failing_models={"model-a:7b", "model-b:7b"}
-    )
+    client = MockOllamaClient(failing_models={"model-a:7b", "model-b:7b"})
     priority = ["model-a:7b", "model-b:7b"]
 
     result = await warmup_keep_alive(priority, ollama_client=client)

From 4536367a2d8a66e9a0881db6dcb85640f2cefc2d Mon Sep 17 00:00:00 2001
From: CIA Operations Officer Jennifer Pike <agent.jennifer.pike@gmail.com>
Date: Mon, 15 Jun 2026 14:45:50 -0700
Subject: [PATCH 4/4] fix(hopper): satisfy codespell on think-tier keyword
 regex

Expand architect(?:ure|ural)? to explicit words so codespell does not flag "ure".

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cecli/hopper/router.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cecli/hopper/router.py b/cecli/hopper/router.py
index ed1f1c7fd59..c7457cbf73f 100644
--- a/cecli/hopper/router.py
+++ b/cecli/hopper/router.py
@@ -93,7 +93,7 @@ def normalize_keep_alive_for_tier(tier: RouteTier | RouteRole, value: int | str)
 # Intent signals (case-insensitive word boundaries).
 _THINK_PATTERNS = re.compile(
     r"\b("
-    r"architect(?:ure|ural)?|refactor|rewrite|migrate|migration|"
+    r"architecture|architectural|architect|refactor|rewrite|migrate|migration|"
     r"race\s+condition|deadlock|concurrency|distributed|microservice|"
     r"security|vulnerability|root\s+cause|design\s+review|"
     r"performance|scalability|profil(?:e|ing)|"