aws · ehcalabres · Apr 6, 2026 · Apr 24, 2026 · Apr 27, 2026 · Apr 27, 2026
@@ -1,6 +1,6 @@
 [dev]
 # Set to "huggingface", for example, if you are a huggingface developer. Default is ""
-partner_developer = ""
+partner_developer = "huggingface"
 # Please only set it to true if you are preparing an EI related PR
 # Do remember to revert it back to false before merging any PR (including EI dedicated PR)
 ei_mode = false
@@ -36,12 +36,12 @@ deep_canary_mode = false
 
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
+build_frameworks = ["huggingface_llamacpp"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
-build_training = true
+build_training = false
 build_inference = true
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
@@ -192,5 +192,8 @@ dlc-pr-huggingface-vllm = ""
 # HuggingFace SGLang
 dlc-pr-huggingface-sglang = ""
 
+# Huggingface Llamacpp
+dlc-pr-huggingface-llamacpp = "/huggingface/llamacpp/buildspec.yml"
+
 # sglang
 dlc-pr-sglang = ""
diff --git a/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py
@@ -0,0 +1,211 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""SageMaker HTTP proxy for llama.cpp llama-server.
+
+SageMaker invokes POST /invocations and GET /ping on port 8080. llama-server
+speaks OpenAI-style routes (e.g. /v1/chat/completions) and does not expose
+/invocations.
+
+Behavior mirrors scripts/vllm/omni_sagemaker_serve.py routing:
+
+- GET /ping is proxied to GET {backend}/health.
+- POST /invocations: if ``X-Amzn-SageMaker-Custom-Attributes`` contains
+  ``route=/some/path``, the request is forwarded to that path on llama-server.
+  Otherwise the target path is inferred from the JSON body (messages ->
+  /v1/chat/completions, prompt -> /v1/completions, input+model -> /v1/embeddings),
+  defaulting to /v1/chat/completions.
+
+For routes that require multipart/form-data (parity with vLLM-Omni), JSON bodies
+are converted when ``route=`` targets those paths.
+
+Environment:
+
+- LLAMACPP_SAGEMAKER_BACKEND_URL: upstream base URL (default http://127.0.0.1:8081)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import uuid
+from collections.abc import AsyncIterator
+
+import httpx
+from starlette.applications import Starlette
+from starlette.requests import Request
+from starlette.responses import Response, StreamingResponse
+from starlette.routing import Route
+
+logger = logging.getLogger("llamacpp_sagemaker")
+
+BACKEND = os.environ.get("LLAMACPP_SAGEMAKER_BACKEND_URL", "http://127.0.0.1:8081").rstrip("/")
+
+FORM_DATA_ROUTES = frozenset({"/v1/videos", "/v1/videos/sync"})
+
+_HOP_BY_HOP = frozenset(
+    {
+        "connection",
+        "keep-alive",
+        "proxy-authenticate",
+        "proxy-authorization",
+        "te",
+        "trailers",
+        "transfer-encoding",
+        "upgrade",
+        "host",
+        "content-length",
+    }
+)
+
+_RESP_DROP = frozenset({"transfer-encoding", "content-length", "connection"})
+
+
+def _parse_route_from_header(raw: str | None) -> str | None:
+    if not raw:
+        return None
+    m = re.search(r"route=(/[^\s,]+)", raw)
+    return m.group(1) if m else None
+
+
+def _parse_route(request: Request) -> str | None:
+    h = request.headers
+    v = h.get("x-amzn-sagemaker-custom-attributes")
+    return _parse_route_from_header(v)
+
+
+def _build_multipart_body(data: dict, boundary: str) -> bytes:
+    parts: list[str] = []
+    for key, value in data.items():
+        parts.append(
+            f'--{boundary}\r\nContent-Disposition: form-data; name="{key}"\r\n\r\n{value}\r\n'
+        )
+    parts.append(f"--{boundary}--\r\n")
+    return "".join(parts).encode()
+
+
+def _default_path_for_invocation(content_type: str, body: bytes) -> str:
+    ct = (content_type or "").lower()
+    if "json" not in ct:
+        return "/v1/chat/completions"
+    try:
+        data = json.loads(body)
+    except (json.JSONDecodeError, UnicodeDecodeError):
+        return "/v1/chat/completions"
+    if not isinstance(data, dict):
+        return "/v1/chat/completions"
+    if "messages" in data:
+        return "/v1/chat/completions"
+    if "prompt" in data:
+        return "/v1/completions"
+    if "input" in data and "model" in data:
+        return "/v1/embeddings"
+    return "/v1/chat/completions"
+
+
+def _forward_request_headers(request: Request, body_len: int, content_type: str | None) -> dict[str, str]:
+    out: dict[str, str] = {}
+    for key, value in request.headers.items():
+        lk = key.lower()
+        if lk in _HOP_BY_HOP or lk == "x-amzn-sagemaker-custom-attributes":
+            continue
+        out[key] = value
+    out["content-length"] = str(body_len)
+    if content_type is not None:
+        out["content-type"] = content_type
+    return out
+
+
+def _response_headers_from_httpx(resp: httpx.Response) -> dict[str, str]:
+    h: dict[str, str] = {}
+    for key, value in resp.headers.items():
+        lk = key.lower()
+        if lk in _RESP_DROP:
+            continue
+        h[key] = value
+    return h
+
+
+async def ping(request: Request) -> Response:
+    url = f"{BACKEND}/health"
+    try:
+        async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=2.0)) as client:
+            r = await client.get(url)
+    except httpx.RequestError as e:
+        logger.warning("Backend health request failed: %s", e)
+        return Response(status_code=503, content=b'{"error":"backend_unavailable"}')
+    return Response(
+        status_code=r.status_code,
+        content=r.content,
+        headers=_response_headers_from_httpx(r),
+    )
+
+
+async def invocations(request: Request) -> Response:
+    if request.method != "POST":
+        return Response(status_code=405, content=b"Method Not Allowed")
+
+    body = await request.body()
+    route = _parse_route(request)
+    content_type = request.headers.get("content-type")
+
+    if route:
+        target = route
+        logger.info("Rerouting /invocations -> %s", target)
+        ct = (content_type or "").lower()
+        if target in FORM_DATA_ROUTES and "json" in ct:
+            try:
+                data = json.loads(body)
+            except (json.JSONDecodeError, UnicodeDecodeError):
+                data = None
+            if isinstance(data, dict):
+                boundary = uuid.uuid4().hex
+                body = _build_multipart_body(data, boundary)
+                content_type = f"multipart/form-data; boundary={boundary}"
+                logger.info("Converted JSON to form-data for %s", target)
+    else:
+        target = _default_path_for_invocation(content_type or "", body)
+        logger.info("Inferred /invocations -> %s", target)
+
+    url = f"{BACKEND}{target}"
+    fwd_headers = _forward_request_headers(request, len(body), content_type)
+
+    timeout = httpx.Timeout(600.0, connect=30.0)
+    client = httpx.AsyncClient(timeout=timeout)
+    try:
+        req = client.build_request("POST", url, headers=fwd_headers, content=body)
+        r = await client.send(req, stream=True)
+    except httpx.RequestError as e:
+        await client.aclose()
+        logger.exception("Upstream request failed: %s", e)
+        return Response(status_code=502, content=json.dumps({"error": "upstream_error"}).encode())
+
+    async def stream_body() -> AsyncIterator[bytes]:
+        try:
+            async for chunk in r.aiter_bytes():
+                yield chunk
+        finally:
+            await r.aclose()
+            await client.aclose()
+
+    return StreamingResponse(
+        stream_body(),
+        status_code=r.status_code,
+        headers=_response_headers_from_httpx(r),
+        media_type=r.headers.get("content-type"),
+    )
+
+
+routes = [
+    Route("/ping", ping, methods=["GET"]),
+    Route("/invocations", invocations, methods=["POST"]),
+]
+
+app = Starlette(routes=routes)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    force=True,
+)
@@ -0,0 +1,95 @@
+#!/bin/bash
+set -euo pipefail
+
+# Check if telemetry file exists before executing
+# Execute telemetry script if it exists, suppress errors
+bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
+
+# Source CUDA compat for older drivers (e.g., g5 instances)
+if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then
+    source /usr/local/bin/start_cuda_compat.sh
+fi
+
+# SageMaker sends traffic to port 8080 on /ping and /invocations. llama-server
+# listens on a loopback-only port; a small Python proxy (llamacpp_sagemaker_serve)
+# binds 8080 and forwards to llama-server, similar to vLLM-Omni middleware.
+INTERNAL_HOST="${LLAMACPP_SAGEMAKER_INTERNAL_HOST:-127.0.0.1}"
+INTERNAL_PORT="${LLAMACPP_SAGEMAKER_INTERNAL_PORT:-8081}"
+PROXY_PORT="${LLAMACPP_SAGEMAKER_PROXY_PORT:-8080}"
+export LLAMACPP_SAGEMAKER_BACKEND_URL="${LLAMACPP_SAGEMAKER_BACKEND_URL:-http://${INTERNAL_HOST}:${INTERNAL_PORT}}"
+
+PREFIX="SM_LLAMACPP_"
+ARG_PREFIX="--"
+
+ARGS=()
+
+while IFS='=' read -r key value; do
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}" || true)
+
+# Drop any user-supplied --host / --port so inference stays on the internal bind.
+normalized=()
+skip_next=0
+for a in "${ARGS[@]}"; do
+    if [ "$skip_next" -eq 1 ]; then
+        skip_next=0
+        continue
+    fi
+    if [ "$a" = "--host" ] || [ "$a" = "--port" ]; then
+        skip_next=1
+        continue
+    fi
+    normalized+=("$a")
+done
+ARGS=("${normalized[@]}")
+ARGS+=(--host "$INTERNAL_HOST" --port "$INTERNAL_PORT")
+
+echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2
+
+/app/llama-server "${ARGS[@]}" &
+LLAMA_PID=$!
+
+wait_for_llama() {
+    local i
+    for i in $(seq 1 120); do
+        if curl -sf "http://${INTERNAL_HOST}:${INTERNAL_PORT}/health" >/dev/null 2>&1; then
+            return 0
+        fi
+        sleep 1
+    done
+    return 1
+}
+
+if ! wait_for_llama; then
+    echo "[sagemaker] llama-server did not become healthy on ${INTERNAL_HOST}:${INTERNAL_PORT}" >&2
+    kill -TERM "$LLAMA_PID" 2>/dev/null || true
+    wait "$LLAMA_PID" 2>/dev/null || true
+    exit 1
+fi
+
+shutdown() {
+    kill -TERM "$UVICORN_PID" 2>/dev/null || true
+    kill -TERM "$LLAMA_PID" 2>/dev/null || true
+    wait "$UVICORN_PID" 2>/dev/null || true
+    wait "$LLAMA_PID" 2>/dev/null || true
+}
+
+trap shutdown SIGTERM SIGINT
+
+if [ -n "${PYTHONPATH:-}" ]; then
+    export PYTHONPATH="${PYTHONPATH}:/usr/local/lib/llamacpp_sagemaker"
+else
+    export PYTHONPATH="/usr/local/lib/llamacpp_sagemaker"
+fi
+python3 -m uvicorn llamacpp_sagemaker_serve:app --host 0.0.0.0 --port "$PROXY_PORT" --log-level info &
+UVICORN_PID=$!
+
+wait "$UVICORN_PID"
+exit_code=$?
+shutdown
+exit "$exit_code"
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+verlte() {
+  [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
+}
+
+COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
+if [ -f $COMPAT_FILE ]; then
+  CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
+  echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
+  NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+  if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
+    NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
+  fi
+  echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
+  if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
+    echo "Adding CUDA compat to LD_LIBRARY_PATH"
+    export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
+    echo $LD_LIBRARY_PATH
+  else
+    echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
+  fi
+else
+  echo "Skipping CUDA compat setup as package not found"
+fi