Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions dlc_developer_config.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[dev]
# Set to "huggingface", for example, if you are a huggingface developer. Default is ""
partner_developer = ""
partner_developer = "huggingface"
# Please only set it to true if you are preparing an EI related PR
# Do remember to revert it back to false before merging any PR (including EI dedicated PR)
ei_mode = false
Expand Down Expand Up @@ -36,12 +36,12 @@ deep_canary_mode = false

[build]
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = []
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = ["huggingface_llamacpp"]


# By default we build both training and inference containers. Set true/false values to determine which to build.
build_training = true
build_training = false
build_inference = true

# Set do_build to "false" to skip builds and test the latest image built by this PR
Expand Down Expand Up @@ -192,5 +192,8 @@ dlc-pr-huggingface-vllm = ""
# HuggingFace SGLang
dlc-pr-huggingface-sglang = ""

# Huggingface Llamacpp
dlc-pr-huggingface-llamacpp = "/huggingface/llamacpp/buildspec.yml"

# sglang
dlc-pr-sglang = ""
211 changes: 211 additions & 0 deletions huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_serve.py
Comment thread
ehcalabres marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""SageMaker HTTP proxy for llama.cpp llama-server.

SageMaker invokes POST /invocations and GET /ping on port 8080. llama-server
speaks OpenAI-style routes (e.g. /v1/chat/completions) and does not expose
/invocations.

Behavior mirrors scripts/vllm/omni_sagemaker_serve.py routing:

- GET /ping is proxied to GET {backend}/health.
- POST /invocations: if ``X-Amzn-SageMaker-Custom-Attributes`` contains
``route=/some/path``, the request is forwarded to that path on llama-server.
Otherwise the target path is inferred from the JSON body (messages ->
/v1/chat/completions, prompt -> /v1/completions, input+model -> /v1/embeddings),
defaulting to /v1/chat/completions.

For routes that require multipart/form-data (parity with vLLM-Omni), JSON bodies
are converted when ``route=`` targets those paths.

Environment:

- LLAMACPP_SAGEMAKER_BACKEND_URL: upstream base URL (default http://127.0.0.1:8081)
"""

from __future__ import annotations

import json
import logging
import os
import re
import uuid
from collections.abc import AsyncIterator

import httpx
from starlette.applications import Starlette
from starlette.requests import Request
from starlette.responses import Response, StreamingResponse
from starlette.routing import Route

logger = logging.getLogger("llamacpp_sagemaker")

BACKEND = os.environ.get("LLAMACPP_SAGEMAKER_BACKEND_URL", "http://127.0.0.1:8081").rstrip("/")

FORM_DATA_ROUTES = frozenset({"/v1/videos", "/v1/videos/sync"})

_HOP_BY_HOP = frozenset(
{
"connection",
"keep-alive",
"proxy-authenticate",
"proxy-authorization",
"te",
"trailers",
"transfer-encoding",
"upgrade",
"host",
"content-length",
}
)

_RESP_DROP = frozenset({"transfer-encoding", "content-length", "connection"})


def _parse_route_from_header(raw: str | None) -> str | None:
if not raw:
return None
m = re.search(r"route=(/[^\s,]+)", raw)
return m.group(1) if m else None


def _parse_route(request: Request) -> str | None:
h = request.headers
v = h.get("x-amzn-sagemaker-custom-attributes")
return _parse_route_from_header(v)


def _build_multipart_body(data: dict, boundary: str) -> bytes:
parts: list[str] = []
for key, value in data.items():
parts.append(
f'--{boundary}\r\nContent-Disposition: form-data; name="{key}"\r\n\r\n{value}\r\n'
)
parts.append(f"--{boundary}--\r\n")
return "".join(parts).encode()


def _default_path_for_invocation(content_type: str, body: bytes) -> str:
ct = (content_type or "").lower()
if "json" not in ct:
return "/v1/chat/completions"
try:
data = json.loads(body)
except (json.JSONDecodeError, UnicodeDecodeError):
return "/v1/chat/completions"
if not isinstance(data, dict):
return "/v1/chat/completions"
if "messages" in data:
return "/v1/chat/completions"
if "prompt" in data:
return "/v1/completions"
if "input" in data and "model" in data:
return "/v1/embeddings"
return "/v1/chat/completions"


def _forward_request_headers(request: Request, body_len: int, content_type: str | None) -> dict[str, str]:
out: dict[str, str] = {}
for key, value in request.headers.items():
lk = key.lower()
if lk in _HOP_BY_HOP or lk == "x-amzn-sagemaker-custom-attributes":
continue
out[key] = value
out["content-length"] = str(body_len)
if content_type is not None:
out["content-type"] = content_type
return out


def _response_headers_from_httpx(resp: httpx.Response) -> dict[str, str]:
h: dict[str, str] = {}
for key, value in resp.headers.items():
lk = key.lower()
if lk in _RESP_DROP:
continue
h[key] = value
return h


async def ping(request: Request) -> Response:
url = f"{BACKEND}/health"
try:
async with httpx.AsyncClient(timeout=httpx.Timeout(10.0, connect=2.0)) as client:
r = await client.get(url)
except httpx.RequestError as e:
logger.warning("Backend health request failed: %s", e)
return Response(status_code=503, content=b'{"error":"backend_unavailable"}')
return Response(
status_code=r.status_code,
content=r.content,
headers=_response_headers_from_httpx(r),
)


async def invocations(request: Request) -> Response:
if request.method != "POST":
return Response(status_code=405, content=b"Method Not Allowed")

body = await request.body()
route = _parse_route(request)
content_type = request.headers.get("content-type")

if route:
target = route
logger.info("Rerouting /invocations -> %s", target)
ct = (content_type or "").lower()
if target in FORM_DATA_ROUTES and "json" in ct:
try:
data = json.loads(body)
except (json.JSONDecodeError, UnicodeDecodeError):
data = None
if isinstance(data, dict):
boundary = uuid.uuid4().hex
body = _build_multipart_body(data, boundary)
content_type = f"multipart/form-data; boundary={boundary}"
logger.info("Converted JSON to form-data for %s", target)
else:
target = _default_path_for_invocation(content_type or "", body)
logger.info("Inferred /invocations -> %s", target)

url = f"{BACKEND}{target}"
fwd_headers = _forward_request_headers(request, len(body), content_type)

timeout = httpx.Timeout(600.0, connect=30.0)
client = httpx.AsyncClient(timeout=timeout)
try:
req = client.build_request("POST", url, headers=fwd_headers, content=body)
r = await client.send(req, stream=True)
except httpx.RequestError as e:
await client.aclose()
logger.exception("Upstream request failed: %s", e)
return Response(status_code=502, content=json.dumps({"error": "upstream_error"}).encode())

async def stream_body() -> AsyncIterator[bytes]:
try:
async for chunk in r.aiter_bytes():
yield chunk
finally:
await r.aclose()
await client.aclose()

return StreamingResponse(
stream_body(),
status_code=r.status_code,
headers=_response_headers_from_httpx(r),
media_type=r.headers.get("content-type"),
)


routes = [
Route("/ping", ping, methods=["GET"]),
Route("/invocations", invocations, methods=["POST"]),
]

app = Starlette(routes=routes)

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
force=True,
)
95 changes: 95 additions & 0 deletions huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash
set -euo pipefail

# Check if telemetry file exists before executing
# Execute telemetry script if it exists, suppress errors
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true

# Source CUDA compat for older drivers (e.g., g5 instances)
if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then
source /usr/local/bin/start_cuda_compat.sh
fi

# SageMaker sends traffic to port 8080 on /ping and /invocations. llama-server
# listens on a loopback-only port; a small Python proxy (llamacpp_sagemaker_serve)
# binds 8080 and forwards to llama-server, similar to vLLM-Omni middleware.
INTERNAL_HOST="${LLAMACPP_SAGEMAKER_INTERNAL_HOST:-127.0.0.1}"
INTERNAL_PORT="${LLAMACPP_SAGEMAKER_INTERNAL_PORT:-8081}"
PROXY_PORT="${LLAMACPP_SAGEMAKER_PROXY_PORT:-8080}"
export LLAMACPP_SAGEMAKER_BACKEND_URL="${LLAMACPP_SAGEMAKER_BACKEND_URL:-http://${INTERNAL_HOST}:${INTERNAL_PORT}}"

PREFIX="SM_LLAMACPP_"
ARG_PREFIX="--"

ARGS=()

while IFS='=' read -r key value; do
arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')

ARGS+=("${ARG_PREFIX}${arg_name}")
if [ -n "$value" ]; then
ARGS+=("$value")
fi
done < <(env | grep "^${PREFIX}" || true)

# Drop any user-supplied --host / --port so inference stays on the internal bind.
normalized=()
skip_next=0
for a in "${ARGS[@]}"; do
if [ "$skip_next" -eq 1 ]; then
skip_next=0
continue
fi
if [ "$a" = "--host" ] || [ "$a" = "--port" ]; then
skip_next=1
continue
fi
normalized+=("$a")
done
ARGS=("${normalized[@]}")
ARGS+=(--host "$INTERNAL_HOST" --port "$INTERNAL_PORT")

echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2

/app/llama-server "${ARGS[@]}" &
LLAMA_PID=$!

wait_for_llama() {
local i
for i in $(seq 1 120); do
if curl -sf "http://${INTERNAL_HOST}:${INTERNAL_PORT}/health" >/dev/null 2>&1; then
return 0
fi
sleep 1
done
return 1
}

if ! wait_for_llama; then
echo "[sagemaker] llama-server did not become healthy on ${INTERNAL_HOST}:${INTERNAL_PORT}" >&2
kill -TERM "$LLAMA_PID" 2>/dev/null || true
wait "$LLAMA_PID" 2>/dev/null || true
exit 1
fi

shutdown() {
kill -TERM "$UVICORN_PID" 2>/dev/null || true
kill -TERM "$LLAMA_PID" 2>/dev/null || true
wait "$UVICORN_PID" 2>/dev/null || true
wait "$LLAMA_PID" 2>/dev/null || true
}

trap shutdown SIGTERM SIGINT

if [ -n "${PYTHONPATH:-}" ]; then
export PYTHONPATH="${PYTHONPATH}:/usr/local/lib/llamacpp_sagemaker"
else
export PYTHONPATH="/usr/local/lib/llamacpp_sagemaker"
fi
python3 -m uvicorn llamacpp_sagemaker_serve:app --host 0.0.0.0 --port "$PROXY_PORT" --log-level info &
UVICORN_PID=$!

wait "$UVICORN_PID"
exit_code=$?
shutdown
exit "$exit_code"
25 changes: 25 additions & 0 deletions huggingface/llamacpp/build_artifacts/start_cuda_compat.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

verlte() {
[ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
}

COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
if [ -f $COMPAT_FILE ]; then
CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
fi
echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
echo "Adding CUDA compat to LD_LIBRARY_PATH"
export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
echo $LD_LIBRARY_PATH
else
echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
fi
else
echo "Skipping CUDA compat setup as package not found"
fi
Loading