diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 89b740a5e315..60593de299b7 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -1,6 +1,6 @@ [dev] # Set to "huggingface", for example, if you are a huggingface developer. Default is "" -partner_developer = "" +partner_developer = "huggingface" # Please only set it to true if you are preparing an EI related PR # Do remember to revert it back to false before merging any PR (including EI dedicated PR) ei_mode = false @@ -36,12 +36,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. -# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] +build_frameworks = ["huggingface_llamacpp"] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = true +build_training = false build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR @@ -192,5 +192,8 @@ dlc-pr-huggingface-vllm = "" # HuggingFace SGLang dlc-pr-huggingface-sglang = "" +# Huggingface Llamacpp +dlc-pr-huggingface-llamacpp = "/huggingface/llamacpp/buildspec.yml" + # sglang dlc-pr-sglang = "" diff --git a/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch new file mode 100644 index 000000000000..a8491c93a26a --- /dev/null +++ b/huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch @@ -0,0 +1,133 @@ +--- a/tools/server/server.cpp ++++ b/tools/server/server.cpp +@@ -11,7 +11,9 @@ + #include "llama.h" + #include "log.h" + ++#include + #include ++#include + #include + #include + #include +@@ -69,6 +71,81 @@ + } + return res; + }; ++} ++ ++static std::string sagemaker_header(const server_http_req & req, const std::string & name) { ++ for (const auto & h : req.headers) { ++ std::string key = h.first; ++ std::transform(key.begin(), key.end(), key.begin(), [](unsigned char c) { return std::tolower(c); }); ++ if (key == name) { ++ return h.second; ++ } ++ } ++ return ""; ++} ++ ++static std::string sagemaker_route_from_attrs(const server_http_req & req) { ++ const std::string attrs = sagemaker_header(req, "x-amzn-sagemaker-custom-attributes"); ++ const std::string key = "route="; ++ const size_t pos = attrs.find(key); ++ if (pos == std::string::npos) { ++ return ""; ++ } ++ const size_t start = pos + key.size(); ++ const size_t end = attrs.find_first_of(",; \t\r\n", start); ++ return attrs.substr(start, end == std::string::npos ? std::string::npos : end - start); ++} ++ ++static bool sagemaker_route_syntax_ok(const std::string & route) { ++ return !route.empty() && route[0] == '/' && route.find("..") == std::string::npos && ++ route.find("://") == std::string::npos && route.find('?') == std::string::npos && ++ route.find('#') == std::string::npos; ++} ++ ++static std::string sagemaker_default_route(const server_http_req & req) { ++ const json body = json::parse(req.body, nullptr, false); ++ if (body.is_object()) { ++ if (body.contains("messages")) { ++ return "/v1/chat/completions"; ++ } ++ if (body.contains("prompt")) { ++ return "/v1/completions"; ++ } ++ if (body.contains("input")) { ++ return "/v1/embeddings"; ++ } ++ } ++ return "/v1/chat/completions"; ++} ++ ++static server_http_res_ptr sagemaker_error(int status, const std::string & message) { ++ auto res = std::make_unique(); ++ res->status = status; ++ res->data = safe_json_to_str({ ++ { "error", { ++ { "code", status }, ++ { "message", message }, ++ { "type", "invalid_request_error" }, ++ } }, ++ }); ++ return res; ++} ++ ++static server_http_res_ptr sagemaker_invocations( ++ const server_http_req & req, ++ const std::map & routes) { ++ const std::string requested = sagemaker_route_from_attrs(req); ++ const std::string route = requested.empty() ? sagemaker_default_route(req) : requested; ++ if (!sagemaker_route_syntax_ok(route)) { ++ return sagemaker_error(400, "invalid SageMaker route: " + route); ++ } ++ const auto it = routes.find(route); ++ if (it == routes.end()) { ++ return sagemaker_error(400, "unsupported SageMaker route: " + route); ++ } ++ server_http_req routed_req = req; ++ routed_req.path = route; ++ return it->second(routed_req); + } + + int main(int argc, char ** argv) { +@@ -169,6 +246,38 @@ + ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload)); + } + ++ ++ const std::map sagemaker_routes = { ++ {"/props", routes.post_props}, ++ {"/completion", routes.post_completions}, ++ {"/completions", routes.post_completions}, ++ {"/v1/completions", routes.post_completions_oai}, ++ {"/chat/completions", routes.post_chat_completions}, ++ {"/v1/chat/completions", routes.post_chat_completions}, ++ {"/v1/responses", routes.post_responses_oai}, ++ {"/responses", routes.post_responses_oai}, ++ {"/v1/audio/transcriptions", routes.post_transcriptions_oai}, ++ {"/audio/transcriptions", routes.post_transcriptions_oai}, ++ {"/v1/messages", routes.post_anthropic_messages}, ++ {"/v1/messages/count_tokens", routes.post_anthropic_count_tokens}, ++ {"/infill", routes.post_infill}, ++ {"/embedding", routes.post_embeddings}, ++ {"/embeddings", routes.post_embeddings}, ++ {"/v1/embeddings", routes.post_embeddings_oai}, ++ {"/rerank", routes.post_rerank}, ++ {"/reranking", routes.post_rerank}, ++ {"/v1/rerank", routes.post_rerank}, ++ {"/v1/reranking", routes.post_rerank}, ++ {"/tokenize", routes.post_tokenize}, ++ {"/detokenize", routes.post_detokenize}, ++ {"/apply-template", routes.post_apply_template}, ++ {"/lora-adapters", routes.post_lora_adapters}, ++ }; ++ ++ ctx_http.get ("/ping", ex_wrapper(routes.get_health)); // SageMaker health endpoint ++ ctx_http.post("/invocations", ex_wrapper([&sagemaker_routes](const server_http_req & req) { ++ return sagemaker_invocations(req, sagemaker_routes); ++ })); + ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) + ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) + ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics)); diff --git a/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh b/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh new file mode 100644 index 000000000000..1d06097ef569 --- /dev/null +++ b/huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh @@ -0,0 +1,53 @@ +#!/bin/bash +set -euo pipefail + +# Check if telemetry file exists before executing +# Execute telemetry script if it exists, suppress errors +bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true + +# Source CUDA compat for older drivers (e.g., g5 instances) +if [ -f /usr/local/bin/start_cuda_compat.sh ] \ + && command -v nvidia-smi >/dev/null 2>&1 \ + && command -v nvcc >/dev/null 2>&1; then + source /usr/local/bin/start_cuda_compat.sh +fi + +# SageMaker sends traffic to port 8080 on /ping and /invocations. The custom +# llama-server build handles those routes directly. +HOST="${LLAMACPP_SAGEMAKER_HOST:-0.0.0.0}" +PORT="${SAGEMAKER_BIND_TO_PORT:-${LLAMACPP_SAGEMAKER_PORT:-8080}}" + +PREFIX="SM_LLAMACPP_" +ARG_PREFIX="--" + +ARGS=() + +while IFS='=' read -r key value; do + arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + + ARGS+=("${ARG_PREFIX}${arg_name}") + if [ -n "$value" ]; then + ARGS+=("$value") + fi +done < <(env | grep "^${PREFIX}" || true) + +# Drop any user-supplied --host / --port so SageMaker can always reach the server. +normalized=() +skip_next=0 +for a in "${ARGS[@]}"; do + if [ "$skip_next" -eq 1 ]; then + skip_next=0 + continue + fi + if [ "$a" = "--host" ] || [ "$a" = "--port" ]; then + skip_next=1 + continue + fi + normalized+=("$a") +done +ARGS=("${normalized[@]}") +ARGS+=(--host "$HOST" --port "$PORT") + +echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2 + +exec /app/llama-server "${ARGS[@]}" diff --git a/huggingface/llamacpp/build_artifacts/start_cuda_compat.sh b/huggingface/llamacpp/build_artifacts/start_cuda_compat.sh new file mode 100644 index 000000000000..791d355c5abe --- /dev/null +++ b/huggingface/llamacpp/build_artifacts/start_cuda_compat.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +verlte() { + [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] +} + +COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1 +if [ -f $COMPAT_FILE ]; then + CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-) + echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}" + NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) + if [ -z "$NVIDIA_DRIVER_VERSION" ]; then + NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true) + fi + echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}" + if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then + echo "Adding CUDA compat to LD_LIBRARY_PATH" + export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH + echo $LD_LIBRARY_PATH + else + echo "Skipping CUDA compat setup as newer NVIDIA driver is installed" + fi +else + echo "Skipping CUDA compat setup as package not found" +fi diff --git a/huggingface/llamacpp/buildspec.yml b/huggingface/llamacpp/buildspec.yml new file mode 100644 index 000000000000..8b05831cbadf --- /dev/null +++ b/huggingface/llamacpp/buildspec.yml @@ -0,0 +1,78 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +base_framework: &BASE_FRAMEWORK llamacpp +framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK] +version: &VERSION "b8882" +short_version: &SHORT_VERSION "b8882" +arch_type: &ARCH_TYPE x86_64 +autopatch_build: "False" + +repository_info: + build_repository: &BUILD_REPOSITORY + image_type: &IMAGE_TYPE inference + root: huggingface/llamacpp + repository_name: &REPOSITORY_NAME !join [ "pr", "-", "huggingface", "-", *BASE_FRAMEWORK ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ "huggingface", "-", *BASE_FRAMEWORK ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + build_context: &BUILD_CONTEXT + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + start_cuda_compat: + source: build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + sagemaker_entrypoint: + source: build_artifacts/sagemaker_entrypoint.sh + target: sagemaker_entrypoint.sh + llamacpp_sagemaker_server_patch: + source: build_artifacts/llamacpp_sagemaker_server.patch + target: llamacpp_sagemaker_server.patch + + +images: + BuildHuggingFaceLlamacppGpuCu130DockerImage: + <<: *BUILD_REPOSITORY + context: + <<: *BUILD_CONTEXT + image_size_baseline: 40000 + device_type: &DEVICE_TYPE gpu + cuda_version: &CUDA_VERSION cu130 + os_version: &OS_VERSION ubuntu24.04 + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + transformers_version: &TRANSFORMERS_VERSION 4.57.3 + tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + build: true + enable_common_stage_build: false + test_configs: + test_platforms: + - sanity + - security + - sagemaker + + BuildHuggingFaceLlamacppCpuDockerImage: + <<: *BUILD_REPOSITORY + context: + <<: *BUILD_CONTEXT + image_size_baseline: 40000 + device_type: &DEVICE_TYPE cpu + os_version: &OS_VERSION ubuntu24.04 + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + transformers_version: &TRANSFORMERS_VERSION 4.57.3 + tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ] + target: sagemaker + build: true + enable_common_stage_build: false + test_configs: + test_platforms: + - sanity + - security + - sagemaker diff --git a/huggingface/llamacpp/docker/b8882/Dockerfile.cpu b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu new file mode 100644 index 000000000000..a577306960f9 --- /dev/null +++ b/huggingface/llamacpp/docker/b8882/Dockerfile.cpu @@ -0,0 +1,91 @@ +ARG UBUNTU_VERSION=24.04 +ARG LLAMACPP_VERSION=b8882 + +FROM ubuntu:${UBUNTU_VERSION} AS build + +ARG LLAMACPP_VERSION + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + gcc-14 \ + g++-14 \ + git \ + libgomp1 \ + libssl-dev \ + patch \ + python3 \ + && rm -rf /var/lib/apt/lists/* + +ENV CC=gcc-14 \ + CXX=g++-14 + +WORKDIR /src/llama.cpp + +RUN git clone --branch "${LLAMACPP_VERSION}" --depth 1 https://github.com/ggml-org/llama.cpp.git . + +COPY llamacpp_sagemaker_server.patch /tmp/llamacpp_sagemaker_server.patch + +RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch +RUN cmake -B build \ + -DGGML_NATIVE=OFF \ + -DGGML_BACKEND_DL=ON \ + -DGGML_CPU_ALL_VARIANTS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + . \ + && cmake --build build --config Release -j"$(nproc)" --target llama-server + +RUN mkdir -p /app/lib \ + && find build -name "*.so*" -exec cp -P {} /app/lib \; \ + && cp build/bin/llama-server /app/llama-server + +FROM ubuntu:${UBUNTU_VERSION} AS base + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +WORKDIR /app + +ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH} + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + libgomp1 \ + && apt-get autoremove -y \ + && apt-get clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=build /app/lib/ /app/ +COPY --from=build /app/llama-server /app/llama-server + +FROM base AS sagemaker + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh + +# Fix several CVEs: +# CVE-2026-31789, CVE-2025-15467, CVE-2025-13151, CVE-2025-15281, +# CVE-2025-69419, CVE-2025-68973, CVE-2025-69421, CVE-2026-28390, +# CVE-2025-69420, CVE-2026-0915, CVE-2026-0861, CVE-2026-28388, +# CVE-2026-31790, CVE-2026-28387, CVE-2026-28389 +RUN apt-get update \ + && apt-get install -y --only-upgrade \ + libssl3t64 \ + openssl \ + libtasn1-6 \ + libc6 \ + libc-bin \ + gnupg \ + gpg \ + gpgv \ + && rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] diff --git a/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu new file mode 100644 index 000000000000..c7110bd0b007 --- /dev/null +++ b/huggingface/llamacpp/docker/b8882/cu130/Dockerfile.gpu @@ -0,0 +1,100 @@ +ARG UBUNTU_VERSION=24.04 +ARG CUDA_VERSION=13.0.2 +ARG LLAMACPP_VERSION=b8882 +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} AS build + +ARG LLAMACPP_VERSION +ARG CUDA_DOCKER_ARCH=default + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + gcc-14 \ + g++-14 \ + git \ + libgomp1 \ + libssl-dev \ + patch \ + python3 \ + && rm -rf /var/lib/apt/lists/* + +ENV CC=gcc-14 \ + CXX=g++-14 \ + CUDAHOSTCXX=g++-14 + +WORKDIR /src/llama.cpp + +RUN git clone --branch "${LLAMACPP_VERSION}" --depth 1 https://github.com/ggml-org/llama.cpp.git . +COPY llamacpp_sagemaker_server.patch /tmp/llamacpp_sagemaker_server.patch +RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch +RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ + fi \ + && cmake -B build \ + -DGGML_NATIVE=OFF \ + -DGGML_CUDA=ON \ + -DGGML_BACKEND_DL=ON \ + -DGGML_CPU_ALL_VARIANTS=ON \ + -DLLAMA_BUILD_TESTS=OFF \ + ${CMAKE_ARGS:-} \ + -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ + . \ + && cmake --build build --config Release -j"$(nproc)" --target llama-server + +RUN mkdir -p /app/lib \ + && find build -name "*.so*" -exec cp -P {} /app/lib \; \ + && cp build/bin/llama-server /app/llama-server + +FROM ${BASE_CUDA_RUN_CONTAINER} AS base + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +WORKDIR /app +ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH} + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + libgomp1 \ + && apt-get autoremove -y \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app/ +COPY --from=build /app/llama-server /app/llama-server + +FROM base AS sagemaker + +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +COPY --chmod=0755 start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh +COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh + +# Fix several CVEs: +# CVE-2026-31789, CVE-2025-15467, CVE-2025-13151, CVE-2025-15281, +# CVE-2025-69419, CVE-2025-68973, CVE-2025-69421, CVE-2026-28390, +# CVE-2025-69420, CVE-2026-0915, CVE-2026-0861, CVE-2026-28388, +# CVE-2026-31790, CVE-2026-28387, CVE-2026-28389 +RUN apt-get update \ + && apt-get install -y --only-upgrade \ + libssl3t64 \ + openssl \ + libtasn1-6 \ + libc6 \ + libc-bin \ + gnupg \ + gpg \ + gpgv \ + && rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] diff --git a/src/constants.py b/src/constants.py index 037414380bca..42275c5532f1 100644 --- a/src/constants.py +++ b/src/constants.py @@ -29,6 +29,7 @@ "sglang", "huggingface_vllm", "huggingface_sglang", + "huggingface_llamacpp", } DEVICE_TYPES = {"cpu", "gpu", "hpu", "eia", "inf", "neuron", "neuronx"} IMAGE_TYPES = {"training", "inference"} diff --git a/src/image_builder.py b/src/image_builder.py index f101f33fa7fb..cc401a6a5e11 100644 --- a/src/image_builder.py +++ b/src/image_builder.py @@ -686,6 +686,7 @@ def get_job_type(image_repo_uri): "base": "general", "vllm": "general", "sglang": "general", + "llamacpp": "general", } for key, job_type in job_type_mapping.items(): diff --git a/test/sagemaker_tests/huggingface/llamacpp/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/__init__.py new file mode 100644 index 000000000000..199e66b95926 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import diff --git a/test/sagemaker_tests/huggingface/llamacpp/conftest.py b/test/sagemaker_tests/huggingface/llamacpp/conftest.py new file mode 100644 index 000000000000..57374310db49 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/conftest.py @@ -0,0 +1,391 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import logging +import os +import platform +import shutil +import sys +import tempfile + +import boto3 +import pytest + +from botocore.exceptions import ClientError +from sagemaker import LocalSession, Session +from sagemaker.pytorch import PyTorch + +from .utils import image_utils, get_ecr_registry + +NO_P4_REGIONS = [ + "af-south-1", + "ap-east-1", + "ap-northeast-3", + "ap-southeast-1", + "ap-southeast-2", + "ap-south-1", + "ca-central-1", + "eu-central-1", + "eu-north-1", + "eu-west-2", + "eu-west-3", + "eu-south-1", + "me-south-1", + "sa-east-1", + "us-west-1", + "cn-northwest-1", + "il-central-1", +] + +NO_G5_REGIONS = [ + "us-west-1", + "ca-west-1", + "mx-cental-1", + "af-south-1", + "ap-east-1", + "ap-south-2", + "ap-southeast-5", + "ap-southeast-4", + "ap-northeast-3", + "ap-southeast-1", + "ap-southeast-7", + "eu-south-1", + "eu-west-3", + "eu-south-2", + "eu-central-2", + "me-south-1", +] + + +logger = logging.getLogger(__name__) +logging.getLogger("boto").setLevel(logging.INFO) +logging.getLogger("boto3").setLevel(logging.INFO) +logging.getLogger("botocore").setLevel(logging.INFO) +logging.getLogger("factory.py").setLevel(logging.INFO) +logging.getLogger("auth.py").setLevel(logging.INFO) +logging.getLogger("connectionpool.py").setLevel(logging.INFO) + + +dir_path = os.path.dirname(os.path.realpath(__file__)) + + +def pytest_addoption(parser): + parser.addoption("--build-image", "-D", action="store_true") + parser.addoption("--build-base-image", "-B", action="store_true") + parser.addoption("--aws-id") + parser.addoption("--instance-type") + parser.addoption("--accelerator-type", default=None) + parser.addoption("--docker-base-name", default="huggingface_llamacpp") + parser.addoption("--region", default="us-west-2") + parser.addoption("--framework-version", default="") + parser.addoption( + "--py-version", + choices=["2", "3", "37", "38", "39", "310", "311", "312"], + default=str(sys.version_info.major), + ) + # Processor is still "cpu" for EIA tests + parser.addoption( + "--processor", choices=["gpu", "cpu", "eia", "neuron", "neuronx"], default="cpu" + ) + # If not specified, will default to {framework-version}-{processor}-py{py-version} + parser.addoption("--tag", default=None) + parser.addoption( + "--generate-coverage-doc", + default=False, + action="store_true", + help="use this option to generate test coverage doc", + ) + parser.addoption( + "--efa", + action="store_true", + default=False, + help="Run only efa tests", + ) + parser.addoption("--sagemaker-regions", default="us-west-2") + + +def pytest_configure(config): + config.addinivalue_line("markers", "efa(): explicitly mark to run efa tests") + + +def pytest_runtest_setup(item): + if item.config.getoption("--efa"): + efa_tests = [mark for mark in item.iter_markers(name="efa")] + if not efa_tests: + pytest.skip("Skipping non-efa tests") + + +def pytest_collection_modifyitems(session, config, items): + for item in items: + print(f"item {item}") + for marker in item.iter_markers(name="team"): + print(f"item {marker}") + team_name = marker.args[0] + item.user_properties.append(("team_marker", team_name)) + print(f"item.user_properties {item.user_properties}") + + if config.getoption("--generate-coverage-doc"): + from test.test_utils.test_reporting import TestReportGenerator + + report_generator = TestReportGenerator(items, is_sagemaker=True) + report_generator.generate_coverage_doc(framework="huggingface_llamacpp", job_type="inference") + + +@pytest.fixture(scope="session", name="docker_base_name") +def fixture_docker_base_name(request): + return request.config.getoption("--docker-base-name") + + +@pytest.fixture(scope="session", name="region") +def fixture_region(request): + return request.config.getoption("--region") + + +@pytest.fixture(scope="session", name="framework_version") +def fixture_framework_version(request): + return request.config.getoption("--framework-version") + + +@pytest.fixture(scope="session", name="py_version") +def fixture_py_version(request): + return "py{}".format(int(request.config.getoption("--py-version"))) + + +@pytest.fixture(scope="session", name="processor") +def fixture_processor(request): + return request.config.getoption("--processor") + + +@pytest.fixture(scope="session", name="tag") +def fixture_tag(request, framework_version, processor, py_version): + provided_tag = request.config.getoption("--tag") + default_tag = "{}-{}-{}".format(framework_version, processor, py_version) + return provided_tag if provided_tag else default_tag + + +@pytest.fixture(scope="session", name="docker_image") +def fixture_docker_image(docker_base_name, tag): + return "{}:{}".format(docker_base_name, tag) + + +@pytest.fixture +def opt_ml(): + tmp = tempfile.mkdtemp() + os.mkdir(os.path.join(tmp, "output")) + + # Docker cannot mount Mac OS /var folder properly see + # https://forums.docker.com/t/var-folders-isnt-mounted-properly/9600 + opt_ml_dir = "/private{}".format(tmp) if platform.system() == "Darwin" else tmp + yield opt_ml_dir + + shutil.rmtree(tmp, True) + + +@pytest.fixture(scope="session", name="use_gpu") +def fixture_use_gpu(processor): + return processor == "gpu" + + +@pytest.fixture(scope="session", name="build_base_image", autouse=True) +def fixture_build_base_image( + request, framework_version, py_version, processor, tag, docker_base_name +): + build_base_image = request.config.getoption("--build-base-image") + if build_base_image: + return image_utils.build_base_image( + framework_name=docker_base_name, + framework_version=framework_version, + py_version=py_version, + base_image_tag=tag, + processor=processor, + cwd=os.path.join(dir_path, ".."), + ) + + return tag + + +@pytest.fixture(scope="session", name="sagemaker_session") +def fixture_sagemaker_session(region): + return Session(boto_session=boto3.Session(region_name=region)) + + +@pytest.fixture(scope="session", name="sagemaker_regions") +def fixture_sagemaker_regions(request): + sagemaker_regions = request.config.getoption("--sagemaker-regions") + return sagemaker_regions.split(",") + + +@pytest.fixture(scope="session", name="sagemaker_local_session") +def fixture_sagemaker_local_session(region): + return LocalSession(boto_session=boto3.Session(region_name=region)) + + +@pytest.fixture(name="aws_id", scope="session") +def fixture_aws_id(request): + return request.config.getoption("--aws-id") + + +@pytest.fixture(name="instance_type", scope="session") +def fixture_instance_type(request, processor): + provided_instance_type = request.config.getoption("--instance-type") + default_instance_type = "local" if processor == "cpu" else "local_gpu" + return provided_instance_type or default_instance_type + + +@pytest.fixture(name="accelerator_type", scope="session") +def fixture_accelerator_type(request): + return request.config.getoption("--accelerator-type") + + +@pytest.fixture(name="docker_registry", scope="session") +def fixture_docker_registry(aws_id, region): + return get_ecr_registry(aws_id, region) + + +@pytest.fixture(name="ecr_image", scope="session") +def fixture_ecr_image(docker_registry, docker_base_name, tag): + return "{}/{}:{}".format(docker_registry, docker_base_name, tag) + + +@pytest.fixture(autouse=True) +def skip_by_device_type(request, use_gpu, instance_type, accelerator_type): + is_gpu = use_gpu or instance_type[3] in ["g", "p"] + is_eia = accelerator_type is not None + is_neuron = instance_type.startswith("ml.inf1") + is_neuronx = instance_type.startswith("ml.inf2") or instance_type.startswith("ml.trn1") + + # Separate out cases for clearer logic. + # When running Neuron test, skip CPU and GPU test. + if request.node.get_closest_marker("neuron_test") and not is_neuron: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + elif request.node.get_closest_marker("neuronx_test") and not is_neuronx: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + # When running GPU test, skip CPU and neuron test. When running CPU test, skip GPU and neuron test. + elif (request.node.get_closest_marker("gpu_test") and not is_gpu) or ( + request.node.get_closest_marker("cpu_test") and (is_gpu or is_neuron or is_neuronx) + ): + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + # When running EIA test, skip the CPU, GPU and Neuron functions + elif ( + request.node.get_closest_marker("neuron_test") + or request.node.get_closest_marker("gpu_test") + or request.node.get_closest_marker("cpu_test") + ) and is_eia: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + # When running CPU or GPU or Neuron test, skip EIA test. + elif request.node.get_closest_marker("eia_test") and not is_eia: + pytest.skip("Skipping because running on '{}' instance".format(instance_type)) + + +@pytest.fixture(autouse=True) +def skip_by_py_version(request, py_version): + if request.node.get_closest_marker("skip_py2") and py_version != "py3": + pytest.skip("Skipping the test because Python 2 is not supported.") + + +@pytest.fixture(autouse=True) +def skip_gpu_instance_restricted_regions(region, instance_type): + if (region in NO_P4_REGIONS and instance_type.startswith("ml.p4")) or ( + region in NO_G5_REGIONS and instance_type.startswith("ml.g5") + ): + pytest.skip( + "Skipping GPU test in region {} with instance type {}".format(region, instance_type) + ) + + +@pytest.fixture(autouse=True) +def skip_gpu_py2(request, use_gpu, instance_type, py_version, framework_version): + is_gpu = use_gpu or instance_type[3] in ["g", "p"] + if ( + request.node.get_closest_marker("skip_gpu_py2") + and is_gpu + and py_version != "py3" + and framework_version == "1.4.0" + ): + pytest.skip("Skipping the test until mms issue resolved.") + + +def _get_remote_override_flags(): + try: + s3_client = boto3.client("s3") + sts_client = boto3.client("sts") + account_id = sts_client.get_caller_identity().get("Account") + result = s3_client.get_object( + Bucket=f"dlc-cicd-helper-{account_id}", Key="override_tests_flags.json" + ) + json_content = json.loads(result["Body"].read().decode("utf-8")) + except ClientError as e: + logger.warning("ClientError when performing S3/STS operation: {}".format(e)) + json_content = {} + return json_content + + +def _is_test_disabled(test_name, build_name, version): + """ + Expected format of remote_override_flags: + { + "CB Project Name for Test Type A": { + "CodeBuild Resolved Source Version": ["test_type_A_test_function_1", "test_type_A_test_function_2"] + }, + "CB Project Name for Test Type B": { + "CodeBuild Resolved Source Version": ["test_type_B_test_function_1", "test_type_B_test_function_2"] + } + } + + :param test_name: str Test Function node name (includes parametrized values in string) + :param build_name: str Build Project name of current execution + :param version: str Source Version of current execution + :return: bool True if test is disabled as per remote override, False otherwise + """ + remote_override_flags = _get_remote_override_flags() + remote_override_build = remote_override_flags.get(build_name, {}) + if version in remote_override_build: + return not remote_override_build[version] or any( + [test_keyword in test_name for test_keyword in remote_override_build[version]] + ) + return False + + +@pytest.fixture(autouse=True) +def disable_test(request): + test_name = request.node.name + # We do not have a regex pattern to find CB name, which means we must resort to string splitting + build_arn = os.getenv("CODEBUILD_BUILD_ARN") + build_name = build_arn.split("/")[-1].split(":")[0] if build_arn else None + version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") + + if build_name and version and _is_test_disabled(test_name, build_name, version): + pytest.skip(f"Skipping {test_name} test because it has been disabled.") + + +@pytest.fixture(autouse=True) +def skip_test_successfully_executed_before(request): + """ + "cache/lastfailed" contains information about failed tests only. We're running SM tests in separate threads for each image. + So when we retry SM tests, successfully executed tests executed again because pytest doesn't have that info in /.cache. + But the flag "--last-failed-no-failures all" requires pytest to execute all the available tests. + The only sign that a test passed last time - lastfailed file exists and the test name isn't in that file. + The method checks whether lastfailed file exists and the test name is not in it. + """ + test_name = request.node.name + lastfailed = request.config.cache.get("cache/lastfailed", None) + + # if lastfailed is not None and not any( + # test_name in failed_test_name for failed_test_name in lastfailed.keys() + # ): + # pytest.skip(f"Skipping {test_name} because it was successfully executed for this commit") diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py new file mode 100644 index 000000000000..44e2b20386e5 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/__init__.py @@ -0,0 +1,137 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import os +import re +import shutil +import tarfile + +import boto3 + +# Path to test resources +resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "resources")) + +# Model artifacts for local mode tests - downloaded from HuggingFace Hub at runtime +MODEL_ID = "unsloth/Qwen3.5-0.8B-GGUF" +MODEL_FILENAME = "Qwen3.5-0.8B-UD-IQ2_XXS.gguf" +model_dir = os.path.join(resources_path, "qwen3.5-0.8b") +model_data = "qwen3.5-0.8b.tar.gz" +model_data_path = os.path.join(model_dir, model_data) + + +def _tar_contains_expected_model(tar_path): + if not os.path.exists(tar_path): + return False + try: + with tarfile.open(tar_path, "r:gz") as tar: + return any( + os.path.basename(member.name) == MODEL_FILENAME + for member in tar.getmembers() + if member.isfile() + ) + except tarfile.TarError: + return False + + +def ensure_model_downloaded(): + """Download model from HuggingFace Hub and create tarball if not already present.""" + if _tar_contains_expected_model(model_data_path): + return model_data_path + + from huggingface_hub import hf_hub_download + + os.makedirs(model_dir, exist_ok=True) + local_model_dir = os.path.join(model_dir, "model") + if os.path.exists(local_model_dir): + shutil.rmtree(local_model_dir) + os.makedirs(local_model_dir, exist_ok=True) + + print(f"Downloading {MODEL_FILENAME} from {MODEL_ID} on HuggingFace Hub...") + hf_hub_download( + repo_id=MODEL_ID, + filename=MODEL_FILENAME, + local_dir=local_model_dir, + ) + + # Remove cache folder if present + cache_dir = os.path.join(local_model_dir, ".cache") + if os.path.exists(cache_dir): + shutil.rmtree(cache_dir) + + print(f"Creating tarball {model_data}...") + with tarfile.open(model_data_path, "w:gz") as tar: + for item in os.listdir(local_model_dir): + tar.add(os.path.join(local_model_dir, item), arcname=item) + + # Clean up extracted model + shutil.rmtree(local_model_dir) + + print(f"Model ready at {model_data_path}") + return model_data_path + + +# Role for local mode (not used but required by SageMaker SDK) +ROLE = "dummy/unused-role" +DEFAULT_TIMEOUT = 45 + +# Llama.cpp SageMaker images listen on port 8080 with a custom llama-server build +# that serves SageMaker-compatible /ping and /invocations routes directly. + + +class NoLogStreamFoundError(Exception): + pass + + +class SageMakerEndpointFailure(Exception): + pass + + +def dump_logs_from_cloudwatch(e, region="us-west-2"): + """ + Function to dump logs from cloudwatch during error handling. + Gracefully handles missing log groups/streams. + """ + error_hosting_endpoint_regex = re.compile(r"Error hosting endpoint ((\w|-)+):") + endpoint_url_regex = re.compile(r"/aws/sagemaker/Endpoints/((\w|-)+)") + endpoint_match = error_hosting_endpoint_regex.search(str(e)) or endpoint_url_regex.search( + str(e) + ) + if endpoint_match: + logs_client = boto3.client("logs", region_name=region) + endpoint = endpoint_match.group(1) + log_group_name = f"/aws/sagemaker/Endpoints/{endpoint}" + try: + log_stream_resp = logs_client.describe_log_streams(logGroupName=log_group_name) + all_traffic_log_stream = "" + for log_stream in log_stream_resp.get("logStreams", []): + log_stream_name = log_stream.get("logStreamName") + if log_stream_name.startswith("AllTraffic"): + all_traffic_log_stream = log_stream_name + break + if not all_traffic_log_stream: + raise NoLogStreamFoundError( + f"Cannot find all traffic log streams for endpoint {endpoint}" + ) from e + events = logs_client.get_log_events( + logGroupName=log_group_name, logStreamName=all_traffic_log_stream + ) + raise SageMakerEndpointFailure( + f"Error from endpoint {endpoint}:\n{json.dumps(events, indent=4)}" + ) from e + except logs_client.exceptions.ResourceNotFoundException: + # Log group doesn't exist yet - endpoint may have failed before creating logs + raise SageMakerEndpointFailure( + f"Endpoint {endpoint} failed. No CloudWatch logs available yet." + ) from e diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py new file mode 100644 index 000000000000..199e66b95926 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/local/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py new file mode 100644 index 000000000000..f4807f5c4cf0 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/local/test_serving.py @@ -0,0 +1,109 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from contextlib import contextmanager + +import pytest +import requests +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer +from sagemaker.deserializers import JSONDeserializer + +from ...integration import ROLE, ensure_model_downloaded +from ...utils import local_mode_utils + + +@contextmanager +def _predictor(image, sagemaker_local_session, instance_type): + """Context manager for Llama.cpp model deployment and cleanup. + + Model is extracted to /opt/ml/model by SageMaker from model_data tar.gz. + The container entrypoint runs a custom llama-server build with + SageMaker-compatible /ping and /invocations routes on port 8080. + """ + # Download model from HuggingFace Hub if not already present + model_data_path = ensure_model_downloaded() + + env = { + "SM_LLAMACPP_MODEL": "/opt/ml/model/Qwen3.5-0.8B-UD-IQ2_XXS.gguf", + } + + model = Model( + model_data=f"file://{model_data_path}", + role=ROLE, + image_uri=image, + env=env, + sagemaker_session=sagemaker_local_session, + predictor_cls=Predictor, + ) + with local_mode_utils.lock(): + predictor = None + try: + predictor = model.deploy(1, instance_type) + yield predictor + finally: + if predictor is not None: + predictor.delete_endpoint() + + +def _assert_sagemaker_ping_local(): + """SageMaker contract: GET /ping on the container HTTP port (local mode: 8080).""" + response = requests.get("http://127.0.0.1:8080/ping", timeout=60) + assert response.status_code == 200 + + +def _assert_llamacpp_chat_prediction(predictor): + """Test Llama.cpp inference using OpenAI-compatible chat completions API.""" + predictor.serializer = JSONSerializer() + predictor.deserializer = JSONDeserializer() + + data = { + "messages": [{"role": "user", "content": "What is Deep Learning?"}], + "max_tokens": 50, + "temperature": 0.7, + } + output = predictor.predict(data) + + assert output is not None + assert "choices" in output + + +def _assert_llamacpp_chat_prediction_explicit_route(predictor): + """Same as chat test but forces target path via SageMaker CustomAttributes route=.""" + predictor.serializer = JSONSerializer() + predictor.deserializer = JSONDeserializer() + + data = { + "messages": [{"role": "user", "content": "Say hello in one word."}], + "max_tokens": 16, + "temperature": 0.3, + } + output = predictor.predict( + data, + custom_attributes="route=/v1/chat/completions", + ) + + assert output is not None + assert "choices" in output + + +@pytest.mark.model("qwen3.5-0.8b") +@pytest.mark.team("sagemaker-1p-algorithms") +def test_llamacpp_local_chat(docker_image, sagemaker_local_session, instance_type): + """Test Llama.cpp local deployment: /ping shim, /invocations chat, and explicit route=.""" + with _predictor(docker_image, sagemaker_local_session, instance_type) as predictor: + _assert_sagemaker_ping_local() + _assert_llamacpp_chat_prediction(predictor) + _assert_llamacpp_chat_prediction_explicit_route(predictor) diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py new file mode 100644 index 000000000000..04fbf5d9a144 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/__init__.py @@ -0,0 +1,12 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py new file mode 100644 index 000000000000..370ae0f51e1b --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/test_llamacpp.py @@ -0,0 +1,116 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import logging + +import pytest +import sagemaker +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer +from sagemaker.deserializers import JSONDeserializer + +from ...integration import dump_logs_from_cloudwatch +from ...integration.sagemaker.timeout import timeout_and_delete_endpoint +from ..... import invoke_sm_endpoint_helper_function + +LOGGER = logging.getLogger(__name__) + + +@pytest.mark.model("qwen3.5-0.8b") +@pytest.mark.processor("gpu") +@pytest.mark.gpu_test +@pytest.mark.team("sagemaker-1p-algorithms") +def test_llamacpp_qwen(framework_version, ecr_image, instance_type, sagemaker_regions): + invoke_sm_endpoint_helper_function( + ecr_image=ecr_image, + sagemaker_regions=sagemaker_regions, + test_function=_test_llamacpp_model, + dump_logs_from_cloudwatch=dump_logs_from_cloudwatch, + framework_version=framework_version, + instance_type=instance_type, + model_id="unsloth/Qwen3.5-0.8B-GGUF", + ) + + +def _test_llamacpp_model( + image_uri, + sagemaker_session, + instance_type, + model_id, + framework_version=None, + **kwargs, +): + """Test Llama.cpp model deployment and inference using OpenAI-compatible API format + + Uses sagemaker.model.Model for SDK v3 compatibility instead of HuggingFaceModel. + + Args: + image_uri: ECR image URI + sagemaker_session: SageMaker session + instance_type: ML instance type + model_id: HuggingFace model ID + framework_version: Optional version info + **kwargs: Additional args from helper (boto_session, sagemaker_client, etc.) + """ + endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-hf-llamacpp-serving") + + env = { + "SM_LLAMACPP_HF_REPO": model_id, + } + + model = Model( + name=endpoint_name, + image_uri=image_uri, + role="SageMakerRole", + env=env, + sagemaker_session=sagemaker_session, + predictor_cls=Predictor, + ) + + with timeout_and_delete_endpoint(endpoint_name, sagemaker_session, minutes=45): + predictor = model.deploy( + initial_instance_count=1, + instance_type=instance_type, + endpoint_name=endpoint_name, + container_startup_health_check_timeout=1800, + inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", + ) + + predictor.serializer = JSONSerializer() + predictor.deserializer = JSONDeserializer() + + # Llama.cpp SageMaker uses OpenAI-compatible chat completions API format + data = { + "messages": [{"role": "user", "content": "What is Deep Learning?"}], + "max_tokens": 50, + "temperature": 0.7, + } + + LOGGER.info(f"Running inference with data: {data}") + output = predictor.predict(data) + LOGGER.info(f"Output: {json.dumps(output)}") + + assert output is not None + assert "choices" in output + + # Explicit route= uses SageMaker CustomAttributes routing in the custom llama-server build. + output_routed = predictor.predict( + data, + custom_attributes="route=/v1/chat/completions", + ) + LOGGER.info(f"Output (routed): {json.dumps(output_routed)}") + assert output_routed is not None + assert "choices" in output_routed diff --git a/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py new file mode 100644 index 000000000000..1d13878031f7 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/integration/sagemaker/timeout.py @@ -0,0 +1,66 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import +import signal +from contextlib import contextmanager +import logging + +from botocore.exceptions import ClientError + +LOGGER = logging.getLogger("timeout") + + +class TimeoutError(Exception): + pass + + +@contextmanager +def timeout(seconds=0, minutes=0, hours=0): + """Add a signal-based timeout to any block of code. + If multiple time units are specified, they will be added together to determine time limit. + Usage: + with timeout(seconds=5): + my_slow_function(...) + Args: + - seconds: The time limit, in seconds. + - minutes: The time limit, in minutes. + - hours: The time limit, in hours. + """ + + limit = seconds + 60 * minutes + 3600 * hours + + def handler(signum, frame): + raise TimeoutError("timed out after {} seconds".format(limit)) + + try: + signal.signal(signal.SIGALRM, handler) + signal.alarm(limit) + + yield + finally: + signal.alarm(0) + + +@contextmanager +def timeout_and_delete_endpoint(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0): + with timeout(seconds=seconds, minutes=minutes, hours=hours) as t: + try: + yield [t] + finally: + try: + sagemaker_session.delete_endpoint(endpoint_name) + LOGGER.info("deleted endpoint {}".format(endpoint_name)) + except ClientError as ce: + if ce.response["Error"]["Code"] == "ValidationException": + # avoids the inner exception to be overwritten + pass diff --git a/test/sagemaker_tests/huggingface/llamacpp/requirements.txt b/test/sagemaker_tests/huggingface/llamacpp/requirements.txt new file mode 100644 index 000000000000..890bbe499718 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/requirements.txt @@ -0,0 +1,29 @@ +boto3 +coverage +# Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local) +docker>=5,<=6.1.3 +flake8==3.7.7 +Flask==1.1.1 +mock +pytest==8.3.5 +pytest-cov +pytest-rerunfailures +pytest-xdist +PyYAML +protobuf>=3.20,<=3.20.2 +sagemaker>=2.237.0,<3 +six +requests<2.32.0 +requests_mock +Pillow +retrying==1.3.3 +urllib3>=1.26.8 +pluggy>=1.5,<2 +requests_mock +sagemaker-inference +tenacity +fabric +invoke +gitpython +toml +huggingface_hub diff --git a/test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py b/test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py new file mode 100644 index 000000000000..6932ed1abd5b --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/utils/__init__.py @@ -0,0 +1,36 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import boto3 +import botocore + + +def _botocore_resolver(): + """ + Get the DNS suffix for the given region. + :return: endpoint object + """ + loader = botocore.loaders.create_loader() + return botocore.regions.EndpointResolver(loader.load_data("endpoints")) + + +def get_ecr_registry(account, region): + """ + Get prefix of ECR image URI + :param account: Account ID + :param region: region where ECR repo exists + :return: AWS ECR registry + """ + endpoint_data = _botocore_resolver().construct_endpoint("ecr", region) + return "{}.dkr.{}".format(account, endpoint_data["hostname"]) diff --git a/test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py b/test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py new file mode 100644 index 000000000000..3421e6ce2b42 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/utils/image_utils.py @@ -0,0 +1,67 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import subprocess +import sys + +CYAN_COLOR = "\033[36m" +END_COLOR = "\033[0m" + + +def build_base_image( + framework_name, framework_version, py_version, processor, base_image_tag, cwd="." +): + base_image_uri = get_base_image_uri(framework_name, base_image_tag) + + dockerfile_location = os.path.join( + "docker", framework_version, "base", "Dockerfile.{}".format(processor) + ) + + subprocess.check_call( + [ + "docker", + "build", + "-t", + base_image_uri, + "-f", + dockerfile_location, + "--build-arg", + "py_version={}".format(py_version[-1]), + cwd, + ], + cwd=cwd, + ) + print("created image {}".format(base_image_uri)) + return base_image_uri + + +def get_base_image_uri(framework_name, base_image_tag): + return "{}-base:{}".format(framework_name, base_image_tag) + + +def get_image_uri(framework_name, tag): + return "{}:{}".format(framework_name, tag) + + +def _check_call(cmd, *popenargs, **kwargs): + if isinstance(cmd, str): + cmd = cmd.split(" ") + _print_cmd(cmd) + subprocess.check_call(cmd, *popenargs, **kwargs) + + +def _print_cmd(cmd): + print("executing docker command: {}{}{}".format(CYAN_COLOR, " ".join(cmd), END_COLOR)) + sys.stdout.flush() diff --git a/test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py b/test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py new file mode 100644 index 000000000000..fa6b3cf00c36 --- /dev/null +++ b/test/sagemaker_tests/huggingface/llamacpp/utils/local_mode_utils.py @@ -0,0 +1,46 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from contextlib import contextmanager +import fcntl +import os +import tarfile +import time + +from ..integration import resources_path + +LOCK_PATH = os.path.join(resources_path, "local_mode_lock") + + +@contextmanager +def lock(): + # Since Local Mode uses the same port for serving, we need a lock in order + # to allow concurrent test execution. + local_mode_lock_fd = open(LOCK_PATH, "w") + local_mode_lock = local_mode_lock_fd.fileno() + + fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) + + try: + yield + finally: + time.sleep(5) + fcntl.lockf(local_mode_lock, fcntl.LOCK_UN) + + +def assert_files_exist(output_path, directory_file_map): + for directory, files in directory_file_map.items(): + with tarfile.open(os.path.join(output_path, "{}.tar.gz".format(directory))) as tar: + for f in files: + tar.getmember(f) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index d593deea76e7..571b9fb26ed3 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -1822,6 +1822,7 @@ def get_framework_and_version_from_tag(image_uri): "huggingface_pytorch", "huggingface_vllm", "huggingface_sglang", + "huggingface_llamacpp", "stabilityai_pytorch", "pytorch_trcomp", "tensorflow", @@ -1939,6 +1940,7 @@ def get_framework_from_image_uri(image_uri): "huggingface-pytorch": "huggingface_pytorch", "huggingface-vllm": "huggingface_vllm", "huggingface-sglang": "huggingface_sglang", + "huggingface-llamacpp": "huggingface_llamacpp", "stabilityai-pytorch": "stabilityai_pytorch", "mxnet": "mxnet", "pytorch": "pytorch", @@ -2080,6 +2082,7 @@ def get_job_type_from_image(image_uri): "base": "general", "vllm": "general", "sglang": "general", + "llamacpp": "general", } for key, job_type in job_type_mapping.items(): diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index 0ab4d69e4829..24f256f66253 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -164,6 +164,8 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type): path = os.path.join("test", "sagemaker_tests", "huggingface", "vllm") elif framework == "huggingface_sglang": path = os.path.join("test", "sagemaker_tests", "huggingface", "sglang") + elif framework == "huggingface_llamacpp": + path = os.path.join("test", "sagemaker_tests", "huggingface", "llamacpp") else: path = os.path.join("test", "sagemaker_tests", framework, job_type) aws_id_arg = "--aws-id" @@ -286,6 +288,8 @@ def generate_sagemaker_pytest_cmd(image, sagemaker_test_type): path = os.path.join("test", "sagemaker_tests", "huggingface", "vllm") elif "huggingface" in framework and "sglang" in framework: path = os.path.join("test", "sagemaker_tests", "huggingface", "sglang") + elif "huggingface" in framework and "llamacpp" in framework: + path = os.path.join("test", "sagemaker_tests", "huggingface", "llamacpp") elif "huggingface" in framework and job_type == "inference": path = os.path.join("test", "sagemaker_tests", "huggingface", "inference") if "trcomp" in framework: diff --git a/test/testrunner.py b/test/testrunner.py index 2d7deb2cfe24..9773d11e1604 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -629,6 +629,15 @@ def main(): sm_utils.generate_empty_report(report, test_type, "sglang") return + # Skip base llamacpp (not huggingface_llamacpp) - huggingface_llamacpp has local tests + if "llamacpp" in dlc_images and "huggingface" not in dlc_images: + LOGGER.info( + f"Skipping - there are no local mode tests for base Llamacpp. Images: {dlc_images}" + ) + report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") + sm_utils.generate_empty_report(report, test_type, "llamacpp") + return + testing_image_list = [ image for image in standard_images_list