diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index c43d8eb9..2e3fe7ef 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -31,15 +31,28 @@ jobs: group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true runs-on: - group: aws-highmemory-32-plus-priv + group: aws-highmemory-64-plus-priv permissions: contents: write packages: write + id-token: write steps: - name: Checkout repository uses: actions/checkout@v4 - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4.4.1 + - name: Extract TensorRT-LLM version + run: | + echo "TENSORRT_LLM_VERSION=$(grep -oP '([a-z,0-9]{40})' $GITHUB_WORKSPACE/backends/trtllm/cmake/trtllm.cmake)" >> $GITHUB_ENV + echo "TensorRT-LLM version: ${{ env.TENSORRT_LLM_VERSION }}" + - name: "Configure AWS Credentials" + id: aws-creds + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: us-east-1 + role-to-assume: ${{ secrets.AWS_ROLE_GITHUB_TGI_TEST }} + role-duration-seconds: 7200 + output-credentials: true - name: Construct harware variables shell: bash run: | @@ -52,6 +65,7 @@ jobs: export runs_on="aws-g6-12xl-plus-priv-cache" export platform="" export extra_pytest="" + export target="nil" ;; cuda-trtllm) export dockerfile="Dockerfile_trtllm" @@ -61,6 +75,10 @@ jobs: export runs_on="ubuntu-latest" export platform="" export extra_pytest="" + export target="ci-runtime" + export sccache_s3_key_prefix="trtllm" + export sccache_region="us-east-1" + export build_type="dev" ;; rocm) export dockerfile="Dockerfile_amd" @@ -71,6 +89,7 @@ jobs: export runs_on="ubuntu-latest" export platform="" export extra_pytest="-k test_flash_gemma_gptq_load" + export target="nil" ;; intel-xpu) export dockerfile="Dockerfile_intel" @@ -80,6 +99,7 @@ jobs: export runs_on="ubuntu-latest" export platform="xpu" export extra_pytest="" + export target="nil" ;; intel-cpu) export dockerfile="Dockerfile_intel" @@ -90,6 +110,7 @@ jobs: export runs_on="aws-highmemory-32-plus-priv" export platform="cpu" export extra_pytest="-k test_flash_gemma_simple" + export target="nil" ;; esac echo $dockerfile @@ -106,6 +127,10 @@ jobs: echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV echo "EXTRA_PYTEST=${extra_pytest}" >> $GITHUB_ENV echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV + echo "TARGET=${target}" >> $GITHUB_ENV + echo "SCCACHE_S3_KEY_PREFIX=${sccache_s3_key_prefix}" >> $GITHUB_ENV + echo "SCCACHE_REGION=${sccache_region}" >> $GITHUB_ENV + echo "BUILD_TYPE=${build_type}" >> $GITHUB_ENV - name: Initialize Docker Buildx uses: docker/setup-buildx-action@v3 with: @@ -170,6 +195,14 @@ jobs: GIT_SHA=${{ env.GITHUB_SHA }} DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }} PLATFORM=${{ env.PLATFORM }} + build_type=${{ env.BUILD_TYPE }} + is_gha_build=true + aws_access_key_id=${{ steps.aws-creds.outputs.aws-access-key-id }} + aws_secret_access_key=${{ steps.aws-creds.outputs.aws-secret-access-key }} + aws_session_token=${{ steps.aws-creds.outputs.aws-session-token }} + sccache_bucket=${{ secrets.AWS_S3_BUCKET_GITHUB_TGI_TEST }} + sccache_s3_key_prefix=${{ env.SCCACHE_S3_KEY_PREFIX }} + sccache_region=${{ env.SCCACHE_REGION }} tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }} labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }} cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min @@ -215,3 +248,22 @@ jobs: echo $DOCKER_IMAGE docker pull $DOCKER_IMAGE pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST} + + backend_trtllm_cxx_tests: + needs: build-and-push + if: needs.build-and-push.outputs.label == '-trtllm' + concurrency: + group: ${{ github.workflow }}-${{ github.job }}-trtllm-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + runs-on: + group: aws-g6-12xl-plus-priv-cache + container: + image: ${{ needs.build-and-push.outputs.docker_image }} + credentials: + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_PASSWORD }} + options: --gpus all --shm-size=8g + + steps: + - name: Run C++/CUDA tests + run: /usr/local/tgi/bin/tgi_trtllm_backend_tests diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml index 0d87cb29..d8746b65 100644 --- a/.github/workflows/ci_build.yaml +++ b/.github/workflows/ci_build.yaml @@ -42,6 +42,7 @@ jobs: permissions: contents: write packages: write + id-token: write with: hardware: ${{ matrix.hardware }} # https://github.com/actions/runner/issues/2206 diff --git a/Cargo.toml b/Cargo.toml index d8155153..9f49c9ab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,21 +1,21 @@ [workspace] members = [ - "benchmark", - "backends/v2", - "backends/v3", - "backends/grpc-metadata", - "backends/trtllm", - "launcher", - "router" + "benchmark", + "backends/v2", + "backends/v3", + "backends/grpc-metadata", + "backends/trtllm", + "launcher", + "router" ] default-members = [ - "benchmark", - "backends/v2", - "backends/v3", - "backends/grpc-metadata", - # "backends/trtllm", - "launcher", - "router" + "benchmark", + "backends/v2", + "backends/v3", + "backends/grpc-metadata", + # "backends/trtllm", + "launcher", + "router" ] resolver = "2" diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm index ecefc048..40972764 100644 --- a/Dockerfile_trtllm +++ b/Dockerfile_trtllm @@ -1,19 +1,7 @@ -ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real" -ARG OMPI_VERSION="4.1.7rc1" - -# Build dependencies resolver stage -FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef -WORKDIR /usr/src/text-generation-inference/backends/trtllm - -FROM chef AS planner -COPY Cargo.lock Cargo.lock -COPY Cargo.toml Cargo.toml -COPY rust-toolchain.toml rust-toolchain.toml -COPY router router -COPY benchmark/ benchmark/ -COPY backends/ backends/ -COPY launcher/ launcher/ -RUN cargo chef prepare --recipe-path recipe.json +ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real" +ARG ompi_version="4.1.7rc1" +ARG build_type=release +ARG is_gha_build=false # CUDA dependent dependencies resolver stage FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder @@ -26,8 +14,11 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ g++-14 \ git \ git-lfs \ + lld \ libssl-dev \ libucx-dev \ + libasan8 \ + libubsan1 \ ninja-build \ pkg-config \ pipx \ @@ -43,9 +34,9 @@ ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt # Install OpenMPI FROM cuda-builder AS mpi-builder -ARG OMPI_VERSION +ARG ompi_version -ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2" +ENV OMPI_TARBALL_FILENAME="openmpi-$ompi_version.tar.bz2" RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \ mkdir /usr/src/mpi && \ tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \ @@ -65,34 +56,56 @@ RUN chmod +x /opt/install_tensorrt.sh && \ FROM cuda-builder AS tgi-builder WORKDIR /usr/src/text-generation-inference +# Scoped global args reuse +ARG is_gha_build +ARG build_type + # Install Rust +ENV PATH="/root/.cargo/bin:$PATH" RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \ chmod -R a+w /root/.rustup && \ - chmod -R a+w /root/.cargo + chmod -R a+w /root/.cargo && \ + cargo install sccache --locked -ENV PATH="/root/.cargo/bin:$PATH" -RUN cargo install cargo-chef +# SCCACHE Specifics args - before finding a better, more generic, way... +ARG aws_access_key_id +ARG aws_secret_access_key +ARG aws_session_token +ARG sccache_bucket +ARG sccache_s3_key_prefix +ARG sccache_region -# Cache dependencies -COPY --from=planner /usr/src/text-generation-inference/backends/trtllm/recipe.json . -RUN cargo chef cook --release --recipe-path recipe.json +ENV AWS_ACCESS_KEY_ID=$aws_access_key_id +ENV AWS_SECRET_ACCESS_KEY=$aws_secret_access_key +ENV AWS_SESSION_TOKEN=$aws_session_token +ENV SCCACHE_BUCKET=$sccache_bucket +ENV SCCACHE_S3_KEY_PREFIX=$sccache_s3_key_prefix +ENV SCCACHE_REGION=$sccache_region -# Build actual TGI -ARG CUDA_ARCH_LIST -ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH" ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH" ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig:$PKG_CONFIG_PATH" +ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH" + +ENV USE_LLD_LINKER=ON +ENV CUDA_ARCH_LIST=${cuda_arch_list} +ENV IS_GHA_BUILD=${is_gha_build} COPY Cargo.lock Cargo.lock COPY Cargo.toml Cargo.toml COPY rust-toolchain.toml rust-toolchain.toml COPY router router -COPY backends/trtllm backends/trtllm +COPY backends backends +COPY benchmark benchmark +COPY launcher launcher COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi + RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \ - cd backends/trtllm && \ - CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release + python3 backends/trtllm/scripts/setup_sccache.py --is-gha-build ${is_gha_build} && \ + CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX \ + RUSTC_WRAPPER=sccache \ + cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \ + sccache --show-stats FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \ @@ -116,6 +129,28 @@ FROM runtime LABEL co.huggingface.vendor="Hugging Face Inc." LABEL org.opencontainers.image.authors="hardware@hf.co" +LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend" ENTRYPOINT ["./text-generation-launcher"] CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"] + +# This is used only for the CI/CD +FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime +RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \ + rm -rf /var/lib/{apt,dpkg,cache,log}/ && \ + pipx ensurepath && \ + pipx install --include-deps transformers tokenizers + +WORKDIR /usr/local/tgi/bin + +ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH +ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH" +ENV TOKENIZERS_PARALLELISM=false +ENV OMPI_MCA_plm_rsh_agent="" + +COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi +COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt +COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi + +# Basically we copy from target/debug instead of target/release +COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher \ No newline at end of file diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 9c1f3436..2cbd3a07 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -1,11 +1,18 @@ cmake_minimum_required(VERSION 3.20) -if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug") +if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER) find_program(CCACHE_EXECUTABLE "ccache") if (CCACHE_EXECUTABLE) message(STATUS "Using ccache") - set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE) + set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}") + set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}") endif () +else () + message(STATUS "Using user specified cmake cxx compiler launcher: ${CMAKE_CXX_COMPILER_LAUNCHER}") + set(CMAKE_C_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}") + set(CMAKE_CUDA_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}") endif () if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") @@ -21,28 +28,31 @@ include(CheckCXXCompilerFlag) option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF) option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF) +option(TGI_TRTLLM_BACKEND_BUILD_USE_LLD "Enable lld linker instead of ld" OFF) set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support") -set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located") +set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path rgo where TensorRT libraries and headers are located") set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located") set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located") # We are using nvidia-ml to query at runtime device information to enable some architecture-specific features find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml) +find_package(MPI REQUIRED) #### External dependencies #### include(cmake/json.cmake) include(cmake/spdlog.cmake) include(cmake/trtllm.cmake) -if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + set(TGI_TRTLLM_BACKEND_DEBUG ON) add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1) -endif() + add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE) +endif () -# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function -check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO) -if(${COMPILER_SUPPORT_WARNING_ON_NVRO}) - set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro") -endif() +if (${TGI_TRTLLM_BACKEND_BUILD_USE_LLD}) + message(STATUS "Using lld linker") + add_link_options("-fuse-ld=lld") +endif () # Let's build TRTLLM as part of CMake add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..") @@ -55,51 +65,68 @@ add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp cs include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) target_include_directories(tgi_trtllm_backend_impl PRIVATE $ -# $ + # $ ) target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include") target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml) target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog) - -if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") - target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm) -else() - target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm) -endif () +target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper) # This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back -install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker) -install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB) +install(TARGETS tgi_trtllm_backend_impl) +install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker) +install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB) +if (NOT ${TGI_TRTLLM_BACKEND_DEBUG}) + install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB) +endif () + #### Unit Tests #### -if (${TGI_TRTLLM_BACKEND_BUILD_TESTS}) +if (${TGI_TRTLLM_BACKEND_BUILD_TESTS} AND CMAKE_BUILD_TYPE MATCHES "Debug") message(STATUS "Building tests") + option(TGI_TRTLLM_BACKEND_ENABLE_ASAN "Enable AddressSanitizer") + option(TGI_TRTLLM_BACKEND_ENABLE_UBSAN "Enable UndefinedSanitizer") + FetchContent_Declare( Catch2 URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz ) FetchContent_MakeAvailable(Catch2) + # This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function + check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO) + if (${COMPILER_SUPPORT_WARNING_ON_NVRO}) + message(STATUS "Enabling non-NVRO detection") + target_compile_options(tgi_trtllm_backend_impl "-Wnvro") + endif () + + cmake_path(GET TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH PARENT_PATH TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH) + message(STATUS "Adding linking path: ${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}") + add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp) + + # target_compile_options(tgi_trtllm_backend_tests PRIVATE -Werror) + target_link_directories(tgi_trtllm_backend_tests PRIVATE "${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}") target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include") target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/") target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml) target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl) + target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper) - if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") - target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm) - else() - target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm) + if (${TGI_TRTLLM_BACKEND_ENABLE_ASAN}) + message(STATUS "Enabled AddressSanitizer") + target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=address) endif () - if(CMAKE_BUILD_TYPE MATCHES "Debug") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address") - target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address) - endif() + if (${TGI_TRTLLM_BACKEND_ENABLE_UBSAN}) + message(STATUS "Enabled UndefinedSanitizer") + target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined) + endif () - list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) - include(CTest) - include(Catch) - catch_discover_tests(tgi_trtllm_backend_tests) -endif () + install(TARGETS tgi_trtllm_backend_tests) + + # list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) + # include(CTest) + # include(Catch) + # catch_discover_tests(tgi_trtllm_backend_tests) +endif () \ No newline at end of file diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index d9c1aa15..c18b13a9 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -3,6 +3,7 @@ use pkg_config; use std::env; use std::env::consts::ARCH; use std::path::{absolute, PathBuf}; +use std::sync::LazyLock; const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"]; const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST"); @@ -12,12 +13,20 @@ const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX"); const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR"); const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR"); +const IS_GHA_BUILD: LazyLock = LazyLock::new(|| { + option_env!("IS_GHA_BUILD").map_or(false, |value| match value.to_lowercase().as_str() { + "on" => true, + "true" => true, + "1" => true, + _ => false, + }) +}); + // Dependencies -const BACKEND_DEPS: [&str; 2] = ["tgi_trtllm_backend_impl", "tgi_trtllm_backend"]; +const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl"; const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"]; -const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [ +const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 4] = [ ("dylib", "tensorrt_llm"), - ("static", "tensorrt_llm_executor_static"), ("dylib", "tensorrt_llm_nvrtc_wrapper"), ("dylib", "nvinfer_plugin_tensorrt_llm"), ("dylib", "decoder_attention"), @@ -32,6 +41,48 @@ macro_rules! probe { }; } +fn get_compiler_flag( + switch: bool, + true_case: &'static str, + false_case: &'static str, +) -> &'static str { + match switch { + true => true_case, + false => false_case, + } +} + +fn get_library_architecture() -> &'static str { + let os = env::var("CARGO_CFG_TARGET_OS").unwrap(); + let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); + let env = env::var("CARGO_CFG_TARGET_ENV").unwrap(); + + match os.as_str() { + "linux" => { + if env != "gnu" { + panic!("unsupported linux ABI {env}, only 'gnu' is supported") + } + + match arch.as_str() { + "x86_64" => "x86_64-linux-gnu", + "aarch64" => "aarch64-linux-gnu", + _ => panic!("unsupported linux architecture {arch}"), + } + } + "windows" => { + if env != "msvc" { + panic!("unsupported windows ABI {env}, only 'msvc' is supported") + } + + match arch.as_str() { + "x86_64" => "x86_64-windows-msvc", + _ => panic!("unsupported windows architecture {arch}"), + } + } + _ => panic!("unsupported OS {os}"), + } +} + fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) { // Build the backend implementation through CMake let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi"); @@ -54,10 +105,45 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf .env("OPT_LEVEL", opt_level) .define("CMAKE_INSTALL_PREFIX", &install_path) .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc") - .define("Python3_ROOT_DIR", "../venv") + .define("CMAKE_LIBRARY_ARCHITECTURE", get_library_architecture()) .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list) + .define( + "TGI_TRTLLM_BACKEND_DEBUG", + get_compiler_flag(is_debug, "ON", "OFF"), + ) .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path); + if is_debug || *IS_GHA_BUILD { + config.define("TGI_TRTLLM_BACKEND_BUILD_TESTS", "ON"); + } + + if option_env!("USE_LLD_LINKER").is_some() { + println!("cargo:warning=Using lld linker"); + config.define("TGI_TRTLLM_BACKEND_BUILD_USE_LLD", "ON"); + } + + if (is_debug && option_env!("ENABLE_ASAN").is_some()) || *IS_GHA_BUILD { + println!("cargo:warning=Enabling Address Sanitizer"); + config.define("TGI_TRTLLM_BACKEND_ENABLE_ASAN", "ON"); + } + + if (is_debug && option_env!("ENABLE_UBSAN").is_some()) || *IS_GHA_BUILD { + println!("cargo:warning=Enabling Undefined Sanitizer"); + config.define("TGI_TRTLLM_BACKEND_ENABLE_UBSAN", "ON"); + } + + if let Some(nvcc_host_compiler) = option_env!("CMAKE_CUDA_HOST_COMPILER") { + config.define("CMAKE_CUDA_HOST_COMPILER", nvcc_host_compiler); + } + + if let Some(wrapper) = option_env!("RUSTC_WRAPPER") { + println!("cargo:warning=Using caching tool: {wrapper}"); + + env::set_var("CMAKE_C_COMPILER_LAUNCHER", wrapper); + env::set_var("CMAKE_CXX_COMPILER_LAUNCHER", wrapper); + env::set_var("CMAKE_CUDA_COMPILER_LAUNCHER", wrapper); + } + // Allow to override which Python to use ... if let Some(python3) = option_env!("Python3_EXECUTABLE") { config.define("Python3_EXECUTABLE", python3); @@ -78,23 +164,18 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf } // Emit linkage information from the artifacts we just built - let install_lib_path = install_path.join("lib"); - - println!( - r"cargo:warning=Adding link search path: {}", - install_lib_path.display() - ); - println!(r"cargo:rustc-link-search={}", install_lib_path.display()); - + for path in ["lib", "lib64"] { + let install_lib_path = install_path.join(path); + println!( + r"cargo:warning=Adding link search path: {}", + install_lib_path.display() + ); + println!(r"cargo:rustc-link-search={}", install_lib_path.display()); + } (PathBuf::from(install_path), deps_folder) } fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) { - let ndebug = match is_debug { - true => "1", - false => "0", - }; - CFG.include_prefix = "backends/trtllm"; cxx_build::bridge("src/lib.rs") .static_flag(true) @@ -106,7 +187,10 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) { .include("/usr/local/tensorrt/include") .include("csrc/") .file("csrc/ffi.hpp") - .define("TGI_TRTLLM_BACKEND_DEBUG", ndebug) + .define( + "TGI_TRTLLM_BACKEND_DEBUG", + get_compiler_flag(is_debug, "ON", "OFF"), + ) .compile("tgi_trtllm_backend"); println!("cargo:rerun-if-changed=CMakeLists.txt"); @@ -125,6 +209,7 @@ fn main() { let build_profile = env::var("PROFILE").unwrap(); let (is_debug, opt_level) = match build_profile.as_ref() { "debug" => (true, "0"), + "dev" => (true, "0"), _ => (false, "3"), }; @@ -161,7 +246,5 @@ fn main() { }); // Backend - BACKEND_DEPS.iter().for_each(|name| { - println!("cargo:rustc-link-lib=static={}", name); - }); -} + println!("cargo:rustc-link-lib=static={}", &BACKEND_DEPS); +} \ No newline at end of file diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake index 45e6790a..e7566cd7 100644 --- a/backends/trtllm/cmake/spdlog.cmake +++ b/backends/trtllm/cmake/spdlog.cmake @@ -4,14 +4,14 @@ set(SPDLOG_FMT_EXTERNAL OFF) # Define the level at which SPDLOG_ compilation level is defined if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") - add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG) + add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE) else () - add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO) + add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG) endif () fetchcontent_declare( spdlog -# DOWNLOAD_EXTRACT_TIMESTAMP - URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz + # DOWNLOAD_EXTRACT_TIMESTAMP + URL https://github.com/gabime/spdlog/archive/refs/tags/v1.15.0.tar.gz ) fetchcontent_makeavailable(spdlog) diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake index 4217892b..d789b1eb 100644 --- a/backends/trtllm/cmake/trtllm.cmake +++ b/backends/trtllm/cmake/trtllm.cmake @@ -14,11 +14,13 @@ message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}") set(ENABLE_UCX OFF) if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") set(FAST_BUILD ON) - set(NVTX_DISABLE OFF) + set(NVTX_DISABLE ON) + set(INDEX_RANGE_CHECK ON) else () set(FAST_BUILD OFF) set(FAST_MATH ON) - set(NVTX_DISABLE ON) + set(NVTX_DISABLE OFF) + set(INDEX_RANGE_CHECK OFF) endif () find_package(Python3 REQUIRED Interpreter) diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp index b50044d8..2151466b 100644 --- a/backends/trtllm/csrc/backend.cpp +++ b/backends/trtllm/csrc/backend.cpp @@ -1,7 +1,6 @@ #include #include -#include #include "backend.hpp" #include "hardware.hpp" @@ -17,7 +16,8 @@ namespace huggingface::tgi::backends::trtllm { if (world_size > 1) { SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode"); mode = tle::CommunicationMode::kORCHESTRATOR; - orchestratorConfig = std::make_optional(true, executor_worker_path_, nullptr, true); + orchestratorConfig = std::make_optional(true, executor_worker_path_, nullptr, + true); } else { SPDLOG_INFO("Detected single engine deployment, using leader mode"); } @@ -44,21 +44,22 @@ namespace huggingface::tgi::backends::trtllm { } backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path) - : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {} + : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {} size_t backend_t::num_tokens_ready() const noexcept { return executor_.getNumResponsesReady(); } std::expected - backend_t::submit(std::span token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept { - SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params); - return executor_.enqueueRequest(tle::Request { + backend_t::submit(std::span token_ids, const generation_params_t g_params, + const sampling_params_t s_params) noexcept { + SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params); + return executor_.enqueueRequest(tle::Request{ {token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens - static_cast(generation_params.max_new_tokens), + static_cast(g_params.max_new_tokens), true, - (tle::SamplingConfig) sampling_params, - tle::OutputConfig { /* returnLogProbs= */ true }, + (tle::SamplingConfig) s_params, + tle::OutputConfig{ /* returnLogProbs= */ true}, std::nullopt, std::nullopt, std::nullopt, diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp index d0342d4b..840614bb 100644 --- a/backends/trtllm/csrc/ffi.hpp +++ b/backends/trtllm/csrc/ffi.hpp @@ -28,20 +28,62 @@ namespace huggingface::tgi::backends::trtllm { #include "backends/trtllm/src/lib.rs.h" + namespace huggingface::tgi::backends::trtllm { std::once_flag backend_initialized_flag; + constexpr finish_reason_t as_finish_reason_t(const tle::FinishReason reason) noexcept { + switch (reason) { + case tle::FinishReason::kNOT_FINISHED: + return finish_reason_t::kNOT_FINISHED; + case tle::FinishReason::kSTOP_WORDS: + return finish_reason_t::kSTOP_WORDS; + case tle::FinishReason::kEND_ID: + return finish_reason_t::kEND_ID; + case tle::FinishReason::kLENGTH: + return finish_reason_t::kLENGTH; + default: + std::unreachable(); + } + } + + static auto as_generation_step = [](const tle::Response &r) { + const auto reqId = r.getRequestId(); + if (!r.hasError()) [[likely]] { + const auto result = r.getResult(); + const auto logits = result.logProbs.value()[0]; + return generation_step_t{ + reqId, + static_cast(result.outputTokenIds[0][0]), + logits.back(), + result.isFinal, + as_finish_reason_t(result.finishReasons[0]), + false, + std::string() + }; + } else { + return generation_step_t{ + reqId, + 0, + 0.0, + true, + finish_reason_t::kNOT_FINISHED, + true, + std::move(r.getErrorMsg()) + }; + } + }; + + class tensorrt_llm_backend_t { private: backend_t inner_; public: tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path) - : inner_(engine_folder, executor_worker_path) {} + : inner_(engine_folder, executor_worker_path) {} - size_t num_tokens_ready() const noexcept { - return inner_.num_tokens_ready(); - } + size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); } request_id_t submit( rust::Slice tokens, @@ -59,13 +101,13 @@ namespace huggingface::tgi::backends::trtllm { // Submit the request to the executor and get back a potential request_id used to track request status const auto signed_tokens = std::vector(tokens.begin(), tokens.end()); const auto maybe_request_id = inner_.submit( - signed_tokens, - {max_new_tokens}, - {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed} + signed_tokens, + {max_new_tokens}, + {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed} ); // If we do have a value, let's return the request_id - if(maybe_request_id.has_value()) [[likely]] { + if (maybe_request_id.has_value()) [[likely]] { return *maybe_request_id; } else { SPDLOG_WARN("[FFI] Failed to submit request to the executor"); @@ -74,61 +116,45 @@ namespace huggingface::tgi::backends::trtllm { } std::unique_ptr> pull_tokens() noexcept { - if(num_tokens_ready() > 0) [[likely]] { + if (num_tokens_ready() > 0) [[likely]] { const auto responses = inner_.pull_tokens(); SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size()); - // Transform tle::Response to GenerationStep - auto steps = std::make_unique>(); - std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) { - const auto reqId = r.getRequestId(); - if (!r.hasError()) [[likely]] { - const auto result = r.getResult(); - return generation_step_t{ - reqId, - static_cast(result.outputTokenIds[0][0]), - result.logProbs.value()[0][0], - result.isFinal, - false, - std::string() - }; - } else { - return generation_step_t{ - reqId, - 0, - 0.0, - true, - true, - std::move(r.getErrorMsg()) - }; - } - }); - return steps; + + // Transform tle::Response to generation_step_t +#ifdef __cpp_lib_ranges_to_container + auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to(); +#else + auto steps = std::vector(); + steps.reserve(responses.size()); + std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step); +#endif + return std::make_unique>(steps); } else { return std::make_unique>(); } } - void cancel(request_id_t requestId) noexcept { - SPDLOG_DEBUG("[FFI] cancelling request {:d}", requestId); - inner_.cancel(requestId); + void cancel(request_id_t request_id) noexcept { + SPDLOG_DEBUG("[FFI] cancelling request {:d}", request_id); + inner_.cancel(request_id); } }; void initialize_logging() { #ifndef TGI_TRTLLM_BACKEND_DEBUG if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) { - std::string log_level(TRTLLM_LOG_LEVEL_CSTR); - std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { - return std::tolower(c); - }); + std::string log_level(TRTLLM_LOG_LEVEL_CSTR); + std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { + return std::tolower(c); + }); - if (log_level == "debug") - spdlog::set_level(spdlog::level::debug); - else - spdlog::set_level(spdlog::level::info); - } + if (log_level == "debug") + spdlog::set_level(spdlog::level::debug); + else + spdlog::set_level(spdlog::level::info); + } #else spdlog::set_level(spdlog::level::debug); #endif @@ -151,11 +177,14 @@ namespace huggingface::tgi::backends::trtllm { } } - std::unique_ptr create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) { + std::unique_ptr + create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) { std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend); return std::make_unique( - std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format), - std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format) + std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), + std::filesystem::path::format::auto_format), + std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), + std::filesystem::path::format::auto_format) ); } } diff --git a/backends/trtllm/scripts/setup_sccache.py b/backends/trtllm/scripts/setup_sccache.py new file mode 100644 index 00000000..982f8c77 --- /dev/null +++ b/backends/trtllm/scripts/setup_sccache.py @@ -0,0 +1,49 @@ +from argparse import ArgumentParser + +AWS_S3_CACHING_VARIABLES = { + "AWS_ACCESS_KEY_ID": "aws_access_key_id", + "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key", + "AWS_SESSION_TOKEN": "aws_session_token", + "SCCACHE_REGION": "s3_region", + "SCCACHE_BUCKET": "s3_bucket_name", +} + +ALL_CACHING_STORAGE_VARIABLES = { + "AWS_S3_CACHING_VARIABLES" +} + + +def setup_sccache_locally(): + from os import environ + + print("Setting up Local Caching Layer") + for target in ALL_CACHING_STORAGE_VARIABLES: + for envvar in globals()[target].keys(): + if envvar in environ: + print(f"Deleted {envvar} from environment variables") + del environ[envvar] + + +def setup_sccache_for_s3(): + from os import environ + + print("Setting up AWS S3 Caching Layer") + for envvar in AWS_S3_CACHING_VARIABLES.keys(): + if not envvar in environ or not environ[envvar] or len(environ[envvar]) == 0: + print(f"Missing definition for environment variable {envvar}") + + +if __name__ == "__main__": + parser = ArgumentParser("TensorRT-LLM Build Caching Setup") + + parser.add_argument("--is-gha-build", type=str, default="FALSE", + help="Indicate if the build is from Github Actions") + + # Parse args + args = parser.parse_args() + args.is_gha_build = args.is_gha_build.lower() in {"on", "true", "1"} + + if args.is_gha_build: + setup_sccache_for_s3() + else: + setup_sccache_locally() diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index d6acafa1..08507256 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -6,6 +6,26 @@ mod utils; #[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")] mod ffi { + #[cxx_name = "finish_reason_t"] + #[derive(Debug, Clone, Copy)] + pub enum FinishReason { + /// The request is not finished. + #[cxx_name = "kNOT_FINISHED"] + NotFinished = 0u8, + + /// The request finished because the end id was generated. + #[cxx_name = "kEND_ID"] + EndTokenId = 1u8, + + /// The request finished because a stop word was generated. + #[cxx_name = "kSTOP_WORDS"] + StopWords = 2u8, + + /// The request finished because the maximum number of tokens was reached. + #[cxx_name = "kLENGTH"] + MaxLength = 3u8, + } + /// Struct used as shared type between rust and C++ to represent the result /// of a single decoding iteration #[cxx_name = "generation_step_t"] @@ -15,6 +35,7 @@ mod ffi { token_id: u32, log_prob: f32, is_final: bool, + finish_reason: FinishReason, has_error: bool, error_msg: String, } @@ -66,3 +87,17 @@ mod ffi { fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64); } } + +use ffi::FinishReason; +use text_generation_router::FinishReason as InferFinishReason; + +impl From for InferFinishReason { + fn from(reason: FinishReason) -> Self { + match reason { + FinishReason::StopWords => InferFinishReason::StopSequence, + FinishReason::MaxLength => InferFinishReason::Length, + FinishReason::EndTokenId => InferFinishReason::EndOfSequenceToken, + _ => panic!("Cannot convert {reason:?} to text_generation_router::FinishReason"), + } + } +} diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index af299f7d..5af96ade 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -11,7 +11,7 @@ use text_generation_router::server::{ get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer, }; use text_generation_router::usage_stats::UsageStatsLevel; -use text_generation_router::{server, HubTokenizerConfig, Tokenizer}; +use text_generation_router::{server, Tokenizer}; /// App Configuration #[derive(Parser, Debug)] @@ -69,7 +69,7 @@ struct Args { async fn get_tokenizer( tokenizer_name: &str, - tokenizer_config_path: Option<&str>, + _tokenizer_config_path: Option<&str>, revision: Option<&str>, ) -> Option { // Parse Huggingface hub token diff --git a/backends/trtllm/tests/test_backend.cpp b/backends/trtllm/tests/test_backend.cpp index 14d92b75..f44cc03f 100644 --- a/backends/trtllm/tests/test_backend.cpp +++ b/backends/trtllm/tests/test_backend.cpp @@ -8,13 +8,13 @@ #include "backend.hpp" - - using namespace huggingface::tgi::backends::trtllm; TEST_CASE("parse generation_config.json all set", "[generation_config_t]") { - const json config_j = {{"temperature", 0.6}, {"top_p", 0.95}, {"eos_token_id", {1,2,3}}}; + const json config_j = {{"temperature", 0.6}, + {"top_p", 0.95}, + {"eos_token_id", {1, 2, 3}}}; const auto generation_config = generation_config_t(config_j); REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(0.6, 1e-6)); @@ -24,8 +24,9 @@ TEST_CASE("parse generation_config.json all set", "[generation_config_t]") REQUIRE_FALSE(generation_config.stop_words.empty()); REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size()); - for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list>{{1}, {2}, {3}})) - { + for (auto [lhs, rhs]: std::views::zip(generation_config.stop_words, std::list>{{1}, + {2}, + {3}})) { // Currently we do not support multi-tokens stop words REQUIRE(lhs.size() == 1); REQUIRE(rhs.size() == 1); @@ -35,7 +36,7 @@ TEST_CASE("parse generation_config.json all set", "[generation_config_t]") TEST_CASE("parse generation_config.json default", "[generation_config_t]") { - const json config_j = {{"eos_token_id", {1,2,3}}}; + const json config_j = {{"eos_token_id", {1, 2, 3}}}; const auto generation_config = generation_config_t(config_j); REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6)); @@ -44,8 +45,9 @@ TEST_CASE("parse generation_config.json default", "[generation_config_t]") REQUIRE_FALSE(generation_config.stop_words.empty()); REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size()); - for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list>{{1}, {2}, {3}})) - { + for (auto [lhs, rhs]: std::views::zip(generation_config.stop_words, std::list>{{1}, + {2}, + {3}})) { // Currently we do not support multi-tokens stop words REQUIRE(lhs.size() == 1); REQUIRE(rhs.size() == 1);