feat(tgi_common): add initial set of common functions for reuse

2025-10-17 19:05:23 +00:00 · 2024-09-27 18:53:56 +02:00 · 2024-09-27 18:53:56 +02:00 · 5f9120da9c
commit 5f9120da9c
parent 31a6065fac
10 changed files with 112 additions and 162 deletions
--- a/csrc/CMakeLists.txt
+++ b/csrc/CMakeLists.txt
@ -16,16 +16,16 @@ option(TGI_BUILD_CCL "Flag to enable/disable build of tgiccl collective library"
 # Add some modules
 include(FetchContent)
 include(cmake/fmt.cmake)
 include(cmake/spdlog.cmake)
 # Let's find LibTorch
 include(cmake/torch.cmake)
 find_package(Python3 COMPONENTS Interpreter)
 ProbeForPyTorchInstall()
 ConfigurePyTorch()
 find_package(Torch REQUIRED)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+find_package(Python3 COMPONENTS Interpreter)
 # TGI common
 add_subdirectory(common)
 # Include submodules
 if (${TGI_BUILD_CCL})
--- a/csrc/cmake/fmt.cmake
+++ b/csrc/cmake/fmt.cmake
@ -0,0 +1,6 @@
 FetchContent_Declare(
        fmt
        GIT_REPOSITORY https://github.com/fmtlib/fmt
        GIT_TAG 11.0.1
 )
 FetchContent_MakeAvailable(fmt)
--- a/csrc/cmake/spdlog.cmake
+++ b/csrc/cmake/spdlog.cmake
@ -1,6 +1,17 @@
 set(SPDLOG_USE_FMT ON)
 set(SPDLOG_BUILD_SHARED OFF)
 set(SPDLOG_FMT_EXTERNAL ON)
 # Define the level at which SPDLOG_ compilation level is defined
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
 else ()
    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
 endif ()
 fetchcontent_declare(
        spdlog
-    URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
+        GIT_REPOSITORY https://github.com/gabime/spdlog.git
        GIT_TAG v1.14.1
 )
 fetchcontent_makeavailable(spdlog)
--- a/csrc/cmake/torch.cmake
+++ b/csrc/cmake/torch.cmake
@ -1,148 +1,7 @@
-# ProbeForPyTorchInstall
+fetchcontent_declare(
-# Attempts to find a Torch installation and set the Torch_ROOT variable
+    Torch
-# based on introspecting the python environment. This allows a subsequent
+    URL https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.4.1%2Bcu124.zip
-# call to find_package(Torch) to work.
+#    OVERRIDE_FIND_PACKAGE
 function(ProbeForPyTorchInstall)
    if (Torch_ROOT)
        message(STATUS "Using cached Torch root = ${Torch_ROOT}")
    else ()
        message(STATUS "Checking for PyTorch using ${Python3_EXECUTABLE} ...")
        execute_process(
                COMMAND ${Python3_EXECUTABLE}
                -c "import os;import torch;print(torch.utils.cmake_prefix_path, end='')"
                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
                RESULT_VARIABLE PYTORCH_STATUS
                OUTPUT_VARIABLE PYTORCH_PACKAGE_DIR)
        if (NOT PYTORCH_STATUS EQUAL "0")
            message(STATUS "Unable to 'import torch' with ${Python3_EXECUTABLE} (fallback to explicit config)")
            return()
        endif ()
        message(STATUS "Found PyTorch installation at ${PYTORCH_PACKAGE_DIR}")
        set(Torch_ROOT "${PYTORCH_PACKAGE_DIR}" CACHE STRING
                "Torch configure directory" FORCE)
    endif ()
 endfunction()
 # ConfigurePyTorch
 # Extensions compiled against PyTorch must be ABI-compatible with PyTorch.
 # On Linux, there are two components to this:
 #   1) Dual ABI settings for libstdc++
 #      See https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html
 # For this, PyTorch helpfully provides a function to check which ABI it was
 # compiled against.
 #   2) C++ ABI compatibility version
 #      See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html (Sec 5/6)
 # The second is a bit more complicated. GCC has official compatibility strings
 # which can be specified by -fabi-version. Clang has no notion of ABI
 # versioning (https://lists.llvm.org/pipermail/cfe-dev/2015-June/043735.html).
 # Separately, pybind11 keeps an internal variable which records its ABI info
 # (PYBIND11_INTERNALS_ID in include/pybind11/detail/internals.h). Differences
 # in this variable between torch-mlir and PyTorch will cause type errors.
 # Thus, our best option is to:
 #   a) Identify which ABI version PyTorch was compiled with
 #   b) Tell gcc to use that version
 #     or
 #   c) Tell clang to pretend to use it and hope it's ABI-compatible, and
 #      tell pybind to pretend we're gcc.
 #
 # MacOS does not have a dual ABI problem.
 # FIXME: I don't know if MacOS needs ABI compatibility version flags.
 #
 # In the future, we may want to switch away from custom building these
 # extensions and instead rely on the Torch machinery directly (definitely want
 # to do that for official builds).
 function(ConfigurePyTorch)
    message(STATUS "Checking PyTorch ABI settings...")
    if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
        # Check dual ABI setting first
        execute_process(
                COMMAND ${Python3_EXECUTABLE}
                -c "import torch; import sys; sys.stdout.write('1' if torch.compiled_with_cxx11_abi() else '0')"
                RESULT_VARIABLE _result
                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
                OUTPUT_VARIABLE _use_cxx11_abi)
        if (_result)
            message(FATAL_ERROR "Failed to determine C++ Dual ABI: ${Python3_EXECUTABLE} -> ${_result}")
        endif ()
        message(STATUS "PyTorch C++ Dual ABI setting: \"${_use_cxx11_abi}\"")
        # Check ABI compatibility version
        execute_process(
                COMMAND ${Python3_EXECUTABLE}
                -c "import torch; import sys; abi=torch._C._PYBIND11_BUILD_ABI; abi.startswith('_cxxabi10') or sys.exit(1); sys.stdout.write(str(abi[-2:]))"
                RESULT_VARIABLE _result
                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
                OUTPUT_VARIABLE _cxx_abi_version)
        if (_result)
            message(FATAL_ERROR "Failed to determine C++ ABI version")
        endif ()
        message(STATUS "PyTorch C++ ABI version: \"${_cxx_abi_version}\"")
        # Specialize compile flags for compiler
        if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
            set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -fabi-version=${_cxx_abi_version}")
        elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
            set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -U__GXX_ABI_VERSION -D__GXX_ABI_VERSION=10${_cxx_abi_version} '-DPYBIND11_COMPILER_TYPE=\"_gcc\"'")
        else ()
            message(WARNING "Unrecognized compiler. Cannot determine ABI flags.")
            return()
        endif ()
        set(TORCH_CXXFLAGS "${TORCH_CXXFLAGS}" PARENT_SCOPE)
    endif ()
 endfunction()
 function(ConfigureLibTorch)
    message(STATUS "Checking LibTorch ABI settings...")
    if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
        message(STATUS "libtorch_python is ${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so")
        # Check dual ABI setting first
        execute_process(
                COMMAND bash "-c" "cat ${TORCH_INSTALL_PREFIX}/share/cmake/Torch/TorchConfig.cmake | egrep -o '_GLIBCXX_USE_CXX11_ABI=[0-1]' | egrep -o '.$'"
                RESULT_VARIABLE _result
                OUTPUT_VARIABLE _use_cxx11_abi
                OUTPUT_STRIP_TRAILING_WHITESPACE)
        if (_result)
            message(FATAL_ERROR "Failed to determine LibTorch C++ Dual ABI")
        endif ()
        message(STATUS "LibTorch C++ Dual ABI setting: \"${_use_cxx11_abi}\"")
        # Check ABI compatibility version
        execute_process(
                COMMAND bash "-c" "strings ${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so | egrep '^_cxxabi[0-9]{4}' | egrep -o '..$'"
                RESULT_VARIABLE _result
                OUTPUT_VARIABLE _cxx_abi_version
                OUTPUT_STRIP_TRAILING_WHITESPACE)
        if (_result)
            message(FATAL_ERROR "Failed to determine LibTorch C++ ABI version")
        endif ()
        message(STATUS "LibTorch C++ ABI version: \"${_cxx_abi_version}\"")
        # Specialize compile flags for compiler
        if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
            set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -fabi-version=${_cxx_abi_version}")
        elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
            set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -U__GXX_ABI_VERSION -D__GXX_ABI_VERSION=10${_cxx_abi_version} '-DPYBIND11_COMPILER_TYPE=\"_gcc\"'")
        else ()
            message(WARNING "Unrecognized compiler. Cannot determine ABI flags.")
            return()
        endif ()
        set(TORCH_CXXFLAGS "${TORCH_CXXFLAGS}" PARENT_SCOPE)
    endif ()
 endfunction()
 function(torch_mlir_python_target_compile_options target)
    target_compile_options(${target} PRIVATE
            $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
            # Enable RTTI and exceptions.
            -frtti -fexceptions
            # Noisy pybind warnings
            -Wno-unused-value
            -Wno-covered-switch-default
            >
            $<$<CXX_COMPILER_ID:MSVC>:
            # Enable RTTI and exceptions.
            /EHsc /GR>
 )
-endfunction()
+FetchContent_MakeAvailable(Torch)
 list(APPEND CMAKE_PREFIX_PATH ${Torch_SOURCE_DIR})
--- a/csrc/common/CMakeLists.txt
+++ b/csrc/common/CMakeLists.txt
@ -0,0 +1,16 @@
 set(TGI_COMMON_HEADERS include/common/device.hpp)
 set(TGI_COMMON_SOURCES lib/device.cpp)
 add_library(tgi_common SHARED ${TGI_COMMON_HEADERS} ${TGI_COMMON_SOURCES})
 target_link_libraries(tgi_common fmt::fmt spdlog::spdlog ${TORCH_LIBRARIES})
 target_include_directories(tgi_common PRIVATE
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/common>
        $<INSTALL_INTERFACE:include>
 )
 target_include_directories(tgi_common PUBLIC
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
        $<INSTALL_INTERFACE:include/>
 )
--- a/csrc/common/include/common/device.hpp
+++ b/csrc/common/include/common/device.hpp
@ -0,0 +1,35 @@
 //
 // Created by morgan on 27/09/24.
 //
 #ifndef TGI_DEVICE_HPP
 #define TGI_DEVICE_HPP
 #include <cstdint>
 #include <nvml.h>
 #include <optional>
 namespace huggingface::tgi {
    using device_index_t = uint8_t;
    /**
     * Attempt to retrieve the referred GPU by its index on the system
     * @param device Device index
     * @return
     */
    std::optional<nvmlDevice_t> GetDeviceByIndex(device_index_t device);
    /**
     * Check whether all the GPUs have direct remote memory access to each other
     */
    bool IsP2PComplete();
    /**
     * Check if GPU "from" has remote memory access to GPU "to"
     * @param from Originating GPU memory
     * @param to Destination GPU memory
     * @return True if p2p is available, false otherwise
     */
    bool IsP2PAvailable(device_index_t from, device_index_t to);
 }
 #endif // TGI_DEVICE_HPP
--- a/csrc/common/lib/device.cpp
+++ b/csrc/common/lib/device.cpp
@ -0,0 +1,20 @@
 //
 // Created by morgan on 27/09/24.
 //
 #include "device.hpp"
 std::optional<nvmlDevice_t> huggingface::tgi::GetDeviceByIndex(device_index_t device)
 {
    return std::nullopt;
 }
 bool huggingface::tgi::IsP2PComplete()
 {
    return false;
 }
 bool huggingface::tgi::IsP2PAvailable(device_index_t from, device_index_t to)
 {
    return false;
 }
--- a/csrc/tgiccl/CMakeLists.txt
+++ b/csrc/tgiccl/CMakeLists.txt
@ -6,7 +6,7 @@ set(TGICCL_SOURCES TgiCclBackend.cpp)
 find_package(CUDAToolkit REQUIRED)
 add_library(tgiccl SHARED ${TGICCL_HEADERS} ${TGICCL_SOURCES})
-target_link_libraries(tgiccl PUBLIC spdlog::spdlog CUDA::nvml ${TORCH_LIBRARIES})
+target_link_libraries(tgiccl PUBLIC tgi_common fmt::fmt spdlog::spdlog CUDA::nvml ${TORCH_LIBRARIES})
 add_executable(test_tgiccl test_tgiccl.cpp)
-target_link_libraries(test_tgiccl tgiccl spdlog::spdlog)
+target_link_libraries(test_tgiccl PUBLIC tgiccl fmt::fmt spdlog::spdlog ${TORCH_LIBRARIES})
--- a/csrc/tgiccl/test_tgiccl.cpp
+++ b/csrc/tgiccl/test_tgiccl.cpp
@ -7,5 +7,5 @@
 int main() {
    auto a = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 1);
    auto b = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 2);
-    auto c = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 3);
+    auto d = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 3);
 }
--- a/csrc/tgiccl/tgiccl.hpp
+++ b/csrc/tgiccl/tgiccl.hpp
@ -13,8 +13,8 @@
 constexpr auto CLL_BACKEND_NAME = "tgiccl";
-namespace huggingface::tgi::tgiccl {
+namespace huggingface::tgi::tgiccl
-
+{
    static std::once_flag NVML_INIT_FLAG;
 #define  ENSURE_NVML_INIT() std::call_once(NVML_INIT_FLAG, nvmlInit_v2);
@ -46,7 +46,10 @@ namespace huggingface::tgi::tgiccl {
        // Query link between both
        nvmlGpuP2PStatus_t status;
        if(nvmlDeviceGetP2PStatus(devFrom.value(), devTo.value(), NVML_P2P_CAPS_INDEX_NVLINK, &status) != NVML_SUCCESS)
        {
            SPDLOG_ERROR(FMT_STRING("Failed to retrieve the p2p status for device {:d} <-> {:d}"), from, to);
            return false;
        }
        return status == NVML_P2P_STATUS_OK;
    }