From 5f9120da9cc86a62bb6bb145629c53456f2e2168 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 27 Sep 2024 18:53:56 +0200 Subject: [PATCH] feat(tgi_common): add initial set of common functions for reuse --- csrc/CMakeLists.txt | 10 +- csrc/cmake/fmt.cmake | 6 + csrc/cmake/spdlog.cmake | 19 +++- csrc/cmake/torch.cmake | 155 ++------------------------ csrc/common/CMakeLists.txt | 16 +++ csrc/common/include/common/device.hpp | 35 ++++++ csrc/common/lib/device.cpp | 20 ++++ csrc/tgiccl/CMakeLists.txt | 4 +- csrc/tgiccl/test_tgiccl.cpp | 2 +- csrc/tgiccl/tgiccl.hpp | 7 +- 10 files changed, 112 insertions(+), 162 deletions(-) create mode 100644 csrc/cmake/fmt.cmake create mode 100644 csrc/common/CMakeLists.txt create mode 100644 csrc/common/include/common/device.hpp create mode 100644 csrc/common/lib/device.cpp diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt index 409fc297..2e0f8588 100644 --- a/csrc/CMakeLists.txt +++ b/csrc/CMakeLists.txt @@ -16,16 +16,16 @@ option(TGI_BUILD_CCL "Flag to enable/disable build of tgiccl collective library" # Add some modules include(FetchContent) +include(cmake/fmt.cmake) include(cmake/spdlog.cmake) # Let's find LibTorch include(cmake/torch.cmake) -find_package(Python3 COMPONENTS Interpreter) -ProbeForPyTorchInstall() -ConfigurePyTorch() - find_package(Torch REQUIRED) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") +find_package(Python3 COMPONENTS Interpreter) + +# TGI common +add_subdirectory(common) # Include submodules if (${TGI_BUILD_CCL}) diff --git a/csrc/cmake/fmt.cmake b/csrc/cmake/fmt.cmake new file mode 100644 index 00000000..a2105596 --- /dev/null +++ b/csrc/cmake/fmt.cmake @@ -0,0 +1,6 @@ +FetchContent_Declare( + fmt + GIT_REPOSITORY https://github.com/fmtlib/fmt + GIT_TAG 11.0.1 +) +FetchContent_MakeAvailable(fmt) \ No newline at end of file diff --git a/csrc/cmake/spdlog.cmake b/csrc/cmake/spdlog.cmake index d4e0c491..1410bd46 100644 --- a/csrc/cmake/spdlog.cmake +++ b/csrc/cmake/spdlog.cmake @@ -1,6 +1,17 @@ -fetchcontent_declare( - spdlog - URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz -) +set(SPDLOG_USE_FMT ON) +set(SPDLOG_BUILD_SHARED OFF) +set(SPDLOG_FMT_EXTERNAL ON) +# Define the level at which SPDLOG_ compilation level is defined +if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") + add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG) +else () + add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO) +endif () + +fetchcontent_declare( + spdlog + GIT_REPOSITORY https://github.com/gabime/spdlog.git + GIT_TAG v1.14.1 +) fetchcontent_makeavailable(spdlog) \ No newline at end of file diff --git a/csrc/cmake/torch.cmake b/csrc/cmake/torch.cmake index aa069d83..a8c2af15 100644 --- a/csrc/cmake/torch.cmake +++ b/csrc/cmake/torch.cmake @@ -1,148 +1,7 @@ -# ProbeForPyTorchInstall -# Attempts to find a Torch installation and set the Torch_ROOT variable -# based on introspecting the python environment. This allows a subsequent -# call to find_package(Torch) to work. -function(ProbeForPyTorchInstall) - if (Torch_ROOT) - message(STATUS "Using cached Torch root = ${Torch_ROOT}") - else () - message(STATUS "Checking for PyTorch using ${Python3_EXECUTABLE} ...") - execute_process( - COMMAND ${Python3_EXECUTABLE} - -c "import os;import torch;print(torch.utils.cmake_prefix_path, end='')" - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - RESULT_VARIABLE PYTORCH_STATUS - OUTPUT_VARIABLE PYTORCH_PACKAGE_DIR) - if (NOT PYTORCH_STATUS EQUAL "0") - message(STATUS "Unable to 'import torch' with ${Python3_EXECUTABLE} (fallback to explicit config)") - return() - endif () - message(STATUS "Found PyTorch installation at ${PYTORCH_PACKAGE_DIR}") - - set(Torch_ROOT "${PYTORCH_PACKAGE_DIR}" CACHE STRING - "Torch configure directory" FORCE) - endif () -endfunction() - - -# ConfigurePyTorch -# Extensions compiled against PyTorch must be ABI-compatible with PyTorch. -# On Linux, there are two components to this: -# 1) Dual ABI settings for libstdc++ -# See https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html -# For this, PyTorch helpfully provides a function to check which ABI it was -# compiled against. -# 2) C++ ABI compatibility version -# See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html (Sec 5/6) -# The second is a bit more complicated. GCC has official compatibility strings -# which can be specified by -fabi-version. Clang has no notion of ABI -# versioning (https://lists.llvm.org/pipermail/cfe-dev/2015-June/043735.html). -# Separately, pybind11 keeps an internal variable which records its ABI info -# (PYBIND11_INTERNALS_ID in include/pybind11/detail/internals.h). Differences -# in this variable between torch-mlir and PyTorch will cause type errors. -# Thus, our best option is to: -# a) Identify which ABI version PyTorch was compiled with -# b) Tell gcc to use that version -# or -# c) Tell clang to pretend to use it and hope it's ABI-compatible, and -# tell pybind to pretend we're gcc. -# -# MacOS does not have a dual ABI problem. -# FIXME: I don't know if MacOS needs ABI compatibility version flags. -# -# In the future, we may want to switch away from custom building these -# extensions and instead rely on the Torch machinery directly (definitely want -# to do that for official builds). -function(ConfigurePyTorch) - message(STATUS "Checking PyTorch ABI settings...") - if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - # Check dual ABI setting first - execute_process( - COMMAND ${Python3_EXECUTABLE} - -c "import torch; import sys; sys.stdout.write('1' if torch.compiled_with_cxx11_abi() else '0')" - RESULT_VARIABLE _result - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE _use_cxx11_abi) - if (_result) - message(FATAL_ERROR "Failed to determine C++ Dual ABI: ${Python3_EXECUTABLE} -> ${_result}") - endif () - message(STATUS "PyTorch C++ Dual ABI setting: \"${_use_cxx11_abi}\"") - - # Check ABI compatibility version - execute_process( - COMMAND ${Python3_EXECUTABLE} - -c "import torch; import sys; abi=torch._C._PYBIND11_BUILD_ABI; abi.startswith('_cxxabi10') or sys.exit(1); sys.stdout.write(str(abi[-2:]))" - RESULT_VARIABLE _result - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE _cxx_abi_version) - if (_result) - message(FATAL_ERROR "Failed to determine C++ ABI version") - endif () - message(STATUS "PyTorch C++ ABI version: \"${_cxx_abi_version}\"") - - # Specialize compile flags for compiler - if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") - set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -fabi-version=${_cxx_abi_version}") - elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") - set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -U__GXX_ABI_VERSION -D__GXX_ABI_VERSION=10${_cxx_abi_version} '-DPYBIND11_COMPILER_TYPE=\"_gcc\"'") - else () - message(WARNING "Unrecognized compiler. Cannot determine ABI flags.") - return() - endif () - set(TORCH_CXXFLAGS "${TORCH_CXXFLAGS}" PARENT_SCOPE) - endif () -endfunction() - -function(ConfigureLibTorch) - message(STATUS "Checking LibTorch ABI settings...") - if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") - message(STATUS "libtorch_python is ${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so") - # Check dual ABI setting first - execute_process( - COMMAND bash "-c" "cat ${TORCH_INSTALL_PREFIX}/share/cmake/Torch/TorchConfig.cmake | egrep -o '_GLIBCXX_USE_CXX11_ABI=[0-1]' | egrep -o '.$'" - RESULT_VARIABLE _result - OUTPUT_VARIABLE _use_cxx11_abi - OUTPUT_STRIP_TRAILING_WHITESPACE) - if (_result) - message(FATAL_ERROR "Failed to determine LibTorch C++ Dual ABI") - endif () - message(STATUS "LibTorch C++ Dual ABI setting: \"${_use_cxx11_abi}\"") - - # Check ABI compatibility version - execute_process( - COMMAND bash "-c" "strings ${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so | egrep '^_cxxabi[0-9]{4}' | egrep -o '..$'" - RESULT_VARIABLE _result - OUTPUT_VARIABLE _cxx_abi_version - OUTPUT_STRIP_TRAILING_WHITESPACE) - if (_result) - message(FATAL_ERROR "Failed to determine LibTorch C++ ABI version") - endif () - message(STATUS "LibTorch C++ ABI version: \"${_cxx_abi_version}\"") - - # Specialize compile flags for compiler - if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") - set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -fabi-version=${_cxx_abi_version}") - elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") - set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -U__GXX_ABI_VERSION -D__GXX_ABI_VERSION=10${_cxx_abi_version} '-DPYBIND11_COMPILER_TYPE=\"_gcc\"'") - else () - message(WARNING "Unrecognized compiler. Cannot determine ABI flags.") - return() - endif () - set(TORCH_CXXFLAGS "${TORCH_CXXFLAGS}" PARENT_SCOPE) - endif () -endfunction() - -function(torch_mlir_python_target_compile_options target) - target_compile_options(${target} PRIVATE - $<$,$,$>: - # Enable RTTI and exceptions. - -frtti -fexceptions - # Noisy pybind warnings - -Wno-unused-value - -Wno-covered-switch-default - > - $<$: - # Enable RTTI and exceptions. - /EHsc /GR> - ) -endfunction() \ No newline at end of file +fetchcontent_declare( + Torch + URL https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.4.1%2Bcu124.zip +# OVERRIDE_FIND_PACKAGE +) +FetchContent_MakeAvailable(Torch) +list(APPEND CMAKE_PREFIX_PATH ${Torch_SOURCE_DIR}) diff --git a/csrc/common/CMakeLists.txt b/csrc/common/CMakeLists.txt new file mode 100644 index 00000000..831c0821 --- /dev/null +++ b/csrc/common/CMakeLists.txt @@ -0,0 +1,16 @@ + +set(TGI_COMMON_HEADERS include/common/device.hpp) +set(TGI_COMMON_SOURCES lib/device.cpp) + +add_library(tgi_common SHARED ${TGI_COMMON_HEADERS} ${TGI_COMMON_SOURCES}) +target_link_libraries(tgi_common fmt::fmt spdlog::spdlog ${TORCH_LIBRARIES}) + +target_include_directories(tgi_common PRIVATE + $ + $ +) + +target_include_directories(tgi_common PUBLIC + $ + $ +) \ No newline at end of file diff --git a/csrc/common/include/common/device.hpp b/csrc/common/include/common/device.hpp new file mode 100644 index 00000000..183e78f8 --- /dev/null +++ b/csrc/common/include/common/device.hpp @@ -0,0 +1,35 @@ +// +// Created by morgan on 27/09/24. +// + +#ifndef TGI_DEVICE_HPP +#define TGI_DEVICE_HPP +#include +#include +#include + +namespace huggingface::tgi { + using device_index_t = uint8_t; + + /** + * Attempt to retrieve the referred GPU by its index on the system + * @param device Device index + * @return + */ + std::optional GetDeviceByIndex(device_index_t device); + + /** + * Check whether all the GPUs have direct remote memory access to each other + */ + bool IsP2PComplete(); + + /** + * Check if GPU "from" has remote memory access to GPU "to" + * @param from Originating GPU memory + * @param to Destination GPU memory + * @return True if p2p is available, false otherwise + */ + bool IsP2PAvailable(device_index_t from, device_index_t to); +} + +#endif // TGI_DEVICE_HPP diff --git a/csrc/common/lib/device.cpp b/csrc/common/lib/device.cpp new file mode 100644 index 00000000..2dcefed8 --- /dev/null +++ b/csrc/common/lib/device.cpp @@ -0,0 +1,20 @@ +// +// Created by morgan on 27/09/24. +// + +#include "device.hpp" + +std::optional huggingface::tgi::GetDeviceByIndex(device_index_t device) +{ + return std::nullopt; +} + +bool huggingface::tgi::IsP2PComplete() +{ + return false; +} + +bool huggingface::tgi::IsP2PAvailable(device_index_t from, device_index_t to) +{ + return false; +} diff --git a/csrc/tgiccl/CMakeLists.txt b/csrc/tgiccl/CMakeLists.txt index cda79d49..1437371a 100644 --- a/csrc/tgiccl/CMakeLists.txt +++ b/csrc/tgiccl/CMakeLists.txt @@ -6,7 +6,7 @@ set(TGICCL_SOURCES TgiCclBackend.cpp) find_package(CUDAToolkit REQUIRED) add_library(tgiccl SHARED ${TGICCL_HEADERS} ${TGICCL_SOURCES}) -target_link_libraries(tgiccl PUBLIC spdlog::spdlog CUDA::nvml ${TORCH_LIBRARIES}) +target_link_libraries(tgiccl PUBLIC tgi_common fmt::fmt spdlog::spdlog CUDA::nvml ${TORCH_LIBRARIES}) add_executable(test_tgiccl test_tgiccl.cpp) -target_link_libraries(test_tgiccl tgiccl spdlog::spdlog) \ No newline at end of file +target_link_libraries(test_tgiccl PUBLIC tgiccl fmt::fmt spdlog::spdlog ${TORCH_LIBRARIES}) \ No newline at end of file diff --git a/csrc/tgiccl/test_tgiccl.cpp b/csrc/tgiccl/test_tgiccl.cpp index 68eee85e..f7a25f6b 100644 --- a/csrc/tgiccl/test_tgiccl.cpp +++ b/csrc/tgiccl/test_tgiccl.cpp @@ -7,5 +7,5 @@ int main() { auto a = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 1); auto b = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 2); - auto c = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 3); + auto d = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 3); } \ No newline at end of file diff --git a/csrc/tgiccl/tgiccl.hpp b/csrc/tgiccl/tgiccl.hpp index 1bde03b0..782ee116 100644 --- a/csrc/tgiccl/tgiccl.hpp +++ b/csrc/tgiccl/tgiccl.hpp @@ -13,8 +13,8 @@ constexpr auto CLL_BACKEND_NAME = "tgiccl"; -namespace huggingface::tgi::tgiccl { - +namespace huggingface::tgi::tgiccl +{ static std::once_flag NVML_INIT_FLAG; #define ENSURE_NVML_INIT() std::call_once(NVML_INIT_FLAG, nvmlInit_v2); @@ -46,7 +46,10 @@ namespace huggingface::tgi::tgiccl { // Query link between both nvmlGpuP2PStatus_t status; if(nvmlDeviceGetP2PStatus(devFrom.value(), devTo.value(), NVML_P2P_CAPS_INDEX_NVLINK, &status) != NVML_SUCCESS) + { SPDLOG_ERROR(FMT_STRING("Failed to retrieve the p2p status for device {:d} <-> {:d}"), from, to); + return false; + } return status == NVML_P2P_STATUS_OK; }