feat(tgi_common): add initial set of common functions for reuse

This commit is contained in:
Morgan Funtowicz 2024-09-27 18:53:56 +02:00
parent 31a6065fac
commit 5f9120da9c
10 changed files with 112 additions and 162 deletions

View File

@ -16,16 +16,16 @@ option(TGI_BUILD_CCL "Flag to enable/disable build of tgiccl collective library"
# Add some modules # Add some modules
include(FetchContent) include(FetchContent)
include(cmake/fmt.cmake)
include(cmake/spdlog.cmake) include(cmake/spdlog.cmake)
# Let's find LibTorch # Let's find LibTorch
include(cmake/torch.cmake) include(cmake/torch.cmake)
find_package(Python3 COMPONENTS Interpreter)
ProbeForPyTorchInstall()
ConfigurePyTorch()
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") find_package(Python3 COMPONENTS Interpreter)
# TGI common
add_subdirectory(common)
# Include submodules # Include submodules
if (${TGI_BUILD_CCL}) if (${TGI_BUILD_CCL})

6
csrc/cmake/fmt.cmake Normal file
View File

@ -0,0 +1,6 @@
FetchContent_Declare(
fmt
GIT_REPOSITORY https://github.com/fmtlib/fmt
GIT_TAG 11.0.1
)
FetchContent_MakeAvailable(fmt)

View File

@ -1,6 +1,17 @@
set(SPDLOG_USE_FMT ON)
set(SPDLOG_BUILD_SHARED OFF)
set(SPDLOG_FMT_EXTERNAL ON)
# Define the level at which SPDLOG_ compilation level is defined
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
else ()
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
endif ()
fetchcontent_declare( fetchcontent_declare(
spdlog spdlog
URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz GIT_REPOSITORY https://github.com/gabime/spdlog.git
GIT_TAG v1.14.1
) )
fetchcontent_makeavailable(spdlog) fetchcontent_makeavailable(spdlog)

View File

@ -1,148 +1,7 @@
# ProbeForPyTorchInstall fetchcontent_declare(
# Attempts to find a Torch installation and set the Torch_ROOT variable Torch
# based on introspecting the python environment. This allows a subsequent URL https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.4.1%2Bcu124.zip
# call to find_package(Torch) to work. # OVERRIDE_FIND_PACKAGE
function(ProbeForPyTorchInstall)
if (Torch_ROOT)
message(STATUS "Using cached Torch root = ${Torch_ROOT}")
else ()
message(STATUS "Checking for PyTorch using ${Python3_EXECUTABLE} ...")
execute_process(
COMMAND ${Python3_EXECUTABLE}
-c "import os;import torch;print(torch.utils.cmake_prefix_path, end='')"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE PYTORCH_STATUS
OUTPUT_VARIABLE PYTORCH_PACKAGE_DIR)
if (NOT PYTORCH_STATUS EQUAL "0")
message(STATUS "Unable to 'import torch' with ${Python3_EXECUTABLE} (fallback to explicit config)")
return()
endif ()
message(STATUS "Found PyTorch installation at ${PYTORCH_PACKAGE_DIR}")
set(Torch_ROOT "${PYTORCH_PACKAGE_DIR}" CACHE STRING
"Torch configure directory" FORCE)
endif ()
endfunction()
# ConfigurePyTorch
# Extensions compiled against PyTorch must be ABI-compatible with PyTorch.
# On Linux, there are two components to this:
# 1) Dual ABI settings for libstdc++
# See https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html
# For this, PyTorch helpfully provides a function to check which ABI it was
# compiled against.
# 2) C++ ABI compatibility version
# See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html (Sec 5/6)
# The second is a bit more complicated. GCC has official compatibility strings
# which can be specified by -fabi-version. Clang has no notion of ABI
# versioning (https://lists.llvm.org/pipermail/cfe-dev/2015-June/043735.html).
# Separately, pybind11 keeps an internal variable which records its ABI info
# (PYBIND11_INTERNALS_ID in include/pybind11/detail/internals.h). Differences
# in this variable between torch-mlir and PyTorch will cause type errors.
# Thus, our best option is to:
# a) Identify which ABI version PyTorch was compiled with
# b) Tell gcc to use that version
# or
# c) Tell clang to pretend to use it and hope it's ABI-compatible, and
# tell pybind to pretend we're gcc.
#
# MacOS does not have a dual ABI problem.
# FIXME: I don't know if MacOS needs ABI compatibility version flags.
#
# In the future, we may want to switch away from custom building these
# extensions and instead rely on the Torch machinery directly (definitely want
# to do that for official builds).
function(ConfigurePyTorch)
message(STATUS "Checking PyTorch ABI settings...")
if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
# Check dual ABI setting first
execute_process(
COMMAND ${Python3_EXECUTABLE}
-c "import torch; import sys; sys.stdout.write('1' if torch.compiled_with_cxx11_abi() else '0')"
RESULT_VARIABLE _result
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE _use_cxx11_abi)
if (_result)
message(FATAL_ERROR "Failed to determine C++ Dual ABI: ${Python3_EXECUTABLE} -> ${_result}")
endif ()
message(STATUS "PyTorch C++ Dual ABI setting: \"${_use_cxx11_abi}\"")
# Check ABI compatibility version
execute_process(
COMMAND ${Python3_EXECUTABLE}
-c "import torch; import sys; abi=torch._C._PYBIND11_BUILD_ABI; abi.startswith('_cxxabi10') or sys.exit(1); sys.stdout.write(str(abi[-2:]))"
RESULT_VARIABLE _result
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE _cxx_abi_version)
if (_result)
message(FATAL_ERROR "Failed to determine C++ ABI version")
endif ()
message(STATUS "PyTorch C++ ABI version: \"${_cxx_abi_version}\"")
# Specialize compile flags for compiler
if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -fabi-version=${_cxx_abi_version}")
elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -U__GXX_ABI_VERSION -D__GXX_ABI_VERSION=10${_cxx_abi_version} '-DPYBIND11_COMPILER_TYPE=\"_gcc\"'")
else ()
message(WARNING "Unrecognized compiler. Cannot determine ABI flags.")
return()
endif ()
set(TORCH_CXXFLAGS "${TORCH_CXXFLAGS}" PARENT_SCOPE)
endif ()
endfunction()
function(ConfigureLibTorch)
message(STATUS "Checking LibTorch ABI settings...")
if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
message(STATUS "libtorch_python is ${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so")
# Check dual ABI setting first
execute_process(
COMMAND bash "-c" "cat ${TORCH_INSTALL_PREFIX}/share/cmake/Torch/TorchConfig.cmake | egrep -o '_GLIBCXX_USE_CXX11_ABI=[0-1]' | egrep -o '.$'"
RESULT_VARIABLE _result
OUTPUT_VARIABLE _use_cxx11_abi
OUTPUT_STRIP_TRAILING_WHITESPACE)
if (_result)
message(FATAL_ERROR "Failed to determine LibTorch C++ Dual ABI")
endif ()
message(STATUS "LibTorch C++ Dual ABI setting: \"${_use_cxx11_abi}\"")
# Check ABI compatibility version
execute_process(
COMMAND bash "-c" "strings ${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so | egrep '^_cxxabi[0-9]{4}' | egrep -o '..$'"
RESULT_VARIABLE _result
OUTPUT_VARIABLE _cxx_abi_version
OUTPUT_STRIP_TRAILING_WHITESPACE)
if (_result)
message(FATAL_ERROR "Failed to determine LibTorch C++ ABI version")
endif ()
message(STATUS "LibTorch C++ ABI version: \"${_cxx_abi_version}\"")
# Specialize compile flags for compiler
if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -fabi-version=${_cxx_abi_version}")
elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
set(TORCH_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=${_use_cxx11_abi} -U__GXX_ABI_VERSION -D__GXX_ABI_VERSION=10${_cxx_abi_version} '-DPYBIND11_COMPILER_TYPE=\"_gcc\"'")
else ()
message(WARNING "Unrecognized compiler. Cannot determine ABI flags.")
return()
endif ()
set(TORCH_CXXFLAGS "${TORCH_CXXFLAGS}" PARENT_SCOPE)
endif ()
endfunction()
function(torch_mlir_python_target_compile_options target)
target_compile_options(${target} PRIVATE
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
# Enable RTTI and exceptions.
-frtti -fexceptions
# Noisy pybind warnings
-Wno-unused-value
-Wno-covered-switch-default
>
$<$<CXX_COMPILER_ID:MSVC>:
# Enable RTTI and exceptions.
/EHsc /GR>
) )
endfunction() FetchContent_MakeAvailable(Torch)
list(APPEND CMAKE_PREFIX_PATH ${Torch_SOURCE_DIR})

View File

@ -0,0 +1,16 @@
set(TGI_COMMON_HEADERS include/common/device.hpp)
set(TGI_COMMON_SOURCES lib/device.cpp)
add_library(tgi_common SHARED ${TGI_COMMON_HEADERS} ${TGI_COMMON_SOURCES})
target_link_libraries(tgi_common fmt::fmt spdlog::spdlog ${TORCH_LIBRARIES})
target_include_directories(tgi_common PRIVATE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/common>
$<INSTALL_INTERFACE:include>
)
target_include_directories(tgi_common PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include/>
)

View File

@ -0,0 +1,35 @@
//
// Created by morgan on 27/09/24.
//
#ifndef TGI_DEVICE_HPP
#define TGI_DEVICE_HPP
#include <cstdint>
#include <nvml.h>
#include <optional>
namespace huggingface::tgi {
using device_index_t = uint8_t;
/**
* Attempt to retrieve the referred GPU by its index on the system
* @param device Device index
* @return
*/
std::optional<nvmlDevice_t> GetDeviceByIndex(device_index_t device);
/**
* Check whether all the GPUs have direct remote memory access to each other
*/
bool IsP2PComplete();
/**
* Check if GPU "from" has remote memory access to GPU "to"
* @param from Originating GPU memory
* @param to Destination GPU memory
* @return True if p2p is available, false otherwise
*/
bool IsP2PAvailable(device_index_t from, device_index_t to);
}
#endif // TGI_DEVICE_HPP

View File

@ -0,0 +1,20 @@
//
// Created by morgan on 27/09/24.
//
#include "device.hpp"
std::optional<nvmlDevice_t> huggingface::tgi::GetDeviceByIndex(device_index_t device)
{
return std::nullopt;
}
bool huggingface::tgi::IsP2PComplete()
{
return false;
}
bool huggingface::tgi::IsP2PAvailable(device_index_t from, device_index_t to)
{
return false;
}

View File

@ -6,7 +6,7 @@ set(TGICCL_SOURCES TgiCclBackend.cpp)
find_package(CUDAToolkit REQUIRED) find_package(CUDAToolkit REQUIRED)
add_library(tgiccl SHARED ${TGICCL_HEADERS} ${TGICCL_SOURCES}) add_library(tgiccl SHARED ${TGICCL_HEADERS} ${TGICCL_SOURCES})
target_link_libraries(tgiccl PUBLIC spdlog::spdlog CUDA::nvml ${TORCH_LIBRARIES}) target_link_libraries(tgiccl PUBLIC tgi_common fmt::fmt spdlog::spdlog CUDA::nvml ${TORCH_LIBRARIES})
add_executable(test_tgiccl test_tgiccl.cpp) add_executable(test_tgiccl test_tgiccl.cpp)
target_link_libraries(test_tgiccl tgiccl spdlog::spdlog) target_link_libraries(test_tgiccl PUBLIC tgiccl fmt::fmt spdlog::spdlog ${TORCH_LIBRARIES})

View File

@ -7,5 +7,5 @@
int main() { int main() {
auto a = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 1); auto a = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 1);
auto b = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 2); auto b = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 2);
auto c = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 3); auto d = huggingface::tgi::tgiccl::IsNvLinkAvailable(0, 3);
} }

View File

@ -13,8 +13,8 @@
constexpr auto CLL_BACKEND_NAME = "tgiccl"; constexpr auto CLL_BACKEND_NAME = "tgiccl";
namespace huggingface::tgi::tgiccl { namespace huggingface::tgi::tgiccl
{
static std::once_flag NVML_INIT_FLAG; static std::once_flag NVML_INIT_FLAG;
#define ENSURE_NVML_INIT() std::call_once(NVML_INIT_FLAG, nvmlInit_v2); #define ENSURE_NVML_INIT() std::call_once(NVML_INIT_FLAG, nvmlInit_v2);
@ -46,7 +46,10 @@ namespace huggingface::tgi::tgiccl {
// Query link between both // Query link between both
nvmlGpuP2PStatus_t status; nvmlGpuP2PStatus_t status;
if(nvmlDeviceGetP2PStatus(devFrom.value(), devTo.value(), NVML_P2P_CAPS_INDEX_NVLINK, &status) != NVML_SUCCESS) if(nvmlDeviceGetP2PStatus(devFrom.value(), devTo.value(), NVML_P2P_CAPS_INDEX_NVLINK, &status) != NVML_SUCCESS)
{
SPDLOG_ERROR(FMT_STRING("Failed to retrieve the p2p status for device {:d} <-> {:d}"), from, to); SPDLOG_ERROR(FMT_STRING("Failed to retrieve the p2p status for device {:d} <-> {:d}"), from, to);
return false;
}
return status == NVML_P2P_STATUS_OK; return status == NVML_P2P_STATUS_OK;
} }