mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
add readme
This commit is contained in:
parent
116769a5f5
commit
2c446f7bde
21
my_optims/README.md
Normal file
21
my_optims/README.md
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
## 通信优化说明文档
|
||||||
|
|
||||||
|
### 优化内容
|
||||||
|
1、allreduce算子
|
||||||
|
2、allgather_into_tensor算子
|
||||||
|
|
||||||
|
### 使用方法
|
||||||
|
1、拉取镜像docker pull sakurahua/lm_inference:tgi-dev
|
||||||
|
2、启动镜像docker run -it --rm --entrypoint /bin/bash --gpus all --net=host --shm-size=4G -v xxx:/code sakurahua/lm_inference:tgi-dev
|
||||||
|
3、启动服务
|
||||||
|
USE_CUSTOM_NCCL=1 CUDA_VISIBLE_DEVICES=0,1 /root/.cargo/bin/text-generation-launcher --model-id /code/models/llama-7b-hf --port 7777 --sharded false
|
||||||
|
4、验证服务
|
||||||
|
curl localhost:7777/generate -X POST -d '{"inputs":"who are you?","parameters":{"max_new_tokens":100,"details":false}}' -H 'Content-Type: application/json'
|
||||||
|
|
||||||
|
|
||||||
|
### 注意点
|
||||||
|
1、USE_CUSTOM_NCCL=1 开启通信算子,默认为0关闭
|
||||||
|
2、USE_TP_EMBEDDING=0 关闭embedding并行,默认为1开启
|
||||||
|
3、启用通信优化需搭配自定义启动命令/root/.cargo/bin/text-generation-launcher,如不启用优化,则使用原始的text-generation-launcher即可
|
||||||
|
4、启用通信优化--sharded false该参数为必须,其他参数与tgi保持一致。
|
||||||
|
|
14
my_optims/docker/Dockerfile
Normal file
14
my_optims/docker/Dockerfile
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
FROM ghcr.nju.edu.cn/huggingface/text-generation-inference:latest
|
||||||
|
MAINTAINER ailab
|
||||||
|
|
||||||
|
COPY . /code/
|
||||||
|
RUN cd /code/ && \
|
||||||
|
bash install_apt_source.sh && \
|
||||||
|
export DEBIAN_FRONTEND=noninteractive && \
|
||||||
|
apt update && \
|
||||||
|
apt install -y wget git vim ssh libxml2 && \
|
||||||
|
bash install_cuda.sh && \
|
||||||
|
bash install_mpi.sh && \
|
||||||
|
bash install_rust.sh && \
|
||||||
|
bash install_tgi.sh && \
|
||||||
|
bash bash_env.sh
|
4
my_optims/docker/bash_env.sh
Normal file
4
my_optims/docker/bash_env.sh
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
echo "add path"
|
||||||
|
echo 'export PATH=$PATH:/usr/local/bin' >> ~/.bashrc
|
||||||
|
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib' >> ~/.bashrc
|
||||||
|
echo 'export LD_LIBRARY_PATH=/opt/conda/lib/python3.9/site-packages/torch/lib/:$LD_LIBRARY_PATH'
|
16
my_optims/docker/install_apt_source.sh
Normal file
16
my_optims/docker/install_apt_source.sh
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
cat > /etc/apt/sources.list << EOF
|
||||||
|
deb https://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
|
||||||
|
deb-src https://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
|
||||||
|
|
||||||
|
deb https://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
|
||||||
|
deb-src https://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
|
||||||
|
|
||||||
|
deb https://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
|
||||||
|
deb-src https://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
|
||||||
|
|
||||||
|
# deb https://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
|
||||||
|
# deb-src https://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
|
||||||
|
|
||||||
|
deb https://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
|
||||||
|
deb-src https://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
|
||||||
|
EOF
|
2
my_optims/docker/install_cuda.sh
Normal file
2
my_optims/docker/install_cuda.sh
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
echo "install cuda"
|
||||||
|
bash cuda_11.8.0_520.61.05_linux.run --silent --toolkit
|
8
my_optims/docker/install_mpi.sh
Normal file
8
my_optims/docker/install_mpi.sh
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
set -e
|
||||||
|
echo "install openmpi"
|
||||||
|
tar -xjvf openmpi-4.1.6.tar.bz2
|
||||||
|
cd openmpi-4.1.6/
|
||||||
|
./configure --prefix=/usr/local
|
||||||
|
make all
|
||||||
|
make install
|
||||||
|
cd -
|
7
my_optims/docker/install_nccl.sh
Normal file
7
my_optims/docker/install_nccl.sh
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
set -e
|
||||||
|
echo "install nccl"
|
||||||
|
cp libnccl.so.2.17.1 /usr/lib/x86_64-linux-gnu/
|
||||||
|
cp nccl_net.h /usr/include/
|
||||||
|
cp nccl.h /usr/include/
|
||||||
|
ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2.17.1 /usr/lib/x86_64-linux-gnu/libnccl.so
|
||||||
|
ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2.17.1 /usr/lib/x86_64-linux-gnu/libnccl.so.2
|
6
my_optims/docker/install_rust.sh
Normal file
6
my_optims/docker/install_rust.sh
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
set -e
|
||||||
|
echo "install rust"
|
||||||
|
tar -zxvf rust-1.72.0-x86_64-unknown-linux-gnu.tar.gz
|
||||||
|
cd rust-1.72.0-x86_64-unknown-linux-gnu
|
||||||
|
./install.sh
|
||||||
|
cd -
|
10
my_optims/docker/install_tgi.sh
Normal file
10
my_optims/docker/install_tgi.sh
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
set -e
|
||||||
|
echo "install tgi"
|
||||||
|
python -m pip install Ninja
|
||||||
|
cd text-generation-inference/my_optims/nccl_test
|
||||||
|
python setup.py install
|
||||||
|
cd -
|
||||||
|
cd text-generation-inference/
|
||||||
|
make install-launcher
|
||||||
|
cp -r server/ /opt/conda/lib/python3.9/site-packages/text_generation_server/
|
||||||
|
cd -
|
411
my_optims/docker/nccl.h
Normal file
411
my_optims/docker/nccl.h
Normal file
@ -0,0 +1,411 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef NCCL_H_
|
||||||
|
#define NCCL_H_
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#if CUDART_VERSION >= 11000
|
||||||
|
#include <cuda_bf16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define NCCL_MAJOR 2
|
||||||
|
#define NCCL_MINOR 17
|
||||||
|
#define NCCL_PATCH 1
|
||||||
|
#define NCCL_SUFFIX ""
|
||||||
|
|
||||||
|
#define NCCL_VERSION_CODE 21701
|
||||||
|
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <limits.h>
|
||||||
|
/* Opaque handle to communicator */
|
||||||
|
typedef struct ncclComm* ncclComm_t;
|
||||||
|
#define NCCL_COMM_NULL NULL
|
||||||
|
|
||||||
|
#define NCCL_UNIQUE_ID_BYTES 128
|
||||||
|
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
|
||||||
|
|
||||||
|
/* Error type */
|
||||||
|
typedef enum { ncclSuccess = 0,
|
||||||
|
ncclUnhandledCudaError = 1,
|
||||||
|
ncclSystemError = 2,
|
||||||
|
ncclInternalError = 3,
|
||||||
|
ncclInvalidArgument = 4,
|
||||||
|
ncclInvalidUsage = 5,
|
||||||
|
ncclRemoteError = 6,
|
||||||
|
ncclInProgress = 7,
|
||||||
|
ncclNumResults = 8 } ncclResult_t;
|
||||||
|
|
||||||
|
#define NCCL_CONFIG_UNDEF_INT INT_MIN
|
||||||
|
#define NCCL_CONFIG_UNDEF_PTR NULL
|
||||||
|
|
||||||
|
/* Communicator configuration. Users can assign value to attributes to specify the
|
||||||
|
* behavior of a communicator. */
|
||||||
|
typedef struct ncclConfig_v21700 {
|
||||||
|
/* attributes that users should never touch. */
|
||||||
|
size_t size;
|
||||||
|
unsigned int magic;
|
||||||
|
unsigned int version;
|
||||||
|
/* attributes that users are able to customize. */
|
||||||
|
int blocking;
|
||||||
|
int cgaClusterSize;
|
||||||
|
int minCTAs;
|
||||||
|
int maxCTAs;
|
||||||
|
const char *netName;
|
||||||
|
} ncclConfig_t;
|
||||||
|
|
||||||
|
/* Config initializer must be assigned to initialize config structure when it is created.
|
||||||
|
* Not initialized config will result in NCCL error. */
|
||||||
|
#define NCCL_CONFIG_INITIALIZER { \
|
||||||
|
sizeof(ncclConfig_t), /* size */ \
|
||||||
|
0xcafebeef, /* magic */ \
|
||||||
|
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
|
||||||
|
NCCL_CONFIG_UNDEF_INT, /* blocking */ \
|
||||||
|
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
|
||||||
|
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
|
||||||
|
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
|
||||||
|
NCCL_CONFIG_UNDEF_PTR /* netName */ \
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
||||||
|
* This integer is coded with the MAJOR, MINOR and PATCH level of the
|
||||||
|
* NCCL library
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclGetVersion(int *version);
|
||||||
|
ncclResult_t pncclGetVersion(int *version);
|
||||||
|
|
||||||
|
/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
|
||||||
|
* called once and the Id should be distributed to all ranks in the
|
||||||
|
* communicator before calling ncclCommInitRank. */
|
||||||
|
ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||||
|
ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
|
||||||
|
|
||||||
|
/* Create a new communicator (multi thread/process version) with a configuration
|
||||||
|
* set by users. */
|
||||||
|
ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
|
||||||
|
ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
|
||||||
|
|
||||||
|
/* Creates a new communicator (multi thread/process version).
|
||||||
|
* rank must be between 0 and nranks-1 and unique within a communicator clique.
|
||||||
|
* Each rank is associated to a CUDA device, which has to be set before calling
|
||||||
|
* ncclCommInitRank.
|
||||||
|
* ncclCommInitRank implicitly syncronizes with other ranks, so it must be
|
||||||
|
* called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
|
||||||
|
ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
||||||
|
ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
|
||||||
|
|
||||||
|
/* Creates a clique of communicators (single process version).
|
||||||
|
* This is a convenience function to create a single-process communicator clique.
|
||||||
|
* Returns an array of ndev newly initialized communicators in comm.
|
||||||
|
* comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
|
||||||
|
* If devlist is NULL, the first ndev CUDA devices are used.
|
||||||
|
* Order of devlist defines user-order of processors within the communicator. */
|
||||||
|
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||||
|
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
|
||||||
|
|
||||||
|
/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
|
||||||
|
* and marks communicator state as ncclInProgress. The state will change to ncclSuccess
|
||||||
|
* when the communicator is globally quiescent and related resources are freed; then,
|
||||||
|
* calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
|
||||||
|
* itself) without blocking. */
|
||||||
|
ncclResult_t ncclCommFinalize(ncclComm_t comm);
|
||||||
|
ncclResult_t pncclCommFinalize(ncclComm_t comm);
|
||||||
|
|
||||||
|
/* Frees local resources associated with communicator object. */
|
||||||
|
ncclResult_t ncclCommDestroy(ncclComm_t comm);
|
||||||
|
ncclResult_t pncclCommDestroy(ncclComm_t comm);
|
||||||
|
|
||||||
|
/* Frees resources associated with communicator object and aborts any operations
|
||||||
|
* that might still be running on the device. */
|
||||||
|
ncclResult_t ncclCommAbort(ncclComm_t comm);
|
||||||
|
ncclResult_t pncclCommAbort(ncclComm_t comm);
|
||||||
|
|
||||||
|
/* Returns a string for each error code. */
|
||||||
|
const char* ncclGetErrorString(ncclResult_t result);
|
||||||
|
const char* pncclGetErrorString(ncclResult_t result);
|
||||||
|
|
||||||
|
/* Returns a human-readable message of the last error that occurred.
|
||||||
|
* comm is currently unused and can be set to NULL
|
||||||
|
*/
|
||||||
|
const char* ncclGetLastError(ncclComm_t comm);
|
||||||
|
const char* pncclGetLastError(ncclComm_t comm);
|
||||||
|
|
||||||
|
/* Checks whether the comm has encountered any asynchronous errors */
|
||||||
|
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||||
|
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||||
|
|
||||||
|
/* Gets the number of ranks in the communicator clique. */
|
||||||
|
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
|
||||||
|
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
|
||||||
|
|
||||||
|
/* Returns the cuda device number associated with the communicator. */
|
||||||
|
ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||||
|
ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||||
|
|
||||||
|
/* Returns the user-ordered "rank" associated with the communicator. */
|
||||||
|
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||||
|
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||||
|
|
||||||
|
/* Reduction operation selector */
|
||||||
|
typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
|
||||||
|
typedef enum { ncclSum = 0,
|
||||||
|
ncclProd = 1,
|
||||||
|
ncclMax = 2,
|
||||||
|
ncclMin = 3,
|
||||||
|
ncclAvg = 4,
|
||||||
|
/* ncclNumOps: The number of built-in ncclRedOp_t values. Also
|
||||||
|
* serves as the least possible value for dynamic ncclRedOp_t's
|
||||||
|
* as constructed by ncclRedOpCreate*** functions. */
|
||||||
|
ncclNumOps = 5,
|
||||||
|
/* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
|
||||||
|
* It is defined to be the largest signed value (since compilers
|
||||||
|
* are permitted to use signed enums) that won't grow
|
||||||
|
* sizeof(ncclRedOp_t) when compared to previous NCCL versions to
|
||||||
|
* maintain ABI compatibility. */
|
||||||
|
ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
|
||||||
|
} ncclRedOp_t;
|
||||||
|
|
||||||
|
/* Data types */
|
||||||
|
typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||||
|
ncclUint8 = 1,
|
||||||
|
ncclInt32 = 2, ncclInt = 2,
|
||||||
|
ncclUint32 = 3,
|
||||||
|
ncclInt64 = 4,
|
||||||
|
ncclUint64 = 5,
|
||||||
|
ncclFloat16 = 6, ncclHalf = 6,
|
||||||
|
ncclFloat32 = 7, ncclFloat = 7,
|
||||||
|
ncclFloat64 = 8, ncclDouble = 8,
|
||||||
|
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||||
|
ncclBfloat16 = 9,
|
||||||
|
ncclNumTypes = 10
|
||||||
|
#else
|
||||||
|
ncclNumTypes = 9
|
||||||
|
#endif
|
||||||
|
} ncclDataType_t;
|
||||||
|
|
||||||
|
/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
|
||||||
|
typedef enum {
|
||||||
|
/* ncclScalarDevice: The scalar is in device-visible memory and will be
|
||||||
|
* dereferenced while the collective is running. */
|
||||||
|
ncclScalarDevice = 0,
|
||||||
|
|
||||||
|
/* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
|
||||||
|
* dereferenced before the ncclRedOpCreate***() function returns. */
|
||||||
|
ncclScalarHostImmediate = 1
|
||||||
|
} ncclScalarResidence_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ncclRedOpCreatePreMulSum
|
||||||
|
*
|
||||||
|
* Creates a new reduction operator which pre-multiplies input values by a given
|
||||||
|
* scalar locally before reducing them with peer values via summation. For use
|
||||||
|
* only with collectives launched against *comm* and *datatype*. The
|
||||||
|
* *residence* argument indicates how/when the memory pointed to by *scalar*
|
||||||
|
* will be dereferenced. Upon return, the newly created operator's handle
|
||||||
|
* is stored in *op*.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||||
|
ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ncclRedOpDestroy
|
||||||
|
*
|
||||||
|
* Destroys the reduction operator *op*. The operator must have been created by
|
||||||
|
* ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
|
||||||
|
* destroyed as soon as the last NCCL function which is given that operator returns.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||||
|
ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Collective communication operations
|
||||||
|
*
|
||||||
|
* Collective communication operations must be called separately for each
|
||||||
|
* communicator in a communicator clique.
|
||||||
|
*
|
||||||
|
* They return when operations have been enqueued on the CUDA stream.
|
||||||
|
*
|
||||||
|
* Since they may perform inter-CPU synchronization, each call has to be done
|
||||||
|
* from a different thread or process, or need to use Group Semantics (see
|
||||||
|
* below).
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reduce
|
||||||
|
*
|
||||||
|
* Reduces data arrays of length count in sendbuff into recvbuff using op
|
||||||
|
* operation.
|
||||||
|
* recvbuff may be NULL on all calls except for root device.
|
||||||
|
* root is the rank (not the CUDA device) where data will reside after the
|
||||||
|
* operation is complete.
|
||||||
|
*
|
||||||
|
* In-place operation will happen if sendbuff == recvbuff.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||||
|
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||||
|
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* (deprecated) Broadcast (in-place)
|
||||||
|
*
|
||||||
|
* Copies count values from root to all other devices.
|
||||||
|
* root is the rank (not the CUDA device) where data resides before the
|
||||||
|
* operation is started.
|
||||||
|
*
|
||||||
|
* This operation is implicitely in place.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||||
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||||
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Broadcast
|
||||||
|
*
|
||||||
|
* Copies count values from root to all other devices.
|
||||||
|
* root is the rank (not the CUDA device) where data resides before the
|
||||||
|
* operation is started.
|
||||||
|
*
|
||||||
|
* In-place operation will happen if sendbuff == recvbuff.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||||
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
|
||||||
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* All-Reduce
|
||||||
|
*
|
||||||
|
* Reduces data arrays of length count in sendbuff using op operation, and
|
||||||
|
* leaves identical copies of result on each recvbuff.
|
||||||
|
*
|
||||||
|
* In-place operation will happen if sendbuff == recvbuff.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||||
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||||
|
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reduce-Scatter
|
||||||
|
*
|
||||||
|
* Reduces data in sendbuff using op operation and leaves reduced result
|
||||||
|
* scattered over the devices so that recvbuff on rank i will contain the i-th
|
||||||
|
* block of the result.
|
||||||
|
* Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
|
||||||
|
* should have a size of at least nranks*recvcount elements.
|
||||||
|
*
|
||||||
|
* In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||||
|
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||||
|
cudaStream_t stream);
|
||||||
|
ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||||
|
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||||
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* All-Gather
|
||||||
|
*
|
||||||
|
* Each device gathers sendcount values from other GPUs into recvbuff,
|
||||||
|
* receiving data from rank i at offset i*sendcount.
|
||||||
|
* Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
|
||||||
|
* should have a size of at least nranks*sendcount elements.
|
||||||
|
*
|
||||||
|
* In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||||
|
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||||
|
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Send
|
||||||
|
*
|
||||||
|
* Send data from sendbuff to rank peer.
|
||||||
|
*
|
||||||
|
* Rank peer needs to call ncclRecv with the same datatype and the same count from this
|
||||||
|
* rank.
|
||||||
|
*
|
||||||
|
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
|
||||||
|
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
|
||||||
|
* ncclGroupEnd section.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||||
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||||
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Receive
|
||||||
|
*
|
||||||
|
* Receive data from rank peer into recvbuff.
|
||||||
|
*
|
||||||
|
* Rank peer needs to call ncclSend with the same datatype and the same count to this
|
||||||
|
* rank.
|
||||||
|
*
|
||||||
|
* This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
|
||||||
|
* need to progress concurrently to complete, they must be fused within a ncclGroupStart/
|
||||||
|
* ncclGroupEnd section.
|
||||||
|
*/
|
||||||
|
ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||||
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
|
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
|
||||||
|
ncclComm_t comm, cudaStream_t stream);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Group semantics
|
||||||
|
*
|
||||||
|
* When managing multiple GPUs from a single thread, and since NCCL collective
|
||||||
|
* calls may perform inter-CPU synchronization, we need to "group" calls for
|
||||||
|
* different ranks/devices into a single call.
|
||||||
|
*
|
||||||
|
* Grouping NCCL calls as being part of the same collective operation is done
|
||||||
|
* using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
|
||||||
|
* collective calls until the ncclGroupEnd call, which will wait for all calls
|
||||||
|
* to be complete. Note that for collective communication, ncclGroupEnd only
|
||||||
|
* guarantees that the operations are enqueued on the streams, not that
|
||||||
|
* the operation is effectively done.
|
||||||
|
*
|
||||||
|
* Both collective communication and ncclCommInitRank can be used in conjunction
|
||||||
|
* of ncclGroupStart/ncclGroupEnd, but not together.
|
||||||
|
*
|
||||||
|
* Group semantics also allow to fuse multiple operations on the same device
|
||||||
|
* to improve performance (for aggregated collective calls), or to permit
|
||||||
|
* concurrent progress of multiple send/receive operations.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Group Start
|
||||||
|
*
|
||||||
|
* Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
|
||||||
|
* a single NCCL operation. Nothing will be started on the CUDA stream until
|
||||||
|
* ncclGroupEnd.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclGroupStart();
|
||||||
|
ncclResult_t pncclGroupStart();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Group End
|
||||||
|
*
|
||||||
|
* End a group call. Start a fused NCCL operation consisting of all calls since
|
||||||
|
* ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
|
||||||
|
* need to be called after ncclGroupEnd.
|
||||||
|
*/
|
||||||
|
ncclResult_t ncclGroupEnd();
|
||||||
|
ncclResult_t pncclGroupEnd();
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} // end extern "C"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // end include guard
|
313
my_optims/docker/nccl_net.h
Normal file
313
my_optims/docker/nccl_net.h
Normal file
@ -0,0 +1,313 @@
|
|||||||
|
/*************************************************************************
|
||||||
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* See LICENSE.txt for license information
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#ifndef NCCL_NET_H_
|
||||||
|
#define NCCL_NET_H_
|
||||||
|
|
||||||
|
#include "nccl.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#define NCCL_NET_HANDLE_MAXSIZE 128
|
||||||
|
|
||||||
|
#define NCCL_PTR_HOST 0x1
|
||||||
|
#define NCCL_PTR_CUDA 0x2
|
||||||
|
#define NCCL_PTR_DMABUF 0x4
|
||||||
|
|
||||||
|
// Maximum number of requests per comm object
|
||||||
|
#define NCCL_NET_MAX_REQUESTS 8
|
||||||
|
|
||||||
|
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||||
|
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||||
|
|
||||||
|
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
char* name; // Used mostly for logging.
|
||||||
|
char* pciPath; // Path to the PCI device in /sys.
|
||||||
|
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||||
|
// cards with multiple PCI functions (Physical or virtual).
|
||||||
|
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||||
|
int speed; // Port speed in Mbps.
|
||||||
|
int port; // Port number.
|
||||||
|
float latency; // Network latency
|
||||||
|
int maxComms; // Maximum number of comms we can create
|
||||||
|
int maxRecvs; // Maximum number of grouped receives.
|
||||||
|
}ncclNetProperties_v6_t;
|
||||||
|
|
||||||
|
typedef ncclNetProperties_v6_t ncclNetProperties_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
// Name of the network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create a connection.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Connect to a handle and return a sending comm object for that peer.
|
||||||
|
// This call must not block for the connection to be established, and instead
|
||||||
|
// should return successfully with sendComm == NULL with the expectation that
|
||||||
|
// it will be called again until sendComm != NULL.
|
||||||
|
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||||
|
// Finalize connection establishment after remote peer has called connect.
|
||||||
|
// This call must not block for the connection to be established, and instead
|
||||||
|
// should return successfully with recvComm == NULL with the expectation that
|
||||||
|
// it will be called again until recvComm != NULL.
|
||||||
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||||
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||||
|
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||||
|
/* DMA-BUF support */
|
||||||
|
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||||
|
// Asynchronous send to a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||||
|
// Asynchronous recv from a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||||
|
// Close and free send/recv comm objects
|
||||||
|
ncclResult_t (*closeSend)(void* sendComm);
|
||||||
|
ncclResult_t (*closeRecv)(void* recvComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclNet_v6_t;
|
||||||
|
|
||||||
|
typedef ncclNet_v6_t ncclNet_t;
|
||||||
|
|
||||||
|
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
// Name of the collective network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the collective network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters capable of doing collective operations.
|
||||||
|
// If ndev returns 0, all other functions might be set to NULL.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create connections.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Create a group for collective operations. handles have been created
|
||||||
|
// using listen() above. rank indicates caller's rank in the collective network.
|
||||||
|
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||||
|
// Returns whether a reduction operation on a data type is supported.
|
||||||
|
// 1 for supported, 0 otherwise.
|
||||||
|
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||||
|
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||||
|
/* DMA-BUF support */
|
||||||
|
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||||
|
// Performs an asynchronous allreduce operation on the collective group.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block).
|
||||||
|
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||||
|
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||||
|
// Close and free collective comm objects
|
||||||
|
ncclResult_t (*closeColl)(void* collComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclCollNet_v6_t;
|
||||||
|
|
||||||
|
typedef ncclCollNet_v6_t ncclCollNet_t;
|
||||||
|
|
||||||
|
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
|
||||||
|
|
||||||
|
// v5 struct for backwards compatibility
|
||||||
|
typedef struct {
|
||||||
|
// Name of the network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create a connection.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Connect to a handle and return a sending comm object for that peer.
|
||||||
|
// This call must not block for the connection to be established, and instead
|
||||||
|
// should return successfully with sendComm == NULL with the expectation that
|
||||||
|
// it will be called again until sendComm != NULL.
|
||||||
|
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||||
|
// Finalize connection establishment after remote peer has called connect.
|
||||||
|
// This call must not block for the connection to be established, and instead
|
||||||
|
// should return successfully with recvComm == NULL with the expectation that
|
||||||
|
// it will be called again until recvComm != NULL.
|
||||||
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||||
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||||
|
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||||
|
// Asynchronous send to a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||||
|
// Asynchronous recv from a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||||
|
// Close and free send/recv comm objects
|
||||||
|
ncclResult_t (*closeSend)(void* sendComm);
|
||||||
|
ncclResult_t (*closeRecv)(void* recvComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclNet_v5_t;
|
||||||
|
|
||||||
|
// v5 struct for backwards compatibility
|
||||||
|
typedef struct {
|
||||||
|
// Name of the collective network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the collective network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters capable of doing collective operations.
|
||||||
|
// If ndev returns 0, all other functions might be set to NULL.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create connections.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Create a group for collective operations. handles have been created
|
||||||
|
// using listen() above. rank indicates caller's rank in the collective network.
|
||||||
|
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||||
|
// Returns whether a reduction operation on a data type is supported.
|
||||||
|
// 1 for supported, 0 otherwise.
|
||||||
|
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||||
|
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||||
|
// Performs an asynchronous allreduce operation on the collective group.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block).
|
||||||
|
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||||
|
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||||
|
// Close and free collective comm objects
|
||||||
|
ncclResult_t (*closeColl)(void* collComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclCollNet_v5_t;
|
||||||
|
|
||||||
|
// v4 struct for backwards compatibility
|
||||||
|
typedef struct {
|
||||||
|
char* name; // Used mostly for logging.
|
||||||
|
char* pciPath; // Path to the PCI device in /sys.
|
||||||
|
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||||
|
// cards with multiple PCI functions (Physical or virtual).
|
||||||
|
int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
|
||||||
|
int speed; // Port speed in Mbps.
|
||||||
|
int port; // Port number.
|
||||||
|
int maxComms; // Maximum number of comms we can create
|
||||||
|
} ncclNetProperties_v4_t;
|
||||||
|
|
||||||
|
// v4 struct for backwards compatibility
|
||||||
|
typedef struct {
|
||||||
|
// Name of the network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create a connection.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Connect to a handle and return a sending comm object for that peer.
|
||||||
|
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||||
|
// Finalize connection establishment after remote peer has called connectHandle
|
||||||
|
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||||
|
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||||
|
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||||
|
// Asynchronous send to a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Asynchronous recv from a peer.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block)
|
||||||
|
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||||
|
// Close and free send/recv comm objects
|
||||||
|
ncclResult_t (*closeSend)(void* sendComm);
|
||||||
|
ncclResult_t (*closeRecv)(void* recvComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclNet_v4_t;
|
||||||
|
|
||||||
|
// v4 struct for backwards compatibility
|
||||||
|
typedef struct {
|
||||||
|
// Name of the collective network (mainly for logs)
|
||||||
|
const char* name;
|
||||||
|
// Initialize the collective network.
|
||||||
|
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||||
|
// Return the number of adapters capable of doing collective operations.
|
||||||
|
// If ndev returns 0, all other functions might be set to NULL.
|
||||||
|
ncclResult_t (*devices)(int* ndev);
|
||||||
|
// Get various device properties.
|
||||||
|
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
|
||||||
|
// Create a receiving object and provide a handle to connect to it. The
|
||||||
|
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||||
|
// between ranks to create connections.
|
||||||
|
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||||
|
// Create a group for collective operations. handles have been created
|
||||||
|
// using listen() above. rank indicates caller's rank in the collective network.
|
||||||
|
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||||
|
// Returns whether a reduction operation on a data type is supported.
|
||||||
|
// 1 for supported, 0 otherwise.
|
||||||
|
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||||
|
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||||
|
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||||
|
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||||
|
// Performs an asynchronous allreduce operation on the collective group.
|
||||||
|
// May return request == NULL if the call cannot be performed (or would block).
|
||||||
|
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||||
|
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||||
|
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||||
|
// visible to the GPU
|
||||||
|
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||||
|
// Test whether a request is complete. If size is not NULL, it returns the
|
||||||
|
// number of bytes sent/received.
|
||||||
|
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||||
|
// Close and free collective comm objects
|
||||||
|
ncclResult_t (*closeColl)(void* collComm);
|
||||||
|
ncclResult_t (*closeListen)(void* listenComm);
|
||||||
|
} ncclCollNet_v4_t;
|
||||||
|
|
||||||
|
#endif // end include guard
|
Loading…
Reference in New Issue
Block a user