From 2c446f7bdee1cfff965ffe0a1da1471ffc491c3f Mon Sep 17 00:00:00 2001 From: HuaYZhao <921203405@qq.com> Date: Mon, 4 Dec 2023 21:03:11 +0800 Subject: [PATCH] add readme --- my_optims/README.md | 21 ++ my_optims/docker/Dockerfile | 14 + my_optims/docker/bash_env.sh | 4 + my_optims/docker/install_apt_source.sh | 16 + my_optims/docker/install_cuda.sh | 2 + my_optims/docker/install_mpi.sh | 8 + my_optims/docker/install_nccl.sh | 7 + my_optims/docker/install_rust.sh | 6 + my_optims/docker/install_tgi.sh | 10 + my_optims/docker/nccl.h | 411 +++++++++++++++++++++++++ my_optims/docker/nccl_net.h | 313 +++++++++++++++++++ 11 files changed, 812 insertions(+) create mode 100644 my_optims/README.md create mode 100644 my_optims/docker/Dockerfile create mode 100644 my_optims/docker/bash_env.sh create mode 100644 my_optims/docker/install_apt_source.sh create mode 100644 my_optims/docker/install_cuda.sh create mode 100644 my_optims/docker/install_mpi.sh create mode 100644 my_optims/docker/install_nccl.sh create mode 100644 my_optims/docker/install_rust.sh create mode 100644 my_optims/docker/install_tgi.sh create mode 100644 my_optims/docker/nccl.h create mode 100644 my_optims/docker/nccl_net.h diff --git a/my_optims/README.md b/my_optims/README.md new file mode 100644 index 00000000..7321a0b4 --- /dev/null +++ b/my_optims/README.md @@ -0,0 +1,21 @@ +## 通信优化说明文档 + +### 优化内容 + 1、allreduce算子 + 2、allgather_into_tensor算子 + +### 使用方法 + 1、拉取镜像docker pull sakurahua/lm_inference:tgi-dev + 2、启动镜像docker run -it --rm --entrypoint /bin/bash --gpus all --net=host --shm-size=4G -v xxx:/code sakurahua/lm_inference:tgi-dev + 3、启动服务 + USE_CUSTOM_NCCL=1 CUDA_VISIBLE_DEVICES=0,1 /root/.cargo/bin/text-generation-launcher --model-id /code/models/llama-7b-hf --port 7777 --sharded false + 4、验证服务 + curl localhost:7777/generate -X POST -d '{"inputs":"who are you?","parameters":{"max_new_tokens":100,"details":false}}' -H 'Content-Type: application/json' + + +### 注意点 + 1、USE_CUSTOM_NCCL=1 开启通信算子,默认为0关闭 + 2、USE_TP_EMBEDDING=0 关闭embedding并行,默认为1开启 + 3、启用通信优化需搭配自定义启动命令/root/.cargo/bin/text-generation-launcher,如不启用优化,则使用原始的text-generation-launcher即可 + 4、启用通信优化--sharded false该参数为必须,其他参数与tgi保持一致。 + diff --git a/my_optims/docker/Dockerfile b/my_optims/docker/Dockerfile new file mode 100644 index 00000000..74748071 --- /dev/null +++ b/my_optims/docker/Dockerfile @@ -0,0 +1,14 @@ +FROM ghcr.nju.edu.cn/huggingface/text-generation-inference:latest +MAINTAINER ailab + +COPY . /code/ +RUN cd /code/ && \ + bash install_apt_source.sh && \ + export DEBIAN_FRONTEND=noninteractive && \ + apt update && \ + apt install -y wget git vim ssh libxml2 && \ + bash install_cuda.sh && \ + bash install_mpi.sh && \ + bash install_rust.sh && \ + bash install_tgi.sh && \ + bash bash_env.sh diff --git a/my_optims/docker/bash_env.sh b/my_optims/docker/bash_env.sh new file mode 100644 index 00000000..4d275d83 --- /dev/null +++ b/my_optims/docker/bash_env.sh @@ -0,0 +1,4 @@ +echo "add path" +echo 'export PATH=$PATH:/usr/local/bin' >> ~/.bashrc +echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib' >> ~/.bashrc +echo 'export LD_LIBRARY_PATH=/opt/conda/lib/python3.9/site-packages/torch/lib/:$LD_LIBRARY_PATH' diff --git a/my_optims/docker/install_apt_source.sh b/my_optims/docker/install_apt_source.sh new file mode 100644 index 00000000..b239f788 --- /dev/null +++ b/my_optims/docker/install_apt_source.sh @@ -0,0 +1,16 @@ +cat > /etc/apt/sources.list << EOF +deb https://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse +deb-src https://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse + +deb https://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse +deb-src https://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse + +deb https://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse +deb-src https://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse + +# deb https://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse +# deb-src https://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse + +deb https://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse +deb-src https://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse +EOF diff --git a/my_optims/docker/install_cuda.sh b/my_optims/docker/install_cuda.sh new file mode 100644 index 00000000..921349a5 --- /dev/null +++ b/my_optims/docker/install_cuda.sh @@ -0,0 +1,2 @@ +echo "install cuda" +bash cuda_11.8.0_520.61.05_linux.run --silent --toolkit diff --git a/my_optims/docker/install_mpi.sh b/my_optims/docker/install_mpi.sh new file mode 100644 index 00000000..1ca1caea --- /dev/null +++ b/my_optims/docker/install_mpi.sh @@ -0,0 +1,8 @@ +set -e +echo "install openmpi" +tar -xjvf openmpi-4.1.6.tar.bz2 +cd openmpi-4.1.6/ +./configure --prefix=/usr/local +make all +make install +cd - diff --git a/my_optims/docker/install_nccl.sh b/my_optims/docker/install_nccl.sh new file mode 100644 index 00000000..98291222 --- /dev/null +++ b/my_optims/docker/install_nccl.sh @@ -0,0 +1,7 @@ +set -e +echo "install nccl" +cp libnccl.so.2.17.1 /usr/lib/x86_64-linux-gnu/ +cp nccl_net.h /usr/include/ +cp nccl.h /usr/include/ +ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2.17.1 /usr/lib/x86_64-linux-gnu/libnccl.so +ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2.17.1 /usr/lib/x86_64-linux-gnu/libnccl.so.2 diff --git a/my_optims/docker/install_rust.sh b/my_optims/docker/install_rust.sh new file mode 100644 index 00000000..1a89cddb --- /dev/null +++ b/my_optims/docker/install_rust.sh @@ -0,0 +1,6 @@ +set -e +echo "install rust" +tar -zxvf rust-1.72.0-x86_64-unknown-linux-gnu.tar.gz +cd rust-1.72.0-x86_64-unknown-linux-gnu +./install.sh +cd - diff --git a/my_optims/docker/install_tgi.sh b/my_optims/docker/install_tgi.sh new file mode 100644 index 00000000..175b0b5a --- /dev/null +++ b/my_optims/docker/install_tgi.sh @@ -0,0 +1,10 @@ +set -e +echo "install tgi" +python -m pip install Ninja +cd text-generation-inference/my_optims/nccl_test +python setup.py install +cd - +cd text-generation-inference/ +make install-launcher +cp -r server/ /opt/conda/lib/python3.9/site-packages/text_generation_server/ +cd - diff --git a/my_optims/docker/nccl.h b/my_optims/docker/nccl.h new file mode 100644 index 00000000..8be5065c --- /dev/null +++ b/my_optims/docker/nccl.h @@ -0,0 +1,411 @@ +/************************************************************************* + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_H_ +#define NCCL_H_ + +#include +#include +#if CUDART_VERSION >= 11000 +#include +#endif + +#define NCCL_MAJOR 2 +#define NCCL_MINOR 17 +#define NCCL_PATCH 1 +#define NCCL_SUFFIX "" + +#define NCCL_VERSION_CODE 21701 +#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) + +#ifdef __cplusplus +extern "C" { +#endif + +#include +/* Opaque handle to communicator */ +typedef struct ncclComm* ncclComm_t; +#define NCCL_COMM_NULL NULL + +#define NCCL_UNIQUE_ID_BYTES 128 +typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; + +/* Error type */ +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclRemoteError = 6, + ncclInProgress = 7, + ncclNumResults = 8 } ncclResult_t; + +#define NCCL_CONFIG_UNDEF_INT INT_MIN +#define NCCL_CONFIG_UNDEF_PTR NULL + +/* Communicator configuration. Users can assign value to attributes to specify the + * behavior of a communicator. */ +typedef struct ncclConfig_v21700 { + /* attributes that users should never touch. */ + size_t size; + unsigned int magic; + unsigned int version; + /* attributes that users are able to customize. */ + int blocking; + int cgaClusterSize; + int minCTAs; + int maxCTAs; + const char *netName; +} ncclConfig_t; + +/* Config initializer must be assigned to initialize config structure when it is created. + * Not initialized config will result in NCCL error. */ +#define NCCL_CONFIG_INITIALIZER { \ + sizeof(ncclConfig_t), /* size */ \ + 0xcafebeef, /* magic */ \ + NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ + NCCL_CONFIG_UNDEF_INT, /* blocking */ \ + NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ + NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ + NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ + NCCL_CONFIG_UNDEF_PTR /* netName */ \ +} + +/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. + * This integer is coded with the MAJOR, MINOR and PATCH level of the + * NCCL library + */ +ncclResult_t ncclGetVersion(int *version); +ncclResult_t pncclGetVersion(int *version); + +/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be + * called once and the Id should be distributed to all ranks in the + * communicator before calling ncclCommInitRank. */ +ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); +ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); + +/* Create a new communicator (multi thread/process version) with a configuration + * set by users. */ +ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); +ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); + +/* Creates a new communicator (multi thread/process version). + * rank must be between 0 and nranks-1 and unique within a communicator clique. + * Each rank is associated to a CUDA device, which has to be set before calling + * ncclCommInitRank. + * ncclCommInitRank implicitly syncronizes with other ranks, so it must be + * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ +ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); + +/* Creates a clique of communicators (single process version). + * This is a convenience function to create a single-process communicator clique. + * Returns an array of ndev newly initialized communicators in comm. + * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). + * If devlist is NULL, the first ndev CUDA devices are used. + * Order of devlist defines user-order of processors within the communicator. */ +ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); + +/* Finalize a communicator. ncclCommFinalize flushes all issued communications, + * and marks communicator state as ncclInProgress. The state will change to ncclSuccess + * when the communicator is globally quiescent and related resources are freed; then, + * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator + * itself) without blocking. */ +ncclResult_t ncclCommFinalize(ncclComm_t comm); +ncclResult_t pncclCommFinalize(ncclComm_t comm); + +/* Frees local resources associated with communicator object. */ +ncclResult_t ncclCommDestroy(ncclComm_t comm); +ncclResult_t pncclCommDestroy(ncclComm_t comm); + +/* Frees resources associated with communicator object and aborts any operations + * that might still be running on the device. */ +ncclResult_t ncclCommAbort(ncclComm_t comm); +ncclResult_t pncclCommAbort(ncclComm_t comm); + +/* Returns a string for each error code. */ +const char* ncclGetErrorString(ncclResult_t result); +const char* pncclGetErrorString(ncclResult_t result); + +/* Returns a human-readable message of the last error that occurred. + * comm is currently unused and can be set to NULL + */ +const char* ncclGetLastError(ncclComm_t comm); +const char* pncclGetLastError(ncclComm_t comm); + +/* Checks whether the comm has encountered any asynchronous errors */ +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); + +/* Gets the number of ranks in the communicator clique. */ +ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); +ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); + +/* Returns the cuda device number associated with the communicator. */ +ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); +ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); + +/* Returns the user-ordered "rank" associated with the communicator. */ +ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); +ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); + +/* Reduction operation selector */ +typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; +typedef enum { ncclSum = 0, + ncclProd = 1, + ncclMax = 2, + ncclMin = 3, + ncclAvg = 4, + /* ncclNumOps: The number of built-in ncclRedOp_t values. Also + * serves as the least possible value for dynamic ncclRedOp_t's + * as constructed by ncclRedOpCreate*** functions. */ + ncclNumOps = 5, + /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. + * It is defined to be the largest signed value (since compilers + * are permitted to use signed enums) that won't grow + * sizeof(ncclRedOp_t) when compared to previous NCCL versions to + * maintain ABI compatibility. */ + ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) + } ncclRedOp_t; + +/* Data types */ +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, +#if defined(__CUDA_BF16_TYPES_EXIST__) + ncclBfloat16 = 9, + ncclNumTypes = 10 +#else + ncclNumTypes = 9 +#endif +} ncclDataType_t; + +/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ +typedef enum { + /* ncclScalarDevice: The scalar is in device-visible memory and will be + * dereferenced while the collective is running. */ + ncclScalarDevice = 0, + + /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be + * dereferenced before the ncclRedOpCreate***() function returns. */ + ncclScalarHostImmediate = 1 +} ncclScalarResidence_t; + +/* + * ncclRedOpCreatePreMulSum + * + * Creates a new reduction operator which pre-multiplies input values by a given + * scalar locally before reducing them with peer values via summation. For use + * only with collectives launched against *comm* and *datatype*. The + * *residence* argument indicates how/when the memory pointed to by *scalar* + * will be dereferenced. Upon return, the newly created operator's handle + * is stored in *op*. + */ +ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); +ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); + +/* + * ncclRedOpDestroy + * + * Destroys the reduction operator *op*. The operator must have been created by + * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be + * destroyed as soon as the last NCCL function which is given that operator returns. + */ +ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); +ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); + +/* + * Collective communication operations + * + * Collective communication operations must be called separately for each + * communicator in a communicator clique. + * + * They return when operations have been enqueued on the CUDA stream. + * + * Since they may perform inter-CPU synchronization, each call has to be done + * from a different thread or process, or need to use Group Semantics (see + * below). + */ + +/* + * Reduce + * + * Reduces data arrays of length count in sendbuff into recvbuff using op + * operation. + * recvbuff may be NULL on all calls except for root device. + * root is the rank (not the CUDA device) where data will reside after the + * operation is complete. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); + +/* + * (deprecated) Broadcast (in-place) + * + * Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * This operation is implicitely in place. + */ +ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); + +/* + * Broadcast + * + * Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); + +/* + * All-Reduce + * + * Reduces data arrays of length count in sendbuff using op operation, and + * leaves identical copies of result on each recvbuff. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); + +/* + * Reduce-Scatter + * + * Reduces data in sendbuff using op operation and leaves reduced result + * scattered over the devices so that recvbuff on rank i will contain the i-th + * block of the result. + * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff + * should have a size of at least nranks*recvcount elements. + * + * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + */ +ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream); +ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream); + +/* + * All-Gather + * + * Each device gathers sendcount values from other GPUs into recvbuff, + * receiving data from rank i at offset i*sendcount. + * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + * should have a size of at least nranks*sendcount elements. + * + * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + */ +ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); + +/* + * Send + * + * Send data from sendbuff to rank peer. + * + * Rank peer needs to call ncclRecv with the same datatype and the same count from this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + * ncclGroupEnd section. + */ +ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); + +/* + * Receive + * + * Receive data from rank peer into recvbuff. + * + * Rank peer needs to call ncclSend with the same datatype and the same count to this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + * ncclGroupEnd section. + */ +ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); + +/* + * Group semantics + * + * When managing multiple GPUs from a single thread, and since NCCL collective + * calls may perform inter-CPU synchronization, we need to "group" calls for + * different ranks/devices into a single call. + * + * Grouping NCCL calls as being part of the same collective operation is done + * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all + * collective calls until the ncclGroupEnd call, which will wait for all calls + * to be complete. Note that for collective communication, ncclGroupEnd only + * guarantees that the operations are enqueued on the streams, not that + * the operation is effectively done. + * + * Both collective communication and ncclCommInitRank can be used in conjunction + * of ncclGroupStart/ncclGroupEnd, but not together. + * + * Group semantics also allow to fuse multiple operations on the same device + * to improve performance (for aggregated collective calls), or to permit + * concurrent progress of multiple send/receive operations. + */ + +/* + * Group Start + * + * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into + * a single NCCL operation. Nothing will be started on the CUDA stream until + * ncclGroupEnd. + */ +ncclResult_t ncclGroupStart(); +ncclResult_t pncclGroupStart(); + +/* + * Group End + * + * End a group call. Start a fused NCCL operation consisting of all calls since + * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations + * need to be called after ncclGroupEnd. + */ +ncclResult_t ncclGroupEnd(); +ncclResult_t pncclGroupEnd(); + +#ifdef __cplusplus +} // end extern "C" +#endif + +#endif // end include guard diff --git a/my_optims/docker/nccl_net.h b/my_optims/docker/nccl_net.h new file mode 100644 index 00000000..a387e66d --- /dev/null +++ b/my_optims/docker/nccl_net.h @@ -0,0 +1,313 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_NET_H_ +#define NCCL_NET_H_ + +#include "nccl.h" +#include + +#define NCCL_NET_HANDLE_MAXSIZE 128 + +#define NCCL_PTR_HOST 0x1 +#define NCCL_PTR_CUDA 0x2 +#define NCCL_PTR_DMABUF 0x4 + +// Maximum number of requests per comm object +#define NCCL_NET_MAX_REQUESTS 8 + +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. +}ncclNetProperties_v6_t; + +typedef ncclNetProperties_v6_t ncclNetProperties_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v6_t; + +typedef ncclNet_v6_t ncclNet_t; + +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6 + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v6_t; + +typedef ncclCollNet_v6_t ncclCollNet_t; + +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6 + +// v5 struct for backwards compatibility +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v5_t; + +// v5 struct for backwards compatibility +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v5_t; + +// v4 struct for backwards compatibility +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA + int speed; // Port speed in Mbps. + int port; // Port number. + int maxComms; // Maximum number of comms we can create +} ncclNetProperties_v4_t; + +// v4 struct for backwards compatibility +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connectHandle + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v4_t; + +// v4 struct for backwards compatibility +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v4_t; + +#endif // end include guard