tp optims

2025-09-11 04:14:52 +00:00 · 2023-11-22 19:58:40 +08:00 · 2023-11-22 19:58:40 +08:00 · 2fd1156d5a
commit 2fd1156d5a
parent dad29f7299
26 changed files with 7255 additions and 0 deletions
--- a/my_optims/nccl_test/CMakeLists.txt
+++ b/my_optims/nccl_test/CMakeLists.txt
@ -0,0 +1,17 @@
 # Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 add_executable(test_nccl test_nccl.cc)
 target_link_libraries(test_nccl PUBLIC -lcublas -lcublasLt -lcudart
                      nvtx_utils mpi_utils nccl_utils memory_utils)
--- a/my_optims/nccl_test/my_custom_comm.cc
+++ b/my_optims/nccl_test/my_custom_comm.cc
@ -0,0 +1,171 @@
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDAStream.h>
 #include <cuda_runtime.h>
 #include <mpi.h>
 #include <nccl.h>
 #include <string>
 #include <torch/extension.h>
 struct NcclParam {
    int          rank_{0};
    int          world_size_{1};
    ncclUniqueId nccl_uid_;
    ncclComm_t   nccl_comm_ = nullptr;
    cudaStream_t stream_;
    NcclParam(): rank_(0), world_size_(1), nccl_comm_(nullptr){};
    NcclParam(int rank, int world_size): rank_(rank), world_size_(world_size){};
    NcclParam(NcclParam const& param):
        rank_(param.rank_), world_size_(param.world_size_), nccl_uid_(param.nccl_uid_), nccl_comm_(param.nccl_comm_), stream_(param.stream_){};
 };
 #define NCCLCHECK(cmd)                                                                                                 \
    do {                                                                                                               \
        ncclResult_t r = cmd;                                                                                          \
        if (r != ncclSuccess) {                                                                                        \
            printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(r));                      \
            exit(EXIT_FAILURE);                                                                                        \
        }                                                                                                              \
    } while (0)
 std::tuple<NcclParam*, NcclParam*> init_nccl(int tensor_para_size, int pipeline_para_size)
 {
    // int    argc = 0;
    // char** argv = nullptr;
    // // 初始化 MPI
    // MPI_Init(&argc, &argv);
    int rank, world_size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);        // 获取当前进程的 rank
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);  // 获取进程总数
    // printf("rank:%d, world_size:%d\n", rank, world_size);
    // 设定gpu
    int device, device_count;
    cudaGetDeviceCount(&device_count);
    cudaSetDevice(rank % device_count);
    cudaGetDevice(&device);
    struct cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, device);
    int mpi_initialized;
    MPI_Initialized(&mpi_initialized);
    static NcclParam tensor_para;
    static NcclParam pipeline_para;
    // Convert WORLD communicator into 2D grid (k * n) communicator.
    // row = a tensor parallel group, col = a pipeline parallel group.
    MPI_Comm grid_comm, tp_comm, pp_comm;
    int dims[2]    = {pipeline_para_size, tensor_para_size};
    int periods[2] = {0, 0};
    MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &grid_comm);
    // Split 2D communicator into rows and cols.
    int tp_remain_dims[2] = {false, true};
    int pp_remain_dims[2] = {true, false};
    MPI_Cart_sub(grid_comm, tp_remain_dims, &tp_comm);
    MPI_Cart_sub(grid_comm, pp_remain_dims, &pp_comm);
    int tp_rank, pp_rank;
    MPI_Comm_rank(tp_comm, &tp_rank);
    MPI_Comm_rank(pp_comm, &pp_rank);
    printf("tp_rank:%d, pp_rank:%d\n", tp_rank, pp_rank);
    ncclUniqueId tp_uid;
    ncclUniqueId pp_uid;
    // The root of each group creates a nccl uid.
    if (tp_rank == 0) {
        NCCLCHECK(ncclGetUniqueId(&tp_uid));
    }
    if (pp_rank == 0) {
        NCCLCHECK(ncclGetUniqueId(&pp_uid));
    }
    // Broadcast nccl uid to share the same nccl uid across gpus in the same group.
    MPI_Bcast(&tp_uid, sizeof(tp_uid), MPI_BYTE, 0, tp_comm);
    MPI_Bcast(&pp_uid, sizeof(pp_uid), MPI_BYTE, 0, pp_comm);
    ncclComm_t tp_nccl_comm, pp_nccl_comm;
    NCCLCHECK(ncclCommInitRank(&tp_nccl_comm, tensor_para_size, tp_uid, tp_rank));
    NCCLCHECK(ncclCommInitRank(&pp_nccl_comm, pipeline_para_size, pp_uid, pp_rank));
    tensor_para.world_size_   = tensor_para_size;
    tensor_para.rank_         = tp_rank;
    tensor_para.nccl_uid_     = tp_uid;
    tensor_para.nccl_comm_    = tp_nccl_comm;
    cudaStreamCreate(&tensor_para.stream_);
    pipeline_para.world_size_ = pipeline_para_size;
    pipeline_para.rank_       = pp_rank;
    pipeline_para.nccl_uid_   = pp_uid;
    pipeline_para.nccl_comm_  = pp_nccl_comm;
    NcclParam* tensor_para_ptr   = &tensor_para;
    NcclParam* pipeline_para_ptr = &pipeline_para;
    return std::make_tuple(tensor_para_ptr, pipeline_para_ptr);
 }
 void finalize_nccl(NcclParam* tensor_para_ptr, NcclParam* pipeline_para_ptr)
 {
    // 销毁nccl
    NcclParam tensor_para   = *tensor_para_ptr;
    NcclParam pipeline_para = *pipeline_para_ptr;
    if (tensor_para.nccl_comm_ != nullptr) {
        ncclCommDestroy(tensor_para.nccl_comm_);
    }
    if (pipeline_para.nccl_comm_ != nullptr) {
        ncclCommDestroy(pipeline_para.nccl_comm_);
    }
    // MPI_Finalize();
 }
 ncclDataType_t getNcclDataType(torch::ScalarType torch_type)
 {
    ncclDataType_t nccl_type;
    if (torch_type == torch::kFloat16) {
        nccl_type = ncclHalf;
    }
    else if (torch_type == torch::kFloat32) {
        nccl_type = ncclFloat;
    }
    else if (torch_type == torch::kFloat64) {
        nccl_type = ncclDouble;
    }
    else if (torch_type == torch::kInt32) {
        nccl_type = ncclInt32;
    }
    else if (torch_type == torch::kInt64) {
        nccl_type = ncclInt64;
    }
    else if (torch_type == torch::kInt8) {
        nccl_type = ncclInt8;
    }
    else {
        printf("[ERROR] NCCL only support float, half, int \n");
        exit(-1);
    }
    return nccl_type;
 }
 void custom_allreduce(torch::Tensor tensor, NcclParam* nccl_param_ptr)
 {
    void*             data_ptr   = tensor.data_ptr();
    size_t            count      = tensor.numel();
    torch::ScalarType torch_type = tensor.scalar_type();
    NcclParam      nccl_param     = *nccl_param_ptr;
    // cudaStream_t   stream         = at::cuda::getCurrentCUDAStream();
    ncclDataType_t nccl_data_type = getNcclDataType(torch_type);
    NCCLCHECK(ncclGroupStart());
    NCCLCHECK(ncclAllReduce(
        (const void*)data_ptr, (void*)data_ptr, count, nccl_data_type, ncclSum, nccl_param.nccl_comm_, nccl_param.stream_));
    NCCLCHECK(ncclGroupEnd());
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
    py::class_<NcclParam>(m, "NcclParam").def(py::init<>());
    m.def("init_nccl", &init_nccl, py::return_value_policy::reference, "");
    m.def("finalize_nccl", &finalize_nccl, "");
    m.def("custom_allreduce", &custom_allreduce, "");
 }
--- a/my_optims/nccl_test/setup.py
+++ b/my_optims/nccl_test/setup.py
@ -0,0 +1,12 @@
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 setup(name='my_custom_comm',
      include_dirs=["/usr/local/cuda/targets/x86_64-linux/include/",
                    ],
      ext_modules=[CUDAExtension('my_custom_comm',
                                 ['my_custom_comm.cc'],
                                 libraries=["mpi"]
                                 ),],
      cmdclass={'build_ext': BuildExtension},
      )
--- a/my_optims/nccl_test/test_extension.py
+++ b/my_optims/nccl_test/test_extension.py
@ -0,0 +1,22 @@
 import torch
 import my_custom_comm
 from mpi4py import MPI
 COMM = MPI.COMM_WORLD
 rank = COMM.Get_rank()
 world_size = COMM.Get_size()
 tp_ptr, pp_ptr = my_custom_comm.init_nccl(2, 1)
 print(tp_ptr)
 device = rank % torch.cuda.device_count()
 torch.cuda.set_device(device)
 torch.cuda.set_per_process_memory_fraction(1., device)
 print(rank, world_size)
 t = torch.tensor([[1, 2, 3, 4], [3, 3, 3, 3.1]],
                 dtype=torch.float16).to('cuda')
 print(my_custom_comm.custom_allreduce(t, tp_ptr))
 print(t)
 torch.cuda.synchronize()
 my_custom_comm.finalize_nccl(tp_ptr, pp_ptr)
--- a/my_optims/nccl_test/test_extension_torch.py
+++ b/my_optims/nccl_test/test_extension_torch.py
@ -0,0 +1,32 @@
 import torch
 import torch.distributed as dist
 import my_custom_comm
 import os
 assert dist.is_mpi_available()
 # rank, world_size = int(os.getenv('RANK')), int(os.getenv('WORLD_SIZE'))
 print(os.environ)
 dist.init_process_group(backend=dist.Backend.MPI, 
                        # rank=rank, 
                        # world_size=2
                        )
 assert dist.is_initialized()
 rank = dist.get_rank()
 world_size = dist.get_world_size()
 print(f'{rank=},{world_size=}')
 # tp_ptr, pp_ptr = my_custom_comm.init_nccl(2, 1)
 # print(tp_ptr)
 # device = rank % torch.cuda.device_count()
 # torch.cuda.set_device(device)
 # torch.cuda.set_per_process_memory_fraction(1., device)
 # print(rank, world_size)
 # t = torch.tensor([[1, 2, 3, 4], [3, 3, 3, 3.1]],
 #                  dtype=torch.float16).to('cuda')
 # print(my_custom_comm.custom_allreduce(t, tp_ptr))
 # print(t)
 # torch.cuda.synchronize()
 # my_custom_comm.finalize_nccl(tp_ptr, pp_ptr)
--- a/my_optims/nccl_test/test_nccl.cc
+++ b/my_optims/nccl_test/test_nccl.cc
@ -0,0 +1,88 @@
 #include "src/fastertransformer/utils/mpi_utils.h"
 #include "src/fastertransformer/utils/nccl_utils.h"
 #include "src/fastertransformer/utils/nvtx_utils.h"
 #include "src/fastertransformer/utils/memory_utils.h"
 #include <string>
 #include <cuda_profiler_api.h>
 using namespace fastertransformer;
 template<typename T>
 void test_nccl();
 int main(int argc, char **argv){
    mpi::initialize(&argc, &argv);
    test_nccl<half>();
    mpi::finalize();
    return 0;
 }
 template<typename T>
 void test_nccl()
 {
    // int rank       = 0;
    // int world_size = 1;
    int rank       = mpi::getCommWorldRank();
    int world_size = mpi::getCommWorldSize();
    if (rank == 0) {
        printf("Total ranks: %d.\n", world_size);
    }
    int device, device_count;
    check_cuda_error(cudaGetDeviceCount(&device_count));
    check_cuda_error(cudaSetDevice(rank % device_count));
    check_cuda_error(cudaGetDevice(&device));
    struct cudaDeviceProp prop;
    check_cuda_error(cudaGetDeviceProperties(&prop, device));
    printf("Device %s\n", prop.name);
    printf("P%d is running with GPU #%d.\n", rank, device);
    print_mem_usage();
    // 声明数据指针
    std::vector<T*> weights_ptr        = std::vector<T*>(1);
    size_t shape = 4096 * 2048;
    deviceMalloc(&weights_ptr[0], shape, true); // 从gpu中开辟空间，并随即初始化
    // deviceFill(weights_ptr[0], (size_t)shape, (T)2.0); // 给gpu空间赋值
    // 初始化nccl
    int tensor_para_size   = 2;
    int pipeline_para_size = 1;
    NcclParam tensor_para;
    NcclParam pipeline_para;
    ftNcclInitialize(tensor_para, pipeline_para, tensor_para_size, pipeline_para_size);
    std::cout << "tensor_para info:" << tensor_para.rank_ << std::endl;
    cudaStream_t     stream;
    cudaStreamCreate(&stream);
    mpi::barrier();
    cudaProfilerStart();
    ft_nvtx::setScope("run_time");
    PUSH_RANGE("run time")
    for (int i = 0; i < 32; i++) {
        cudaDeviceSynchronize();
        ftNcclAllReduceSum(weights_ptr[0], weights_ptr[0], shape, tensor_para, stream);
        cudaDeviceSynchronize();
        deviceMalloc(&weights_ptr[0], shape, true);
    }
    mpi::barrier();
    POP_RANGE;
    ft_nvtx::resetScope();
    // T* hBuf = new T[shape]; // 开辟CPU空间
    // cudaD2Hcpy(hBuf, weights_ptr[0], shape);
    // { // 从cpu打印，查看数据
    //     for (size_t i = 0; i < shape; i++) {
    //         printf("%f ", (float)hBuf[i]);
    //     }
    //     std::cout << std::endl;
    // }
    // delete[] hBuf;
    return;
 }
--- a/my_optims/nccl_test/torch_nccl.py
+++ b/my_optims/nccl_test/torch_nccl.py
@ -0,0 +1,30 @@
 import torch
 import torch.distributed as dist
 import os
 import time
 def test_nccl():
    rank, world_size = int(os.getenv('RANK')), int(os.getenv('WORLD_SIZE'))
    dist.init_process_group(backend='nccl', 
                            rank=rank, 
                            world_size=world_size)
    process_group = dist.group.WORLD
    torch.cuda.set_device(rank % world_size)
    shape = 4096 * 2048
    weight = torch.randn([shape], dtype=torch.float16).to("cuda")
    # nccl test
    dist.barrier(process_group)
    for i in range(32):
        torch.cuda.synchronize()
        dist.all_reduce(weight, group=process_group)
        torch.cuda.synchronize()
        weight = torch.randn([shape], dtype=torch.float16).to("cuda")
    dist.barrier(process_group)
 if __name__ == '__main__':
    test_nccl()
--- a/my_optims/optims/test_llama_fa.py
+++ b/my_optims/optims/test_llama_fa.py
@ -0,0 +1,709 @@
 # encoding:utf-8
 # -------------------------------------------#
 # Filename: optims -- test_llama_fa.py
 #
 # Description:
 # Version:       1.0
 # Created:       2023/9/18-20:50
 # Last modified by:
 # Author:        'zhaohuayang@myhexin.com'
 # Company:       同花顺网络信息股份有限公司
 # -------------------------------------------#
 import math
 import time
 from pathlib import Path
 from typing import List, Dict, Optional
 from typing import Tuple
 # Flash attention imports
 import dropout_layer_norm
 import flash_attn_2_cuda
 import numpy as np
 import rotary_emb
 import torch
 import torch.distributed
 import transformers
 from safetensors import safe_open
 from torch import nn
 from torch.nn import functional as F
 from transformers.activations import ACT2FN
 from vllm import attention_ops, cache_ops
 # vllm imports
 BLOCK_SIZE = 16
 class FastLinear(nn.Module):
    def __init__(
            self,
            weight,
            bias,
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(weight)
        if bias is not None:
            self.bias = nn.Parameter(bias)
        else:
            self.bias = None
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_tensor(f"{prefix}.weight")
        if bias:
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(weight, bias)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return F.linear(input, self.weight, self.bias)
 class PositionRotaryEmbedding(nn.Module):
    def __init__(self, inv_freq, scaling_factor):
        super().__init__()
        self.inv_freq = inv_freq
        self._seq_len_cached = 0
        self._cos_cached = None
        self._sin_cached = None
        self._cos_k_cached = None
        self._sin_k_cached = None
        self.scaling_factor = scaling_factor
        self.dynamic_args = None
    @staticmethod
    def _create_inv_freq(dim, base, device):
        inv_freq = 1.0 / (
                base
                ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
        )
        return inv_freq
    @classmethod
    def static(cls, config, dim, base, device):
        inv_freq = cls._create_inv_freq(dim, base, device)
        scaling_factor = None
        return cls(inv_freq, scaling_factor)
    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
                seqlen > self._seq_len_cached
                or self._cos_cached.device != device
                or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            if self.scaling_factor is not None:
                t /= self.scaling_factor
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)
    def get_cos_sin(
            self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype
    ):
        """
        Return cos and sin for the asked position ids
        """
        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
        cos = torch.index_select(self._cos_cached, 0, position_ids)
        sin = torch.index_select(self._sin_cached, 0, position_ids)
        return cos.unsqueeze(1), sin.unsqueeze(1)
    def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
        rotary_dim = cos.shape[-1]
        x1 = x[..., :rotary_dim]
        x2 = x[..., rotary_dim: 2 * rotary_dim]
        rotary_emb.apply_rotary(x1, x2, cos, sin, x1, x2, False)
        return x
 class Weights:
    def __init__(
            self,
            filenames: List[Path],
            device,
            dtype,
            process_group,
            aliases: Optional[Dict[str, List[str]]] = None,
    ):
        routing = {}
        for filename in filenames:
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename
        if aliases is None:
            aliases = {}
        self.aliases = aliases
        self.routing = routing
        self.device = device
        self.dtype = dtype
        self.process_group = process_group
        self._handles = {}
    def _get_handle(self, filename):
        if filename not in self._handles:
            f = safe_open(filename, framework="pytorch")
            self._handles[filename] = f
        return self._handles[filename]
    def get_filename(self, tensor_name: str) -> (str, str):
        filename = self.routing.get(tensor_name, None)
        if filename is None:
            aliases = self.aliases.get(tensor_name, [])
            for alias in aliases:
                filename = self.routing.get(alias, None)
                if filename is not None:
                    return str(filename), alias
            raise RuntimeError(f"weight {tensor_name} does not exist")
        return str(filename), tensor_name
    def get_tensor(self, tensor_name: str, to_device=True):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        tensor = f.get_tensor(tensor_name)
        # Special case for gptq which shouldn't convert
        # u4 which are disguised as int32
        if tensor.dtype not in [torch.int32, torch.int64]:
            tensor = tensor.to(dtype=self.dtype)
        if to_device:
            tensor = tensor.to(device=self.device)
        return tensor
    def load_multi_linear(self, config, prefixes):
        weight = torch.cat([self.get_tensor(f"{p}.weight") for p in prefixes], dim=0)
        return FastLinear(weight, bias=None)
 class LlamaRMSNorm(nn.Module):
    def __init__(self, prefix, weights, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        weight = weights.get_tensor(f"{prefix}.weight")
        self.weight = nn.Parameter(weight)
        self.variance_epsilon = eps
    def forward(self, hidden_states, residual=None):
        # faster post attention rms norm
        normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd(
            hidden_states,
            residual,
            self.weight,
            None,
            None,
            None,
            None,
            None,
            0.0,
            self.variance_epsilon,
            1.0,
            0,
            None,
            False,
            True,  # Activate RMSNorm
        )
        if res is None:
            res = hidden_states
        return normed_hidden_states, res
 class FlashLlamaAttention(torch.nn.Module):
    def __init__(
            self,
            prefix: str,
            config,
            weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        self.softmax_scale = self.head_size ** -0.5
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)
        # q,k,v,o and rotary
        self.query_key_value = weights.load_multi_linear(config,
                                                         [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"])
        self.o_proj = FastLinear.load(config, f"{prefix}.o_proj", weights, bias=False)
        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config, dim=self.head_size, base=config.rope_theta, device=weights.device
        )
    def forward(
            self,
            hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
    ):
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
        self.rotary_emb(query, cos, sin)
        self.rotary_emb(torch.select(kv, dim=1, index=0), cos, sin)
        cache_ops.reshape_and_cache(
            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
        )
        # output tensor
        attn_output = torch.empty_like(query)
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            flash_attn_2_cuda.varlen_fwd(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
                attn_output,
                cu_seqlen_prefill,
                cu_seqlen_prefill,
                max_s,
                max_s,
                0.0,
                self.softmax_scale,
                False,
                True,
                -1,
                0,
                False,
                None,
            )
        # Decode
        else:
            # kv_cache[1] => [num_blocks, num_heads, head_size, block_size]
            block_size = kv_cache[1].shape[3]
            attention_ops.paged_attention_v1(
                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None
            )
        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
 class LlamaMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.intermediate_size = config.intermediate_size
        self.act = ACT2FN[act]
        # Fuse gate and up proj
        self.gate_up_proj = weights.load_multi_linear(config, [f"{prefix}.gate_proj", f"{prefix}.up_proj"])
        self.down_proj = FastLinear.load(config, f"{prefix}.down_proj", weights, bias=False)
    def forward(self, hidden_states):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
 class FlashLlamaLayer(nn.Module):
    def __init__(self, layer_id, config, weights):
        super().__init__()
        prefix = f"model.layers.{layer_id}"
        self.input_layernorm = LlamaRMSNorm(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.self_attn = FlashLlamaAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.post_attention_layernorm = LlamaRMSNorm(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.mlp = LlamaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
    def forward(
            self,
            hidden_states,
            residual,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
        )
        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )
        mlp_output = self.mlp(normed_attn_res_output)
        return mlp_output, attn_res
 class FlashLlamaModel(torch.nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        embeddings = weights.get_tensor(f"model.embed_tokens.weight")
        self.embed_tokens = nn.Embedding.from_pretrained(F.pad(embeddings, (0, 0, 0, 1)),
                                                         padding_idx=config.pad_token_id)
        self.layers = nn.ModuleList(
            [
                FlashLlamaLayer(
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
                # for layer_id in range(1)
            ]
        )
        self.norm = LlamaRMSNorm(
            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
        )
        self.gradient_checkpointing = False
        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
    def forward(
            self,
            input_ids: torch.Tensor,
            position_ids: torch.Tensor,
            cu_seqlen_prefill: Optional[torch.Tensor],
            kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
            block_tables: torch.Tensor,
            slots: torch.Tensor,
            input_lengths: torch.Tensor,
            max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)
        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )
        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                input_lengths,
                max_s,
            )
        hidden_states, _ = self.norm(hidden_states, residual)
        return hidden_states
 class FlashLlamaForCausalLM(torch.nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        self.config = config
        self.model = FlashLlamaModel(config, weights)
        self.lm_head = FastLinear.load(config, prefix="lm_head", weights=weights, bias=False)
    def forward(
            self,
            input_ids: torch.Tensor,
            position_ids: torch.Tensor,
            cu_seqlen_prefill: Optional[torch.Tensor],
            kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
            block_tables: torch.Tensor,
            slots: torch.Tensor,
            input_lengths: torch.Tensor,
            max_s: int,
            lm_head_indices: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits
 class CacheManager:
    def __init__(
            self,
            num_blocks: int,
            num_layers: int,
            num_heads: int,
            head_size: int,
            dtype: torch.dtype,
            device: torch.device,
    ):
        self.block_size = BLOCK_SIZE
        self.num_blocks = num_blocks
        self.device = device
        element_size = torch.tensor([], dtype=dtype).element_size()
        x = self.block_size // element_size
        self.kv_cache = [
            (
                torch.empty(
                    (num_blocks, num_heads, head_size // x, self.block_size, x),
                    dtype=dtype,
                    device=device,
                ),
                torch.empty(
                    (num_blocks, num_heads, head_size, self.block_size),
                    dtype=dtype,
                    device=device,
                ),
            )
            for _ in range(num_layers)
        ]
        self.free_block_mask = torch.ones(num_blocks, dtype=torch.int32, device="cpu")
        self.slots = torch.arange(
            0, num_blocks * self.block_size, dtype=torch.int32
        ).view(num_blocks, self.block_size)
    def allocate(self, blocks, max_blocks, needed_blocks_slots):
        """
        blocks: 总共需要的blocks数量
        max_blocks: 最大的blocks数量大小
        needed_blocks_slots: 每个序列所需的blocks及其对应的序列长度
        """
        # Get free blocks indices by finding values in mask that are not set to 0
        free_block_indices = self.free_block_mask.nonzero()
        assert (
                len(free_block_indices) >= blocks
        ), f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
        # Slice by the number of required blocks
        block_indices = free_block_indices[: blocks]
        block_indices = block_indices.flatten()
        # Padded block tables
        block_tables_tensor = torch.zeros(
            (len(needed_blocks_slots), max_blocks), dtype=torch.int32
        )
        # Allocate paged attention blocks
        cumulative_blocks = 0
        slots = []
        block_tables = []
        for i, (needed_blocks, needed_slots) in enumerate(needed_blocks_slots):
            # Get allocated blocks for this sequence
            allocated_blocks = block_indices[
                               cumulative_blocks: cumulative_blocks + needed_blocks
                               ]
            # Get slots for the allocated blocks
            allocated_slots = self.slots[allocated_blocks].flatten()[:needed_slots]
            slots.append(allocated_slots)
            block_tables.append(allocated_blocks.tolist())
            block_tables_tensor[i, :needed_blocks] = allocated_blocks
            cumulative_blocks += needed_blocks
        # Allocate the required number of blocks by setting the mask to 0
        self.free_block_mask[block_indices] = 0
        return block_tables, block_tables_tensor.to(self.device), torch.concat(slots).to(self.device)
    def free(self, block_indices: Optional[List[int]]):
        if block_indices is not None and block_indices:
            # Reset mask
            self.free_block_mask[block_indices] = 1
 def generate(tokenizer, model, prompt, max_new_tokens=10):
    input_ids = tokenizer(prompt).input_ids
    def warmup():
        print("start warmup...")
        global CACHE_MANAGER
        blocks = 260
        CACHE_MANAGER = CacheManager(blocks,
                                     model.config.num_hidden_layers,
                                     model.config.num_key_value_heads,
                                     model.config.hidden_size // model.config.num_attention_heads,
                                     torch.float16,
                                     device)
        input_length = 1024
        bs = 4
        warmup_inputs = {
            'input_ids': torch.arange(1, input_length + 1, dtype=torch.int64, device=device).repeat(bs),
            'position_ids': torch.arange(0, input_length, dtype=torch.int32, device=device).repeat(bs),
            'cu_seqlen_prefill': torch.tensor([i * input_length for i in range(bs + 1)], dtype=torch.int32,
                                              device=device),
            'block_tables': torch.arange(0, blocks, dtype=torch.int32, device=device).split(blocks // bs),
            'slots': torch.arange(0, 4144, dtype=torch.int32, device=device),
            'input_lengths': torch.tensor([input_length] * 4, dtype=torch.int32, device=device),
            'max_s': 1024,
            'lm_head_indices': None
        }
        model.forward(**warmup_inputs, kv_cache=CACHE_MANAGER.kv_cache)
        del CACHE_MANAGER
        torch.cuda.empty_cache()
    # 预热
    warmup()
    print("start speed test running")
    # 申请缓存空间
    global CACHE_MANAGER
    CACHE_MANAGER = CacheManager(100,
                                 model.config.num_hidden_layers,
                                 model.config.num_key_value_heads,
                                 model.config.hidden_size // model.config.num_attention_heads,
                                 torch.float16,
                                 device)
    total_tokens = len(input_ids) + max_new_tokens - 1
    needed_blocks = math.ceil(total_tokens / BLOCK_SIZE)
    needed_blocks_slots = [(needed_blocks, total_tokens)]
    _, block_tables_tensor, slots = CACHE_MANAGER.allocate(needed_blocks, needed_blocks, needed_blocks_slots)
    # forward循环
    loops = 10
    for loop in range(loops):
        print(f"loop {loop}...")
        times = []
        new_tokens = []
        for step in range(max_new_tokens):
            if step == 0:
                # prefill step
                slot_indices = torch.arange(0, 0 + len(input_ids), dtype=torch.int64)
                inputs = {
                    'input_ids': torch.tensor(input_ids, dtype=torch.int64, device=device),
                    'position_ids': torch.arange(0, len(input_ids), dtype=torch.int32, device=device),
                    'cu_seqlen_prefill': torch.tensor([0, len(input_ids)], dtype=torch.int32, device=device),
                    'block_tables': block_tables_tensor,
                    'slots': slots[slot_indices],
                    'input_lengths': torch.tensor([len(input_ids)], dtype=torch.int32, device=device),
                    'max_s': len(input_ids),
                    'lm_head_indices': torch.tensor([0 + len(input_ids) - 1], dtype=torch.int32, device=device)
                }
            else:
                # incremental step
                current_length = len(input_ids) + step
                inputs = {
                    'input_ids': new_tokens[-1],
                    'position_ids': torch.tensor([current_length - 1], dtype=torch.int32, device=device),
                    'cu_seqlen_prefill': None,
                    'block_tables': block_tables_tensor,
                    'slots': torch.tensor([current_length - 1], dtype=torch.int32, device=device),
                    'input_lengths': torch.tensor([current_length], dtype=torch.int32, device=device),
                    'max_s': current_length,
                    'lm_head_indices': None
                }
            torch.cuda.synchronize()
            s_time = time.time()
            logits = model.forward(**inputs, kv_cache=CACHE_MANAGER.kv_cache)
            torch.cuda.synchronize()
            cost_time = time.time() - s_time
            next_token_id = logits.argmax(dim=-1)
            new_tokens.append(next_token_id)
            times.append(round(cost_time, 6))
        if loop == 0:
            new_tokens = torch.concat(new_tokens)
            print(tokenizer.decode(new_tokens, skip_special_tokens=True))
        elapsed_time = np.mean(times)
        print(f"total new tokens: {max_new_tokens}, cost time: {sum(times):.6f} s\n"
              f"time_per_token: {elapsed_time * 1000:.3f} ms, tps: {1 / elapsed_time:.2f} tokens/s")
 def main(model_path):
    # step 0: 定义路径与属性
    model_path = Path(model_path)
    config = transformers.AutoConfig.from_pretrained(model_path)
    model_files = list(model_path.glob('*.safetensors'))
    # step 1: 定义tokenizer与权重
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, padding_side="left", truncation_side="left")
    weights = Weights(model_files, device, torch.float16, process_group=None)
    # step2: 定义模型
    model = FlashLlamaForCausalLM(config, weights).eval()
    print(model)
    # step3: 推理
    with torch.no_grad():
        prompt = "who are you?"
        generate(tokenizer, model, prompt, max_new_tokens=100)
 if __name__ == '__main__':
    CACHE_MANAGER: Optional[CacheManager] = None
    device = torch.device("cuda")
    main('/code/models/llama-7b-hf')
--- a/my_optims/optims/test_llama_hf.py
+++ b/my_optims/optims/test_llama_hf.py
@ -0,0 +1,42 @@
 # encoding:utf-8
 # -------------------------------------------#
 # Filename: aime-local-inference-server -- test_llama_hf.py
 #
 # Description:   
 # Version:       1.0
 # Created:       2023/9/7-15:19
 # Last modified by: 
 # Author:        'zhaohuayang@myhexin.com'
 # Company:       同花顺网络信息股份有限公司
 # -------------------------------------------#
 import time
 import torch
 import transformers
 def test_llama():
    tokenizer = transformers.AutoTokenizer.from_pretrained("/code/models/llama-7b-hf")
    model = transformers.AutoModelForCausalLM.from_pretrained("/code/models/llama-7b-hf", torch_dtype=torch.float16,
                                                              device_map="auto").half().eval()
    prompt = "who are you?"
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')
    # warm up
    output_ids = model.generate(input_ids, max_new_tokens=100)
    i_l = len(input_ids[0])
    o_l = len(output_ids[0])
    print(f'input length: {i_l}\t'
          f'output length: {o_l}')
    print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
    # tps
    loop = 10
    s_time = time.time()
    for _ in range(loop):
        model.generate(input_ids, max_new_tokens=100)
    mean_ = (time.time() - s_time) / loop
    print(f"tps: {(o_l - i_l) / mean_:.4f}")
 if __name__ == '__main__':
    test_llama()
--- a/my_optims/optims/test_tp/flash_llama_modeling.py
+++ b/my_optims/optims/test_tp/flash_llama_modeling.py
@ -0,0 +1,467 @@
 # coding=utf-8
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional, List, Tuple
 # Flash attention imports
 import dropout_layer_norm
 import flash_attn_2_cuda
 import torch
 import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 # vllm imports
 from vllm import attention_ops, cache_ops
 from torch.nn import functional as F
 from layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    TensorParallelHead,
    get_linear,
 )
 class LlamaRMSNorm(nn.Module):
    def __init__(self, prefix, weights, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        weight = weights.get_tensor(f"{prefix}.weight")
        self.weight = nn.Parameter(weight)
        self.variance_epsilon = eps
    def forward(self, hidden_states, residual=None):
        if hidden_states.shape[-1] > 8192:
            if residual is not None:
                hidden_states += residual
            residual = hidden_states
            hidden_states = hidden_states.to(torch.float32)
            variance = hidden_states.pow(2).mean(-1, keepdim=True)
            hidden_states = hidden_states * torch.rsqrt(
                variance + self.variance_epsilon
            )
            # convert into half-precision if necessary
            if self.weight.dtype in [torch.float16, torch.bfloat16]:
                hidden_states = hidden_states.to(self.weight.dtype)
            return self.weight * hidden_states, residual
        else:
            # faster post attention rms norm
            normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd(
                hidden_states,
                residual,
                self.weight,
                None,
                None,
                None,
                None,
                None,
                0.0,
                self.variance_epsilon,
                1.0,
                0,
                None,
                False,
                True,  # Activate RMSNorm
            )
            if res is None:
                res = hidden_states
            return normed_hidden_states, res
 def load_attention(config, prefix, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        if config.model_type == "baichuan":
            return TensorParallelColumnLinear.load_qkv(
                config,
                prefix=f"{prefix}.W_pack",
                weights=weights,
                bias=False,
            )
        else:
            return TensorParallelColumnLinear.load_multi(
                config,
                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                dim=0,
                weights=weights,
                bias=False,
            )
 def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0
    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        quantize=config.quantize,
        dim=0,
    )
    if config.quantize != "gptq":
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
    return TensorParallelColumnLinear(
        get_linear(weight, bias=None, quantize=config.quantize)
    )
 class FlashLlamaAttention(torch.nn.Module):
    def __init__(
            self,
            prefix: str,
            config,
            weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        # self.rotary_emb = PositionRotaryEmbedding.load(
        #     config=config, prefix=f"{prefix}.rotary_emb", weights=weights
        # )
        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config, dim=self.head_size, base=config.rope_theta, device=weights.device
        )
        self.softmax_scale = self.head_size ** -0.5
        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
                config.num_key_value_heads // weights.process_group.size()
        )
        self.query_key_value = load_attention(config, prefix, weights)
        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)
    def forward(
            self,
            hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
    ):
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
        self.rotary_emb(query, cos, sin)
        self.rotary_emb(torch.select(kv, dim=1, index=0), cos, sin)
        cache_ops.reshape_and_cache(
            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
        )
        # output tensor
        attn_output = torch.empty_like(query)
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            flash_attn_2_cuda.varlen_fwd(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
                attn_output,
                cu_seqlen_prefill,
                cu_seqlen_prefill,
                max_s,
                max_s,
                0.0,
                self.softmax_scale,
                False,
                True,
                -1,
                0,
                False,
                None,
            )
        # Decode
        else:
            # kv_cache[1] => [num_blocks, num_heads, head_size, block_size]
            block_size = kv_cache[1].shape[3]
            attention_ops.paged_attention_v1(
                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None
            )
        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
 class LlamaMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate="tanh"
                if act in ["gelu_fast", "gelu_pytorch_tanh"]
                else "none",
            )
        )
        # Fuse gate and up proj
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.intermediate_size = (
                config.intermediate_size // weights.process_group.size()
        )
    def forward(self, hidden_states):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
 class FlashLlamaLayer(nn.Module):
    def __init__(self, layer_id, config, weights):
        super().__init__()
        prefix = f"model.layers.{layer_id}"
        self.self_attn = FlashLlamaAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.mlp = LlamaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.input_layernorm = LlamaRMSNorm(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = LlamaRMSNorm(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
    def forward(
            self,
            hidden_states,
            residual,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
        )
        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )
        mlp_output = self.mlp(normed_attn_res_output)
        return mlp_output, attn_res
 class FlashLlamaModel(torch.nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        # self.embed_tokens = TensorParallelEmbedding(
        #     prefix="model.embed_tokens", weights=weights
        # )
        embeddings = weights.get_tensor(f"model.embed_tokens.weight")
        self.embed_tokens = nn.Embedding.from_pretrained(F.pad(embeddings, (0, 0, 0, 1)),
                                                         padding_idx=config.pad_token_id)
        self.layers = nn.ModuleList(
            [
                FlashLlamaLayer(
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
                # for layer_id in range(1)
            ]
        )
        self.norm = LlamaRMSNorm(
            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
        )
        self.gradient_checkpointing = False
        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
    def forward(
            self,
            input_ids: torch.Tensor,
            position_ids: torch.Tensor,
            cu_seqlen_prefill: Optional[torch.Tensor],
            kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
            block_tables: torch.Tensor,
            slots: torch.Tensor,
            input_lengths: torch.Tensor,
            max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)
        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )
        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                input_lengths,
                max_s,
            )
        hidden_states, _ = self.norm(hidden_states, residual)
        return hidden_states
 class FlashLlamaForCausalLM(torch.nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        self.model = FlashLlamaModel(config, weights)
        self.lm_head = TensorParallelHead.load(
            config,
            prefix="lm_head",
            weights=weights,
        )
    def forward(
            self,
            input_ids: torch.Tensor,
            position_ids: torch.Tensor,
            cu_seqlen_prefill: Optional[torch.Tensor],
            kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
            block_tables: torch.Tensor,
            slots: torch.Tensor,
            input_lengths: torch.Tensor,
            max_s: int,
            lm_head_indices: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits
--- a/my_optims/optims/test_tp/layers.py
+++ b/my_optims/optims/test_tp/layers.py
@ -0,0 +1,399 @@
 import os
 from typing import List
 import torch
 import torch.distributed
 from torch import nn
 from torch.nn import functional as F
 class FastLinear(nn.Module):
    def __init__(
            self,
            weight,
            bias,
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(weight)
        if bias is not None:
            self.bias = nn.Parameter(bias)
        else:
            self.bias = None
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_tensor(f"{prefix}.weight")
        if bias:
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(weight, bias)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return F.linear(input, self.weight, self.bias)
 def get_linear(weight, bias, quantize):
    linear = FastLinear(weight, bias)
    return linear
 class SuperLayer(nn.Module):
    def __init__(self, linear):
        super().__init__()
        self.linear = linear
    def forward(self, x):
        return self.linear.forward(x)
 class TensorParallelHead(SuperLayer):
    def __init__(self, linear, process_group, should_gather: bool):
        super().__init__(linear)
        self.process_group = process_group
        self.should_gather = should_gather
    @staticmethod
    def load(config, prefix: str, weights):
        if weights.process_group.size() > 1:
            try:
                assert False
                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
                should_gather = True
            except AssertionError:
                # If the vocab size is not divisible by number of shards
                # just load the entire thing.
                weight = weights.get_tensor(f"{prefix}.weight")
                should_gather = False
        else:
            weight = weights.get_tensor(f"{prefix}.weight")
            should_gather = False
        # GPTQ doesn't quantize heads (nor embeddings)
        if config.quantize == "gptq":
            quantize = None
        else:
            quantize = config.quantize
        return TensorParallelHead(
            get_linear(weight, bias=None, quantize=quantize),
            process_group=weights.process_group,
            should_gather=should_gather,
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if not self.should_gather:
            return super().forward(input)
        world_size = self.process_group.size()
        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
            out_dim = self.linear.weight.shape[0]
            if input.shape[0] == 1:
                world_out = input.new_empty(1, out_dim * world_size)
                local_out = input.new_empty(1, out_dim)
                gather_input = local_out
            else:
                world_out = input.new_empty(out_dim * world_size, input.shape[0])
                gather_input = input.new_empty(out_dim, input.shape[0])
                local_out = gather_input.T
            torch.mm(input, self.linear.weight.T, out=local_out)
            torch.distributed.all_gather_into_tensor(
                world_out, gather_input, group=self.process_group
            )
            if input.shape[0] == 1:
                return world_out
            return world_out.T
        output = super().forward(input)
        world_output = [
            torch.empty_like(output) for _ in range(self.process_group.size())
        ]
        torch.distributed.all_gather(world_output, output, group=self.process_group)
        world_output = torch.cat(world_output, dim=-1)
        return world_output
 class TensorParallelColumnLinear(SuperLayer):
    @classmethod
    def load_qkv(cls, config, prefix: str, weights, bias: bool):
        """Specific method when the QKV was joined after the fact"""
        weight = weights.get_weights_col_packed_qkv(
            prefix, quantize=config.quantize
        )
        if bias:
            raise NotImplementedError("packed_qkv only implemented for baichuan")
        else:
            bias = None
        linear = get_linear(weight, bias, config.quantize)
        return cls(linear)
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        return cls.load_multi(config, [prefix], weights, bias, dim=0)
    @classmethod
    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
        weight = weights.get_multi_weights_col(
            prefixes, quantize=config.quantize, dim=dim
        )
        if bias:
            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
            bias = torch.cat(b, dim=dim)
        else:
            bias = None
        linear = get_linear(weight, bias, config.quantize)
        return cls(linear)
 class TensorParallelRowLinear(SuperLayer):
    def __init__(self, linear, process_group):
        super().__init__(linear)
        self.process_group = process_group
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
        if bias and weights.process_group.rank() == 0:
            # Rank is only on the first rank process
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(
            get_linear(weight, bias, config.quantize),
            process_group=weights.process_group,
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        out = super().forward(input)
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)
        return out
 class TensorParallelEmbedding(nn.Module):
    def __init__(self, prefix: str, weights, reduce=True):
        super().__init__()
        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
        process_group = weights.process_group
        world_size = process_group.size()
        rank = process_group.rank()
        block_size = num_embeddings // world_size
        self.min_id = rank * block_size
        self.max_id = min(num_embeddings, (rank + 1) * block_size)
        self.null_idx = block_size
        self.process_group = weights.process_group
        self.reduce = reduce
        """Additional 0 entry used for masking"""
        self.weight = nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
        # translate for [0, self.max_id - self.min_id[
        input = torch.where(
            (self.min_id > input) | (input >= self.max_id),
            self.null_idx,
            input - self.min_id,
        )
        out = torch.nn.functional.embedding(input, self.weight)
        if self.reduce and self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)
        return out
 try:
    import dropout_layer_norm
    class FastLayerNorm(nn.LayerNorm):
        def forward(self, hidden_states, residual=None):
            if hidden_states.shape[-1] > 8192:
                if residual is not None:
                    hidden_states += residual
                residual = hidden_states
                return super(FastLayerNorm, self).forward(hidden_states), residual
            else:
                (
                    normed_hidden_states,
                    residual,
                    *rest,
                ) = dropout_layer_norm.dropout_add_ln_fwd(
                    hidden_states,
                    residual,
                    self.weight,
                    self.bias,
                    None,
                    None,
                    None,
                    None,
                    0.0,
                    self.eps,
                    1.0,
                    0,
                    None,
                    False,
                    False,
                )
                if residual is None:
                    residual = hidden_states
                return normed_hidden_states, residual
 except ImportError:
    pass
 try:
    from flash_attn.layers.rotary import RotaryEmbedding
    import rotary_emb
    def _create_inv_freq(dim, base, device):
        inv_freq = 1.0 / (
                base
                ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
        )
        return inv_freq
    def _get_rope_config(config):
        if os.getenv("ROPE_SCALING", None) is not None:
            rope_scaling = {"type": os.environ["ROPE_SCALING"], "factor": float(os.environ["ROPE_FACTOR"])}
            return rope_scaling
        return getattr(config, "rope_scaling", None)
    class PositionRotaryEmbedding(nn.Module):
        def __init__(self, inv_freq, scaling_factor):
            super().__init__()
            self.inv_freq = inv_freq
            self._seq_len_cached = 0
            self._cos_cached = None
            self._sin_cached = None
            self._cos_k_cached = None
            self._sin_k_cached = None
            self.scaling_factor = scaling_factor
            self.dynamic_args = None
        @classmethod
        def static(cls, config, dim, base, device):
            inv_freq = _create_inv_freq(dim, base, device)
            scaling_factor = None
            rope_scaling = _get_rope_config(config)
            if rope_scaling is not None:
                scaling_factor = rope_scaling["factor"]
                if rope_scaling["type"] == "linear":
                    pass
                elif rope_scaling["type"] == "dynamic":
                    return DynamicPositionRotaryEmbedding(dim=dim,
                                                          max_position_embeddings=config.max_position_embeddings,
                                                          base=base, device=inv_freq.device,
                                                          scaling_factor=scaling_factor)
                else:
                    raise NotImplementedError(f"rope scaling type {rope_scaling['type']} is not implemented or invalid")
            return cls(inv_freq, scaling_factor)
        @classmethod
        def load(cls, config, prefix, weights):
            # XXX: Always load this in float32 !
            dtype = weights.dtype
            weights.dtype = torch.float32
            inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
            weights.dtype = dtype
            scaling_factor = None
            rope_scaling = _get_rope_config(config)
            if rope_scaling is not None:
                scaling_factor = rope_scaling["factor"]
                if rope_scaling["type"] == "linear":
                    pass
                elif rope_scaling["type"] == "dynamic":
                    return DynamicPositionRotaryEmbedding(dim=2 * inv_freq.shape[0],
                                                          max_position_embeddings=config.max_position_embeddings,
                                                          base=10000.0, device=inv_freq.device,
                                                          scaling_factor=scaling_factor)
                else:
                    raise NotImplementedError(f"rope scaling type {rope_scaling['type']} is not implemented or invalid")
            return cls(inv_freq, scaling_factor)
        def _update_cos_sin_cache(self, dtype, device, seqlen):
            # Reset the tables if the sequence length has changed,
            # or if we're on a new device (possibly due to tracing for instance)
            if (
                    seqlen > self._seq_len_cached
                    or self._cos_cached.device != device
                    or self._cos_cached.dtype != dtype
            ):
                self._seq_len_cached = seqlen
                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
                if self.scaling_factor is not None:
                    t /= self.scaling_factor
                # Don't do einsum, it converts fp32 to fp16
                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
                self._cos_cached = torch.cos(freqs).to(dtype)
                self._sin_cached = torch.sin(freqs).to(dtype)
        def get_cos_sin(
                self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype
        ):
            """
            Return cos and sin for the asked position ids
            """
            self._update_cos_sin_cache(dtype, position_ids.device, max_s)
            cos = torch.index_select(self._cos_cached, 0, position_ids)
            sin = torch.index_select(self._sin_cached, 0, position_ids)
            return cos.unsqueeze(1), sin.unsqueeze(1)
        def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
            rotary_dim = cos.shape[-1]
            x1 = x[..., :rotary_dim]
            x2 = x[..., rotary_dim: 2 * rotary_dim]
            rotary_emb.apply_rotary(x1, x2, cos, sin, x1, x2, False)
            return x
    class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
        def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
            inv_freq = _create_inv_freq(dim, base, device)
            super().__init__(inv_freq, scaling_factor)
            self.dim = dim
            self.max_position_embeddings = max_position_embeddings
            self.base = base
        def _update_cos_sin_cache(self, dtype, device, seqlen):
            # Reset the tables if the sequence length has changed,
            # or if we're on a new device (possibly due to tracing for instance)
            if (
                    seqlen > self._seq_len_cached
                    or self._cos_cached.device != device
                    or self._cos_cached.dtype != dtype
            ):
                if seqlen > self.max_position_embeddings:
                    newbase = self.base * ((self.scaling_factor * seqlen / self.max_position_embeddings) - (
                                self.scaling_factor - 1)) ** (self.dim / (self.dim - 2))
                    self.inv_freq = _create_inv_freq(self.dim, newbase, self.inv_freq.device)
                self._seq_len_cached = seqlen
                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
                # Don't do einsum, it converts fp32 to fp16
                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
                self._cos_cached = torch.cos(freqs).to(dtype)
                self._sin_cached = torch.sin(freqs).to(dtype)
 except ImportError:
    pass
--- a/my_optims/optims/test_tp/test_llama_fa_tp.py
+++ b/my_optims/optims/test_tp/test_llama_fa_tp.py
@ -0,0 +1,247 @@
 # encoding:utf-8
 # -------------------------------------------#
 # Filename: optims -- test_llama_fa.py
 #
 # Description:
 # Version:       1.0
 # Created:       2023/9/18-20:50
 # Last modified by:
 # Author:        'zhaohuayang@myhexin.com'
 # Company:       同花顺网络信息股份有限公司
 # -------------------------------------------#
 import math
 import time
 from pathlib import Path
 from typing import List, Optional
 # Flash attention imports
 import numpy as np
 import torch
 import torch.distributed
 import transformers
 from flash_llama_modeling import FlashLlamaForCausalLM
 from utils import initialize_torch_distributed
 from weights import Weights
 BLOCK_SIZE = 16
 class CacheManager:
    def __init__(
            self,
            num_blocks: int,
            num_layers: int,
            num_heads: int,
            head_size: int,
            dtype: torch.dtype,
            device: torch.device,
    ):
        self.block_size = BLOCK_SIZE
        self.num_blocks = num_blocks
        self.device = device
        element_size = torch.tensor([], dtype=dtype).element_size()
        x = self.block_size // element_size
        self.kv_cache = [
            (
                torch.empty(
                    (num_blocks, num_heads, head_size // x, self.block_size, x),
                    dtype=dtype,
                    device=device,
                ),
                torch.empty(
                    (num_blocks, num_heads, head_size, self.block_size),
                    dtype=dtype,
                    device=device,
                ),
            )
            for _ in range(num_layers)
        ]
        self.free_block_mask = torch.ones(num_blocks, dtype=torch.int32, device="cpu")
        self.slots = torch.arange(
            0, num_blocks * self.block_size, dtype=torch.int32
        ).view(num_blocks, self.block_size)
    def allocate(self, blocks, max_blocks, needed_blocks_slots):
        """
        blocks: 总共需要的blocks数量
        max_blocks: 最大的blocks数量大小
        needed_blocks_slots: 每个序列所需的blocks及其对应的序列长度
        """
        # Get free blocks indices by finding values in mask that are not set to 0
        free_block_indices = self.free_block_mask.nonzero()
        assert (
                len(free_block_indices) >= blocks
        ), f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
        # Slice by the number of required blocks
        block_indices = free_block_indices[: blocks]
        block_indices = block_indices.flatten()
        # Padded block tables
        block_tables_tensor = torch.zeros(
            (len(needed_blocks_slots), max_blocks), dtype=torch.int32
        )
        # Allocate paged attention blocks
        cumulative_blocks = 0
        slots = []
        block_tables = []
        for i, (needed_blocks, needed_slots) in enumerate(needed_blocks_slots):
            # Get allocated blocks for this sequence
            allocated_blocks = block_indices[
                               cumulative_blocks: cumulative_blocks + needed_blocks
                               ]
            # Get slots for the allocated blocks
            allocated_slots = self.slots[allocated_blocks].flatten()[:needed_slots]
            slots.append(allocated_slots)
            block_tables.append(allocated_blocks.tolist())
            block_tables_tensor[i, :needed_blocks] = allocated_blocks
            cumulative_blocks += needed_blocks
        # Allocate the required number of blocks by setting the mask to 0
        self.free_block_mask[block_indices] = 0
        return block_tables, block_tables_tensor.to(self.device), torch.concat(slots).to(self.device)
    def free(self, block_indices: Optional[List[int]]):
        if block_indices is not None and block_indices:
            # Reset mask
            self.free_block_mask[block_indices] = 1
 def generate(tokenizer, model, config, device, prompt, max_new_tokens=10):
    input_ids = tokenizer(prompt).input_ids
    def warmup():
        print("start warmup...")
        global CACHE_MANAGER
        blocks = 260
        CACHE_MANAGER = CacheManager(blocks,
                                     len(model.model.layers),
                                     model.model.num_key_value_heads,
                                     model.model.head_size,
                                     torch.float16,
                                     device)
        input_length = 1024
        bs = 4
        warmup_inputs = {
            'input_ids': torch.arange(1, input_length + 1, dtype=torch.int64, device=device).repeat(bs),
            'position_ids': torch.arange(0, input_length, dtype=torch.int32, device=device).repeat(bs),
            'cu_seqlen_prefill': torch.tensor([i * input_length for i in range(bs + 1)], dtype=torch.int32,
                                              device=device),
            'block_tables': torch.arange(0, blocks, dtype=torch.int32, device=device).split(blocks // bs),
            'slots': torch.arange(0, 4144, dtype=torch.int32, device=device),
            'input_lengths': torch.tensor([input_length] * 4, dtype=torch.int32, device=device),
            'max_s': 1024,
            'lm_head_indices': None
        }
        model.forward(**warmup_inputs, kv_cache=CACHE_MANAGER.kv_cache)
        del CACHE_MANAGER
        torch.cuda.empty_cache()
    # 预热
    warmup()
    print("start speed test running")
    # 申请缓存空间
    global CACHE_MANAGER
    CACHE_MANAGER = CacheManager(100,
                                 len(model.model.layers),
                                 model.model.num_key_value_heads,
                                 model.model.head_size,
                                 torch.float16,
                                 device)
    total_tokens = len(input_ids) + max_new_tokens - 1
    needed_blocks = math.ceil(total_tokens / BLOCK_SIZE)
    needed_blocks_slots = [(needed_blocks, total_tokens)]
    _, block_tables_tensor, slots = CACHE_MANAGER.allocate(needed_blocks, needed_blocks, needed_blocks_slots)
    # forward循环
    loops = 10
    tpss = []
    for loop in range(loops):
        print(f"loop {loop}...")
        times = []
        new_tokens = []
        for step in range(max_new_tokens):
            if step == 0:
                # prefill step
                slot_indices = torch.arange(0, 0 + len(input_ids), dtype=torch.int64)
                inputs = {
                    'input_ids': torch.tensor(input_ids, dtype=torch.int64, device=device),
                    'position_ids': torch.arange(0, len(input_ids), dtype=torch.int32, device=device),
                    'cu_seqlen_prefill': torch.tensor([0, len(input_ids)], dtype=torch.int32, device=device),
                    'block_tables': block_tables_tensor,
                    'slots': slots[slot_indices],
                    'input_lengths': torch.tensor([len(input_ids)], dtype=torch.int32, device=device),
                    'max_s': len(input_ids),
                    'lm_head_indices': torch.tensor([0 + len(input_ids) - 1], dtype=torch.int32, device=device)
                }
            else:
                # incremental step
                current_length = len(input_ids) + step
                inputs = {
                    'input_ids': new_tokens[-1],
                    'position_ids': torch.tensor([current_length - 1], dtype=torch.int32, device=device),
                    'cu_seqlen_prefill': None,
                    'block_tables': block_tables_tensor,
                    'slots': torch.tensor([current_length - 1], dtype=torch.int32, device=device),
                    'input_lengths': torch.tensor([current_length], dtype=torch.int32, device=device),
                    'max_s': current_length,
                    'lm_head_indices': None
                }
            torch.cuda.synchronize()
            s_time = time.time()
            logits = model.forward(**inputs, kv_cache=CACHE_MANAGER.kv_cache)
            torch.cuda.synchronize()
            cost_time = time.time() - s_time
            next_token_id = logits.argmax(dim=-1)
            new_tokens.append(next_token_id)
            times.append(round(cost_time, 6))
        if loop == 0:
            new_tokens = torch.concat(new_tokens)
            print(tokenizer.decode(new_tokens, skip_special_tokens=True))
        elapsed_time = np.mean(times)
        tps = 1 / elapsed_time
        tpss.append(tps)
        print(times)
        print(f"total new tokens: {max_new_tokens}, cost time: {sum(times):.6f} s\n"
              f"time_per_token: {elapsed_time * 1000:.3f} ms, tps: {tps:.2f} tokens/s")
    print(f'mean tps: {np.mean(tpss):.2f} tokens/s')
 def main(model_path):
    # init env
    process_group, rank, world_size = initialize_torch_distributed()
    # step 0: 定义路径与属性
    model_path = Path(model_path)
    config = transformers.AutoConfig.from_pretrained(model_path)
    config.quantize = None
    model_files = list(model_path.glob('*.safetensors'))
    # step 1: 定义tokenizer与权重
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, padding_side="left", truncation_side="left")
    device = torch.device(f"cuda:{rank}")
    weights = Weights(model_files, device, torch.float16, process_group=process_group)
    # step2: 定义模型
    torch.distributed.barrier(group=process_group)
    model = FlashLlamaForCausalLM(config, weights).eval()
    torch.distributed.barrier(group=process_group)
    print(model)
    # step3: 推理
    with torch.no_grad():
        prompt = "who are you?"
        generate(tokenizer, model, config, device, prompt, max_new_tokens=100)
 if __name__ == '__main__':
    CACHE_MANAGER: Optional[CacheManager] = None
    main('/code/models/llama-7b-hf')
--- a/my_optims/optims/test_tp/utils.py
+++ b/my_optims/optims/test_tp/utils.py
@ -0,0 +1,100 @@
 # encoding:utf-8
 # -------------------------------------------#
 # Filename: optims -- utils.py
 #
 # Description:   
 # Version:       1.0
 # Created:       2023/9/27-14:43
 # Last modified by: 
 # Author:        'zhaohuayang@myhexin.com'
 # Company:       同花顺网络信息股份有限公司
 # -------------------------------------------#
 import os
 import time
 from datetime import timedelta
 import torch
 from loguru import logger
 from torch.distributed import ProcessGroupNCCL
 RANK = int(os.getenv("LOCAL_RANK", "0"))
 WORLD_SIZE = int(os.getenv("WORLD_SIZE", "2"))
 NCCL_PORT = int(os.getenv("NCCL_PORT", "29500"))
 MEMORY_FRACTION = float(os.getenv("CUDA_MEMORY_FRACTION", "1.0"))
 class FakeBarrier:
    def wait(self):
        pass
 class FakeGroup:
    def __init__(self, rank, size):
        self._rank = rank
        self._size = size
    def allreduce(self, *args, **kwargs):
        return FakeBarrier()
    def allgather(self, inputs, local_tensor, **kwargs):
        assert (
                len(inputs[0]) == len(local_tensor) == 1
        ), f"{len(inputs[0])} != {len(local_tensor)} != 1, and the FakeGroup is supposed to join on simple tensors"
        for input_ in inputs:
            input_[0].data = local_tensor[0].data
        return FakeBarrier()
    def barrier(self, *args, **kwargs):
        return FakeBarrier()
    def size(self):
        return self._size
    def rank(self):
        return self._rank
 def initialize_torch_distributed():
    # Set the device id.
    assert WORLD_SIZE <= torch.cuda.device_count(), "Each process is one gpu"
    device = RANK % torch.cuda.device_count()
    torch.cuda.set_device(device)
    torch.cuda.set_per_process_memory_fraction(MEMORY_FRACTION, device)
    backend = "nccl"
    options = ProcessGroupNCCL.Options()
    options.is_high_priority_stream = True
    options._timeout = timedelta(seconds=60)
    if not torch.distributed.is_initialized():
        # Call the init process.
        torch.distributed.init_process_group(
            backend=backend,
            init_method=f"tcp://localhost:{NCCL_PORT}",
            world_size=WORLD_SIZE,
            rank=RANK,
            timeout=timedelta(seconds=60),
            pg_options=options,
        )
        logger.info(f"torch.distributed is initialized on rank {RANK} of {WORLD_SIZE}.")
    else:
        logger.warning("torch.distributed is already initialized.")
    return torch.distributed.group.WORLD, RANK, WORLD_SIZE
 class Timer:
    def __init__(self):
        self.times = []
        self.time = None
    def start(self):
        torch.cuda.synchronize()
        self.time = time.time()
    def end(self):
        torch.cuda.synchronize()
        self.times.append(time.time() - self.time)
    @property
    def elapsed(self):
        self.times.pop(0)
        return round(sum(self.times) / len(self.times) * 1000, 2)
--- a/my_optims/optims/test_tp/weights.py
+++ b/my_optims/optims/test_tp/weights.py
@ -0,0 +1,336 @@
 # encoding:utf-8
 # -------------------------------------------#
 # Filename: optims -- weights.py
 #
 # Description:   
 # Version:       1.0
 # Created:       2023/9/27-15:05
 # Last modified by: 
 # Author:        'zhaohuayang@myhexin.com'
 # Company:       同花顺网络信息股份有限公司
 # -------------------------------------------#
 import json
 import os
 from pathlib import Path
 from typing import List, Dict, Optional, Tuple
 import torch
 from huggingface_hub import hf_hub_download
 from loguru import logger
 from safetensors import safe_open, SafetensorError
 class Weights:
    def __init__(
            self,
            filenames: List[Path],
            device,
            dtype,
            process_group,
            aliases: Optional[Dict[str, List[str]]] = None,
    ):
        routing = {}
        for filename in filenames:
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename
        if aliases is None:
            aliases = {}
        self.aliases = aliases
        self.routing = routing
        self.device = device
        self.dtype = dtype
        self.process_group = process_group
        self._handles = {}
    def _get_handle(self, filename):
        if filename not in self._handles:
            f = safe_open(filename, framework="pytorch")
            self._handles[filename] = f
        return self._handles[filename]
    def get_filename(self, tensor_name: str) -> (str, str):
        filename = self.routing.get(tensor_name, None)
        if filename is None:
            aliases = self.aliases.get(tensor_name, [])
            for alias in aliases:
                filename = self.routing.get(alias, None)
                if filename is not None:
                    return str(filename), alias
            raise RuntimeError(f"weight {tensor_name} does not exist")
        return str(filename), tensor_name
    def _get_slice(self, tensor_name: str):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        return slice_
    def get_shape(self, tensor_name: str):
        return self._get_slice(tensor_name).get_shape()
    def get_tensor(self, tensor_name: str, to_device=True):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        tensor = f.get_tensor(tensor_name)
        # Special case for gptq which shouldn't convert
        # u4 which are disguised as int32
        if tensor.dtype not in [torch.int32, torch.int64]:
            tensor = tensor.to(dtype=self.dtype)
        if to_device:
            tensor = tensor.to(device=self.device)
        return tensor
    def get_partial_sharded(self, tensor_name: str, dim: int):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        world_size = self.process_group.size()
        rank = self.process_group.rank()
        size = slice_.get_shape()[dim]
        block_size = size // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        if dim == 0:
            tensor = slice_[start:stop]
        elif dim == 1:
            tensor = slice_[:, start:stop]
        else:
            raise NotImplementedError("Let's make that generic when needed")
        # Special case for gptq which shouldn't convert
        # u4 which are disguised as int32
        if tensor.dtype != torch.int32:
            tensor = tensor.to(dtype=self.dtype)
        tensor = tensor.to(device=self.device)
        return tensor
    def get_sharded(self, tensor_name: str, dim: int):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        world_size = self.process_group.size()
        size = slice_.get_shape()[dim]
        assert (
                size % world_size == 0
        ), f"The choosen size {size} is not compatible with sharding on {world_size} shards"
        return self.get_partial_sharded(tensor_name, dim)
    def _get_qweight(self, name: str):
        slice_ = self._get_slice(name)
        total_size = slice_.get_shape()[1]
        assert total_size % 3 == 0, "Prepacked quantized qkv is not divisible by 3"
        single_size = total_size // 3
        world_size = self.process_group.size()
        rank = self.process_group.rank()
        assert single_size % world_size == 0, f"Prepacked quantized qkv cannot be sharded across {world_size} shards"
        block_size = single_size // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        q = slice_[:, start:stop]
        k = slice_[:, start + single_size:stop + single_size]
        v = slice_[:, start + 2 * single_size:stop + 2 * single_size]
        weight = torch.cat([q, k, v], dim=1)
        weight = weight.to(device=self.device)
        return weight
    def get_weights_col_packed_qkv(self, prefix: str, quantize: str):
        """
        Highly specific when the underlying tensor is a simple cat of Q,K,V instead of being
        already alternating Q,K,V within the main tensor
        """
        if quantize == "gptq":
            try:
                qweight = self._get_qweight(f"{prefix}.qweight")
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
                )
            qzeros = self._get_qweight(f"{prefix}.qzeros")
            scales = self._get_qweight(f"{prefix}.scales")
            scales = scales.to(dtype=self.dtype)
            g_idx = self.get_tensor(f"{prefix}.g_idx")
            bits, groupsize = self._get_gptq_params()
            weight = (qweight, qzeros, scales, g_idx, bits, groupsize, False)
        else:
            slice_ = self._get_slice(f"{prefix}.weight")
            total_size = slice_.get_shape()[0]
            assert total_size % 3 == 0, "Prepacked qkv is not divisible by 3"
            single_size = total_size // 3
            world_size = self.process_group.size()
            rank = self.process_group.rank()
            assert single_size % world_size == 0, f"Prepacked qkv cannot be sharded across {world_size} shards"
            block_size = single_size // world_size
            start = rank * block_size
            stop = (rank + 1) * block_size
            q = slice_[start:stop]
            k = slice_[start + single_size:stop + single_size]
            v = slice_[start + 2 * single_size:stop + 2 * single_size]
            weight = torch.cat([q, k, v], dim=0)
            weight = weight.to(device=self.device)
            weight = weight.to(dtype=self.dtype)
        return weight
    def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
        if quantize == "gptq":
            try:
                qweight = torch.cat(
                    [self.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
                )
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
                )
            qzeros = torch.cat(
                [self.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
            )
            scales = torch.cat(
                [self.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
            )
            w = [self.get_tensor(f"{p}.g_idx") for p in prefixes]
            for w2 in w[1:]:
                torch.testing.assert_close(w2, w[0])
            g_idx = w[0]
            bits, groupsize = self._get_gptq_params()
            weight = (qweight, qzeros, scales, g_idx, bits, groupsize, False)
        else:
            w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
            weight = torch.cat(w, dim=dim)
        return weight
    def get_tensor_shard(self, var, dim):
        world_size = self.process_group.size()
        rank = self.process_group.rank()
        block_size = var.size()[dim] // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        if dim == 0:
            tensor = var[start:stop]
        elif dim == 1:
            tensor = var[:, start:stop]
        else:
            raise NotImplementedError("Let's make that generic when needed")
        tensor = tensor.to(dtype=self.dtype)
        tensor = tensor.to(device=self.device)
        return tensor
    def get_multi_weights_row(self, prefix: str, quantize: str):
        if quantize == "gptq":
            use_exllama = True
            bits, groupsize = self._get_gptq_params()
            if bits != 4:
                use_exllama = False
            if self.process_group.size() > 1:
                g_idx = self.get_tensor(f"{prefix}.g_idx")
                if g_idx is not None:
                    if (
                            not torch.equal(
                                g_idx.cpu(),
                                torch.tensor(
                                    [i // groupsize for i in range(g_idx.shape[0])],
                                    dtype=torch.int32,
                                ),
                            )
                            and not (g_idx == 0).all()
                    ):
                        # Exllama implementation does not support row tensor parallelism with act-order, as
                        # it would require to reorder input activations that are split unto several GPUs
                        use_exllama = False
            try:
                qweight = self.get_sharded(f"{prefix}.qweight", dim=0)
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
                )
            from text_generation_server.utils.layers import HAS_EXLLAMA, CAN_EXLLAMA
            if use_exllama:
                if not HAS_EXLLAMA:
                    if CAN_EXLLAMA:
                        logger.warning(
                            "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True"
                        )
                    use_exllama = False
                else:
                    logger.info("Using exllama kernels")
            if use_exllama:
                if groupsize >= 0:
                    # Exllama reorders the weights in advance and the activations on the fly, thus
                    # the scales and zero-points do not need to be reordered.
                    qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
                    scales = self.get_sharded(f"{prefix}.scales", dim=0)
                else:
                    qzeros = self.get_tensor(f"{prefix}.qzeros")
                    scales = self.get_tensor(f"{prefix}.scales")
                # For tp > 1, at this point we know we do not use act-order
                if self.process_group.size() == 1:
                    g_idx = self.get_tensor(f"{prefix}.g_idx")
                else:
                    g_idx = None
            else:
                # The triton kernel reorders the scales/zero points instead of the weight/activation.
                # Thus, each rank needs the full qzeros/scales.
                qzeros = self.get_tensor(f"{prefix}.qzeros")
                scales = self.get_tensor(f"{prefix}.scales")
                g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
            weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
        else:
            weight = self.get_sharded(f"{prefix}.weight", dim=1)
        return weight
    def _get_gptq_params(self) -> Tuple[int, int]:
        try:
            bits = self.get_tensor("gptq_bits").item()
            groupsize = self.get_tensor("gptq_groupsize").item()
        except (SafetensorError, RuntimeError) as e:
            try:
                bits = self.gptq_bits
                groupsize = self.gptq_groupsize
            except Exception:
                raise e
        return bits, groupsize
    def _set_gptq_params(self, model_id):
        filename = "config.json"
        try:
            if os.path.exists(os.path.join(model_id, filename)):
                filename = os.path.join(model_id, filename)
            else:
                filename = hf_hub_download(model_id, filename=filename)
            with open(filename, "r") as f:
                data = json.load(f)
            self.gptq_bits = data["quantization_config"]["bits"]
            self.gptq_groupsize = data["quantization_config"]["group_size"]
        except Exception:
            filename = "quantize_config.json"
            try:
                if os.path.exists(os.path.join(model_id, filename)):
                    filename = os.path.join(model_id, filename)
                else:
                    filename = hf_hub_download(model_id, filename=filename)
                with open(filename, "r") as f:
                    data = json.load(f)
                self.gptq_bits = data["bits"]
                self.gptq_groupsize = data["group_size"]
            except Exception:
                pass
--- a/my_optims/tgi_update/flash_llama.py
+++ b/my_optims/tgi_update/flash_llama.py
@ -0,0 +1,94 @@
 import torch
 import torch.distributed
 from opentelemetry import trace
 from transformers import AutoConfig, AutoTokenizer
 from transformers.models.llama import LlamaTokenizer
 from typing import Optional
 from text_generation_server.models import FlashCausalLM
 from text_generation_server.models.custom_modeling.flash_llama_modeling import (
    FlashLlamaForCausalLM,
    LlamaConfig,
 )
 from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
    Weights,
 )
 import os
 tracer = trace.get_tracer(__name__)
 USE_CUSTOM_NCCL = int(os.getenv("OMPI_COMM_WORLD_SIZE", "1")) > 1 and int(os.getenv("USE_CUSTOM_NCCL", "0")) == 1
 if USE_CUSTOM_NCCL:
    from text_generation_server.utils.my_dist import initialize_mpi_distributed
 class FlashLlama(FlashCausalLM):
    def __init__(
        self,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        if USE_CUSTOM_NCCL:
            self.process_group, rank, world_size, COMM = initialize_mpi_distributed()
        else:
            self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = torch.float16 if dtype is None else dtype
        else:
            raise NotImplementedError("FlashLlama is only available on GPU")
        try:
            raise
            tokenizer = LlamaTokenizer.from_pretrained(
                model_id,
                revision=revision,
                padding_side="left",
                truncation_side="left",
                trust_remote_code=trust_remote_code,
            )
        except Exception:
            tokenizer = AutoTokenizer.from_pretrained(
                model_id,
                revision=revision,
                padding_side="left",
                truncation_side="left",
                trust_remote_code=trust_remote_code,
            )
        config = LlamaConfig.from_pretrained(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        config.quantize = quantize
        if USE_CUSTOM_NCCL:
            COMM.barrier()
        else:
            torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
        if config.quantize in ["gptq", "awq"]:
            weights._set_gptq_params(model_id)
        model = FlashLlamaForCausalLM(config, weights)
        if USE_CUSTOM_NCCL:
            COMM.barrier()
        else:
            torch.distributed.barrier(group=self.process_group)
        super(FlashLlama, self).__init__(
            model=model,
            tokenizer=tokenizer,
            num_layers=len(model.model.layers),
            num_kv_heads=model.model.num_key_value_heads,
            head_size=model.model.head_size,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
        )
--- a/my_optims/tgi_update/flash_llama_modeling.py
+++ b/my_optims/tgi_update/flash_llama_modeling.py
@ -0,0 +1,513 @@
 # coding=utf-8
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 # Flash attention imports
 import dropout_layer_norm
 from text_generation_server.utils import paged_attention, flash_attn
 from text_generation_server.utils.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    TensorParallelHead,
    get_linear,
 )
 class LlamaConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_scaling=None,
        rope_theta=10000.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_scaling = rope_scaling
        self.rope_theta = rope_theta
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
 class LlamaRMSNorm(nn.Module):
    def __init__(self, prefix, weights, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        weight = weights.get_tensor(f"{prefix}.weight")
        self.weight = nn.Parameter(weight)
        self.variance_epsilon = eps
    def forward(self, hidden_states, residual=None):
        if hidden_states.shape[-1] > 8192:
            if residual is not None:
                hidden_states += residual
            residual = hidden_states
            hidden_states = hidden_states.to(torch.float32)
            variance = hidden_states.pow(2).mean(-1, keepdim=True)
            hidden_states = hidden_states * torch.rsqrt(
                variance + self.variance_epsilon
            )
            # convert into half-precision if necessary
            if self.weight.dtype in [torch.float16, torch.bfloat16]:
                hidden_states = hidden_states.to(self.weight.dtype)
            return self.weight * hidden_states, residual
        else:
            # faster post attention rms norm
            normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd(
                hidden_states,
                residual,
                self.weight,
                None,
                None,
                None,
                None,
                None,
                0.0,
                self.variance_epsilon,
                1.0,
                0,
                None,
                False,
                True,  # Activate RMSNorm
            )
            if res is None:
                res = hidden_states
            return normed_hidden_states, res
 def load_attention(config, prefix, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        if config.model_type == "baichuan":
            return TensorParallelColumnLinear.load_qkv(
                config,
                prefix=f"{prefix}.W_pack",
                weights=weights,
                bias=False,
            )
        else:
            return TensorParallelColumnLinear.load_multi(
                config,
                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                dim=0,
                weights=weights,
                bias=False,
            )
 def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0
    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        quantize=config.quantize,
        dim=0,
    )
    if config.quantize not in ["gptq", "awq"]:
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
    return TensorParallelColumnLinear(
        get_linear(weight, bias=None, quantize=config.quantize)
    )
 class FlashLlamaAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        # self.rotary_emb = PositionRotaryEmbedding.load(
        #     config=config, prefix=f"{prefix}.rotary_emb", weights=weights
        # )
        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=config.rope_theta,
            device=weights.device,
        )
        self.softmax_scale = self.head_size**-0.5
        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )
        self.query_key_value = load_attention(config, prefix, weights)
        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)
    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        input_lengths,
        max_s,
    ):
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
        self.rotary_emb(query, cos, sin)
        self.rotary_emb(torch.select(kv, dim=1, index=0), cos, sin)
        paged_attention.reshape_and_cache(
            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
        )
        # output tensor
        attn_output = torch.empty_like(query)
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            flash_attn.attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
            )
        # Decode
        else:
            paged_attention.attention(
                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                input_lengths,
                max_s,
            )
        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
 class LlamaMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate="tanh"
                if act in ["gelu_fast", "gelu_pytorch_tanh"]
                else "none",
            )
        )
        # Fuse gate and up proj
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )
    def forward(self, hidden_states):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
 class FlashLlamaLayer(nn.Module):
    def __init__(self, layer_id, config, weights):
        super().__init__()
        prefix = f"model.layers.{layer_id}"
        self.self_attn = FlashLlamaAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.mlp = LlamaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.input_layernorm = LlamaRMSNorm(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = LlamaRMSNorm(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        input_lengths,
        max_s,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
        )
        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )
        mlp_output = self.mlp(normed_attn_res_output)
        return mlp_output, attn_res
 class FlashLlamaModel(torch.nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        import os
        if int(os.getenv("USE_TP_EMBEDDING", "1")) == 1:
            self.embed_tokens = TensorParallelEmbedding(
                prefix="model.embed_tokens", weights=weights
            )
        else:
            from torch.nn import functional as F
            from loguru import logger
            embeddings = weights.get_tensor(f"model.embed_tokens.weight")
            self.embed_tokens = nn.Embedding.from_pretrained(F.pad(embeddings, (0, 0, 0, 1)),
                                                            padding_idx=config.pad_token_id)
            logger.info("Disabled embedding tensor parallel! ")
        self.layers = nn.ModuleList(
            [
                FlashLlamaLayer(
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = LlamaRMSNorm(
            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
        )
        self.gradient_checkpointing = False
        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        input_lengths: torch.Tensor,
        max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)
        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )
        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                input_lengths,
                max_s,
            )
        hidden_states, _ = self.norm(hidden_states, residual)
        return hidden_states
 class FlashLlamaForCausalLM(torch.nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        self.model = FlashLlamaModel(config, weights)
        self.lm_head = TensorParallelHead.load(
            config,
            prefix="lm_head",
            weights=weights,
        )
    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        input_lengths: torch.Tensor,
        max_s: int,
        lm_head_indices: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits
--- a/my_optims/tgi_update/layers.py
+++ b/my_optims/tgi_update/layers.py
@ -0,0 +1,815 @@
 import os
 import torch
 from torch import nn
 from torch.nn import functional as F
 from typing import List
 from loguru import logger
 from functools import lru_cache
 HAS_BITS_AND_BYTES = True
 try:
    import bitsandbytes as bnb
    from bitsandbytes.nn import Int8Params, Params4bit
 except ImportError:
    HAS_BITS_AND_BYTES = False
 from accelerate import init_empty_weights
 from text_generation_server.utils.gptq.quant_linear import QuantLinear
 HAS_AWQ = True
 try:
    from text_generation_server.utils.awq.quantize.qmodule import WQLinear
 except ImportError:
    HAS_AWQ = False
 try:
    major, _minor = torch.cuda.get_device_capability()
 except Exception:
    major = 1
 HAS_EXLLAMA = False
 CAN_EXLLAMA = major >= 8
 if os.getenv("DISABLE_EXLLAMA") == "True":
    HAS_EXLLAMA = False
 elif CAN_EXLLAMA:
    try:
        from text_generation_server.utils.gptq.exllama import Ex4bitLinear
        HAS_EXLLAMA = True
    except ImportError:
        pass
 from typing import Optional
 HAS_EETQ = False
 try:
    from EETQ import quant_weights, w8_a16_gemm
    HAS_EETQ = True
 except ImportError:
    pass
 import my_custom_comm
 USE_CUSTOM_NCCL = int(os.getenv("OMPI_COMM_WORLD_SIZE", "1")) > 1 and int(os.getenv("USE_CUSTOM_NCCL", "0")) == 1
 USE_LM_HEAD_PARALLEL = int(os.getenv("USE_LM_HEAD_PARALLEL", "1"))
 # Monkey patching
@classmethod
 def load_layer_norm(cls, prefix, weights, eps):
    weight = weights.get_tensor(f"{prefix}.weight")
    bias = weights.get_tensor(f"{prefix}.bias")
    with init_empty_weights():
        ln = cls(weight.shape, eps=eps)
    ln.weight = nn.Parameter(weight)
    ln.bias = nn.Parameter(bias)
    return ln
@classmethod
 def load_layer_norm_no_bias(cls, prefix, weights, eps):
    weight = weights.get_tensor(f"{prefix}.weight")
    with init_empty_weights():
        ln = cls(weight.shape, eps=eps)
    ln.weight = nn.Parameter(weight)
    ln.bias = None
    return ln
@classmethod
 def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
    weight = weights.get_tensor(f"{prefix}.weight")
    bias = weights.get_tensor(f"{prefix}.bias")
    with init_empty_weights():
        conv2d = cls(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )
    conv2d.weight = nn.Parameter(weight)
    conv2d.bias = nn.Parameter(bias)
    return conv2d
@classmethod
 def load_conv2d_no_bias(
    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
 ):
    weight = weights.get_tensor(f"{prefix}.weight")
    with init_empty_weights():
        conv2d = cls(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )
    conv2d.weight = nn.Parameter(weight)
    conv2d.bias = None
    return conv2d
 torch.nn.Conv2d.load = load_conv2d
 torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
 torch.nn.LayerNorm.load = load_layer_norm
 torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
 class FastLinear(nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(weight)
        if bias is not None:
            self.bias = nn.Parameter(bias)
        else:
            self.bias = None
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_tensor(f"{prefix}.weight")
        if bias:
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(weight, bias)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return F.linear(input, self.weight, self.bias)
 class EETQLinear(nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        device = weight.device
        weight = torch.t(weight).contiguous().cpu()
        weight, scale = quant_weights(weight, torch.int8, False)
        self.weight = weight.cuda(device)
        self.scale = scale.cuda(device)
        self.bias = bias.cuda(device) if bias is not None else None
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        output = w8_a16_gemm(input, self.weight, self.scale)
        output = output + self.bias if self.bias is not None else output
        return output
 class Linear8bitLt(nn.Module):
    def __init__(
        self,
        weight,
        bias,
        has_fp16_weights=True,
        memory_efficient_backward=False,
        threshold=0.0,
        index=None,
    ):
        super().__init__()
        assert (
            not memory_efficient_backward
        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
        self.state = bnb.MatmulLtState()
        self.index = index
        # Necessary for stacked layers
        self.state.threshold = threshold
        self.state.has_fp16_weights = has_fp16_weights
        self.state.memory_efficient_backward = memory_efficient_backward
        if threshold > 0.0 and not has_fp16_weights:
            self.state.use_pool = True
        self.weight = Int8Params(
            weight.data,
            has_fp16_weights=has_fp16_weights,
            requires_grad=has_fp16_weights,
        )
        self.weight.cuda(weight.device)
        self.bias = bias
    def init_8bit_state(self):
        self.state.CB = self.weight.CB
        self.state.SCB = self.weight.SCB
        self.weight.CB = None
        self.weight.SCB = None
    def forward(self, x: torch.Tensor):
        self.state.is_training = self.training
        if self.weight.CB is not None:
            self.init_8bit_state()
        # weights are cast automatically as Int8Params, but the bias has to be cast manually
        if self.bias is not None and self.bias.dtype != x.dtype:
            self.bias.data = self.bias.data.to(x.dtype)
        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
        if not self.state.has_fp16_weights:
            if self.state.CB is not None and self.state.CxB is not None:
                # we converted 8-bit row major to turing/ampere format in the first inference pass
                # we no longer need the row-major weight
                del self.state.CB
                self.weight.data = self.state.CxB
        return out
 class Linear4bit(nn.Module):
    def __init__(self, weight, bias, quant_type):
        super().__init__()
        self.weight = Params4bit(
            weight.data,
            requires_grad=False,
            compress_statistics=True,
            quant_type=quant_type,
        )
        self.compute_dtype = None
        self.weight.cuda(weight.device)
        self.bias = bias
    def forward(self, x: torch.Tensor):
        # weights are cast automatically as Int8Params, but the bias has to be cast manually
        if self.bias is not None and self.bias.dtype != x.dtype:
            self.bias.data = self.bias.data.to(x.dtype)
        if getattr(self.weight, "quant_state", None) is None:
            print(
                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
            )
        inp_dtype = x.dtype
        if self.compute_dtype is not None:
            x = x.to(self.compute_dtype)
        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
        out = bnb.matmul_4bit(
            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
        )
        out = out.to(inp_dtype)
        return out
@lru_cache(1)
 def warn_deprecate_bnb():
    logger.warning(
        "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
    )
 def get_linear(weight, bias, quantize):
    if quantize is None:
        linear = FastLinear(weight, bias)
    elif quantize == "eetq":
        if HAS_EETQ:
            linear = EETQLinear(weight, bias)
        else:
            raise ImportError(
                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
            )
    elif quantize == "bitsandbytes":
        warn_deprecate_bnb()
        linear = Linear8bitLt(
            weight,
            bias,
            has_fp16_weights=False,
            threshold=6.0,
        )
        if bias is not None:
            linear.bias = nn.Parameter(bias)
    elif quantize == "bitsandbytes-fp4":
        linear = Linear4bit(
            weight,
            bias,
            quant_type="fp4",
        )
    elif quantize == "bitsandbytes-nf4":
        linear = Linear4bit(
            weight,
            bias,
            quant_type="nf4",
        )
    elif quantize == "gptq":
        try:
            qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight
        except Exception:
            raise NotImplementedError(
                f"The passed weight is not `gptq` compatible, loader needs to be updated."
            )
        if use_exllama:
            linear = Ex4bitLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
        else:
            linear = QuantLinear(
                qweight,
                qzeros,
                scales,
                g_idx,
                bias,
                bits,
                groupsize,
            )
    elif quantize == "awq":
        try:
            qweight, qzeros, scales, _, bits, groupsize, _ = weight
        except Exception:
            raise NotImplementedError(
                f"The passed weight is not `awq` compatible, loader needs to be updated."
            )
        linear = WQLinear(
            w_bit=bits,
            group_size=groupsize,
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            bias=bias is not None,
        )
    else:
        raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
    return linear
 class SuperLayer(nn.Module):
    def __init__(self, linear):
        super().__init__()
        self.linear = linear
    def forward(self, x):
        return self.linear.forward(x)
 class TensorParallelHead(SuperLayer):
    def __init__(self, linear, process_group, should_gather: bool):
        super().__init__(linear)
        self.process_group = process_group
        self.should_gather = should_gather
    @staticmethod
    def load(config, prefix: str, weights):
        if weights.process_group.size() > 1:
            try:
                assert USE_CUSTOM_NCCL == 0 and USE_LM_HEAD_PARALLEL == 1
                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
                should_gather = True
            except AssertionError:
                # If the vocab size is not divisible by number of shards
                # just load the entire thing.
                weight = weights.get_tensor(f"{prefix}.weight")
                should_gather = False
                logger.info("Disabled lm head parallel! ")
        else:
            weight = weights.get_tensor(f"{prefix}.weight")
            should_gather = False
        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
        if config.quantize in ["gptq", "awq", "eetq"]:
            quantize = None
        else:
            quantize = config.quantize
        return TensorParallelHead(
            get_linear(weight, bias=None, quantize=quantize),
            process_group=weights.process_group,
            should_gather=should_gather,
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if not self.should_gather:
            return super().forward(input)
        world_size = self.process_group.size()
        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
            out_dim = self.linear.weight.shape[0]
            if input.shape[0] == 1:
                world_out = input.new_empty(1, out_dim * world_size)
                local_out = input.new_empty(1, out_dim)
                gather_input = local_out
            else:
                world_out = input.new_empty(out_dim * world_size, input.shape[0])
                gather_input = input.new_empty(out_dim, input.shape[0])
                local_out = gather_input.T
            torch.mm(input, self.linear.weight.T, out=local_out)
            torch.distributed.all_gather_into_tensor(
                world_out, gather_input, group=self.process_group
            )
            if input.shape[0] == 1:
                return world_out
            return world_out.T
        output = super().forward(input)
        world_output = [
            torch.empty_like(output) for _ in range(self.process_group.size())
        ]
        torch.distributed.all_gather(world_output, output, group=self.process_group)
        world_output = torch.cat(world_output, dim=-1)
        return world_output
 class TensorParallelColumnLinear(SuperLayer):
    @classmethod
    def load_qkv(cls, config, prefix: str, weights, bias: bool):
        """Specific method when the QKV was joined after the fact"""
        weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize)
        if bias:
            raise NotImplementedError("packed_qkv only implemented for baichuan")
        else:
            bias = None
        linear = get_linear(weight, bias, config.quantize)
        return cls(linear)
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        return cls.load_multi(config, [prefix], weights, bias, dim=0)
    @classmethod
    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
        weight = weights.get_multi_weights_col(
            prefixes, quantize=config.quantize, dim=dim
        )
        if bias:
            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
            bias = torch.cat(b, dim=dim)
        else:
            bias = None
        linear = get_linear(weight, bias, config.quantize)
        return cls(linear)
 class TensorParallelRowLinear(SuperLayer):
    def __init__(self, linear, process_group):
        super().__init__(linear)
        self.process_group = process_group
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
        if bias and weights.process_group.rank() == 0:
            # Rank is only on the first rank process
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(
            get_linear(weight, bias, config.quantize),
            process_group=weights.process_group,
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        out = super().forward(input)
        if self.process_group.size() > 1:
            if USE_CUSTOM_NCCL:
                my_custom_comm.custom_allreduce(out, self.process_group.tp_comm)
            else:
                torch.distributed.all_reduce(out, group=self.process_group)
        return out
 class TensorParallelEmbedding(nn.Module):
    def __init__(self, prefix: str, weights, reduce=True):
        super().__init__()
        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
        process_group = weights.process_group
        world_size = process_group.size()
        rank = process_group.rank()
        block_size = num_embeddings // world_size
        self.min_id = rank * block_size
        self.max_id = min(num_embeddings, (rank + 1) * block_size)
        self.null_idx = block_size
        self.process_group = weights.process_group
        self.reduce = reduce
        """Additional 0 entry used for masking"""
        self.weight = nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
        # translate for [0, self.max_id - self.min_id[
        input = torch.where(
            (self.min_id > input) | (input >= self.max_id),
            self.null_idx,
            input - self.min_id,
        )
        out = torch.nn.functional.embedding(input, self.weight)
        if self.reduce and self.process_group.size() > 1:
            if USE_CUSTOM_NCCL:
                my_custom_comm.custom_allreduce(out, self.process_group.tp_comm)
            else:
                torch.distributed.all_reduce(out, group=self.process_group)
        return out
 try:
    import dropout_layer_norm
    class FastLayerNorm(nn.LayerNorm):
        def forward(self, hidden_states, residual=None):
            if hidden_states.shape[-1] > 8192:
                if residual is not None:
                    hidden_states += residual
                residual = hidden_states
                return super(FastLayerNorm, self).forward(hidden_states), residual
            else:
                (
                    normed_hidden_states,
                    residual,
                    *rest,
                ) = dropout_layer_norm.dropout_add_ln_fwd(
                    hidden_states,
                    residual,
                    self.weight,
                    self.bias,
                    None,
                    None,
                    None,
                    None,
                    0.0,
                    self.eps,
                    1.0,
                    0,
                    None,
                    False,
                    False,
                )
                if residual is None:
                    residual = hidden_states
                return normed_hidden_states, residual
 except ImportError:
    pass
 try:
    from flash_attn.layers.rotary import RotaryEmbedding
    import rotary_emb
    def _create_inv_freq(dim, base, device):
        inv_freq = 1.0 / (
            base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
        )
        return inv_freq
    def _get_rope_config(config):
        if os.getenv("ROPE_SCALING", None) is not None:
            rope_scaling = {
                "type": os.environ["ROPE_SCALING"],
                "factor": float(os.environ["ROPE_FACTOR"]),
            }
            return rope_scaling
        return getattr(config, "rope_scaling", None)
    class PositionRotaryEmbedding(nn.Module):
        def __init__(self, inv_freq, scaling_factor):
            super().__init__()
            self.inv_freq = inv_freq
            self._seq_len_cached = 0
            self._cos_cached = None
            self._sin_cached = None
            self._cos_k_cached = None
            self._sin_k_cached = None
            self.scaling_factor = scaling_factor
            self.dynamic_args = None
        @classmethod
        def static(cls, config, dim, base, device):
            inv_freq = _create_inv_freq(dim, base, device)
            scaling_factor = None
            rope_scaling = _get_rope_config(config)
            if rope_scaling is not None:
                scaling_factor = rope_scaling["factor"]
                if rope_scaling["type"] == "linear":
                    pass
                elif rope_scaling["type"] == "dynamic":
                    return DynamicPositionRotaryEmbedding(
                        dim=dim,
                        max_position_embeddings=config.max_position_embeddings,
                        base=base,
                        device=inv_freq.device,
                        scaling_factor=scaling_factor,
                    )
                elif rope_scaling["type"] == "yarn":
                    return YarnPositionRotaryEmbedding(
                        dim=2 * inv_freq.shape[0],
                        max_position_embeddings=rope_scaling["original_max_position_embeddings"],
                        base=10000.0,
                        device=inv_freq.device,
                        scaling_factor=scaling_factor,
                        extrapolation_factor=1,
                        attn_factor=1,
                        beta_fast=32,
                        beta_slow=1
                    )
                else:
                    raise NotImplementedError(
                        f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
                    )
            return cls(inv_freq, scaling_factor)
        @classmethod
        def load(cls, config, prefix, weights):
            # XXX: Always load this in float32 !
            dtype = weights.dtype
            weights.dtype = torch.float32
            inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
            weights.dtype = dtype
            scaling_factor = None
            rope_scaling = _get_rope_config(config)
            if rope_scaling is not None:
                scaling_factor = rope_scaling["factor"]
                if rope_scaling["type"] == "linear":
                    pass
                elif rope_scaling["type"] == "dynamic":
                    return DynamicPositionRotaryEmbedding(
                        dim=2 * inv_freq.shape[0],
                        max_position_embeddings=config.max_position_embeddings,
                        base=10000.0,
                        device=inv_freq.device,
                        scaling_factor=scaling_factor,
                    )
                elif rope_scaling["type"] == "yarn":
                    return YarnPositionRotaryEmbedding(
                        dim=2 * inv_freq.shape[0],
                        max_position_embeddings=rope_scaling["original_max_position_embeddings"],
                        base=10000.0,
                        device=inv_freq.device,
                        scaling_factor=scaling_factor,
                        extrapolation_factor=1,
                        attn_factor=1,
                        beta_fast=32,
                        beta_slow=1
                    )
                else:
                    raise NotImplementedError(
                        f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
                    )
            return cls(inv_freq, scaling_factor)
        def _update_cos_sin_cache(self, dtype, device, seqlen):
            # Reset the tables if the sequence length has changed,
            # or if we're on a new device (possibly due to tracing for instance)
            if (
                seqlen > self._seq_len_cached
                or self._cos_cached.device != device
                or self._cos_cached.dtype != dtype
            ):
                self._seq_len_cached = seqlen
                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
                if self.scaling_factor is not None:
                    t /= self.scaling_factor
                # Don't do einsum, it converts fp32 to fp16
                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
                self._cos_cached = torch.cos(freqs).to(dtype)
                self._sin_cached = torch.sin(freqs).to(dtype)
        def get_cos_sin(
            self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype
        ):
            """
            Return cos and sin for the asked position ids
            """
            self._update_cos_sin_cache(dtype, position_ids.device, max_s)
            cos = torch.index_select(self._cos_cached, 0, position_ids)
            sin = torch.index_select(self._sin_cached, 0, position_ids)
            return cos.unsqueeze(1), sin.unsqueeze(1)
        def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
            rotary_dim = cos.shape[-1]
            x1 = x[..., :rotary_dim]
            x2 = x[..., rotary_dim : 2 * rotary_dim]
            rotary_emb.apply_rotary(x1, x2, cos, sin, x1, x2, False)
            return x
    class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
        def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
            inv_freq = _create_inv_freq(dim, base, device)
            super().__init__(inv_freq, scaling_factor)
            self.dim = dim
            self.max_position_embeddings = max_position_embeddings
            self.base = base
        def _update_cos_sin_cache(self, dtype, device, seqlen):
            # Reset the tables if the sequence length has changed,
            # or if we're on a new device (possibly due to tracing for instance)
            if (
                seqlen > self._seq_len_cached
                or self._cos_cached.device != device
                or self._cos_cached.dtype != dtype
            ):
                if seqlen > self.max_position_embeddings:
                    newbase = self.base * (
                        (self.scaling_factor * seqlen / self.max_position_embeddings)
                        - (self.scaling_factor - 1)
                    ) ** (self.dim / (self.dim - 2))
                    self.inv_freq = _create_inv_freq(
                        self.dim, newbase, self.inv_freq.device
                    )
                self._seq_len_cached = seqlen
                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
                # Don't do einsum, it converts fp32 to fp16
                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
                self._cos_cached = torch.cos(freqs).to(dtype)
                self._sin_cached = torch.sin(freqs).to(dtype)
    # Inverse dim formula to find dim based on number of rotations
    import math
    def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
        return (dim * math.log(max_position_embeddings/(num_rotations * 2 * math.pi)))/(2 * math.log(base))
    # Find dim range bounds based on rotations
    def find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048):
        low = math.floor(find_correction_dim(
            low_rot, dim, base, max_position_embeddings))
        high = math.ceil(find_correction_dim(
            high_rot, dim, base, max_position_embeddings))
        return max(low, 0), min(high, dim-1)  # Clamp values just in case
    def linear_ramp_mask(min, max, dim):
        if min == max:
            max += 0.001  # Prevent singularity
        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
        ramp_func = torch.clamp(linear_func, 0, 1)
        return ramp_func
    def get_mscale(scale=1):
        if scale <= 1:
            return 1.0
        return 0.1 * math.log(scale) + 1.0
    class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
        def __init__(self, dim, max_position_embeddings, base, device, scaling_factor,*, extrapolation_factor, attn_factor, beta_fast, beta_slow):
            inv_freq = _create_inv_freq(dim, base, device)
            super().__init__(inv_freq, scaling_factor)
            self.dim = dim
            self.max_position_embeddings = max_position_embeddings
            self.base = base
            self.extrapolation_factor = extrapolation_factor
            self.attn_factor = attn_factor
            self.beta_fast = beta_fast
            self.beta_slow = beta_slow
            self.mscale = float(get_mscale(self.scaling_factor) * self.attn_factor) # Get n-d magnitude scaling corrected for interpolation
        def _update_cos_sin_cache(self, dtype, device, seqlen):
            # Reset the tables if the sequence length has changed,
            # or if we're on a new device (possibly due to tracing for instance)
            if (
                seqlen > self._seq_len_cached
                or self._cos_cached.device != device
                or self._cos_cached.dtype != dtype
            ):
                if seqlen > self.max_position_embeddings:
                    inv_freq_extrapolation = _create_inv_freq(
                        self.dim, self.base, self.inv_freq.device
                    )
                    freqs = 1.0 / inv_freq_extrapolation
                    inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
                    low, high = find_correction_range(self.beta_fast, self.beta_slow, self.dim, self.base, self.max_position_embeddings)
                    inv_freq_mask = (1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation
                    inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
                    self.inv_freq = inv_freq
                    self.mscale = float(get_mscale(self.scaling_factor) * self.attn_factor) # Get n-d magnitude scaling corrected for interpolation
                self._seq_len_cached = seqlen
                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
                # Don't do einsum, it converts fp32 to fp16
                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
                self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
                self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
 except ImportError:
    pass
--- a/my_optims/tgi_update/main.rs
+++ b/my_optims/tgi_update/main.rs
--- a/my_optims/tgi_update/my_dist.py
+++ b/my_optims/tgi_update/my_dist.py
@ -0,0 +1,47 @@
 import os
 import torch
 from datetime import timedelta
 from loguru import logger
 from mpi4py import MPI
 import my_custom_comm
 # Tensor Parallelism settings
 RANK = int(os.getenv("OMPI_COMM_WORLD_RANK", "0"))
 WORLD_SIZE = int(os.getenv("OMPI_COMM_WORLD_SIZE", "1"))
 # CUDA memory fraction
 MEMORY_FRACTION = float(os.getenv("CUDA_MEMORY_FRACTION", "1.0"))
 class MyCommGroup:
    def __init__(self, rank, size, tp_comm, pp_comm):
        self._rank = rank
        self._size = size
        self.tp_comm = tp_comm
        self.pp_comm = pp_comm
    def size(self):
        return self._size
    def rank(self):
        return self._rank
 def initialize_mpi_distributed():
    assert torch.cuda.is_available()
    # mpi initialize
    COMM = MPI.COMM_WORLD
    assert COMM.Get_size() == WORLD_SIZE, f"{COMM.Get_size()},{WORLD_SIZE}"
    # Set the device id.
    assert WORLD_SIZE <= torch.cuda.device_count(), "Each process is one gpu"
    device = RANK % torch.cuda.device_count()
    torch.cuda.set_device(device)
    torch.cuda.set_per_process_memory_fraction(MEMORY_FRACTION, device)
    # nccl initialize
    tp_comm, pp_comm = my_custom_comm.init_nccl(WORLD_SIZE, 1)
    process_group = MyCommGroup(RANK, WORLD_SIZE, tp_comm, pp_comm)
    logger.warning("custom mpi and nccl is already initialized.")
    return process_group, RANK, WORLD_SIZE, COMM
--- a/my_optims/tgi_update/server.py
+++ b/my_optims/tgi_update/server.py
@ -0,0 +1,217 @@
 import asyncio
 import os
 import torch
 from grpc import aio
 from loguru import logger
 from grpc_reflection.v1alpha import reflection
 from pathlib import Path
 from typing import List, Optional
 from text_generation_server.cache import Cache
 from text_generation_server.interceptor import ExceptionInterceptor
 from text_generation_server.models import Model, get_model
 from text_generation_server.pb import generate_pb2_grpc, generate_pb2
 from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor
 from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch
 class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
    def __init__(self, model: Model, cache: Cache, server_urls: List[str]):
        self.cache = cache
        self.model = model
        self.server_urls = server_urls
        # For some reason, inference_mode does not work well with GLOO which we use on CPU
        if model.device.type == "cuda":
            # Force inference mode for the lifetime of TextGenerationService
            self._inference_mode_raii_guard = torch._C._InferenceMode(True)
    async def Info(self, request, context):
        return self.model.info
    async def Health(self, request, context):
        if self.model.device.type == "cuda":
            torch.zeros((2, 2)).cuda()
        return generate_pb2.HealthResponse()
    async def ServiceDiscovery(self, request, context):
        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)
    async def ClearCache(self, request, context):
        if request.HasField("id"):
            self.cache.delete(request.id)
        else:
            self.cache.clear()
        return generate_pb2.ClearCacheResponse()
    async def FilterBatch(self, request, context):
        batch = self.cache.pop(request.batch_id)
        if batch is None:
            raise ValueError(f"Batch ID {request.batch_id} not found in cache.")
        filtered_batch = batch.filter(request.request_ids)
        self.cache.set(filtered_batch)
        return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
    async def Warmup(self, request, context):
        if (
            self.model.batch_type == IdeficsCausalLMBatch
        ):  # Hack, i would rather use kwargs in the `from_pb` call
            batch = self.model.batch_type.from_pb(
                request.batch,
                self.model.tokenizer,
                self.model.processor,
                self.model.dtype,
                self.model.device,
            )
        else:
            batch = self.model.batch_type.from_pb(
                request.batch, self.model.tokenizer, self.model.dtype, self.model.device
            )
        max_supported_total_tokens = self.model.warmup(batch)
        return generate_pb2.WarmupResponse(
            max_supported_total_tokens=max_supported_total_tokens
        )
    async def Prefill(self, request, context):
        if (
            self.model.batch_type == IdeficsCausalLMBatch
        ):  # Hack, i would rather use kwargs in the `from_pb` call
            batch = self.model.batch_type.from_pb(
                request.batch,
                self.model.tokenizer,
                self.model.processor,
                self.model.dtype,
                self.model.device,
            )
        else:
            batch = self.model.batch_type.from_pb(
                request.batch, self.model.tokenizer, self.model.dtype, self.model.device
            )
        generations, next_batch = self.model.generate_token(batch)
        self.cache.set(next_batch)
        return generate_pb2.PrefillResponse(
            generations=[generation.to_pb() for generation in generations],
            batch=next_batch.to_pb() if next_batch else None,
        )
    async def Decode(self, request, context):
        if len(request.batches) == 0:
            raise ValueError("Must provide at least one batch")
        batches = []
        for batch_pb in request.batches:
            batch = self.cache.pop(batch_pb.id)
            if batch is None:
                raise ValueError(f"Batch ID {batch_pb.id} not found in cache.")
            batches.append(batch)
        if len(batches) == 0:
            raise ValueError("All batches are empty")
        if len(batches) > 1:
            batch = self.model.batch_type.concatenate(batches)
        else:
            batch = batches[0]
        generations, next_batch = self.model.generate_token(batch)
        self.cache.set(next_batch)
        return generate_pb2.DecodeResponse(
            generations=[generation.to_pb() for generation in generations],
            batch=next_batch.to_pb() if next_batch else None,
        )
 def serve(
    model_id: str,
    revision: Optional[str],
    sharded: bool,
    quantize: Optional[str],
    dtype: Optional[str],
    trust_remote_code: bool,
    uds_path: Path,
 ):
    async def serve_inner(
        model_id: str,
        revision: Optional[str],
        sharded: bool = False,
        quantize: Optional[str] = None,
        dtype: Optional[str] = None,
        trust_remote_code: bool = False,
    ):
        logger.info(os.environ)
        unix_socket_template = "unix://{}-{}"
        if sharded:
            server_urls = [
                unix_socket_template.format(uds_path, rank)
                for rank in range(int(os.environ["WORLD_SIZE"]))
            ]
            local_url = server_urls[int(os.environ["RANK"])]
        else:
            local_url = unix_socket_template.format(uds_path, 0)
            server_urls = [local_url]
        if int(os.environ.get("USE_CUSTOM_NCCL", 0)):
            server_urls = [
                unix_socket_template.format(uds_path, rank)
                for rank in range(int(os.environ["OMPI_COMM_WORLD_SIZE"]))
            ]
            local_url = server_urls[int(os.environ["OMPI_COMM_WORLD_RANK"])]
        try:
            model = get_model(
                model_id, revision, sharded, quantize, dtype, trust_remote_code
            )
        except Exception:
            logger.exception("Error when initializing model")
            raise
        if quantize == "gptq":
            try:
                # When using GPTQ, Exllama kernels need some global kernels
                # For which we have the finale shapes only after the model has loaded
                # This will allocate those buffers.
                from text_generation_server.utils.gptq.exllama import (
                    create_exllama_buffers,
                    set_device,
                )
                set_device(model.device)
                create_exllama_buffers()
            except ImportError:
                pass
        server = aio.server(
            interceptors=[
                ExceptionInterceptor(),
                UDSOpenTelemetryAioServerInterceptor(),
            ]
        )
        generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
            TextGenerationService(model, Cache(), server_urls), server
        )
        SERVICE_NAMES = (
            generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
            reflection.SERVICE_NAME,
        )
        reflection.enable_server_reflection(SERVICE_NAMES, server)
        server.add_insecure_port(local_url)
        await server.start()
        logger.info("Server started at {}".format(local_url))
        try:
            await server.wait_for_termination()
        except KeyboardInterrupt:
            logger.info("Signal received. Shutting down")
            await server.stop(0)
    asyncio.run(
        serve_inner(model_id, revision, sharded, quantize, dtype, trust_remote_code)
    )
--- a/my_optims/tp_optims/flash_llama_modeling.py
+++ b/my_optims/tp_optims/flash_llama_modeling.py
@ -0,0 +1,467 @@
 # coding=utf-8
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional, List, Tuple
 # Flash attention imports
 import dropout_layer_norm
 import flash_attn_2_cuda
 import torch
 import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 # vllm imports
 from vllm import attention_ops, cache_ops
 from torch.nn import functional as F
 from layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    TensorParallelHead,
    get_linear,
 )
 class LlamaRMSNorm(nn.Module):
    def __init__(self, prefix, weights, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        weight = weights.get_tensor(f"{prefix}.weight")
        self.weight = nn.Parameter(weight)
        self.variance_epsilon = eps
    def forward(self, hidden_states, residual=None):
        if hidden_states.shape[-1] > 8192:
            if residual is not None:
                hidden_states += residual
            residual = hidden_states
            hidden_states = hidden_states.to(torch.float32)
            variance = hidden_states.pow(2).mean(-1, keepdim=True)
            hidden_states = hidden_states * torch.rsqrt(
                variance + self.variance_epsilon
            )
            # convert into half-precision if necessary
            if self.weight.dtype in [torch.float16, torch.bfloat16]:
                hidden_states = hidden_states.to(self.weight.dtype)
            return self.weight * hidden_states, residual
        else:
            # faster post attention rms norm
            normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd(
                hidden_states,
                residual,
                self.weight,
                None,
                None,
                None,
                None,
                None,
                0.0,
                self.variance_epsilon,
                1.0,
                0,
                None,
                False,
                True,  # Activate RMSNorm
            )
            if res is None:
                res = hidden_states
            return normed_hidden_states, res
 def load_attention(config, prefix, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        if config.model_type == "baichuan":
            return TensorParallelColumnLinear.load_qkv(
                config,
                prefix=f"{prefix}.W_pack",
                weights=weights,
                bias=False,
            )
        else:
            return TensorParallelColumnLinear.load_multi(
                config,
                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                dim=0,
                weights=weights,
                bias=False,
            )
 def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0
    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        quantize=config.quantize,
        dim=0,
    )
    if config.quantize != "gptq":
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
    return TensorParallelColumnLinear(
        get_linear(weight, bias=None, quantize=config.quantize)
    )
 class FlashLlamaAttention(torch.nn.Module):
    def __init__(
            self,
            prefix: str,
            config,
            weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        # self.rotary_emb = PositionRotaryEmbedding.load(
        #     config=config, prefix=f"{prefix}.rotary_emb", weights=weights
        # )
        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config, dim=self.head_size, base=config.rope_theta, device=weights.device
        )
        self.softmax_scale = self.head_size ** -0.5
        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
                config.num_key_value_heads // weights.process_group.size()
        )
        self.query_key_value = load_attention(config, prefix, weights)
        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)
    def forward(
            self,
            hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
    ):
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
        self.rotary_emb(query, cos, sin)
        self.rotary_emb(torch.select(kv, dim=1, index=0), cos, sin)
        cache_ops.reshape_and_cache(
            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
        )
        # output tensor
        attn_output = torch.empty_like(query)
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            flash_attn_2_cuda.varlen_fwd(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
                attn_output,
                cu_seqlen_prefill,
                cu_seqlen_prefill,
                max_s,
                max_s,
                0.0,
                self.softmax_scale,
                False,
                True,
                -1,
                0,
                False,
                None,
            )
        # Decode
        else:
            # kv_cache[1] => [num_blocks, num_heads, head_size, block_size]
            block_size = kv_cache[1].shape[3]
            attention_ops.paged_attention_v1(
                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None
            )
        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
 class LlamaMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate="tanh"
                if act in ["gelu_fast", "gelu_pytorch_tanh"]
                else "none",
            )
        )
        # Fuse gate and up proj
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.intermediate_size = (
                config.intermediate_size // weights.process_group.size()
        )
    def forward(self, hidden_states):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
 class FlashLlamaLayer(nn.Module):
    def __init__(self, layer_id, config, weights):
        super().__init__()
        prefix = f"model.layers.{layer_id}"
        self.self_attn = FlashLlamaAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.mlp = LlamaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.input_layernorm = LlamaRMSNorm(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = LlamaRMSNorm(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
    def forward(
            self,
            hidden_states,
            residual,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
        )
        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )
        mlp_output = self.mlp(normed_attn_res_output)
        return mlp_output, attn_res
 class FlashLlamaModel(torch.nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        # self.embed_tokens = TensorParallelEmbedding(
        #     prefix="model.embed_tokens", weights=weights
        # )
        embeddings = weights.get_tensor(f"model.embed_tokens.weight")
        self.embed_tokens = nn.Embedding.from_pretrained(F.pad(embeddings, (0, 0, 0, 1)),
                                                         padding_idx=config.pad_token_id)
        self.layers = nn.ModuleList(
            [
                FlashLlamaLayer(
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
                # for layer_id in range(1)
            ]
        )
        self.norm = LlamaRMSNorm(
            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
        )
        self.gradient_checkpointing = False
        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
    def forward(
            self,
            input_ids: torch.Tensor,
            position_ids: torch.Tensor,
            cu_seqlen_prefill: Optional[torch.Tensor],
            kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
            block_tables: torch.Tensor,
            slots: torch.Tensor,
            input_lengths: torch.Tensor,
            max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)
        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )
        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                input_lengths,
                max_s,
            )
        hidden_states, _ = self.norm(hidden_states, residual)
        return hidden_states
 class FlashLlamaForCausalLM(torch.nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        self.model = FlashLlamaModel(config, weights)
        self.lm_head = TensorParallelHead.load(
            config,
            prefix="lm_head",
            weights=weights,
        )
    def forward(
            self,
            input_ids: torch.Tensor,
            position_ids: torch.Tensor,
            cu_seqlen_prefill: Optional[torch.Tensor],
            kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
            block_tables: torch.Tensor,
            slots: torch.Tensor,
            input_lengths: torch.Tensor,
            max_s: int,
            lm_head_indices: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits
--- a/my_optims/tp_optims/layers.py
+++ b/my_optims/tp_optims/layers.py
@ -0,0 +1,402 @@
 import os
 from typing import List
 import torch
 # import torch.distributed
 from torch import nn
 from torch.nn import functional as F
 import my_custom_comm
 class FastLinear(nn.Module):
    def __init__(
            self,
            weight,
            bias,
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(weight)
        if bias is not None:
            self.bias = nn.Parameter(bias)
        else:
            self.bias = None
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_tensor(f"{prefix}.weight")
        if bias:
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(weight, bias)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return F.linear(input, self.weight, self.bias)
 def get_linear(weight, bias, quantize):
    linear = FastLinear(weight, bias)
    return linear
 class SuperLayer(nn.Module):
    def __init__(self, linear):
        super().__init__()
        self.linear = linear
    def forward(self, x):
        return self.linear.forward(x)
 class TensorParallelHead(SuperLayer):
    def __init__(self, linear, process_group, should_gather: bool):
        super().__init__(linear)
        self.process_group = process_group
        self.should_gather = should_gather
    @staticmethod
    def load(config, prefix: str, weights):
        if weights.process_group.size() > 1:
            try:
                assert False
                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
                should_gather = True
            except AssertionError:
                # If the vocab size is not divisible by number of shards
                # just load the entire thing.
                weight = weights.get_tensor(f"{prefix}.weight")
                should_gather = False
        else:
            weight = weights.get_tensor(f"{prefix}.weight")
            should_gather = False
        # GPTQ doesn't quantize heads (nor embeddings)
        if config.quantize == "gptq":
            quantize = None
        else:
            quantize = config.quantize
        return TensorParallelHead(
            get_linear(weight, bias=None, quantize=quantize),
            process_group=weights.process_group,
            should_gather=should_gather,
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if not self.should_gather:
            return super().forward(input)
        # world_size = self.process_group.size()
        # if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
        #     out_dim = self.linear.weight.shape[0]
        #     if input.shape[0] == 1:
        #         world_out = input.new_empty(1, out_dim * world_size)
        #         local_out = input.new_empty(1, out_dim)
        #         gather_input = local_out
        #     else:
        #         world_out = input.new_empty(out_dim * world_size, input.shape[0])
        #         gather_input = input.new_empty(out_dim, input.shape[0])
        #         local_out = gather_input.T
        #     torch.mm(input, self.linear.weight.T, out=local_out)
        #     torch.distributed.all_gather_into_tensor(
        #         world_out, gather_input, group=self.process_group
        #     )
        #     if input.shape[0] == 1:
        #         return world_out
        #     return world_out.T
        # output = super().forward(input)
        # world_output = [
        #     torch.empty_like(output) for _ in range(self.process_group.size())
        # ]
        # torch.distributed.all_gather(world_output, output, group=self.process_group)
        # world_output = torch.cat(world_output, dim=-1)
        # return world_output
 class TensorParallelColumnLinear(SuperLayer):
    @classmethod
    def load_qkv(cls, config, prefix: str, weights, bias: bool):
        """Specific method when the QKV was joined after the fact"""
        weight = weights.get_weights_col_packed_qkv(
            prefix, quantize=config.quantize
        )
        if bias:
            raise NotImplementedError("packed_qkv only implemented for baichuan")
        else:
            bias = None
        linear = get_linear(weight, bias, config.quantize)
        return cls(linear)
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        return cls.load_multi(config, [prefix], weights, bias, dim=0)
    @classmethod
    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
        weight = weights.get_multi_weights_col(
            prefixes, quantize=config.quantize, dim=dim
        )
        if bias:
            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
            bias = torch.cat(b, dim=dim)
        else:
            bias = None
        linear = get_linear(weight, bias, config.quantize)
        return cls(linear)
 class TensorParallelRowLinear(SuperLayer):
    def __init__(self, linear, process_group):
        super().__init__(linear)
        self.process_group = process_group
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
        if bias and weights.process_group.rank() == 0:
            # Rank is only on the first rank process
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(
            get_linear(weight, bias, config.quantize),
            process_group=weights.process_group,
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        out = super().forward(input)
        if self.process_group.size() > 1:
            # torch.distributed.all_reduce(out, group=self.process_group)
            my_custom_comm.custom_allreduce(out, self.process_group.tp_comm)
        return out
 class TensorParallelEmbedding(nn.Module):
    def __init__(self, prefix: str, weights, reduce=True):
        super().__init__()
        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
        process_group = weights.process_group
        world_size = process_group.size()
        rank = process_group.rank()
        block_size = num_embeddings // world_size
        self.min_id = rank * block_size
        self.max_id = min(num_embeddings, (rank + 1) * block_size)
        self.null_idx = block_size
        self.process_group = weights.process_group
        self.reduce = reduce
        """Additional 0 entry used for masking"""
        self.weight = nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
        # translate for [0, self.max_id - self.min_id[
        input = torch.where(
            (self.min_id > input) | (input >= self.max_id),
            self.null_idx,
            input - self.min_id,
        )
        out = torch.nn.functional.embedding(input, self.weight)
        if self.reduce and self.process_group.size() > 1:
            # torch.distributed.all_reduce(out, group=self.process_group)
            my_custom_comm.custom_allreduce(out, self.process_group.tp_comm)
        return out
 try:
    import dropout_layer_norm
    class FastLayerNorm(nn.LayerNorm):
        def forward(self, hidden_states, residual=None):
            if hidden_states.shape[-1] > 8192:
                if residual is not None:
                    hidden_states += residual
                residual = hidden_states
                return super(FastLayerNorm, self).forward(hidden_states), residual
            else:
                (
                    normed_hidden_states,
                    residual,
                    *rest,
                ) = dropout_layer_norm.dropout_add_ln_fwd(
                    hidden_states,
                    residual,
                    self.weight,
                    self.bias,
                    None,
                    None,
                    None,
                    None,
                    0.0,
                    self.eps,
                    1.0,
                    0,
                    None,
                    False,
                    False,
                )
                if residual is None:
                    residual = hidden_states
                return normed_hidden_states, residual
 except ImportError:
    pass
 try:
    from flash_attn.layers.rotary import RotaryEmbedding
    import rotary_emb
    def _create_inv_freq(dim, base, device):
        inv_freq = 1.0 / (
                base
                ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
        )
        return inv_freq
    def _get_rope_config(config):
        if os.getenv("ROPE_SCALING", None) is not None:
            rope_scaling = {"type": os.environ["ROPE_SCALING"], "factor": float(os.environ["ROPE_FACTOR"])}
            return rope_scaling
        return getattr(config, "rope_scaling", None)
    class PositionRotaryEmbedding(nn.Module):
        def __init__(self, inv_freq, scaling_factor):
            super().__init__()
            self.inv_freq = inv_freq
            self._seq_len_cached = 0
            self._cos_cached = None
            self._sin_cached = None
            self._cos_k_cached = None
            self._sin_k_cached = None
            self.scaling_factor = scaling_factor
            self.dynamic_args = None
        @classmethod
        def static(cls, config, dim, base, device):
            inv_freq = _create_inv_freq(dim, base, device)
            scaling_factor = None
            rope_scaling = _get_rope_config(config)
            if rope_scaling is not None:
                scaling_factor = rope_scaling["factor"]
                if rope_scaling["type"] == "linear":
                    pass
                elif rope_scaling["type"] == "dynamic":
                    return DynamicPositionRotaryEmbedding(dim=dim,
                                                          max_position_embeddings=config.max_position_embeddings,
                                                          base=base, device=inv_freq.device,
                                                          scaling_factor=scaling_factor)
                else:
                    raise NotImplementedError(f"rope scaling type {rope_scaling['type']} is not implemented or invalid")
            return cls(inv_freq, scaling_factor)
        @classmethod
        def load(cls, config, prefix, weights):
            # XXX: Always load this in float32 !
            dtype = weights.dtype
            weights.dtype = torch.float32
            inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
            weights.dtype = dtype
            scaling_factor = None
            rope_scaling = _get_rope_config(config)
            if rope_scaling is not None:
                scaling_factor = rope_scaling["factor"]
                if rope_scaling["type"] == "linear":
                    pass
                elif rope_scaling["type"] == "dynamic":
                    return DynamicPositionRotaryEmbedding(dim=2 * inv_freq.shape[0],
                                                          max_position_embeddings=config.max_position_embeddings,
                                                          base=10000.0, device=inv_freq.device,
                                                          scaling_factor=scaling_factor)
                else:
                    raise NotImplementedError(f"rope scaling type {rope_scaling['type']} is not implemented or invalid")
            return cls(inv_freq, scaling_factor)
        def _update_cos_sin_cache(self, dtype, device, seqlen):
            # Reset the tables if the sequence length has changed,
            # or if we're on a new device (possibly due to tracing for instance)
            if (
                    seqlen > self._seq_len_cached
                    or self._cos_cached.device != device
                    or self._cos_cached.dtype != dtype
            ):
                self._seq_len_cached = seqlen
                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
                if self.scaling_factor is not None:
                    t /= self.scaling_factor
                # Don't do einsum, it converts fp32 to fp16
                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
                self._cos_cached = torch.cos(freqs).to(dtype)
                self._sin_cached = torch.sin(freqs).to(dtype)
        def get_cos_sin(
                self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype
        ):
            """
            Return cos and sin for the asked position ids
            """
            self._update_cos_sin_cache(dtype, position_ids.device, max_s)
            cos = torch.index_select(self._cos_cached, 0, position_ids)
            sin = torch.index_select(self._sin_cached, 0, position_ids)
            return cos.unsqueeze(1), sin.unsqueeze(1)
        def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
            rotary_dim = cos.shape[-1]
            x1 = x[..., :rotary_dim]
            x2 = x[..., rotary_dim: 2 * rotary_dim]
            rotary_emb.apply_rotary(x1, x2, cos, sin, x1, x2, False)
            return x
    class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
        def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
            inv_freq = _create_inv_freq(dim, base, device)
            super().__init__(inv_freq, scaling_factor)
            self.dim = dim
            self.max_position_embeddings = max_position_embeddings
            self.base = base
        def _update_cos_sin_cache(self, dtype, device, seqlen):
            # Reset the tables if the sequence length has changed,
            # or if we're on a new device (possibly due to tracing for instance)
            if (
                    seqlen > self._seq_len_cached
                    or self._cos_cached.device != device
                    or self._cos_cached.dtype != dtype
            ):
                if seqlen > self.max_position_embeddings:
                    newbase = self.base * ((self.scaling_factor * seqlen / self.max_position_embeddings) - (
                                self.scaling_factor - 1)) ** (self.dim / (self.dim - 2))
                    self.inv_freq = _create_inv_freq(self.dim, newbase, self.inv_freq.device)
                self._seq_len_cached = seqlen
                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
                # Don't do einsum, it converts fp32 to fp16
                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
                self._cos_cached = torch.cos(freqs).to(dtype)
                self._sin_cached = torch.sin(freqs).to(dtype)
 except ImportError:
    pass
--- a/my_optims/tp_optims/test_llama_fa_tp.py
+++ b/my_optims/tp_optims/test_llama_fa_tp.py
@ -0,0 +1,277 @@
 # encoding:utf-8
 # -------------------------------------------#
 # Filename: optims -- test_llama_fa.py
 #
 # Description:
 # Version:       1.0
 # Created:       2023/9/18-20:50
 # Last modified by:
 # Author:        'zhaohuayang@myhexin.com'
 # Company:       同花顺网络信息股份有限公司
 # -------------------------------------------#
 import math
 import time
 from pathlib import Path
 from typing import List, Optional
 import numpy as np
 import torch
 import transformers
 from flash_llama_modeling import FlashLlamaForCausalLM
 from weights import Weights
 from mpi4py import MPI
 import my_custom_comm
 COMM = None
 BLOCK_SIZE = 16
 class FakeGroup:
    def __init__(self, rank, size, tp_comm, pp_comm):
        self._rank = rank
        self._size = size
        self.tp_comm = tp_comm
        self.pp_comm = pp_comm
    def size(self):
        return self._size
    def rank(self):
        return self._rank
 class CacheManager:
    def __init__(
            self,
            num_blocks: int,
            num_layers: int,
            num_heads: int,
            head_size: int,
            dtype: torch.dtype,
            device: torch.device,
    ):
        self.block_size = BLOCK_SIZE
        self.num_blocks = num_blocks
        self.device = device
        element_size = torch.tensor([], dtype=dtype).element_size()
        x = self.block_size // element_size
        self.kv_cache = [
            (
                torch.empty(
                    (num_blocks, num_heads, head_size // x, self.block_size, x),
                    dtype=dtype,
                    device=device,
                ),
                torch.empty(
                    (num_blocks, num_heads, head_size, self.block_size),
                    dtype=dtype,
                    device=device,
                ),
            )
            for _ in range(num_layers)
        ]
        self.free_block_mask = torch.ones(num_blocks, dtype=torch.int32, device="cpu")
        self.slots = torch.arange(
            0, num_blocks * self.block_size, dtype=torch.int32
        ).view(num_blocks, self.block_size)
    def allocate(self, blocks, max_blocks, needed_blocks_slots):
        """
        blocks: 总共需要的blocks数量
        max_blocks: 最大的blocks数量大小
        needed_blocks_slots: 每个序列所需的blocks及其对应的序列长度
        """
        # Get free blocks indices by finding values in mask that are not set to 0
        free_block_indices = self.free_block_mask.nonzero()
        assert (
                len(free_block_indices) >= blocks
        ), f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
        # Slice by the number of required blocks
        block_indices = free_block_indices[: blocks]
        block_indices = block_indices.flatten()
        # Padded block tables
        block_tables_tensor = torch.zeros(
            (len(needed_blocks_slots), max_blocks), dtype=torch.int32
        )
        # Allocate paged attention blocks
        cumulative_blocks = 0
        slots = []
        block_tables = []
        for i, (needed_blocks, needed_slots) in enumerate(needed_blocks_slots):
            # Get allocated blocks for this sequence
            allocated_blocks = block_indices[
                               cumulative_blocks: cumulative_blocks + needed_blocks
                               ]
            # Get slots for the allocated blocks
            allocated_slots = self.slots[allocated_blocks].flatten()[:needed_slots]
            slots.append(allocated_slots)
            block_tables.append(allocated_blocks.tolist())
            block_tables_tensor[i, :needed_blocks] = allocated_blocks
            cumulative_blocks += needed_blocks
        # Allocate the required number of blocks by setting the mask to 0
        self.free_block_mask[block_indices] = 0
        return block_tables, block_tables_tensor.to(self.device), torch.concat(slots).to(self.device)
    def free(self, block_indices: Optional[List[int]]):
        if block_indices is not None and block_indices:
            # Reset mask
            self.free_block_mask[block_indices] = 1
 def generate(tokenizer, model, config, device, prompt, max_new_tokens=10):
    input_ids = tokenizer(prompt).input_ids
    def warmup():
        print("start warmup...")
        global CACHE_MANAGER
        blocks = 260
        CACHE_MANAGER = CacheManager(blocks,
                                     len(model.model.layers),
                                     model.model.num_key_value_heads,
                                     model.model.head_size,
                                     torch.float16,
                                     device)
        input_length = 1024
        bs = 4
        warmup_inputs = {
            'input_ids': torch.arange(1, input_length + 1, dtype=torch.int64, device=device).repeat(bs),
            'position_ids': torch.arange(0, input_length, dtype=torch.int32, device=device).repeat(bs),
            'cu_seqlen_prefill': torch.tensor([i * input_length for i in range(bs + 1)], dtype=torch.int32,
                                              device=device),
            'block_tables': torch.arange(0, blocks, dtype=torch.int32, device=device).split(blocks // bs),
            'slots': torch.arange(0, 4144, dtype=torch.int32, device=device),
            'input_lengths': torch.tensor([input_length] * 4, dtype=torch.int32, device=device),
            'max_s': 1024,
            'lm_head_indices': None
        }
        model.forward(**warmup_inputs, kv_cache=CACHE_MANAGER.kv_cache)
        del CACHE_MANAGER
        torch.cuda.empty_cache()
    # 预热
    warmup()
    print("start speed test running")
    # 申请缓存空间
    global CACHE_MANAGER
    CACHE_MANAGER = CacheManager(100,
                                 len(model.model.layers),
                                 model.model.num_key_value_heads,
                                 model.model.head_size,
                                 torch.float16,
                                 device)
    total_tokens = len(input_ids) + max_new_tokens - 1
    needed_blocks = math.ceil(total_tokens / BLOCK_SIZE)
    needed_blocks_slots = [(needed_blocks, total_tokens)]
    _, block_tables_tensor, slots = CACHE_MANAGER.allocate(needed_blocks, needed_blocks, needed_blocks_slots)
    # forward循环
    tpss = []
    loops = 10
    for loop in range(loops):
        print(f"loop {loop}...")
        times = []
        new_tokens = []
        for step in range(max_new_tokens):
            if step == 0:
                # prefill step
                slot_indices = torch.arange(0, 0 + len(input_ids), dtype=torch.int64)
                inputs = {
                    'input_ids': torch.tensor(input_ids, dtype=torch.int64, device=device),
                    'position_ids': torch.arange(0, len(input_ids), dtype=torch.int32, device=device),
                    'cu_seqlen_prefill': torch.tensor([0, len(input_ids)], dtype=torch.int32, device=device),
                    'block_tables': block_tables_tensor,
                    'slots': slots[slot_indices],
                    'input_lengths': torch.tensor([len(input_ids)], dtype=torch.int32, device=device),
                    'max_s': len(input_ids),
                    'lm_head_indices': torch.tensor([0 + len(input_ids) - 1], dtype=torch.int32, device=device)
                }
            else:
                # incremental step
                current_length = len(input_ids) + step
                inputs = {
                    'input_ids': new_tokens[-1],
                    'position_ids': torch.tensor([current_length - 1], dtype=torch.int32, device=device),
                    'cu_seqlen_prefill': None,
                    'block_tables': block_tables_tensor,
                    'slots': torch.tensor([current_length - 1], dtype=torch.int32, device=device),
                    'input_lengths': torch.tensor([current_length], dtype=torch.int32, device=device),
                    'max_s': current_length,
                    'lm_head_indices': None
                }
            torch.cuda.synchronize()
            s_time = time.time()
            logits = model.forward(**inputs, kv_cache=CACHE_MANAGER.kv_cache)
            torch.cuda.synchronize()
            cost_time = time.time() - s_time
            next_token_id = logits.argmax(dim=-1)
            new_tokens.append(next_token_id)
            times.append(round(cost_time, 6))
        if loop == 0:
            new_tokens = torch.concat(new_tokens)
            print(tokenizer.decode(new_tokens, skip_special_tokens=True))
        elapsed_time = np.mean(times)
        tps = 1 / elapsed_time
        tpss.append(tps)
        print(times)
        print(f"total new tokens: {max_new_tokens}, cost time: {sum(times):.6f} s\n"
              f"time_per_token: {elapsed_time * 1000:.3f} ms, tps: {tps:.2f} tokens/s")
    print(f'mean tps: {np.mean(tpss):.2f} tokens/s')
 def init_dist_env():
    global COMM
    COMM = MPI.COMM_WORLD
    rank = COMM.Get_rank()
    world_size = COMM.Get_size()
    tp_ptr, pp_ptr = my_custom_comm.init_nccl(2, 1)
    device = rank % torch.cuda.device_count()
    torch.cuda.set_device(device)
    torch.cuda.set_per_process_memory_fraction(1., device)
    process_group = FakeGroup(rank, world_size, tp_ptr, pp_ptr)
    return process_group, rank, world_size
 def main(model_path):
    # init env
    process_group, rank, world_size = init_dist_env()
    print(f'{rank=},{world_size=},{process_group.__dict__=}')
    # step 0: 定义路径与属性
    model_path = Path(model_path)
    config = transformers.AutoConfig.from_pretrained(model_path)
    config.quantize = None
    model_files = list(model_path.glob('*.safetensors'))
    # step 1: 定义tokenizer与权重
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, padding_side="left", truncation_side="left")
    device = torch.device(f"cuda:{rank}")
    weights = Weights(model_files, device, torch.float16, process_group=process_group)
    # step2: 定义模型
    COMM.barrier()
    model = FlashLlamaForCausalLM(config, weights).eval()
    COMM.barrier()
    print(model)
    # step3: 推理
    with torch.no_grad():
        prompt = "who are you?"
        generate(tokenizer, model, config, device, prompt, max_new_tokens=100)
    my_custom_comm.finalize_nccl(process_group.tp_comm, process_group.pp_comm)
 if __name__ == '__main__':
    CACHE_MANAGER: Optional[CacheManager] = None
    main('/code/models/llama-7b-hf')
--- a/my_optims/tp_optims/utils.py
+++ b/my_optims/tp_optims/utils.py
@ -0,0 +1,100 @@
 # encoding:utf-8
 # -------------------------------------------#
 # Filename: optims -- utils.py
 #
 # Description:   
 # Version:       1.0
 # Created:       2023/9/27-14:43
 # Last modified by: 
 # Author:        'zhaohuayang@myhexin.com'
 # Company:       同花顺网络信息股份有限公司
 # -------------------------------------------#
 import os
 import time
 from datetime import timedelta
 import torch
 from loguru import logger
 from torch.distributed import ProcessGroupNCCL
 RANK = int(os.getenv("LOCAL_RANK", "0"))
 WORLD_SIZE = int(os.getenv("WORLD_SIZE", "2"))
 NCCL_PORT = int(os.getenv("NCCL_PORT", "29500"))
 MEMORY_FRACTION = float(os.getenv("CUDA_MEMORY_FRACTION", "1.0"))
 class FakeBarrier:
    def wait(self):
        pass
 class FakeGroup:
    def __init__(self, rank, size):
        self._rank = rank
        self._size = size
    def allreduce(self, *args, **kwargs):
        return FakeBarrier()
    def allgather(self, inputs, local_tensor, **kwargs):
        assert (
                len(inputs[0]) == len(local_tensor) == 1
        ), f"{len(inputs[0])} != {len(local_tensor)} != 1, and the FakeGroup is supposed to join on simple tensors"
        for input_ in inputs:
            input_[0].data = local_tensor[0].data
        return FakeBarrier()
    def barrier(self, *args, **kwargs):
        return FakeBarrier()
    def size(self):
        return self._size
    def rank(self):
        return self._rank
 def initialize_torch_distributed():
    # Set the device id.
    assert WORLD_SIZE <= torch.cuda.device_count(), "Each process is one gpu"
    device = RANK % torch.cuda.device_count()
    torch.cuda.set_device(device)
    torch.cuda.set_per_process_memory_fraction(MEMORY_FRACTION, device)
    backend = "nccl"
    options = ProcessGroupNCCL.Options()
    options.is_high_priority_stream = True
    options._timeout = timedelta(seconds=60)
    if not torch.distributed.is_initialized():
        # Call the init process.
        torch.distributed.init_process_group(
            backend=backend,
            init_method=f"tcp://localhost:{NCCL_PORT}",
            world_size=WORLD_SIZE,
            rank=RANK,
            timeout=timedelta(seconds=60),
            pg_options=options,
        )
        logger.info(f"torch.distributed is initialized on rank {RANK} of {WORLD_SIZE}.")
    else:
        logger.warning("torch.distributed is already initialized.")
    return torch.distributed.group.WORLD, RANK, WORLD_SIZE
 class Timer:
    def __init__(self):
        self.times = []
        self.time = None
    def start(self):
        torch.cuda.synchronize()
        self.time = time.time()
    def end(self):
        torch.cuda.synchronize()
        self.times.append(time.time() - self.time)
    @property
    def elapsed(self):
        self.times.pop(0)
        return round(sum(self.times) / len(self.times) * 1000, 2)
--- a/my_optims/tp_optims/weights.py
+++ b/my_optims/tp_optims/weights.py
@ -0,0 +1,336 @@
 # encoding:utf-8
 # -------------------------------------------#
 # Filename: optims -- weights.py
 #
 # Description:   
 # Version:       1.0
 # Created:       2023/9/27-15:05
 # Last modified by: 
 # Author:        'zhaohuayang@myhexin.com'
 # Company:       同花顺网络信息股份有限公司
 # -------------------------------------------#
 import json
 import os
 from pathlib import Path
 from typing import List, Dict, Optional, Tuple
 import torch
 from huggingface_hub import hf_hub_download
 from loguru import logger
 from safetensors import safe_open, SafetensorError
 class Weights:
    def __init__(
            self,
            filenames: List[Path],
            device,
            dtype,
            process_group,
            aliases: Optional[Dict[str, List[str]]] = None,
    ):
        routing = {}
        for filename in filenames:
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename
        if aliases is None:
            aliases = {}
        self.aliases = aliases
        self.routing = routing
        self.device = device
        self.dtype = dtype
        self.process_group = process_group
        self._handles = {}
    def _get_handle(self, filename):
        if filename not in self._handles:
            f = safe_open(filename, framework="pytorch")
            self._handles[filename] = f
        return self._handles[filename]
    def get_filename(self, tensor_name: str) -> (str, str):
        filename = self.routing.get(tensor_name, None)
        if filename is None:
            aliases = self.aliases.get(tensor_name, [])
            for alias in aliases:
                filename = self.routing.get(alias, None)
                if filename is not None:
                    return str(filename), alias
            raise RuntimeError(f"weight {tensor_name} does not exist")
        return str(filename), tensor_name
    def _get_slice(self, tensor_name: str):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        return slice_
    def get_shape(self, tensor_name: str):
        return self._get_slice(tensor_name).get_shape()
    def get_tensor(self, tensor_name: str, to_device=True):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        tensor = f.get_tensor(tensor_name)
        # Special case for gptq which shouldn't convert
        # u4 which are disguised as int32
        if tensor.dtype not in [torch.int32, torch.int64]:
            tensor = tensor.to(dtype=self.dtype)
        if to_device:
            tensor = tensor.to(device=self.device)
        return tensor
    def get_partial_sharded(self, tensor_name: str, dim: int):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        world_size = self.process_group.size()
        rank = self.process_group.rank()
        size = slice_.get_shape()[dim]
        block_size = size // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        if dim == 0:
            tensor = slice_[start:stop]
        elif dim == 1:
            tensor = slice_[:, start:stop]
        else:
            raise NotImplementedError("Let's make that generic when needed")
        # Special case for gptq which shouldn't convert
        # u4 which are disguised as int32
        if tensor.dtype != torch.int32:
            tensor = tensor.to(dtype=self.dtype)
        tensor = tensor.to(device=self.device)
        return tensor
    def get_sharded(self, tensor_name: str, dim: int):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        world_size = self.process_group.size()
        size = slice_.get_shape()[dim]
        assert (
                size % world_size == 0
        ), f"The choosen size {size} is not compatible with sharding on {world_size} shards"
        return self.get_partial_sharded(tensor_name, dim)
    def _get_qweight(self, name: str):
        slice_ = self._get_slice(name)
        total_size = slice_.get_shape()[1]
        assert total_size % 3 == 0, "Prepacked quantized qkv is not divisible by 3"
        single_size = total_size // 3
        world_size = self.process_group.size()
        rank = self.process_group.rank()
        assert single_size % world_size == 0, f"Prepacked quantized qkv cannot be sharded across {world_size} shards"
        block_size = single_size // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        q = slice_[:, start:stop]
        k = slice_[:, start + single_size:stop + single_size]
        v = slice_[:, start + 2 * single_size:stop + 2 * single_size]
        weight = torch.cat([q, k, v], dim=1)
        weight = weight.to(device=self.device)
        return weight
    def get_weights_col_packed_qkv(self, prefix: str, quantize: str):
        """
        Highly specific when the underlying tensor is a simple cat of Q,K,V instead of being
        already alternating Q,K,V within the main tensor
        """
        if quantize == "gptq":
            try:
                qweight = self._get_qweight(f"{prefix}.qweight")
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
                )
            qzeros = self._get_qweight(f"{prefix}.qzeros")
            scales = self._get_qweight(f"{prefix}.scales")
            scales = scales.to(dtype=self.dtype)
            g_idx = self.get_tensor(f"{prefix}.g_idx")
            bits, groupsize = self._get_gptq_params()
            weight = (qweight, qzeros, scales, g_idx, bits, groupsize, False)
        else:
            slice_ = self._get_slice(f"{prefix}.weight")
            total_size = slice_.get_shape()[0]
            assert total_size % 3 == 0, "Prepacked qkv is not divisible by 3"
            single_size = total_size // 3
            world_size = self.process_group.size()
            rank = self.process_group.rank()
            assert single_size % world_size == 0, f"Prepacked qkv cannot be sharded across {world_size} shards"
            block_size = single_size // world_size
            start = rank * block_size
            stop = (rank + 1) * block_size
            q = slice_[start:stop]
            k = slice_[start + single_size:stop + single_size]
            v = slice_[start + 2 * single_size:stop + 2 * single_size]
            weight = torch.cat([q, k, v], dim=0)
            weight = weight.to(device=self.device)
            weight = weight.to(dtype=self.dtype)
        return weight
    def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
        if quantize == "gptq":
            try:
                qweight = torch.cat(
                    [self.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
                )
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
                )
            qzeros = torch.cat(
                [self.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
            )
            scales = torch.cat(
                [self.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
            )
            w = [self.get_tensor(f"{p}.g_idx") for p in prefixes]
            for w2 in w[1:]:
                torch.testing.assert_close(w2, w[0])
            g_idx = w[0]
            bits, groupsize = self._get_gptq_params()
            weight = (qweight, qzeros, scales, g_idx, bits, groupsize, False)
        else:
            w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
            weight = torch.cat(w, dim=dim)
        return weight
    def get_tensor_shard(self, var, dim):
        world_size = self.process_group.size()
        rank = self.process_group.rank()
        block_size = var.size()[dim] // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        if dim == 0:
            tensor = var[start:stop]
        elif dim == 1:
            tensor = var[:, start:stop]
        else:
            raise NotImplementedError("Let's make that generic when needed")
        tensor = tensor.to(dtype=self.dtype)
        tensor = tensor.to(device=self.device)
        return tensor
    def get_multi_weights_row(self, prefix: str, quantize: str):
        if quantize == "gptq":
            use_exllama = True
            bits, groupsize = self._get_gptq_params()
            if bits != 4:
                use_exllama = False
            if self.process_group.size() > 1:
                g_idx = self.get_tensor(f"{prefix}.g_idx")
                if g_idx is not None:
                    if (
                            not torch.equal(
                                g_idx.cpu(),
                                torch.tensor(
                                    [i // groupsize for i in range(g_idx.shape[0])],
                                    dtype=torch.int32,
                                ),
                            )
                            and not (g_idx == 0).all()
                    ):
                        # Exllama implementation does not support row tensor parallelism with act-order, as
                        # it would require to reorder input activations that are split unto several GPUs
                        use_exllama = False
            try:
                qweight = self.get_sharded(f"{prefix}.qweight", dim=0)
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
                )
            from text_generation_server.utils.layers import HAS_EXLLAMA, CAN_EXLLAMA
            if use_exllama:
                if not HAS_EXLLAMA:
                    if CAN_EXLLAMA:
                        logger.warning(
                            "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True"
                        )
                    use_exllama = False
                else:
                    logger.info("Using exllama kernels")
            if use_exllama:
                if groupsize >= 0:
                    # Exllama reorders the weights in advance and the activations on the fly, thus
                    # the scales and zero-points do not need to be reordered.
                    qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
                    scales = self.get_sharded(f"{prefix}.scales", dim=0)
                else:
                    qzeros = self.get_tensor(f"{prefix}.qzeros")
                    scales = self.get_tensor(f"{prefix}.scales")
                # For tp > 1, at this point we know we do not use act-order
                if self.process_group.size() == 1:
                    g_idx = self.get_tensor(f"{prefix}.g_idx")
                else:
                    g_idx = None
            else:
                # The triton kernel reorders the scales/zero points instead of the weight/activation.
                # Thus, each rank needs the full qzeros/scales.
                qzeros = self.get_tensor(f"{prefix}.qzeros")
                scales = self.get_tensor(f"{prefix}.scales")
                g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
            weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
        else:
            weight = self.get_sharded(f"{prefix}.weight", dim=1)
        return weight
    def _get_gptq_params(self) -> Tuple[int, int]:
        try:
            bits = self.get_tensor("gptq_bits").item()
            groupsize = self.get_tensor("gptq_groupsize").item()
        except (SafetensorError, RuntimeError) as e:
            try:
                bits = self.gptq_bits
                groupsize = self.gptq_groupsize
            except Exception:
                raise e
        return bits, groupsize
    def _set_gptq_params(self, model_id):
        filename = "config.json"
        try:
            if os.path.exists(os.path.join(model_id, filename)):
                filename = os.path.join(model_id, filename)
            else:
                filename = hf_hub_download(model_id, filename=filename)
            with open(filename, "r") as f:
                data = json.load(f)
            self.gptq_bits = data["quantization_config"]["bits"]
            self.gptq_groupsize = data["quantization_config"]["group_size"]
        except Exception:
            filename = "quantize_config.json"
            try:
                if os.path.exists(os.path.join(model_id, filename)):
                    filename = os.path.join(model_id, filename)
                else:
                    filename = hf_hub_download(model_id, filename=filename)
                with open(filename, "r") as f:
                    data = json.load(f)
                self.gptq_bits = data["bits"]
                self.gptq_groupsize = data["group_size"]
            except Exception:
                pass
--- a/server/text_generation_server/utils/my_dist.py
+++ b/server/text_generation_server/utils/my_dist.py
@ -0,0 +1,47 @@
 import os
 import torch
 from datetime import timedelta
 from loguru import logger
 from mpi4py import MPI
 import my_custom_comm
 # Tensor Parallelism settings
 RANK = int(os.getenv("OMPI_COMM_WORLD_RANK", "0"))
 WORLD_SIZE = int(os.getenv("OMPI_COMM_WORLD_SIZE", "1"))
 # CUDA memory fraction
 MEMORY_FRACTION = float(os.getenv("CUDA_MEMORY_FRACTION", "1.0"))
 class MyCommGroup:
    def __init__(self, rank, size, tp_comm, pp_comm):
        self._rank = rank
        self._size = size
        self.tp_comm = tp_comm
        self.pp_comm = pp_comm
    def size(self):
        return self._size
    def rank(self):
        return self._rank
 def initialize_mpi_distributed():
    assert torch.cuda.is_available()
    # mpi initialize
    COMM = MPI.COMM_WORLD
    assert COMM.Get_size() == WORLD_SIZE, f"{COMM.Get_size()},{WORLD_SIZE}"
    # Set the device id.
    assert WORLD_SIZE <= torch.cuda.device_count(), "Each process is one gpu"
    device = RANK % torch.cuda.device_count()
    torch.cuda.set_device(device)
    torch.cuda.set_per_process_memory_fraction(MEMORY_FRACTION, device)
    # nccl initialize
    tp_comm, pp_comm = my_custom_comm.init_nccl(WORLD_SIZE, 1)
    process_group = MyCommGroup(RANK, WORLD_SIZE, tp_comm, pp_comm)
    logger.warning("custom mpi and nccl is already initialized.")
    return process_group, RANK, WORLD_SIZE, COMM