From 57a6cbff82126e2197e6e73bf210d0c303dff043 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 8 May 2023 09:00:54 +0000 Subject: [PATCH] Tmp work for sharding to work properly. --- Dockerfile | 24 ++++++++++++------- .../custom_modeling/flash_llama_modeling.py | 14 +++++------ 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/Dockerfile b/Dockerfile index 50145395..f2a7f9a1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -157,13 +157,6 @@ COPY --from=transformers-builder /usr/src/transformers/build/lib.linux-x86_64-cp RUN cd /usr/src/transformers && pip install -e . --no-cache-dir && pip install einops --no-cache-dir # Install server -COPY proto proto -COPY server server -COPY server/Makefile server/Makefile -RUN cd server && \ - make gen-server && \ - pip install -r requirements.txt && \ - pip install ".[bnb, accelerate]" --no-cache-dir # Install benchmarker COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark @@ -172,16 +165,31 @@ COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bi # Install launcher COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher +COPY proto proto +COPY server/requirements.txt server/requirements.txt +COPY server/pyproject.toml server/pyproject.toml +RUN cd server && pip install -r requirements.txt +COPY server/text_generation_server server/text_generation_server +COPY server/Makefile server/Makefile +COPY server/Makefile-flash-att server/Makefile-flash-att +COPY server/Makefile-transformers server/Makefile-transformers +RUN cd server && \ + make gen-server && \ + pip install ".[bnb, accelerate]" --no-cache-dir +RUN apt update && apt install build-essential g++ -y + # AWS Sagemaker compatbile image FROM base as sagemaker COPY sagemaker-entrypoint.sh entrypoint.sh RUN chmod +x entrypoint.sh +RUN pip install triton ENTRYPOINT ["./entrypoint.sh"] # Final image FROM base + ENTRYPOINT ["text-generation-launcher"] -CMD ["--json-output"] \ No newline at end of file +CMD ["--json-output"] diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 2f259b67..56aa6b0d 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -184,7 +184,6 @@ class FastLinear(nn.Linear): return tensor.contiguous() if isinstance(self, TensorParallelRowLinear): - raise ValueError("This is currently not functionning") get_slice = get_row_slice elif isinstance(self, TensorParallelColumnLinear): get_slice = get_col_slice @@ -216,10 +215,11 @@ class FastLinear(nn.Linear): elif name == "self_attn.o_proj": - self.qlinear.qweight[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.qweight") - self.qlinear.qzeros[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.qzeros") - self.qlinear.scales[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.scales") + self.qlinear.qweight = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.qweight") + self.qlinear.qzeros = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.qzeros") + self.qlinear.scales = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.scales") self.qlinear.g_idx[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.g_idx") + import ipdb;ipdb.set_trace() elif name == "mlp.gate_up_proj": N = self.qlinear.qweight.shape[1] // 2 @@ -237,9 +237,9 @@ class FastLinear(nn.Linear): self.qlinear.qzeros[:, :N] = get_slice(f, f"model.layers.{layer}.mlp.gate_proj.qzeros") elif name == "mlp.down_proj": - self.qlinear.qweight[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.qweight") - self.qlinear.qzeros[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.qzeros") - self.qlinear.scales[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.scales") + self.qlinear.qweight = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.qweight") + self.qlinear.qzeros = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.qzeros") + self.qlinear.scales = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.scales") self.qlinear.g_idx[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.g_idx") else: raise ValueError("Not handled")