Tmp work for sharding to work properly.

This commit is contained in:
Ubuntu 2023-05-08 09:00:54 +00:00
parent c5846ee73a
commit 57a6cbff82
2 changed files with 23 additions and 15 deletions

View File

@ -157,13 +157,6 @@ COPY --from=transformers-builder /usr/src/transformers/build/lib.linux-x86_64-cp
RUN cd /usr/src/transformers && pip install -e . --no-cache-dir && pip install einops --no-cache-dir
# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
RUN cd server && \
make gen-server && \
pip install -r requirements.txt && \
pip install ".[bnb, accelerate]" --no-cache-dir
# Install benchmarker
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
@ -172,16 +165,31 @@ COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bi
# Install launcher
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
COPY proto proto
COPY server/requirements.txt server/requirements.txt
COPY server/pyproject.toml server/pyproject.toml
RUN cd server && pip install -r requirements.txt
COPY server/text_generation_server server/text_generation_server
COPY server/Makefile server/Makefile
COPY server/Makefile-flash-att server/Makefile-flash-att
COPY server/Makefile-transformers server/Makefile-transformers
RUN cd server && \
make gen-server && \
pip install ".[bnb, accelerate]" --no-cache-dir
RUN apt update && apt install build-essential g++ -y
# AWS Sagemaker compatbile image
FROM base as sagemaker
COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh
RUN pip install triton
ENTRYPOINT ["./entrypoint.sh"]
# Final image
FROM base
ENTRYPOINT ["text-generation-launcher"]
CMD ["--json-output"]

View File

@ -184,7 +184,6 @@ class FastLinear(nn.Linear):
return tensor.contiguous()
if isinstance(self, TensorParallelRowLinear):
raise ValueError("This is currently not functionning")
get_slice = get_row_slice
elif isinstance(self, TensorParallelColumnLinear):
get_slice = get_col_slice
@ -216,10 +215,11 @@ class FastLinear(nn.Linear):
elif name == "self_attn.o_proj":
self.qlinear.qweight[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.qweight")
self.qlinear.qzeros[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.qzeros")
self.qlinear.scales[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.scales")
self.qlinear.qweight = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.qweight")
self.qlinear.qzeros = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.qzeros")
self.qlinear.scales = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.scales")
self.qlinear.g_idx[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.g_idx")
import ipdb;ipdb.set_trace()
elif name == "mlp.gate_up_proj":
N = self.qlinear.qweight.shape[1] // 2
@ -237,9 +237,9 @@ class FastLinear(nn.Linear):
self.qlinear.qzeros[:, :N] = get_slice(f, f"model.layers.{layer}.mlp.gate_proj.qzeros")
elif name == "mlp.down_proj":
self.qlinear.qweight[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.qweight")
self.qlinear.qzeros[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.qzeros")
self.qlinear.scales[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.scales")
self.qlinear.qweight = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.qweight")
self.qlinear.qzeros = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.qzeros")
self.qlinear.scales = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.scales")
self.qlinear.g_idx[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.g_idx")
else:
raise ValueError("Not handled")