mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 16:32:12 +00:00
Tmp work for sharding to work properly.
This commit is contained in:
parent
c5846ee73a
commit
57a6cbff82
22
Dockerfile
22
Dockerfile
@ -157,13 +157,6 @@ COPY --from=transformers-builder /usr/src/transformers/build/lib.linux-x86_64-cp
|
||||
RUN cd /usr/src/transformers && pip install -e . --no-cache-dir && pip install einops --no-cache-dir
|
||||
|
||||
# Install server
|
||||
COPY proto proto
|
||||
COPY server server
|
||||
COPY server/Makefile server/Makefile
|
||||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements.txt && \
|
||||
pip install ".[bnb, accelerate]" --no-cache-dir
|
||||
|
||||
# Install benchmarker
|
||||
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
||||
@ -172,16 +165,31 @@ COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bi
|
||||
# Install launcher
|
||||
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
|
||||
|
||||
COPY proto proto
|
||||
COPY server/requirements.txt server/requirements.txt
|
||||
COPY server/pyproject.toml server/pyproject.toml
|
||||
RUN cd server && pip install -r requirements.txt
|
||||
COPY server/text_generation_server server/text_generation_server
|
||||
COPY server/Makefile server/Makefile
|
||||
COPY server/Makefile-flash-att server/Makefile-flash-att
|
||||
COPY server/Makefile-transformers server/Makefile-transformers
|
||||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install ".[bnb, accelerate]" --no-cache-dir
|
||||
RUN apt update && apt install build-essential g++ -y
|
||||
|
||||
# AWS Sagemaker compatbile image
|
||||
FROM base as sagemaker
|
||||
|
||||
COPY sagemaker-entrypoint.sh entrypoint.sh
|
||||
RUN chmod +x entrypoint.sh
|
||||
RUN pip install triton
|
||||
|
||||
ENTRYPOINT ["./entrypoint.sh"]
|
||||
|
||||
# Final image
|
||||
FROM base
|
||||
|
||||
|
||||
ENTRYPOINT ["text-generation-launcher"]
|
||||
CMD ["--json-output"]
|
@ -184,7 +184,6 @@ class FastLinear(nn.Linear):
|
||||
return tensor.contiguous()
|
||||
|
||||
if isinstance(self, TensorParallelRowLinear):
|
||||
raise ValueError("This is currently not functionning")
|
||||
get_slice = get_row_slice
|
||||
elif isinstance(self, TensorParallelColumnLinear):
|
||||
get_slice = get_col_slice
|
||||
@ -216,10 +215,11 @@ class FastLinear(nn.Linear):
|
||||
|
||||
|
||||
elif name == "self_attn.o_proj":
|
||||
self.qlinear.qweight[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.qweight")
|
||||
self.qlinear.qzeros[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.qzeros")
|
||||
self.qlinear.scales[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.scales")
|
||||
self.qlinear.qweight = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.qweight")
|
||||
self.qlinear.qzeros = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.qzeros")
|
||||
self.qlinear.scales = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.scales")
|
||||
self.qlinear.g_idx[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.g_idx")
|
||||
import ipdb;ipdb.set_trace()
|
||||
|
||||
elif name == "mlp.gate_up_proj":
|
||||
N = self.qlinear.qweight.shape[1] // 2
|
||||
@ -237,9 +237,9 @@ class FastLinear(nn.Linear):
|
||||
self.qlinear.qzeros[:, :N] = get_slice(f, f"model.layers.{layer}.mlp.gate_proj.qzeros")
|
||||
|
||||
elif name == "mlp.down_proj":
|
||||
self.qlinear.qweight[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.qweight")
|
||||
self.qlinear.qzeros[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.qzeros")
|
||||
self.qlinear.scales[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.scales")
|
||||
self.qlinear.qweight = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.qweight")
|
||||
self.qlinear.qzeros = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.qzeros")
|
||||
self.qlinear.scales = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.scales")
|
||||
self.qlinear.g_idx[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.g_idx")
|
||||
else:
|
||||
raise ValueError("Not handled")
|
||||
|
Loading…
Reference in New Issue
Block a user