mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 16:32:12 +00:00
Tmp work for sharding to work properly.
This commit is contained in:
parent
c5846ee73a
commit
57a6cbff82
22
Dockerfile
22
Dockerfile
@ -157,13 +157,6 @@ COPY --from=transformers-builder /usr/src/transformers/build/lib.linux-x86_64-cp
|
|||||||
RUN cd /usr/src/transformers && pip install -e . --no-cache-dir && pip install einops --no-cache-dir
|
RUN cd /usr/src/transformers && pip install -e . --no-cache-dir && pip install einops --no-cache-dir
|
||||||
|
|
||||||
# Install server
|
# Install server
|
||||||
COPY proto proto
|
|
||||||
COPY server server
|
|
||||||
COPY server/Makefile server/Makefile
|
|
||||||
RUN cd server && \
|
|
||||||
make gen-server && \
|
|
||||||
pip install -r requirements.txt && \
|
|
||||||
pip install ".[bnb, accelerate]" --no-cache-dir
|
|
||||||
|
|
||||||
# Install benchmarker
|
# Install benchmarker
|
||||||
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
||||||
@ -172,16 +165,31 @@ COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bi
|
|||||||
# Install launcher
|
# Install launcher
|
||||||
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
|
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
|
||||||
|
|
||||||
|
COPY proto proto
|
||||||
|
COPY server/requirements.txt server/requirements.txt
|
||||||
|
COPY server/pyproject.toml server/pyproject.toml
|
||||||
|
RUN cd server && pip install -r requirements.txt
|
||||||
|
COPY server/text_generation_server server/text_generation_server
|
||||||
|
COPY server/Makefile server/Makefile
|
||||||
|
COPY server/Makefile-flash-att server/Makefile-flash-att
|
||||||
|
COPY server/Makefile-transformers server/Makefile-transformers
|
||||||
|
RUN cd server && \
|
||||||
|
make gen-server && \
|
||||||
|
pip install ".[bnb, accelerate]" --no-cache-dir
|
||||||
|
RUN apt update && apt install build-essential g++ -y
|
||||||
|
|
||||||
# AWS Sagemaker compatbile image
|
# AWS Sagemaker compatbile image
|
||||||
FROM base as sagemaker
|
FROM base as sagemaker
|
||||||
|
|
||||||
COPY sagemaker-entrypoint.sh entrypoint.sh
|
COPY sagemaker-entrypoint.sh entrypoint.sh
|
||||||
RUN chmod +x entrypoint.sh
|
RUN chmod +x entrypoint.sh
|
||||||
|
RUN pip install triton
|
||||||
|
|
||||||
ENTRYPOINT ["./entrypoint.sh"]
|
ENTRYPOINT ["./entrypoint.sh"]
|
||||||
|
|
||||||
# Final image
|
# Final image
|
||||||
FROM base
|
FROM base
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["text-generation-launcher"]
|
ENTRYPOINT ["text-generation-launcher"]
|
||||||
CMD ["--json-output"]
|
CMD ["--json-output"]
|
@ -184,7 +184,6 @@ class FastLinear(nn.Linear):
|
|||||||
return tensor.contiguous()
|
return tensor.contiguous()
|
||||||
|
|
||||||
if isinstance(self, TensorParallelRowLinear):
|
if isinstance(self, TensorParallelRowLinear):
|
||||||
raise ValueError("This is currently not functionning")
|
|
||||||
get_slice = get_row_slice
|
get_slice = get_row_slice
|
||||||
elif isinstance(self, TensorParallelColumnLinear):
|
elif isinstance(self, TensorParallelColumnLinear):
|
||||||
get_slice = get_col_slice
|
get_slice = get_col_slice
|
||||||
@ -216,10 +215,11 @@ class FastLinear(nn.Linear):
|
|||||||
|
|
||||||
|
|
||||||
elif name == "self_attn.o_proj":
|
elif name == "self_attn.o_proj":
|
||||||
self.qlinear.qweight[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.qweight")
|
self.qlinear.qweight = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.qweight")
|
||||||
self.qlinear.qzeros[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.qzeros")
|
self.qlinear.qzeros = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.qzeros")
|
||||||
self.qlinear.scales[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.scales")
|
self.qlinear.scales = f.get_tensor(f"model.layers.{layer}.self_attn.o_proj.scales")
|
||||||
self.qlinear.g_idx[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.g_idx")
|
self.qlinear.g_idx[:] = get_slice(f, f"model.layers.{layer}.self_attn.o_proj.g_idx")
|
||||||
|
import ipdb;ipdb.set_trace()
|
||||||
|
|
||||||
elif name == "mlp.gate_up_proj":
|
elif name == "mlp.gate_up_proj":
|
||||||
N = self.qlinear.qweight.shape[1] // 2
|
N = self.qlinear.qweight.shape[1] // 2
|
||||||
@ -237,9 +237,9 @@ class FastLinear(nn.Linear):
|
|||||||
self.qlinear.qzeros[:, :N] = get_slice(f, f"model.layers.{layer}.mlp.gate_proj.qzeros")
|
self.qlinear.qzeros[:, :N] = get_slice(f, f"model.layers.{layer}.mlp.gate_proj.qzeros")
|
||||||
|
|
||||||
elif name == "mlp.down_proj":
|
elif name == "mlp.down_proj":
|
||||||
self.qlinear.qweight[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.qweight")
|
self.qlinear.qweight = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.qweight")
|
||||||
self.qlinear.qzeros[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.qzeros")
|
self.qlinear.qzeros = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.qzeros")
|
||||||
self.qlinear.scales[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.scales")
|
self.qlinear.scales = f.get_tensor(f"model.layers.{layer}.mlp.down_proj.scales")
|
||||||
self.qlinear.g_idx[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.g_idx")
|
self.qlinear.g_idx[:] = get_slice(f, f"model.layers.{layer}.mlp.down_proj.g_idx")
|
||||||
else:
|
else:
|
||||||
raise ValueError("Not handled")
|
raise ValueError("Not handled")
|
||||||
|
Loading…
Reference in New Issue
Block a user