text-generation-inference/nix/server.nix
Daniël de Kok 571ac9b507
Use kernels from the kernel hub (#2988)
* Use Hub kernels for Marlin and cutlass quantization kernels

* Use hub kernels for MoE/GPTQ-Marlin MoE

* Use attention kernels from the Hub

* Cache the kernels in the Docker image

* Update moe kernels

* Support loading local kernels for development

* Support latest moe kernels

* Update to moe 0.1.1

* CI: download locked kernels for server tests

* Fixup some imports

* CI: activate venv

* Fix unused imports

* Nix: add attention/moe/quantization kernels

* Update hf-kernels to 0.1.5

* Update kernels

* Update tgi-nix flake for hf-kernels

* Fix EOF

* Take `load_kernel` out of a frequently-called function

* Hoist another case of kernel loading out of a somewhat hot function

* marlin-kernels -> quantization

* attention -> paged-attention

* EOF fix

* Update hf-kernels, fixup Docker

* ipex fix

* Remove outdated TODO
2025-02-10 19:19:25 +01:00

129 lines
2.4 KiB
Nix

{
nix-filter,
buildPythonPackage,
poetry-core,
mypy-protobuf,
awq-inference-engine,
causal-conv1d,
compressed-tensors,
eetq,
einops,
exllamav2,
flashinfer,
flash-attn,
flash-attn-layer-norm,
flash-attn-rotary,
flash-attn-v1,
grpc-interceptor,
grpcio-reflection,
grpcio-status,
grpcio-tools,
hf-kernels,
hf-transfer,
loguru,
mamba-ssm,
moe,
opentelemetry-api,
opentelemetry-exporter-otlp,
opentelemetry-instrumentation-grpc,
opentelemetry-semantic-conventions,
outlines,
paged-attention,
peft,
pillow,
prometheus-client,
punica-kernels,
py-cpuinfo,
pydantic,
quantization,
safetensors,
tokenizers,
torch,
sentencepiece,
transformers,
typer,
}:
let
filter = nix-filter.lib;
in
buildPythonPackage {
name = "text-generation-server";
src = filter {
root = ../.;
include = with filter; [
isDirectory
(and (inDirectory "server") (or_ (matchExt "py") (matchExt "pyi")))
"server/pyproject.toml"
(and (inDirectory "proto/v3") (matchExt "proto"))
];
};
pyproject = true;
build-system = [ poetry-core ];
nativeBuildInputs = [ mypy-protobuf ];
pythonRelaxDeps = [
"einops"
"huggingface-hub"
"loguru"
"opentelemetry-instrumentation-grpc"
"pillow"
"sentencepiece"
"typer"
];
pythonRemoveDeps = [ "scipy" ];
dependencies = [
awq-inference-engine
eetq
causal-conv1d
compressed-tensors
einops
exllamav2
flashinfer
flash-attn
flash-attn-layer-norm
flash-attn-rotary
grpc-interceptor
grpcio-reflection
grpcio-status
grpcio-tools
hf-kernels
hf-transfer
loguru
mamba-ssm
moe
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-instrumentation-grpc
opentelemetry-semantic-conventions
outlines
paged-attention
peft
pillow
prometheus-client
punica-kernels
py-cpuinfo
pydantic
quantization
safetensors
sentencepiece
tokenizers
transformers
typer
];
prePatch = ''
python -m grpc_tools.protoc -Iproto/v3 --python_out=server/text_generation_server/pb \
--grpc_python_out=server/text_generation_server/pb --mypy_out=server/text_generation_server/pb proto/v3/generate.proto
find server/text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
touch server/text_generation_server/pb/__init__.py
cd server
'';
}