mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-10-20 12:25:23 +00:00
Deepseek V2 is a MoE model from Deepseek. Relevant variations
compared to other models:
- Grouped top-K in expert selection.
- mscale in yarn is calculated using the `mscale` and `mscale_all_dim`
configuration options.
- `mscale_all_dim` is also used in scaling attention softmax.
- Permuting of the query/key representations before applying rotary
embeddings.
- Some projections cannot be sharded (`q_a_proj`, `kv_a_proj_with_mqa`).
So, we need weight loads that supports quantized weights. To this
end `{Weights,WeightLoader}.get_weight` was added.
- The query/key head dimensionality differs from that of the value,
so we need to pad during attention.
- Heads with size 192, needs an extension to our paged attention
fork and we need to ensure that the KV cache is allocated with the
correct size.
- Shared experts.
24 lines
898 B
Plaintext
24 lines
898 B
Plaintext
commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b
|
|
commit_rocm := c6ee53b1be97e3bbc791b95f22827501297f8921
|
|
build-vllm-cuda:
|
|
if [ ! -d 'vllm' ]; then \
|
|
pip install -U ninja packaging --no-cache-dir && \
|
|
git clone https://github.com/Narsil/vllm.git vllm; \
|
|
fi
|
|
cd vllm && git fetch origin && git checkout $(commit_cuda) && python setup.py build
|
|
|
|
install-vllm-cuda: build-vllm-cuda
|
|
cd vllm && git fetch origin && git checkout $(commit_cuda) && pip install -e .
|
|
|
|
build-vllm-rocm:
|
|
if [ ! -d 'vllm' ]; then \
|
|
pip install -U ninja packaging --no-cache-dir && \
|
|
git clone https://github.com/fxmarty/rocm-vllm.git vllm; \
|
|
fi
|
|
cd vllm && git fetch && git checkout $(commit_rocm) && \
|
|
PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
|
|
|
|
install-vllm-rocm: build-vllm-rocm
|
|
cd vllm && git fetch && git checkout $(commit_rocm) && \
|
|
PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .
|