text-generation-inference/integration-tests/models/test_flash_llama_fp8_kv_cache.py

import pytest


@pytest.fixture(scope="module")
def flash_llama_fp8_kv_cache_handle(launcher):
    with launcher(
        "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
        num_shard=2,
        kv_cache_dtype="fp8_e4m3fn",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache_handle):
    await flash_llama_fp8_kv_cache_handle.health(300)
    return flash_llama_fp8_kv_cache_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache, response_snapshot):
    response = await flash_llama_fp8_kv_cache.generate(
        "What is deep learning?", max_new_tokens=10, decoder_input_details=True
    )

    assert (
        response.generated_text
        == " Deep learning is a subset of machine learning that involves"
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_fp8_kv_cache_all_params(
    flash_llama_fp8_kv_cache, response_snapshot
):
    response = await flash_llama_fp8_kv_cache.generate(
        "What is deep learning?",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_fp8_kv_cache_load(
    flash_llama_fp8_kv_cache, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_llama_fp8_kv_cache, "What is deep learning?", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert (
        responses[0].generated_text
        == " Deep learning is a subset of machine learning that involves"
    )
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"Different messages : {[r.generated_text for r in responses]}"
    assert responses == response_snapshot
Add basic FP8 KV cache support (#2603) * Add basic FP8 KV cache support This change adds rudimentary FP8 KV cache support. The support is enabled by passing `--kv-cache-dtype fp8_e5m2` to the launcher. Doing so uses this type for the KV cache. However support is still limited: * Only the `fp8_e5m2` type is supported. * The KV cache layout is the same as `float16`/`bfloat16` (HND). * The FP8 KV cache is only supported for FlashInfer. * Loading of scales is not yet supported. * Fix Cargo.toml 2024-10-04 15:51:48 +00:00			`import pytest`


			`@pytest.fixture(scope="module")`
			`def flash_llama_fp8_kv_cache_handle(launcher):`
			`with launcher(`
Add support for FP8 KV cache scales (#2628) * Add support for FP8 KV cache scales Since FP8 only has limited dynamic range, we can scale keys/values before storing them into the cache (and unscale them in attention). To avoid rescaling the cache as the absmax values change, good scales are usually determined per layer using calibration calibration data and stored in the checkpoint. This change adds support for for using key-value scales and loading them from checkpoints in the two most common formats: - Separate per-layer `k_scale` and `v_scale` scalars. - Per-layer `kv_scale` scalar (older format). Currently, scales are only used with an `float8_e4m3fn` cache. Besides adding support for key/value scales, the `fp8_quantize` function is also extended to support quantization with a kernel vendored from vLLM. This is slightly faster than the PyTorch implementation, but also scales in FP32, potentially improving accuracy. * Update FP8 KV cache test to use checkpoint with scales * `can_scale`: check that the attention is flashinfer 2024-10-24 14:36:18 +00:00			`"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",`
			`num_shard=2,`
			`kv_cache_dtype="fp8_e4m3fn",`
Add basic FP8 KV cache support (#2603) * Add basic FP8 KV cache support This change adds rudimentary FP8 KV cache support. The support is enabled by passing `--kv-cache-dtype fp8_e5m2` to the launcher. Doing so uses this type for the KV cache. However support is still limited: * Only the `fp8_e5m2` type is supported. * The KV cache layout is the same as `float16`/`bfloat16` (HND). * The FP8 KV cache is only supported for FlashInfer. * Loading of scales is not yet supported. * Fix Cargo.toml 2024-10-04 15:51:48 +00:00			`) as handle:`
			`yield handle`


			`@pytest.fixture(scope="module")`
			`async def flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache_handle):`
			`await flash_llama_fp8_kv_cache_handle.health(300)`
			`return flash_llama_fp8_kv_cache_handle.client`


			`@pytest.mark.release`
			`@pytest.mark.asyncio`
			`@pytest.mark.private`
			`async def test_flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache, response_snapshot):`
			`response = await flash_llama_fp8_kv_cache.generate(`
			`"What is deep learning?", max_new_tokens=10, decoder_input_details=True`
			`)`

			`assert (`
			`response.generated_text`
Add support for FP8 KV cache scales (#2628) * Add support for FP8 KV cache scales Since FP8 only has limited dynamic range, we can scale keys/values before storing them into the cache (and unscale them in attention). To avoid rescaling the cache as the absmax values change, good scales are usually determined per layer using calibration calibration data and stored in the checkpoint. This change adds support for for using key-value scales and loading them from checkpoints in the two most common formats: - Separate per-layer `k_scale` and `v_scale` scalars. - Per-layer `kv_scale` scalar (older format). Currently, scales are only used with an `float8_e4m3fn` cache. Besides adding support for key/value scales, the `fp8_quantize` function is also extended to support quantization with a kernel vendored from vLLM. This is slightly faster than the PyTorch implementation, but also scales in FP32, potentially improving accuracy. * Update FP8 KV cache test to use checkpoint with scales * `can_scale`: check that the attention is flashinfer 2024-10-24 14:36:18 +00:00			`== " Deep learning is a subset of machine learning that involves"`
Add basic FP8 KV cache support (#2603) * Add basic FP8 KV cache support This change adds rudimentary FP8 KV cache support. The support is enabled by passing `--kv-cache-dtype fp8_e5m2` to the launcher. Doing so uses this type for the KV cache. However support is still limited: * Only the `fp8_e5m2` type is supported. * The KV cache layout is the same as `float16`/`bfloat16` (HND). * The FP8 KV cache is only supported for FlashInfer. * Loading of scales is not yet supported. * Fix Cargo.toml 2024-10-04 15:51:48 +00:00			`)`
			`assert response.details.generated_tokens == 10`
			`assert response == response_snapshot`


			`@pytest.mark.release`
			`@pytest.mark.asyncio`
			`@pytest.mark.private`
			`async def test_flash_llama_fp8_kv_cache_all_params(`
			`flash_llama_fp8_kv_cache, response_snapshot`
			`):`
			`response = await flash_llama_fp8_kv_cache.generate(`
			`"What is deep learning?",`
			`max_new_tokens=10,`
			`repetition_penalty=1.2,`
			`return_full_text=True,`
			`stop_sequences=["test"],`
			`temperature=0.5,`
			`top_p=0.9,`
			`top_k=10,`
			`truncate=5,`
			`typical_p=0.9,`
			`watermark=True,`
			`decoder_input_details=True,`
			`seed=0,`
			`)`

			`assert response == response_snapshot`


			`@pytest.mark.release`
			`@pytest.mark.asyncio`
			`@pytest.mark.private`
			`async def test_flash_llama_fp8_kv_cache_load(`
			`flash_llama_fp8_kv_cache, generate_load, response_snapshot`
			`):`
			`responses = await generate_load(`
			`flash_llama_fp8_kv_cache, "What is deep learning?", max_new_tokens=10, n=4`
			`)`

			`assert len(responses) == 4`
			`assert (`
			`responses[0].generated_text`
Add support for FP8 KV cache scales (#2628) * Add support for FP8 KV cache scales Since FP8 only has limited dynamic range, we can scale keys/values before storing them into the cache (and unscale them in attention). To avoid rescaling the cache as the absmax values change, good scales are usually determined per layer using calibration calibration data and stored in the checkpoint. This change adds support for for using key-value scales and loading them from checkpoints in the two most common formats: - Separate per-layer `k_scale` and `v_scale` scalars. - Per-layer `kv_scale` scalar (older format). Currently, scales are only used with an `float8_e4m3fn` cache. Besides adding support for key/value scales, the `fp8_quantize` function is also extended to support quantization with a kernel vendored from vLLM. This is slightly faster than the PyTorch implementation, but also scales in FP32, potentially improving accuracy. * Update FP8 KV cache test to use checkpoint with scales * `can_scale`: check that the attention is flashinfer 2024-10-24 14:36:18 +00:00			`== " Deep learning is a subset of machine learning that involves"`
Add basic FP8 KV cache support (#2603) * Add basic FP8 KV cache support This change adds rudimentary FP8 KV cache support. The support is enabled by passing `--kv-cache-dtype fp8_e5m2` to the launcher. Doing so uses this type for the KV cache. However support is still limited: * Only the `fp8_e5m2` type is supported. * The KV cache layout is the same as `float16`/`bfloat16` (HND). * The FP8 KV cache is only supported for FlashInfer. * Loading of scales is not yet supported. * Fix Cargo.toml 2024-10-04 15:51:48 +00:00			`)`
			`assert all(`
			`[r.generated_text == responses[0].generated_text for r in responses]`
			`), f"Different messages : {[r.generated_text for r in responses]}"`
			`assert responses == response_snapshot`