text-generation-inference/server/text_generation_server/layers/moe/__init__.py

from typing import Optional, Protocol, runtime_checkable

import torch
import torch.nn as nn
from loguru import logger
from transformers.activations import ACT2FN

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
)
from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
from text_generation_server.layers.marlin import GPTQMarlinWeightsLoader
from text_generation_server.layers.moe.gptq_marlin import (
    GPTQMarlinSparseMoELayer,
    can_use_marlin_moe_gemm,
)
from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer
from text_generation_server.layers.moe.fp8 import FP8SparseMoELayer
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import (
    DefaultWeightsLoader,
    Weights,
    UnquantizedWeight,
)

if SYSTEM == "ipex":
    from .fused_moe_ipex import fused_topk, grouped_topk
else:
    from moe_kernels.fused_moe import fused_topk, grouped_topk


# NOTE: we are using a protocol here, because multiple inherance is not nice.
#       We need `Module`, and `Module` -> some abstract class -> some concrete
#       class inheritance is whacky.


@runtime_checkable
class MoELayer(Protocol):
    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
        hidden_act: str = "silu",
        scoring_func: Optional[str] = None,
        e_score_correction_bias: Optional[float] = None,
    ): ...

    def forward(
        self, x: torch.Tensor, *, gating_output: torch.Tensor
    ) -> torch.Tensor: ...


class DenseMoELayer(nn.Module):
    """
    Layer for MoE that applies *all* experts to each tokens and then weights
    their outputs based on the calculated routing. This layer is much slower
    than `SparseMoELayer` and should only be used when no fused kernels are
    available (e.g. for unsupported quantizers).
    """

    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
        hidden_act: str = "silu",
        scoring_func: Optional[str] = None,
        e_score_correction_bias: Optional[float] = None,
    ):
        super().__init__()

        assert scoring_func is None, "scoring func is not handled"
        assert e_score_correction_bias is None, "scoring correction bias is not handled"

        log_once(
            logger.info,
            "No fused layers are available for this model type, using (slower) dense MoE layer",
        )

        assert (n_expert_group is None) == (
            topk_group is None
        ), "n_expert_group and topk_group must both be None or have some value"

        self.n_expert_group = n_expert_group
        self.n_experts = n_experts
        self.renormalize = renormalize
        self.topk = topk
        self.topk_group = topk_group

        if "gelu" in hidden_act:
            self.act = lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh"
                    if hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
                    else "none"
                ),
            )
        elif "silu" in hidden_act:
            self.act = torch.nn.functional.silu
        else:
            self.act = ACT2FN[hidden_act]

        self.gate_proj = [
            TensorParallelColumnLinear.load(
                None,
                prefix=f"{prefix}.{i}.{gate_proj_name}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_experts)
        ]
        self.up_proj = [
            TensorParallelColumnLinear.load(
                None,
                prefix=f"{prefix}.{i}.{up_proj_name}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_experts)
        ]
        self.down_proj = [
            TensorParallelRowLinear.load(
                None,
                prefix=f"{prefix}.{i}.{down_proj_name}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_experts)
        ]

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        """
        x: (sequence_length, model_dim)
        gating_output: (sequence_length, n_experts)
        """
        # optional reshape
        input_shape = x.shape
        x = x.view(-1, input_shape[-1])

        if self.n_expert_group is not None and self.topk_group is not None:
            topk_weights, topk_ids = grouped_topk(
                x,
                gating_output,
                self.topk,
                renormalize=self.renormalize,
                num_expert_group=self.n_expert_group,
                topk_group=self.topk_group,
            )
        else:
            topk_weights, topk_ids = fused_topk(
                x, gating_output, self.topk, self.renormalize
            )
            topk_weights = topk_weights.to(x.dtype)

        weights = torch.zeros(
            topk_ids.shape[0], self.n_experts, dtype=x.dtype, device=x.device
        )

        weights.scatter_(1, topk_ids.long(), topk_weights.to(weights.dtype))

        out = torch.zeros_like(x)
        for i in range(self.n_experts):
            h = self.act(self.gate_proj[i](x)) * self.up_proj[i](x)
            h = self.down_proj[i](h, reduce=False)
            out += h * weights[:, i].view(-1, 1)

        return out


class SparseMoELayer(nn.Module):
    """
    Layer for MoE that uses fused kernels to only apply the active experts
    for each token (rather than applying all experts and selecting the
    outputs of active experts).
    """

    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        scoring_func: Optional[str] = "softmax",
        e_score_correction_bias: Optional[float] = None,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
    ):
        super().__init__()
        if (
            isinstance(weights.loader, DefaultWeightsLoader)
            and isinstance(weights.loader.weight_class, UnquantizedWeight)
        ) or isinstance(weights.loader, HybridFP8UnquantLoader):
            if (
                isinstance(weights.loader, HybridFP8UnquantLoader)
                and weights.loader.to_fp8
            ):
                cls = FP8SparseMoELayer
            else:
                cls = UnquantizedSparseMoELayer
        elif isinstance(
            weights.loader, GPTQMarlinWeightsLoader
        ) and can_use_marlin_moe_gemm(
            quant_method=weights.loader.quant_method,
            quantize=weights.loader.quantize,
            sym=weights.loader.sym,
        ):
            cls = GPTQMarlinSparseMoELayer
        else:
            raise ValueError(
                f"Unsupported weights loader: {type(weights.loader)}, sparse MoE is only supported for unquantized, AWQ, and GPTQ weights"
            )

        log_once(
            logger.info,
            "Using MoE layer wih fused gemm",
        )

        self.moe = cls(
            n_expert_group=n_expert_group,
            n_experts=n_experts,
            prefix=prefix,
            renormalize=renormalize,
            topk=topk,
            topk_group=topk_group,
            weights=weights,
            scoring_func=scoring_func,
            e_score_correction_bias=e_score_correction_bias,
            gate_proj_name=gate_proj_name,
            up_proj_name=up_proj_name,
            down_proj_name=down_proj_name,
        )

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        return self.moe(x, gating_output=gating_output)

    @staticmethod
    def is_supported(weights: Weights) -> bool:
        return (
            (
                isinstance(weights.loader, DefaultWeightsLoader)
                and isinstance(weights.loader.weight_class, UnquantizedWeight)
            )
            or isinstance(weights.loader, HybridFP8UnquantLoader)
            or (
                isinstance(weights.loader, GPTQMarlinWeightsLoader)
                and can_use_marlin_moe_gemm(
                    quant_method=weights.loader.quant_method,
                    quantize=weights.loader.quantize,
                    sym=weights.loader.sym,
                )
            )
        )
Add `DenseMoELayer` and wire it up in Mixtral/Deepseek V2 (#2537) This replaces the custom layers in both models. 2024-09-24 12:27:06 +00:00			`from typing import Optional, Protocol, runtime_checkable`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00
			`import torch`
			`import torch.nn as nn`
Add `DenseMoELayer` and wire it up in Mixtral/Deepseek V2 (#2537) This replaces the custom layers in both models. 2024-09-24 12:27:06 +00:00			`from loguru import logger`
			`from transformers.activations import ACT2FN`

			`from text_generation_server.layers import (`
			`TensorParallelColumnLinear,`
			`TensorParallelRowLinear,`
			`)`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00			`from text_generation_server.layers.fp8 import HybridFP8UnquantLoader`
Add support for GPTQ-quantized MoE models using MoE Marlin (#2557) This change add support for MoE models that use GPTQ quantization. Currently only models with the following properties are supported: - No `desc_act` with tensor parallelism, unless `group_size=-1`. - No asymmetric quantization. - No AWQ. 2024-09-30 09:14:32 +00:00			`from text_generation_server.layers.marlin import GPTQMarlinWeightsLoader`
			`from text_generation_server.layers.moe.gptq_marlin import (`
			`GPTQMarlinSparseMoELayer,`
			`can_use_marlin_moe_gemm,`
			`)`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00			`from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer`
Add fp8 support moe models (#2928) * Add fp8 support moe models * flatten condition 2025-01-29 12:56:32 +00:00			`from text_generation_server.layers.moe.fp8 import FP8SparseMoELayer`
Add `DenseMoELayer` and wire it up in Mixtral/Deepseek V2 (#2537) This replaces the custom layers in both models. 2024-09-24 12:27:06 +00:00			`from text_generation_server.utils.import_utils import SYSTEM`
			`from text_generation_server.utils.log import log_once`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00			`from text_generation_server.utils.weights import (`
			`DefaultWeightsLoader,`
			`Weights,`
Add support for GPTQ-quantized MoE models using MoE Marlin (#2557) This change add support for MoE models that use GPTQ quantization. Currently only models with the following properties are supported: - No `desc_act` with tensor parallelism, unless `group_size=-1`. - No asymmetric quantization. - No AWQ. 2024-09-30 09:14:32 +00:00			`UnquantizedWeight,`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00			`)`

Update vllm kernels for ROCM (#2826) * (vllm) updated vllm rocm kernels * revert silu * update partition size * remove grouped_topk * (nit) remove log * update moe-kernels commit 2024-12-18 11:44:42 +00:00			`if SYSTEM == "ipex":`
fix moe in quantization path (#2935) update ipex xpu to support moe for mixtral Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> 2025-01-22 13:36:15 +00:00			`from .fused_moe_ipex import fused_topk, grouped_topk`
add ipex moe implementation to support Mixtral and PhiMoe (#2707) * add ipex moe implementation to support Mixtral and PhiMoe Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * update to ipex xpu 2.5 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * torch has xpu support in 2.5 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix oneapi basekit version Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * Apply suggestions from code review Co-authored-by: Daniël de Kok <me@github.danieldk.eu> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Daniël de Kok <me@github.danieldk.eu> 2024-11-18 16:16:55 +00:00			`else:`
Add `DenseMoELayer` and wire it up in Mixtral/Deepseek V2 (#2537) This replaces the custom layers in both models. 2024-09-24 12:27:06 +00:00			`from moe_kernels.fused_moe import fused_topk, grouped_topk`


			`# NOTE: we are using a protocol here, because multiple inherance is not nice.`
			# We need `Module`, and `Module` -> some abstract class -> some concrete
			`# class inheritance is whacky.`


			`@runtime_checkable`
			`class MoELayer(Protocol):`
			`def __init__(`
			`self,`
			`*,`
			`n_expert_group: Optional[int],`
			`n_experts: int,`
			`prefix: str,`
			`renormalize: bool,`
			`topk: int,`
			`topk_group: Optional[int],`
			`weights: Weights,`
			`gate_proj_name: str = "gate_proj",`
			`up_proj_name: str = "up_proj",`
			`down_proj_name: str = "down_proj",`
			`hidden_act: str = "silu",`
Add deepseekv3 (#2968) * Add fp8 support moe models add deepseekv3 format codfe' update dockerfile update doc * Small modifications. * Moe kernels 0.8.1 * Upgrade to 0.8.1 * Fixing moe import. * Black. * Apply suggestions from code review Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> * Fixing Mixtral + Nits. * Put link to ref. * Fix other call locations. * Scoring func `softmax` is the only one that works. --------- Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> 2025-01-30 15:40:25 +00:00			`scoring_func: Optional[str] = None,`
			`e_score_correction_bias: Optional[float] = None,`
Add `DenseMoELayer` and wire it up in Mixtral/Deepseek V2 (#2537) This replaces the custom layers in both models. 2024-09-24 12:27:06 +00:00			`): ...`

			`def forward(`
			`self, x: torch.Tensor, *, gating_output: torch.Tensor`
			`) -> torch.Tensor: ...`


			`class DenseMoELayer(nn.Module):`
			`"""`
			`Layer for MoE that applies all experts to each tokens and then weights`
			`their outputs based on the calculated routing. This layer is much slower`
			than `SparseMoELayer` and should only be used when no fused kernels are
			`available (e.g. for unsupported quantizers).`
			`"""`

			`def __init__(`
			`self,`
			`*,`
			`n_expert_group: Optional[int],`
			`n_experts: int,`
			`prefix: str,`
			`renormalize: bool,`
			`topk: int,`
			`topk_group: Optional[int],`
			`weights: Weights,`
			`gate_proj_name: str = "gate_proj",`
			`up_proj_name: str = "up_proj",`
			`down_proj_name: str = "down_proj",`
			`hidden_act: str = "silu",`
Add deepseekv3 (#2968) * Add fp8 support moe models add deepseekv3 format codfe' update dockerfile update doc * Small modifications. * Moe kernels 0.8.1 * Upgrade to 0.8.1 * Fixing moe import. * Black. * Apply suggestions from code review Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> * Fixing Mixtral + Nits. * Put link to ref. * Fix other call locations. * Scoring func `softmax` is the only one that works. --------- Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> 2025-01-30 15:40:25 +00:00			`scoring_func: Optional[str] = None,`
			`e_score_correction_bias: Optional[float] = None,`
Add `DenseMoELayer` and wire it up in Mixtral/Deepseek V2 (#2537) This replaces the custom layers in both models. 2024-09-24 12:27:06 +00:00			`):`
			`super().__init__()`

Add deepseekv3 (#2968) * Add fp8 support moe models add deepseekv3 format codfe' update dockerfile update doc * Small modifications. * Moe kernels 0.8.1 * Upgrade to 0.8.1 * Fixing moe import. * Black. * Apply suggestions from code review Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> * Fixing Mixtral + Nits. * Put link to ref. * Fix other call locations. * Scoring func `softmax` is the only one that works. --------- Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> 2025-01-30 15:40:25 +00:00			`assert scoring_func is None, "scoring func is not handled"`
			`assert e_score_correction_bias is None, "scoring correction bias is not handled"`

Add `DenseMoELayer` and wire it up in Mixtral/Deepseek V2 (#2537) This replaces the custom layers in both models. 2024-09-24 12:27:06 +00:00			`log_once(`
			`logger.info,`
			`"No fused layers are available for this model type, using (slower) dense MoE layer",`
			`)`

			`assert (n_expert_group is None) == (`
			`topk_group is None`
			`), "n_expert_group and topk_group must both be None or have some value"`

			`self.n_expert_group = n_expert_group`
			`self.n_experts = n_experts`
			`self.renormalize = renormalize`
			`self.topk = topk`
			`self.topk_group = topk_group`

			`if "gelu" in hidden_act:`
			`self.act = lambda x: torch.nn.functional.gelu(`
			`x,`
			`approximate=(`
			`"tanh"`
			`if hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]`
			`else "none"`
			`),`
			`)`
			`elif "silu" in hidden_act:`
			`self.act = torch.nn.functional.silu`
			`else:`
			`self.act = ACT2FN[hidden_act]`

			`self.gate_proj = [`
			`TensorParallelColumnLinear.load(`
			`None,`
			`prefix=f"{prefix}.{i}.{gate_proj_name}",`
			`weights=weights,`
			`bias=False,`
			`)`
			`for i in range(self.n_experts)`
			`]`
			`self.up_proj = [`
			`TensorParallelColumnLinear.load(`
			`None,`
			`prefix=f"{prefix}.{i}.{up_proj_name}",`
			`weights=weights,`
			`bias=False,`
			`)`
			`for i in range(self.n_experts)`
			`]`
			`self.down_proj = [`
			`TensorParallelRowLinear.load(`
			`None,`
			`prefix=f"{prefix}.{i}.{down_proj_name}",`
			`weights=weights,`
			`bias=False,`
			`)`
			`for i in range(self.n_experts)`
			`]`

			`self.process_group = weights.process_group`

			`def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:`
			`"""`
			`x: (sequence_length, model_dim)`
			`gating_output: (sequence_length, n_experts)`
			`"""`
			`# optional reshape`
			`input_shape = x.shape`
			`x = x.view(-1, input_shape[-1])`

			`if self.n_expert_group is not None and self.topk_group is not None:`
			`topk_weights, topk_ids = grouped_topk(`
			`x,`
			`gating_output,`
			`self.topk,`
			`renormalize=self.renormalize,`
			`num_expert_group=self.n_expert_group,`
			`topk_group=self.topk_group,`
			`)`
			`else:`
			`topk_weights, topk_ids = fused_topk(`
			`x, gating_output, self.topk, self.renormalize`
			`)`
			`topk_weights = topk_weights.to(x.dtype)`

			`weights = torch.zeros(`
			`topk_ids.shape[0], self.n_experts, dtype=x.dtype, device=x.device`
			`)`

			`weights.scatter_(1, topk_ids.long(), topk_weights.to(weights.dtype))`

			`out = torch.zeros_like(x)`
			`for i in range(self.n_experts):`
			`h = self.act(self.gate_proj[i](x)) * self.up_proj[i](x)`
			`h = self.down_proj[i](h, reduce=False)`
			`out += h * weights[:, i].view(-1, 1)`

			`return out`

Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00
			`class SparseMoELayer(nn.Module):`
			`"""`
			`Layer for MoE that uses fused kernels to only apply the active experts`
			`for each token (rather than applying all experts and selecting the`
			`outputs of active experts).`
			`"""`

			`def __init__(`
			`self,`
			`*,`
			`n_expert_group: Optional[int],`
			`n_experts: int,`
			`prefix: str,`
			`renormalize: bool,`
			`topk: int,`
			`topk_group: Optional[int],`
			`weights: Weights,`
Add deepseekv3 (#2968) * Add fp8 support moe models add deepseekv3 format codfe' update dockerfile update doc * Small modifications. * Moe kernels 0.8.1 * Upgrade to 0.8.1 * Fixing moe import. * Black. * Apply suggestions from code review Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> * Fixing Mixtral + Nits. * Put link to ref. * Fix other call locations. * Scoring func `softmax` is the only one that works. --------- Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> 2025-01-30 15:40:25 +00:00			`scoring_func: Optional[str] = "softmax",`
			`e_score_correction_bias: Optional[float] = None,`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00			`gate_proj_name: str = "gate_proj",`
			`up_proj_name: str = "up_proj",`
			`down_proj_name: str = "down_proj",`
			`):`
			`super().__init__()`
Add deepseekv3 (#2968) * Add fp8 support moe models add deepseekv3 format codfe' update dockerfile update doc * Small modifications. * Moe kernels 0.8.1 * Upgrade to 0.8.1 * Fixing moe import. * Black. * Apply suggestions from code review Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> * Fixing Mixtral + Nits. * Put link to ref. * Fix other call locations. * Scoring func `softmax` is the only one that works. --------- Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> 2025-01-30 15:40:25 +00:00			`if (`
			`isinstance(weights.loader, DefaultWeightsLoader)`
			`and isinstance(weights.loader.weight_class, UnquantizedWeight)`
			`) or isinstance(weights.loader, HybridFP8UnquantLoader):`
			`if (`
			`isinstance(weights.loader, HybridFP8UnquantLoader)`
			`and weights.loader.to_fp8`
			`):`
			`cls = FP8SparseMoELayer`
			`else:`
			`cls = UnquantizedSparseMoELayer`
Add support for fused MoE Marlin for AWQ (#2616) * Add support for fused MoE Marlin for AWQ This uses the updated MoE Marlin kernels from vLLM. * Add integration test for AWQ MoE 2024-10-08 09:56:41 +00:00			`elif isinstance(`
			`weights.loader, GPTQMarlinWeightsLoader`
			`) and can_use_marlin_moe_gemm(`
			`quant_method=weights.loader.quant_method,`
			`quantize=weights.loader.quantize,`
			`sym=weights.loader.sym,`
			`):`
Add support for GPTQ-quantized MoE models using MoE Marlin (#2557) This change add support for MoE models that use GPTQ quantization. Currently only models with the following properties are supported: - No `desc_act` with tensor parallelism, unless `group_size=-1`. - No asymmetric quantization. - No AWQ. 2024-09-30 09:14:32 +00:00			`cls = GPTQMarlinSparseMoELayer`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00			`else:`
			`raise ValueError(`
Add support for fused MoE Marlin for AWQ (#2616) * Add support for fused MoE Marlin for AWQ This uses the updated MoE Marlin kernels from vLLM. * Add integration test for AWQ MoE 2024-10-08 09:56:41 +00:00			`f"Unsupported weights loader: {type(weights.loader)}, sparse MoE is only supported for unquantized, AWQ, and GPTQ weights"`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00			`)`

Add support for GPTQ-quantized MoE models using MoE Marlin (#2557) This change add support for MoE models that use GPTQ quantization. Currently only models with the following properties are supported: - No `desc_act` with tensor parallelism, unless `group_size=-1`. - No asymmetric quantization. - No AWQ. 2024-09-30 09:14:32 +00:00			`log_once(`
			`logger.info,`
			`"Using MoE layer wih fused gemm",`
			`)`

Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00			`self.moe = cls(`
			`n_expert_group=n_expert_group,`
			`n_experts=n_experts,`
			`prefix=prefix,`
			`renormalize=renormalize,`
			`topk=topk,`
			`topk_group=topk_group,`
			`weights=weights,`
Add deepseekv3 (#2968) * Add fp8 support moe models add deepseekv3 format codfe' update dockerfile update doc * Small modifications. * Moe kernels 0.8.1 * Upgrade to 0.8.1 * Fixing moe import. * Black. * Apply suggestions from code review Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> * Fixing Mixtral + Nits. * Put link to ref. * Fix other call locations. * Scoring func `softmax` is the only one that works. --------- Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com> 2025-01-30 15:40:25 +00:00			`scoring_func=scoring_func,`
			`e_score_correction_bias=e_score_correction_bias,`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00			`gate_proj_name=gate_proj_name,`
			`up_proj_name=up_proj_name,`
			`down_proj_name=down_proj_name,`
			`)`

			`def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:`
			`return self.moe(x, gating_output=gating_output)`

			`@staticmethod`
			`def is_supported(weights: Weights) -> bool:`
			`return (`
			`(`
			`isinstance(weights.loader, DefaultWeightsLoader)`
			`and isinstance(weights.loader.weight_class, UnquantizedWeight)`
			`)`
			`or isinstance(weights.loader, HybridFP8UnquantLoader)`
Add support for GPTQ-quantized MoE models using MoE Marlin (#2557) This change add support for MoE models that use GPTQ quantization. Currently only models with the following properties are supported: - No `desc_act` with tensor parallelism, unless `group_size=-1`. - No asymmetric quantization. - No AWQ. 2024-09-30 09:14:32 +00:00			`or (`
			`isinstance(weights.loader, GPTQMarlinWeightsLoader)`
			`and can_use_marlin_moe_gemm(`
			`quant_method=weights.loader.quant_method,`
			`quantize=weights.loader.quantize,`
			`sym=weights.loader.sym,`
			`)`
			`)`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 16:08:58 +00:00			`)`