text-generation-inference/server/text_generation_server/models/model.py

import inspect
import torch

from abc import ABC, abstractmethod
from typing import List, Tuple, Optional, TypeVar, Type, Dict, DefaultDict
from collections import defaultdict
from transformers import PreTrainedTokenizerBase, PretrainedConfig

from text_generation_server.models.types import Batch, Generation
from text_generation_server.utils.speculate import get_speculate
from text_generation_server.pb.generate_pb2 import InfoResponse
from text_generation_server.adapters.weights import LayerAdapterWeights
from text_generation_server.utils.adapter import (
    load_and_merge_adapters,
    AdapterParameters,
    AdapterSource,
)
from loguru import logger


BASE_MODEL_ADAPTER_ID = "__base_model__"


def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
    block_size = size // world_size
    start = offset + rank * block_size
    stop = offset + (rank + 1) * block_size
    return start, stop


def shard_on_dim(
    t: torch.Tensor, dim: int, process_group: torch.distributed.ProcessGroup
):
    world_size = process_group.size()
    rank = process_group.rank()

    size = t.shape[dim]
    start, stop = get_start_stop_idxs_for_rank(0, size, rank, world_size)

    if dim == 0:
        tensor = t[start:stop]
    elif dim == 1:
        tensor = t[:, start:stop]
    else:
        raise NotImplementedError("Let's make that generic when needed")

    return tensor


B = TypeVar("B", bound=Batch)


class Model(ABC):
    def __init__(
        self,
        model_id: str,
        model: torch.nn.Module,
        tokenizer: PreTrainedTokenizerBase,
        requires_padding: bool,
        dtype: torch.dtype,
        device: torch.device,
        rank: int = 0,
        world_size: int = 1,
        sliding_window: Optional[int] = None,
        speculate: Optional[int] = None,
    ):
        self.model_id = model_id
        self.model = model.eval()
        self.tokenizer = tokenizer

        # all_special_ids is not set correctly if the rust tokenizer is unpacked
        # TODO report this to transformers.
        other_special_ids = {
            id for id, token in tokenizer.added_tokens_decoder.items() if token.special
        }
        self.all_special_ids = set(tokenizer.all_special_ids)
        self.all_special_ids.update(other_special_ids)
        self.requires_padding = requires_padding
        self.dtype = dtype
        self.device = device
        self.rank = rank
        self.world_size = world_size
        self.sliding_window = sliding_window if sliding_window != -1 else None

        self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
            LayerAdapterWeights
        )
        self.target_to_layer = self.adapter_target_to_layer()
        self.loaded_adapters = set()

        if speculate is None:
            speculate = get_speculate()
        self.speculate = speculate

        self.has_position_ids = (
            inspect.signature(model.forward).parameters.get("position_ids", None)
            is not None
        )

        self.check_initialized()

    @property
    def info(self) -> InfoResponse:
        if self.requires_padding and self.sliding_window is not None:
            raise NotImplementedError("sliding_window is not implemented with padding")

        return InfoResponse(
            requires_padding=self.requires_padding,
            dtype=str(self.dtype),
            device_type=self.device.type,
            window_size=self.sliding_window,
            speculate=self.speculate,
        )

    @property
    @abstractmethod
    def batch_type(self) -> Type[B]:
        raise NotImplementedError

    @abstractmethod
    def generate_token(
        self, batch: B
    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
        raise NotImplementedError

    def warmup(self, batch: B) -> Optional[int]:
        self.generate_token(batch)
        return None

    def decode_token(
        self,
        all_input_ids: List[int],
        prefix_offset: int = 0,
        read_offset: int = 0,
        skip_special_tokens: bool = False,
    ) -> Tuple[str, int, int]:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""

        # The prefix text is necessary only to defeat cleanup algorithms in the decode
        # which decide to add a space or not depending on the surrounding ids.
        prefix_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:read_offset],
            skip_special_tokens=skip_special_tokens,
        )
        new_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
        )

        if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
            # utf-8 char at the end means it's a potential unfinished byte sequence
            # from byte fallback tokenization.
            # If it's in the middle, it's probably a real invalid id generated
            # by the model
            new_text = new_text[len(prefix_text) :]
            return new_text, read_offset, len(all_input_ids)
        else:
            return "", prefix_offset, read_offset

    def check_initialized(self):
        uninitialized_parameters = []
        for n, p in self.model.named_parameters():
            if p.data.device == torch.device("meta"):
                uninitialized_parameters.append(n)
        if uninitialized_parameters:
            raise RuntimeError(
                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
            )

    @property
    def supports_adapter_loading(self) -> bool:
        return False

    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
        return {}

    @property
    def adapter_layers(self) -> List[str]:
        return []

    @property
    def default_traced_adapter_layers(self) -> List[str]:
        return []

    def get_num_layers_for_type(self, layer_type: str) -> int:
        return 0

    def is_row_parallel(self, layer_type: str) -> bool:
        return False

    @property
    def max_speculative_tokens(self) -> int:
        return max(
            [
                weights.max_speculative_tokens
                for weights in self.layer_to_adapter_weights.values()
            ],
            default=0,
        )

    def load_adapter(
        self,
        adapter_parameters: AdapterParameters,
        adapter_source: AdapterSource,
        adapter_index: int,
        api_token: str,
        dynamic: bool = True,
    ):
        """Loads adapter weights from disk / host memory on the GPU.

        adapter_id must be `BASE_MODEL_ADAPTER_ID` if adapter statically loaded
        into model. Otherwise, the adapter weights are applied during the forward
        pass and stored separately from the base model parameters.
        """
        if adapter_index in self.loaded_adapters:
            # Adapter already loaded
            return

        if not self.supports_adapter_loading:
            raise ValueError("This model does not support adapter loading.")

        if dynamic and not self.dynamic_adapter_loading_enabled:
            raise ValueError(
                f"This model was initialized with the adapter {self.static_adapter_id} "
                f"and therefore does not support dynamic adapter loading. "
                f"Please initialize a new model instance from the base model in "
                f"order to use the dynamic adapter loading feature."
            )

        logger.info(
            f"Loading adapter weights into model: {','.join(adapter_parameters.adapter_ids)}"
        )
        weight_names = tuple([v[0] for v in self.target_to_layer.values()])
        (
            module_map,
            adapter_config,
            adapter_weight_names,
            adapter_tokenizer,
        ) = load_and_merge_adapters(
            self.model_id,
            adapter_parameters,
            adapter_source,
            adapter_index,
            weight_names,
            api_token,
            False,
        )

        unused_weight_names = adapter_weight_names.copy()
        for layer_name in self.adapter_layers:
            adapter_weights = adapter_config.load_batched_adapter_weights(
                self,
                module_map,
                layer_name,
                unused_weight_names,
                dynamic,
            )

            if adapter_weights is None:
                continue

            layer_weights = self.layer_to_adapter_weights[layer_name]
            layer_weights.add_adapter(adapter_index, adapter_weights)

        if len(unused_weight_names) > 0:
            logger.warning(
                f"{','.join(adapter_parameters.adapter_ids)} unused adapter weights: {unused_weight_names}"
            )

        if adapter_tokenizer is not None:
            self.tokenizers.add_tokenizer(adapter_index, adapter_tokenizer)

        self.loaded_adapters.add(adapter_index)

    def shard_lora_weights(
        self,
        weights_a: List[torch.Tensor],
        weights_b: List[torch.Tensor],
        layer_type: str,
    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
        # [hidden_size, r]
        split_dim = 0 if self.is_row_parallel(layer_type) else 1
        weights_a = [
            shard_on_dim(w, dim=split_dim, process_group=self.process_group)
            for w in weights_a
        ]

        # [r, hidden_size]
        weights_b = [
            shard_on_dim(w, dim=1, process_group=self.process_group) for w in weights_b
        ]

        return weights_a, weights_b

    def offload_adapter(
        self,
        adapter_parameters: AdapterParameters,
        adapter_source: AdapterSource,
        adapter_index: int,
    ):
        """Offloads the adapter weights from GPU to CPU or disk."""
        if adapter_index not in self.loaded_adapters:
            # Adapter already offloaded
            return

        if not self.supports_adapter_loading:
            raise ValueError("This model does not support adapter loading.")

        if not self.dynamic_adapter_loading_enabled:
            raise ValueError(
                f"This model was initialized with the adapter {self.static_adapter_id} "
                f"and therefore does not support dynamic adapter loading. "
                f"Please initialize a new model instance from the base model in "
                f"order to use the dynamic adapter loading feature."
            )

        for layer_name in self.adapter_layers:
            if layer_name in self.layer_to_adapter_weights:
                self.layer_to_adapter_weights[layer_name].remove_adapter(adapter_index)

        self.loaded_adapters.remove(adapter_index)
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 09:41:35 +00:00
+								import inspect
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 13:22:47 +00:00
+								import torch
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 15:07:54 +00:00
+								from abc import ABC, abstractmethod
-												feat: prefer lorax implementation and port loading logic

											
										
										
											2024-06-05 23:56:04 +00:00
+								from typing import List, Tuple, Optional, TypeVar, Type, Dict, DefaultDict
 								from collections import defaultdict
-												feat(server): Add exllama GPTQ CUDA kernel support #553 (#666)

Just trying to get the integration tests to pass.


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: Felix Marty <9808326+fxmarty@users.noreply.github.com>
											
										
										
											2023-07-21 08:59:00 +00:00
+								from transformers import PreTrainedTokenizerBase, PretrainedConfig
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 17:24:00 +00:00
-												Fix typing in `Model.generate_token` (#733)

## What does this PR do?

This PR fixes a minor type annotation issue in the signature of
`Model.generate_token`.

All existing overrides of `Model.generate_token` return
`Tuple[List[Generation], Optional[B]]`:

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/causal_lm.py#L535-L537

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/flash_causal_lm.py#L802-L804

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/seq2seq_lm.py#L589-L591

I suspect that back in 017a2a8c when `GeneratedText` and `Generation`
were separated, the function signature was not updated.

## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [x] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?

CC @OlivierDehaene
											
										
										
											2023-07-31 12:35:14 +00:00
+								from text_generation_server.models.types import Batch, Generation
-												Speculative (#1308)


											
										
										
											2023-12-11 11:46:30 +00:00
+								from text_generation_server.utils.speculate import get_speculate
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								from text_generation_server.pb.generate_pb2 import InfoResponse
-												feat: prefer lorax implementation and port loading logic

											
										
										
											2024-06-05 23:56:04 +00:00
+								from text_generation_server.adapters.weights import LayerAdapterWeights
 								from text_generation_server.utils.adapter import (
 								    load_and_merge_adapters,
 								    AdapterParameters,
 								    AdapterSource,
 								)
 								from loguru import logger
 								BASE_MODEL_ADAPTER_ID = "__base_model__"
 								def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
 								    block_size = size // world_size
 								    start = offset + rank * block_size
 								    stop = offset + (rank + 1) * block_size
 								    return start, stop
 								def shard_on_dim(
 								    t: torch.Tensor, dim: int, process_group: torch.distributed.ProcessGroup
 								):
 								    world_size = process_group.size()
 								    rank = process_group.rank()
 								    size = t.shape[dim]
 								    start, stop = get_start_stop_idxs_for_rank(0, size, rank, world_size)
 								    if dim == 0:
 								        tensor = t[start:stop]
 								    elif dim == 1:
 								        tensor = t[:, start:stop]
 								    else:
 								        raise NotImplementedError("Let's make that generic when needed")
 								    return tensor
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 17:24:00 +00:00
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 17:03:04 +00:00
+								B = TypeVar("B", bound=Batch)
-												feat: add cuda memory fraction (#659)

Close #673
											
										
										
											2023-07-24 09:43:58 +00:00
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 15:07:54 +00:00
+								class Model(ABC):
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 10:03:10 +00:00
+								    def __init__(
 								        self,
-												feat: prefer lorax implementation and port loading logic

											
										
										
											2024-06-05 23:56:04 +00:00
+								        model_id: str,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        model: torch.nn.Module,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 10:03:10 +00:00
+								        tokenizer: PreTrainedTokenizerBase,
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								        requires_padding: bool,
 								        dtype: torch.dtype,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 10:03:10 +00:00
+								        device: torch.device,
-												feat(server): shard token decode (#303)


											
										
										
											2023-05-10 13:48:21 +00:00
+								        rank: int = 0,
 								        world_size: int = 1,
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 07:55:47 +00:00
+								        sliding_window: Optional[int] = None,
-												Speculative (#1308)


											
										
										
											2023-12-11 11:46:30 +00:00
+								        speculate: Optional[int] = None,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 10:03:10 +00:00
+								    ):
-												feat: prefer lorax implementation and port loading logic

											
										
										
											2024-06-05 23:56:04 +00:00
+								        self.model_id = model_id
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        self.model = model.eval()
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 13:22:47 +00:00
+								        self.tokenizer = tokenizer
-												Use the generation config. (#1808)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2024-04-25 17:41:50 +00:00
 								        # all_special_ids is not set correctly if the rust tokenizer is unpacked
 								        # TODO report this to transformers.
 								        other_special_ids = {
 								            id for id, token in tokenizer.added_tokens_decoder.items() if token.special
 								        }
-												feat(server): add special token bool (#85)


											
										
										
											2023-02-24 14:55:57 +00:00
+								        self.all_special_ids = set(tokenizer.all_special_ids)
-												Use the generation config. (#1808)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2024-04-25 17:41:50 +00:00
+								        self.all_special_ids.update(other_special_ids)
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								        self.requires_padding = requires_padding
 								        self.dtype = dtype
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 13:22:47 +00:00
+								        self.device = device
-												feat(server): shard token decode (#303)


											
										
										
											2023-05-10 13:48:21 +00:00
+								        self.rank = rank
 								        self.world_size = world_size
-												fix: fix logic if sliding window key is not present in config (#1352)


											
										
										
											2023-12-15 13:56:17 +00:00
+								        self.sliding_window = sliding_window if sliding_window != -1 else None
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 09:41:35 +00:00
-												feat: prefer lorax implementation and port loading logic

											
										
										
											2024-06-05 23:56:04 +00:00
+								        self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
 								            LayerAdapterWeights
 								        )
 								        self.target_to_layer = self.adapter_target_to_layer()
 								        self.loaded_adapters = set()
-												Speculative (#1308)


											
										
										
											2023-12-11 11:46:30 +00:00
+								        if speculate is None:
 								            speculate = get_speculate()
 								        self.speculate = speculate
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 09:41:35 +00:00
+								        self.has_position_ids = (
 								            inspect.signature(model.forward).parameters.get("position_ids", None)
 								            is not None
 								        )
-												Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2023-05-15 09:32:25 +00:00
+								        self.check_initialized()
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 13:22:47 +00:00
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								    @property
 								    def info(self) -> InfoResponse:
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 07:55:47 +00:00
+								        if self.requires_padding and self.sliding_window is not None:
 								            raise NotImplementedError("sliding_window is not implemented with padding")
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								        return InfoResponse(
 								            requires_padding=self.requires_padding,
 								            dtype=str(self.dtype),
 								            device_type=self.device.type,
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 07:55:47 +00:00
+								            window_size=self.sliding_window,
-												chore: formatting

											
										
										
											2023-12-11 13:49:52 +00:00
+								            speculate=self.speculate,
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								        )
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 17:03:04 +00:00
+								    @property
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 15:07:54 +00:00
+								    @abstractmethod
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 17:03:04 +00:00
+								    def batch_type(self) -> Type[B]:
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 15:07:54 +00:00
+								        raise NotImplementedError
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 17:24:00 +00:00
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 17:03:04 +00:00
+								    @abstractmethod
-												feat: add more latency metrics in forward (#1346)


											
										
										
											2023-12-14 14:59:38 +00:00
+								    def generate_token(
 								        self, batch: B
 								    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 17:03:04 +00:00
+								        raise NotImplementedError
-												fix(server): fix generate_stream by forcing tokens to be decoded correctly (#100)


											
										
										
											2023-03-06 12:22:58 +00:00
-												feat(server): auto max_batch_total_tokens for flash att models (#630)


											
										
										
											2023-07-19 07:31:25 +00:00
+								    def warmup(self, batch: B) -> Optional[int]:
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 17:09:59 +00:00
+								        self.generate_token(batch)
-												feat(server): auto max_batch_total_tokens for flash att models (#630)


											
										
										
											2023-07-19 07:31:25 +00:00
+								        return None
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 17:09:59 +00:00
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 14:38:22 +00:00
+								    def decode_token(
 								        self,
 								        all_input_ids: List[int],
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        prefix_offset: int = 0,
 								        read_offset: int = 0,
-												Remove the stripping of the prefix space (and any other mangling that tokenizers might do). (#1065)

Superseed #1024


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: bangoz <ch_xie@pku.edu.cn>
											
										
										
											2023-09-27 10:13:45 +00:00
+								        skip_special_tokens: bool = False,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								    ) -> Tuple[str, int, int]:
-												fix(server): fix generate_stream by forcing tokens to be decoded correctly (#100)


											
										
										
											2023-03-06 12:22:58 +00:00
+								        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 14:38:22 +00:00
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        # The prefix text is necessary only to defeat cleanup algorithms in the decode
 								        # which decide to add a space or not depending on the surrounding ids.
 								        prefix_text = self.tokenizer.decode(
-												feat: format code (#1070)


											
										
										
											2023-09-27 10:22:09 +00:00
+								            all_input_ids[prefix_offset:read_offset],
 								            skip_special_tokens=skip_special_tokens,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        )
 								        new_text = self.tokenizer.decode(
-												Remove the stripping of the prefix space (and any other mangling that tokenizers might do). (#1065)

Superseed #1024


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: bangoz <ch_xie@pku.edu.cn>
											
										
										
											2023-09-27 10:13:45 +00:00
+								            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        )
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 14:38:22 +00:00
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
 								            # utf-8 char at the end means it's a potential unfinished byte sequence
 								            # from byte fallback tokenization.
 								            # If it's in the middle, it's probably a real invalid id generated
 								            # by the model
 								            new_text = new_text[len(prefix_text) :]
 								            return new_text, read_offset, len(all_input_ids)
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 14:38:22 +00:00
+								        else:
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								            return "", prefix_offset, read_offset
-												Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2023-05-15 09:32:25 +00:00
 								    def check_initialized(self):
 								        uninitialized_parameters = []
 								        for n, p in self.model.named_parameters():
 								            if p.data.device == torch.device("meta"):
 								                uninitialized_parameters.append(n)
 								        if uninitialized_parameters:
 								            raise RuntimeError(
 								                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
 								            )
-												feat: prefer lorax implementation and port loading logic

											
										
										
											2024-06-05 23:56:04 +00:00
 								    @property
 								    def supports_adapter_loading(self) -> bool:
 								        return False
 								    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
 								        return {}
 								    @property
 								    def adapter_layers(self) -> List[str]:
 								        return []
 								    @property
 								    def default_traced_adapter_layers(self) -> List[str]:
 								        return []
 								    def get_num_layers_for_type(self, layer_type: str) -> int:
 								        return 0
 								    def is_row_parallel(self, layer_type: str) -> bool:
 								        return False
 								    @property
 								    def max_speculative_tokens(self) -> int:
 								        return max(
 								            [
 								                weights.max_speculative_tokens
 								                for weights in self.layer_to_adapter_weights.values()
 								            ],
 								            default=0,
 								        )
 								    def load_adapter(
 								        self,
 								        adapter_parameters: AdapterParameters,
 								        adapter_source: AdapterSource,
 								        adapter_index: int,
 								        api_token: str,
 								        dynamic: bool = True,
 								    ):
 								        """Loads adapter weights from disk / host memory on the GPU.
 								        adapter_id must be `BASE_MODEL_ADAPTER_ID` if adapter statically loaded
 								        into model. Otherwise, the adapter weights are applied during the forward
 								        pass and stored separately from the base model parameters.
 								        """
 								        if adapter_index in self.loaded_adapters:
 								            # Adapter already loaded
 								            return
 								        if not self.supports_adapter_loading:
 								            raise ValueError("This model does not support adapter loading.")
 								        if dynamic and not self.dynamic_adapter_loading_enabled:
 								            raise ValueError(
 								                f"This model was initialized with the adapter {self.static_adapter_id} "
 								                f"and therefore does not support dynamic adapter loading. "
 								                f"Please initialize a new model instance from the base model in "
 								                f"order to use the dynamic adapter loading feature."
 								            )
 								        logger.info(
 								            f"Loading adapter weights into model: {','.join(adapter_parameters.adapter_ids)}"
 								        )
 								        weight_names = tuple([v[0] for v in self.target_to_layer.values()])
 								        (
 								            module_map,
 								            adapter_config,
 								            adapter_weight_names,
 								            adapter_tokenizer,
 								        ) = load_and_merge_adapters(
 								            self.model_id,
 								            adapter_parameters,
 								            adapter_source,
 								            adapter_index,
 								            weight_names,
 								            api_token,
 								            False,
 								        )
 								        unused_weight_names = adapter_weight_names.copy()
 								        for layer_name in self.adapter_layers:
 								            adapter_weights = adapter_config.load_batched_adapter_weights(
 								                self,
 								                module_map,
 								                layer_name,
 								                unused_weight_names,
 								                dynamic,
 								            )
 								            if adapter_weights is None:
 								                continue
 								            layer_weights = self.layer_to_adapter_weights[layer_name]
 								            layer_weights.add_adapter(adapter_index, adapter_weights)
 								        if len(unused_weight_names) > 0:
 								            logger.warning(
 								                f"{','.join(adapter_parameters.adapter_ids)} unused adapter weights: {unused_weight_names}"
 								            )
 								        if adapter_tokenizer is not None:
 								            self.tokenizers.add_tokenizer(adapter_index, adapter_tokenizer)
 								        self.loaded_adapters.add(adapter_index)
 								    def shard_lora_weights(
 								        self,
 								        weights_a: List[torch.Tensor],
 								        weights_b: List[torch.Tensor],
 								        layer_type: str,
 								    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
 								        # [hidden_size, r]
 								        split_dim = 0 if self.is_row_parallel(layer_type) else 1
 								        weights_a = [
 								            shard_on_dim(w, dim=split_dim, process_group=self.process_group)
 								            for w in weights_a
 								        ]
 								        # [r, hidden_size]
 								        weights_b = [
 								            shard_on_dim(w, dim=1, process_group=self.process_group) for w in weights_b
 								        ]
 								        return weights_a, weights_b
 								    def offload_adapter(
 								        self,
 								        adapter_parameters: AdapterParameters,
 								        adapter_source: AdapterSource,
 								        adapter_index: int,
 								    ):
 								        """Offloads the adapter weights from GPU to CPU or disk."""
 								        if adapter_index not in self.loaded_adapters:
 								            # Adapter already offloaded
 								            return
 								        if not self.supports_adapter_loading:
 								            raise ValueError("This model does not support adapter loading.")
 								        if not self.dynamic_adapter_loading_enabled:
 								            raise ValueError(
 								                f"This model was initialized with the adapter {self.static_adapter_id} "
 								                f"and therefore does not support dynamic adapter loading. "
 								                f"Please initialize a new model instance from the base model in "
 								                f"order to use the dynamic adapter loading feature."
 								            )
 								        for layer_name in self.adapter_layers:
 								            if layer_name in self.layer_to_adapter_weights:
 								                self.layer_to_adapter_weights[layer_name].remove_adapter(adapter_index)
 								        self.loaded_adapters.remove(adapter_index)