text-generation-inference/server/text_generation_server/models/model.py

import inspect
import torch

from abc import ABC, abstractmethod
from typing import List, Tuple, Optional, TypeVar, Type, Dict, DefaultDict
from collections import defaultdict
from transformers import PreTrainedTokenizerBase, PretrainedConfig

from text_generation_server.models.types import Batch, Generation
from text_generation_server.utils.speculate import get_speculate
from text_generation_server.pb.generate_pb2 import InfoResponse
from text_generation_server.adapters.weights import LayerAdapterWeights
from text_generation_server.utils.adapter import (
    load_and_merge_adapters,
    AdapterParameters,
    AdapterSource,
)
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.models.globals import CUDA_GRAPHS
import os
from loguru import logger


BASE_MODEL_ADAPTER_ID = "__base_model__"


B = TypeVar("B", bound=Batch)


class Model(ABC):
    def __init__(
        self,
        model_id: str,
        model: torch.nn.Module,
        tokenizer: PreTrainedTokenizerBase,
        requires_padding: bool,
        dtype: torch.dtype,
        device: torch.device,
        rank: int = 0,
        world_size: int = 1,
        sliding_window: Optional[int] = None,
        speculate: Optional[int] = None,
        adapter_id: str = BASE_MODEL_ADAPTER_ID,
    ):
        self.model_id = model_id
        self.model = model.eval()
        self.tokenizer = tokenizer

        # all_special_ids is not set correctly if the rust tokenizer is unpacked
        # TODO report this to transformers.
        other_special_ids = {
            id for id, token in tokenizer.added_tokens_decoder.items() if token.special
        }
        self.all_special_ids = set(tokenizer.all_special_ids)
        self.all_special_ids.update(other_special_ids)
        self.requires_padding = requires_padding
        self.dtype = dtype
        self.device = device
        self.rank = rank
        self.world_size = world_size
        self.sliding_window = sliding_window if sliding_window != -1 else None

        self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
            LayerAdapterWeights
        )
        self.target_to_layer = None
        self.loaded_adapters = set()
        self.static_adapter_id = adapter_id

        if speculate is None:
            speculate = get_speculate()
        self.speculate = speculate

        self.has_position_ids = (
            inspect.signature(model.forward).parameters.get("position_ids", None)
            is not None
        )

        self.check_initialized()

    @property
    def info(self) -> InfoResponse:
        if self.requires_padding and self.sliding_window is not None:
            raise NotImplementedError("sliding_window is not implemented with padding")

        return InfoResponse(
            requires_padding=self.requires_padding,
            dtype=str(self.dtype),
            device_type=self.device.type,
            window_size=self.sliding_window,
            speculate=self.speculate,
        )

    @property
    @abstractmethod
    def batch_type(self) -> Type[B]:
        raise NotImplementedError

    @abstractmethod
    def generate_token(
        self, batch: B
    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
        raise NotImplementedError

    def warmup(self, batch: B) -> Optional[int]:
        if SYSTEM == "rocm" and (
            os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
            or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
        ):
            logger.info(
                f"ROCm: Got PYTORCH_TUNABLEOP_ENABLED=1 but TunableOp is not supported for {self.model_id} (instance of {self.__class__.__name__}). Disabling TunableOp."
            )
            torch.cuda.tunable.tuning_enable(False)
            torch.cuda.tunable.enable(False)

        self.generate_token(batch)

        if CUDA_GRAPHS:
            logger.info(
                f"Got CUDA_GRAPHS={CUDA_GRAPHS} but cuda graphs are not supported for {self.model_id} (instance of {self.__class__.__name__}). Cuda graphs will not be used."
            )

        return None

    def decode_token(
        self,
        all_input_ids: List[int],
        prefix_offset: int = 0,
        read_offset: int = 0,
        skip_special_tokens: bool = False,
    ) -> Tuple[str, int, int]:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""

        # The prefix text is necessary only to defeat cleanup algorithms in the decode
        # which decide to add a space or not depending on the surrounding ids.
        prefix_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:read_offset],
            skip_special_tokens=skip_special_tokens,
        )
        new_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
        )

        if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
            # utf-8 char at the end means it's a potential unfinished byte sequence
            # from byte fallback tokenization.
            # If it's in the middle, it's probably a real invalid id generated
            # by the model
            new_text = new_text[len(prefix_text) :]
            return new_text, read_offset, len(all_input_ids)
        else:
            return "", prefix_offset, read_offset

    def check_initialized(self):
        uninitialized_parameters = []
        for n, p in self.model.named_parameters():
            if p.data.device == torch.device("meta"):
                uninitialized_parameters.append(n)
        if uninitialized_parameters:
            raise RuntimeError(
                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
            )

    @property
    def supports_adapter_loading(self) -> bool:
        return False

    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
        return {}

    @property
    def adapter_layers(self) -> List[str]:
        return []

    @property
    def default_traced_adapter_layers(self) -> List[str]:
        return []

    def get_num_layers_for_type(self, layer_type: str) -> int:
        return 0

    def is_row_parallel(self, layer_type: str) -> bool:
        return False

    @property
    def max_speculative_tokens(self) -> int:
        return max(
            [
                weights.max_speculative_tokens
                for weights in self.layer_to_adapter_weights.values()
            ],
            default=0,
        )

    def load_adapter(
        self,
        adapter_parameters: AdapterParameters,
        adapter_source: AdapterSource,
        adapter_index: int,
        api_token: str,
        dynamic: bool = True,
    ):
        """Loads adapter weights from disk / host memory on the GPU.

        adapter_id must be `BASE_MODEL_ADAPTER_ID` if adapter statically loaded
        into model. Otherwise, the adapter weights are applied during the forward
        pass and stored separately from the base model parameters.
        """
        if self.target_to_layer is None:
            self.target_to_layer = self.adapter_target_to_layer()
        if adapter_index in self.loaded_adapters:
            # Adapter already loaded
            return

        if not self.supports_adapter_loading:
            raise ValueError("This model does not support adapter loading.")

        if dynamic and not self.dynamic_adapter_loading_enabled:
            raise ValueError(
                f"This model was initialized with the adapter {self.static_adapter_id} "
                f"and therefore does not support dynamic adapter loading. "
                f"Please initialize a new model instance from the base model in "
                f"order to use the dynamic adapter loading feature."
            )

        logger.info(
            f"Loading adapter weights into model: {','.join(adapter_parameters.adapter_ids)}"
        )
        weight_names = tuple([v[0] for v in self.target_to_layer.values()])
        (
            module_map,
            adapter_config,
            adapter_weight_names,
            adapter_tokenizer,
        ) = load_and_merge_adapters(
            self.model_id,
            adapter_parameters,
            adapter_source,
            adapter_index,
            weight_names,
            api_token,
            False,
        )

        unused_weight_names = adapter_weight_names.copy()
        for layer_name in self.adapter_layers:
            adapter_weights = adapter_config.load_batched_adapter_weights(
                self,
                module_map,
                layer_name,
                unused_weight_names,
                dynamic,
            )

            if adapter_weights is None:
                continue

            layer_weights = self.layer_to_adapter_weights[layer_name]
            layer_weights.add_adapter(adapter_index, adapter_weights)

        if len(unused_weight_names) > 0:
            logger.warning(
                f"{','.join(adapter_parameters.adapter_ids)} unused adapter weights: {unused_weight_names}"
            )

        if adapter_tokenizer is not None:
            self.tokenizers.add_tokenizer(adapter_index, adapter_tokenizer)

        self.loaded_adapters.add(adapter_index)

    def offload_adapter(
        self,
        adapter_parameters: AdapterParameters,
        adapter_source: AdapterSource,
        adapter_index: int,
    ):
        """Offloads the adapter weights from GPU to CPU or disk."""
        if adapter_index not in self.loaded_adapters:
            # Adapter already offloaded
            return

        if not self.supports_adapter_loading:
            raise ValueError("This model does not support adapter loading.")

        if not self.dynamic_adapter_loading_enabled:
            raise ValueError(
                f"This model was initialized with the adapter {self.static_adapter_id} "
                f"and therefore does not support dynamic adapter loading. "
                f"Please initialize a new model instance from the base model in "
                f"order to use the dynamic adapter loading feature."
            )

        for layer_name in self.adapter_layers:
            if layer_name in self.layer_to_adapter_weights:
                self.layer_to_adapter_weights[layer_name].remove_adapter(adapter_index)

        self.loaded_adapters.remove(adapter_index)
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 09:41:35 +00:00
+								import inspect
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 13:22:47 +00:00
+								import torch
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 15:07:54 +00:00
+								from abc import ABC, abstractmethod
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 18:46:27 +00:00
+								from typing import List, Tuple, Optional, TypeVar, Type, Dict, DefaultDict
 								from collections import defaultdict
-												feat(server): Add exllama GPTQ CUDA kernel support #553 (#666)

Just trying to get the integration tests to pass.


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: Felix Marty <9808326+fxmarty@users.noreply.github.com>
											
										
										
											2023-07-21 08:59:00 +00:00
+								from transformers import PreTrainedTokenizerBase, PretrainedConfig
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 17:24:00 +00:00
-												Fix typing in `Model.generate_token` (#733)

## What does this PR do?

This PR fixes a minor type annotation issue in the signature of
`Model.generate_token`.

All existing overrides of `Model.generate_token` return
`Tuple[List[Generation], Optional[B]]`:

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/causal_lm.py#L535-L537

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/flash_causal_lm.py#L802-L804

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/seq2seq_lm.py#L589-L591

I suspect that back in 017a2a8c when `GeneratedText` and `Generation`
were separated, the function signature was not updated.

## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [x] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?

CC @OlivierDehaene
											
										
										
											2023-07-31 12:35:14 +00:00
+								from text_generation_server.models.types import Batch, Generation
-												Speculative (#1308)


											
										
										
											2023-12-11 11:46:30 +00:00
+								from text_generation_server.utils.speculate import get_speculate
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								from text_generation_server.pb.generate_pb2 import InfoResponse
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 18:46:27 +00:00
+								from text_generation_server.adapters.weights import LayerAdapterWeights
 								from text_generation_server.utils.adapter import (
 								    load_and_merge_adapters,
 								    AdapterParameters,
 								    AdapterSource,
 								)
-												do not use tunableop for non flash-causal-lm modezls

											
										
										
											2024-07-02 12:52:55 +00:00
+								from text_generation_server.utils.import_utils import SYSTEM
 								from text_generation_server.models.globals import CUDA_GRAPHS
 								import os
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 18:46:27 +00:00
+								from loguru import logger
 								BASE_MODEL_ADAPTER_ID = "__base_model__"
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 17:24:00 +00:00
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 17:03:04 +00:00
+								B = TypeVar("B", bound=Batch)
-												feat: add cuda memory fraction (#659)

Close #673
											
										
										
											2023-07-24 09:43:58 +00:00
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 15:07:54 +00:00
+								class Model(ABC):
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 10:03:10 +00:00
+								    def __init__(
 								        self,
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 18:46:27 +00:00
+								        model_id: str,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        model: torch.nn.Module,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 10:03:10 +00:00
+								        tokenizer: PreTrainedTokenizerBase,
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								        requires_padding: bool,
 								        dtype: torch.dtype,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 10:03:10 +00:00
+								        device: torch.device,
-												feat(server): shard token decode (#303)


											
										
										
											2023-05-10 13:48:21 +00:00
+								        rank: int = 0,
 								        world_size: int = 1,
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 07:55:47 +00:00
+								        sliding_window: Optional[int] = None,
-												Speculative (#1308)


											
										
										
											2023-12-11 11:46:30 +00:00
+								        speculate: Optional[int] = None,
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 18:46:27 +00:00
+								        adapter_id: str = BASE_MODEL_ADAPTER_ID,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 10:03:10 +00:00
+								    ):
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 18:46:27 +00:00
+								        self.model_id = model_id
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        self.model = model.eval()
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 13:22:47 +00:00
+								        self.tokenizer = tokenizer
-												Use the generation config. (#1808)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2024-04-25 17:41:50 +00:00
 								        # all_special_ids is not set correctly if the rust tokenizer is unpacked
 								        # TODO report this to transformers.
 								        other_special_ids = {
 								            id for id, token in tokenizer.added_tokens_decoder.items() if token.special
 								        }
-												feat(server): add special token bool (#85)


											
										
										
											2023-02-24 14:55:57 +00:00
+								        self.all_special_ids = set(tokenizer.all_special_ids)
-												Use the generation config. (#1808)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2024-04-25 17:41:50 +00:00
+								        self.all_special_ids.update(other_special_ids)
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								        self.requires_padding = requires_padding
 								        self.dtype = dtype
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 13:22:47 +00:00
+								        self.device = device
-												feat(server): shard token decode (#303)


											
										
										
											2023-05-10 13:48:21 +00:00
+								        self.rank = rank
 								        self.world_size = world_size
-												fix: fix logic if sliding window key is not present in config (#1352)


											
										
										
											2023-12-15 13:56:17 +00:00
+								        self.sliding_window = sliding_window if sliding_window != -1 else None
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 09:41:35 +00:00
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 18:46:27 +00:00
+								        self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
 								            LayerAdapterWeights
 								        )
-												Hotfixing after refactor.

											
										
										
											2024-07-05 09:25:29 +00:00
+								        self.target_to_layer = None
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 18:46:27 +00:00
+								        self.loaded_adapters = set()
 								        self.static_adapter_id = adapter_id
-												Speculative (#1308)


											
										
										
											2023-12-11 11:46:30 +00:00
+								        if speculate is None:
 								            speculate = get_speculate()
 								        self.speculate = speculate
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 09:41:35 +00:00
+								        self.has_position_ids = (
 								            inspect.signature(model.forward).parameters.get("position_ids", None)
 								            is not None
 								        )
-												Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2023-05-15 09:32:25 +00:00
+								        self.check_initialized()
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 13:22:47 +00:00
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								    @property
 								    def info(self) -> InfoResponse:
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 07:55:47 +00:00
+								        if self.requires_padding and self.sliding_window is not None:
 								            raise NotImplementedError("sliding_window is not implemented with padding")
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								        return InfoResponse(
 								            requires_padding=self.requires_padding,
 								            dtype=str(self.dtype),
 								            device_type=self.device.type,
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 07:55:47 +00:00
+								            window_size=self.sliding_window,
-												chore: formatting

											
										
										
											2023-12-11 13:49:52 +00:00
+								            speculate=self.speculate,
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 13:36:29 +00:00
+								        )
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 17:03:04 +00:00
+								    @property
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 15:07:54 +00:00
+								    @abstractmethod
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 17:03:04 +00:00
+								    def batch_type(self) -> Type[B]:
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 15:07:54 +00:00
+								        raise NotImplementedError
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 17:24:00 +00:00
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 17:03:04 +00:00
+								    @abstractmethod
-												feat: add more latency metrics in forward (#1346)


											
										
										
											2023-12-14 14:59:38 +00:00
+								    def generate_token(
 								        self, batch: B
 								    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 17:03:04 +00:00
+								        raise NotImplementedError
-												fix(server): fix generate_stream by forcing tokens to be decoded correctly (#100)


											
										
										
											2023-03-06 12:22:58 +00:00
-												feat(server): auto max_batch_total_tokens for flash att models (#630)


											
										
										
											2023-07-19 07:31:25 +00:00
+								    def warmup(self, batch: B) -> Optional[int]:
-												do not use tunableop for non flash-causal-lm modezls

											
										
										
											2024-07-02 12:52:55 +00:00
+								        if SYSTEM == "rocm" and (
 								            os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
 								            or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
 								        ):
 								            logger.info(
 								                f"ROCm: Got PYTORCH_TUNABLEOP_ENABLED=1 but TunableOp is not supported for {self.model_id} (instance of {self.__class__.__name__}). Disabling TunableOp."
 								            )
 								            torch.cuda.tunable.tuning_enable(False)
 								            torch.cuda.tunable.enable(False)
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 17:09:59 +00:00
+								        self.generate_token(batch)
-												do not use tunableop for non flash-causal-lm modezls

											
										
										
											2024-07-02 12:52:55 +00:00
 								        if CUDA_GRAPHS:
 								            logger.info(
 								                f"Got CUDA_GRAPHS={CUDA_GRAPHS} but cuda graphs are not supported for {self.model_id} (instance of {self.__class__.__name__}). Cuda graphs will not be used."
 								            )
-												feat(server): auto max_batch_total_tokens for flash att models (#630)


											
										
										
											2023-07-19 07:31:25 +00:00
+								        return None
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 17:09:59 +00:00
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 14:38:22 +00:00
+								    def decode_token(
 								        self,
 								        all_input_ids: List[int],
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        prefix_offset: int = 0,
 								        read_offset: int = 0,
-												Remove the stripping of the prefix space (and any other mangling that tokenizers might do). (#1065)

Superseed #1024


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: bangoz <ch_xie@pku.edu.cn>
											
										
										
											2023-09-27 10:13:45 +00:00
+								        skip_special_tokens: bool = False,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								    ) -> Tuple[str, int, int]:
-												fix(server): fix generate_stream by forcing tokens to be decoded correctly (#100)


											
										
										
											2023-03-06 12:22:58 +00:00
+								        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 14:38:22 +00:00
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        # The prefix text is necessary only to defeat cleanup algorithms in the decode
 								        # which decide to add a space or not depending on the surrounding ids.
 								        prefix_text = self.tokenizer.decode(
-												feat: format code (#1070)


											
										
										
											2023-09-27 10:22:09 +00:00
+								            all_input_ids[prefix_offset:read_offset],
 								            skip_special_tokens=skip_special_tokens,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        )
 								        new_text = self.tokenizer.decode(
-												Remove the stripping of the prefix space (and any other mangling that tokenizers might do). (#1065)

Superseed #1024


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: bangoz <ch_xie@pku.edu.cn>
											
										
										
											2023-09-27 10:13:45 +00:00
+								            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        )
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 14:38:22 +00:00
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								        if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
 								            # utf-8 char at the end means it's a potential unfinished byte sequence
 								            # from byte fallback tokenization.
 								            # If it's in the middle, it's probably a real invalid id generated
 								            # by the model
 								            new_text = new_text[len(prefix_text) :]
 								            return new_text, read_offset, len(all_input_ids)
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 14:38:22 +00:00
+								        else:
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 21:23:27 +00:00
+								            return "", prefix_offset, read_offset
-												Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2023-05-15 09:32:25 +00:00
 								    def check_initialized(self):
 								        uninitialized_parameters = []
 								        for n, p in self.model.named_parameters():
 								            if p.data.device == torch.device("meta"):
 								                uninitialized_parameters.append(n)
 								        if uninitialized_parameters:
 								            raise RuntimeError(
 								                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
 								            )
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 18:46:27 +00:00
 								    @property
 								    def supports_adapter_loading(self) -> bool:
 								        return False
 								    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
 								        return {}
 								    @property
 								    def adapter_layers(self) -> List[str]:
 								        return []
 								    @property
 								    def default_traced_adapter_layers(self) -> List[str]:
 								        return []
 								    def get_num_layers_for_type(self, layer_type: str) -> int:
 								        return 0
 								    def is_row_parallel(self, layer_type: str) -> bool:
 								        return False
 								    @property
 								    def max_speculative_tokens(self) -> int:
 								        return max(
 								            [
 								                weights.max_speculative_tokens
 								                for weights in self.layer_to_adapter_weights.values()
 								            ],
 								            default=0,
 								        )
 								    def load_adapter(
 								        self,
 								        adapter_parameters: AdapterParameters,
 								        adapter_source: AdapterSource,
 								        adapter_index: int,
 								        api_token: str,
 								        dynamic: bool = True,
 								    ):
 								        """Loads adapter weights from disk / host memory on the GPU.
 								        adapter_id must be `BASE_MODEL_ADAPTER_ID` if adapter statically loaded
 								        into model. Otherwise, the adapter weights are applied during the forward
 								        pass and stored separately from the base model parameters.
 								        """
-												Hotfixing after refactor.

											
										
										
											2024-07-05 09:25:29 +00:00
+								        if self.target_to_layer is None:
 								            self.target_to_layer = self.adapter_target_to_layer()
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 18:46:27 +00:00
+								        if adapter_index in self.loaded_adapters:
 								            # Adapter already loaded
 								            return
 								        if not self.supports_adapter_loading:
 								            raise ValueError("This model does not support adapter loading.")
 								        if dynamic and not self.dynamic_adapter_loading_enabled:
 								            raise ValueError(
 								                f"This model was initialized with the adapter {self.static_adapter_id} "
 								                f"and therefore does not support dynamic adapter loading. "
 								                f"Please initialize a new model instance from the base model in "
 								                f"order to use the dynamic adapter loading feature."
 								            )
 								        logger.info(
 								            f"Loading adapter weights into model: {','.join(adapter_parameters.adapter_ids)}"
 								        )
 								        weight_names = tuple([v[0] for v in self.target_to_layer.values()])
 								        (
 								            module_map,
 								            adapter_config,
 								            adapter_weight_names,
 								            adapter_tokenizer,
 								        ) = load_and_merge_adapters(
 								            self.model_id,
 								            adapter_parameters,
 								            adapter_source,
 								            adapter_index,
 								            weight_names,
 								            api_token,
 								            False,
 								        )
 								        unused_weight_names = adapter_weight_names.copy()
 								        for layer_name in self.adapter_layers:
 								            adapter_weights = adapter_config.load_batched_adapter_weights(
 								                self,
 								                module_map,
 								                layer_name,
 								                unused_weight_names,
 								                dynamic,
 								            )
 								            if adapter_weights is None:
 								                continue
 								            layer_weights = self.layer_to_adapter_weights[layer_name]
 								            layer_weights.add_adapter(adapter_index, adapter_weights)
 								        if len(unused_weight_names) > 0:
 								            logger.warning(
 								                f"{','.join(adapter_parameters.adapter_ids)} unused adapter weights: {unused_weight_names}"
 								            )
 								        if adapter_tokenizer is not None:
 								            self.tokenizers.add_tokenizer(adapter_index, adapter_tokenizer)
 								        self.loaded_adapters.add(adapter_index)
 								    def offload_adapter(
 								        self,
 								        adapter_parameters: AdapterParameters,
 								        adapter_source: AdapterSource,
 								        adapter_index: int,
 								    ):
 								        """Offloads the adapter weights from GPU to CPU or disk."""
 								        if adapter_index not in self.loaded_adapters:
 								            # Adapter already offloaded
 								            return
 								        if not self.supports_adapter_loading:
 								            raise ValueError("This model does not support adapter loading.")
 								        if not self.dynamic_adapter_loading_enabled:
 								            raise ValueError(
 								                f"This model was initialized with the adapter {self.static_adapter_id} "
 								                f"and therefore does not support dynamic adapter loading. "
 								                f"Please initialize a new model instance from the base model in "
 								                f"order to use the dynamic adapter loading feature."
 								            )
 								        for layer_name in self.adapter_layers:
 								            if layer_name in self.layer_to_adapter_weights:
 								                self.layer_to_adapter_weights[layer_name].remove_adapter(adapter_index)
 								        self.loaded_adapters.remove(adapter_index)