diff --git a/.github/workflows/autodocs.yml b/.github/workflows/autodocs.yaml similarity index 100% rename from .github/workflows/autodocs.yml rename to .github/workflows/autodocs.yaml diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 991cd76d..22fa06e3 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,46 +1,29 @@ name: Build and push docker image to internal registry on: - workflow_dispatch: - push: - branches: - - 'main' - tags: - - 'v*' - pull_request: - paths: - - ".github/workflows/build.yaml" - - "integration-tests/**" - - "server/**" - - "proto/**" - - "router/**" - - "launcher/**" - - "Cargo.lock" - - "rust-toolchain.toml" - - "Dockerfile" - - "Dockerfile_amd" - - "Dockerfile_intel" - branches: - - 'main' + workflow_call: + inputs: + hardware: + type: string + description: Hardware + # options: + # - cuda + # - rocm + # - intel + required: true jobs: - build-and-push-image: + build-and-push: + outputs: + docker_image: ${{ steps.final.outputs.docker_image }} + docker_devices: ${{ steps.final.outputs.docker_devices }} + runs_on: ${{ steps.final.outputs.runs_on }} + label: ${{ steps.final.outputs.label }} concurrency: - group: ${{ github.workflow }}-build-and-push-image-${{ matrix.name }}-${{ github.head_ref || github.run_id }} + group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true + # TODO see with @Glegendre to get CPU runner here instead runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci] - strategy: - matrix: - include: - - name: "cuda" - label: "" - dockerfile: "Dockerfile" - - name: "amd" - label: "-rocm" - dockerfile: "Dockerfile_amd" - - name: "intel" - label: "-intel" - dockerfile: "Dockerfile_intel" permissions: contents: write packages: write @@ -50,10 +33,43 @@ jobs: security-events: write steps: - name: Checkout repository - uses: actions/checkout@v3 - + uses: actions/checkout@v4 - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4.4.1 + - name: Construct harware variables + shell: bash + run: | + case ${{ inputs.hardware }} in + cuda) + export dockerfile="Dockerfile" + export label_extension="" + export docker_devices="" + export runs_on="nvidia-gpu" + ;; + rocm) + export dockerfile="Dockerfile_amd" + export label_extension="-rocm" + export docker_devices="/dev/kfd,/dev/dri" + # TODO Re-enable when they pass. + # export runs_on="amd-gpu-tgi" + export runs_on="ubuntu-latest" + ;; + intel) + export dockerfile="Dockerfile_intel" + export label_extension="-intel" + export docker_devices="" + export runs_on="ubuntu-latest" + ;; + esac + echo $dockerfile + echo "Dockerfile=${dockerfile}" + echo $label_extension + echo $docker_devices + echo $runs_on + echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV + echo "LABEL=${label_extension}" >> $GITHUB_ENV + echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV + echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV - name: Tailscale uses: huggingface/tailscale-action@main with: @@ -61,25 +77,25 @@ jobs: slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} - name: Initialize Docker Buildx - uses: docker/setup-buildx-action@v2.0.0 + uses: docker/setup-buildx-action@v3 with: install: true - name: Login to GitHub Container Registry if: github.event_name != 'pull_request' - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Login to internal Container Registry - uses: docker/login-action@v2.1.0 + uses: docker/login-action@v3 with: username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }} password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }} registry: registry.internal.huggingface.tech - name: Login to Azure Container Registry if: github.event_name != 'pull_request' - uses: docker/login-action@v2.1.0 + uses: docker/login-action@v3 with: username: ${{ secrets.AZURE_DOCKER_USERNAME }} password: ${{ secrets.AZURE_DOCKER_PASSWORD }} @@ -88,12 +104,12 @@ jobs: - name: Extract metadata (tags, labels) for Docker if: ${{ github.event_name == 'pull_request' }} id: meta-pr - uses: docker/metadata-action@v4.3.0 + uses: docker/metadata-action@v5 with: images: | registry.internal.huggingface.tech/api-inference/community/text-generation-inference tags: | - type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ matrix.label }} + type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }} # If main, release or tag - name: Extract metadata (tags, labels) for Docker if: ${{ github.event_name != 'pull_request' }} @@ -107,44 +123,61 @@ jobs: ghcr.io/huggingface/text-generation-inference db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference tags: | - type=semver,pattern={{version}}${{ matrix.label }} - type=semver,pattern={{major}}.{{minor}}${{ matrix.label }} - type=raw,value=latest${{ matrix.label }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }} - type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ matrix.label }} + type=semver,pattern={{version}}${{ env.LABEL }} + type=semver,pattern={{major}}.{{minor}}${{ env.LABEL }} + type=raw,value=latest${{ env.LABEL }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }} + type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }} - name: Build and push Docker image id: build-and-push uses: docker/build-push-action@v4 with: context: . - file: ${{ matrix.dockerfile }} + file: ${{ env.DOCKERFILE }} push: true platforms: 'linux/amd64' build-args: | GIT_SHA=${{ env.GITHUB_SHA }} - DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ matrix.label }} + DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }} tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }} labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }} - network: host - cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache${{ matrix.label }},mode=min - cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache${{ matrix.label }},mode=min + cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache${{ env.LABEL }},mode=min + cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache${{ env.LABEL }},mode=min + - name: Final + id: final + run: | + echo "docker_image=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT" + echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT" + echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT" + echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT" + integration_tests: + concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + needs: build-and-push + runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"] + if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest' + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4.4.1 - name: Set up Python - if: matrix.name == 'cuda' uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: "3.10" - name: Install - if: matrix.name == 'cuda' run: | make install-integration-tests + - name: Tailscale + uses: huggingface/tailscale-action@main + if: needs.build-and-push.outputs.runs_on != 'amd-gpu-tgi' + with: + authkey: ${{ secrets.TAILSCALE_AUTHKEY }} - name: Run tests - if: matrix.name == 'cuda' run: | export DOCKER_VOLUME=/mnt/cache - export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }} - export HUGGING_FACE_HUB_TOKEN=${{ secrets.HF_TOKEN }} + export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }} + export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }} + export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} + echo $DOCKER_IMAGE pytest -s -vv integration-tests - - name: Tailscale Wait - if: ${{ failure() || runner.debug == '1' }} - uses: huggingface/tailscale-action@main - with: - waitForSSH: true diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yaml similarity index 100% rename from .github/workflows/build_documentation.yml rename to .github/workflows/build_documentation.yaml diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yaml similarity index 95% rename from .github/workflows/build_pr_documentation.yml rename to .github/workflows/build_pr_documentation.yaml index a5ce39a5..bf03bfdf 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yaml @@ -11,7 +11,7 @@ concurrency: jobs: build: - uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main + uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yaml@main with: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml new file mode 100644 index 00000000..754c4850 --- /dev/null +++ b/.github/workflows/ci_build.yaml @@ -0,0 +1,36 @@ +name: CI build + +on: + push: + branches: + - 'main' + tags: + - 'v*' + pull_request: + paths: + - ".github/workflows/build.yaml" + - "integration-tests/**" + - "server/**" + - "proto/**" + - "router/**" + - "launcher/**" + - "Cargo.lock" + - "rust-toolchain.toml" + - "Dockerfile" + - "Dockerfile_amd" + - "Dockerfile_intel" + branches: + - 'main' + +jobs: + build: + strategy: + # super important if you want to see all results, even if one fails + # fail-fast is true by default + fail-fast: false + matrix: + hardware: ["cuda", "rocm", "intel"] + uses: ./.github/workflows/build.yaml # calls the one above ^ + with: + hardware: ${{ matrix.hardware }} + secrets: inherit diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml new file mode 100644 index 00000000..4e111afe --- /dev/null +++ b/.github/workflows/integration_tests.yaml @@ -0,0 +1,41 @@ +name: Integration tests + +on: + workflow_call: + inputs: + docker_image: + type: string + description: Hardware + required: true + docker_devices: + type: string + description: Hardware + runs_on: + type: string + required: true + description: Hardware to run integration tests +jobs: + integration_tests: + concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + runs-on: ${{ inputs.runs_on }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4.4.1 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install + run: | + make install-integration-tests + - name: Run tests + run: | + export DOCKER_VOLUME=/mnt/cache + export DOCKER_IMAGE=${{ inputs.docker_image }} + export DOCKER_DEVICES=${{ inputs.docker_devices }} + export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} + pytest -s -vv integration-tests diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yaml similarity index 100% rename from .github/workflows/stale.yml rename to .github/workflows/stale.yaml diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index d5ad9da3..eb5a4657 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -33,8 +33,8 @@ jobs: - name: Install Rust uses: actions-rs/toolchain@v1 with: - # Released on: June 13, 2024 - # https://releases.rs/docs/1.79.0/ + # Released on: 02 May, 2024 + # https://releases.rs/docs/1.78.0/ toolchain: 1.79.0 override: true components: rustfmt, clippy diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yaml similarity index 100% rename from .github/workflows/trufflehog.yml rename to .github/workflows/trufflehog.yaml diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yaml similarity index 100% rename from .github/workflows/upload_pr_documentation.yml rename to .github/workflows/upload_pr_documentation.yaml diff --git a/Dockerfile b/Dockerfile index 1444396e..58df06e9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,7 @@ WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse FROM chef as planner +COPY Cargo.lock Cargo.lock COPY Cargo.toml Cargo.toml COPY rust-toolchain.toml rust-toolchain.toml COPY proto proto diff --git a/Dockerfile_amd b/Dockerfile_amd index 55da9204..e325bb65 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -5,6 +5,7 @@ WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse FROM chef as planner +COPY Cargo.lock Cargo.lock COPY Cargo.toml Cargo.toml COPY rust-toolchain.toml rust-toolchain.toml COPY proto proto diff --git a/Dockerfile_intel b/Dockerfile_intel index 35362fc9..131f49ba 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -4,6 +4,7 @@ WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse FROM chef as planner +COPY Cargo.lock Cargo.lock COPY Cargo.toml Cargo.toml COPY rust-toolchain.toml rust-toolchain.toml COPY proto proto diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index 2ef85da6..0b239484 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -34,6 +34,7 @@ from text_generation.types import ( DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None) HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None) DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data") +DOCKER_DEVICES = os.getenv("DOCKER_DEVICES") class ResponseComparator(JSONSnapshotExtension): @@ -453,6 +454,18 @@ def launcher(event_loop): if DOCKER_VOLUME: volumes = [f"{DOCKER_VOLUME}:/data"] + if DOCKER_DEVICES: + devices = DOCKER_DEVICES.split(",") + visible = os.getenv("ROCR_VISIBLE_DEVICES") + if visible: + env["ROCR_VISIBLE_DEVICES"] = visible + device_requests = [] + else: + devices = [] + device_requests = [ + docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]]) + ] + container = client.containers.run( DOCKER_IMAGE, command=args, @@ -460,9 +473,8 @@ def launcher(event_loop): environment=env, auto_remove=False, detach=True, - device_requests=[ - docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]]) - ], + device_requests=device_requests, + devices=devices, volumes=volumes, ports={"80/tcp": port}, shm_size="1G", diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index 892943ba..41744a4d 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -340,7 +340,7 @@ def quantize( logger_level=logger_level, json_output=json_output, ) - from text_generation_server.utils.gptq.quantize import quantize + from text_generation_server.layers.gptq.quantize import quantize quantize( model_id=model_id, diff --git a/server/text_generation_server/layers/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py index ca113d8f..8d029817 100644 --- a/server/text_generation_server/layers/gptq/quantize.py +++ b/server/text_generation_server/layers/gptq/quantize.py @@ -12,7 +12,7 @@ from huggingface_hub import HfApi from accelerate import init_empty_weights from text_generation_server.utils import initialize_torch_distributed, Weights from text_generation_server.utils.hub import weight_files -from text_generation_server.utils.gptq.quant_linear import QuantLinear +from text_generation_server.layers.gptq.quant_linear import QuantLinear from loguru import logger from typing import Optional diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py index 5bddb69d..69f38c3a 100644 --- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py @@ -40,31 +40,12 @@ def _load_gqa(config, prefix: str, weights): assert config.hidden_size % config.num_attention_heads == 0 assert config.num_attention_heads % weights.process_group.size() == 0 - weight = weights.get_multi_weights_col( + return TensorParallelColumnLinear.load_multi( + config, prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"], - quantize=config.quantize, dim=0, - ) - - if config.quantize not in ["gptq", "awq", "marlin"]: - weight = weight.to(dtype=weights.dtype).to(device=weights.device) - - head_size = config.hidden_size // config.num_attention_heads - num_heads = config.num_attention_heads // weights.process_group.size() - num_key_value_heads = config.num_key_value_heads // weights.process_group.size() - assert list(weight.shape) == [ - (num_heads + 2 * num_key_value_heads) * head_size, - config.hidden_size, - ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}" - - w = [ - weights.get_sharded(f"{p}.bias", dim=0) - for p in [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"] - ] - bias = torch.cat(w, dim=0).to(dtype=weights.dtype).to(device=weights.device) - - return TensorParallelColumnLinear( - get_linear(weight, bias=bias, quantize=config.quantize) + weights=weights, + bias=True, ) diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py index 11eece48..cc2f172a 100644 --- a/server/text_generation_server/models/globals.py +++ b/server/text_generation_server/models/globals.py @@ -16,6 +16,13 @@ if cuda_graphs is not None: else: cuda_graphs = None + +# sorting the cuda graphs in descending order helps reduce the +# memory impact and results in less memory usage +if cuda_graphs is not None: + cuda_graphs.sort(reverse=True) + + CUDA_GRAPHS = cuda_graphs # This is overridden at model loading. diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index 45cfc073..e6142525 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -130,29 +130,57 @@ class Weights: ), f"The choosen size {size} is not compatible with sharding on {world_size} shards" return self.get_partial_sharded(tensor_name, dim) - def _get_qweight(self, name: str, block_sizes: Union[int, List[int]]): - slice_ = self._get_slice(name) - total_size = slice_.get_shape()[1] + def get_packed_sharded( + self, tensor_name: str, dim: int, block_sizes: Union[int, List[int]] + ) -> torch.Tensor: + """ + Get a shard from a tensor that packs multiple tensors. + + When a tensor packs multiple tensors (such as QKV or an up + projection + gate projection), sharding with `get_sharded` is not + safe since it would not split the packed tensors across shards. + + This method shards a tensor, such that the packed tensors are + split across shards. + + The columns are split in equally sized blocks when blocks is an `int`, or + in blocks proportional given to the sizes. For instance `[2, 1, 1]` will + divide an input with dimensionality `1024` in `[512, 256, 256]`. This is + convenient for e.g. splitting QKV without knowing the storage details of + quantized weights. + """ + slice_ = self._get_slice(tensor_name) + total_size = slice_.get_shape()[dim] block_sizes = _blocks_to_block_sizes(total_size=total_size, blocks=block_sizes) world_size = self.process_group.size() rank = self.process_group.rank() - weights = [] + tensors = [] block_offset = 0 for block_size in block_sizes: assert ( block_size % world_size == 0 - ), f"Prepacked qkv cannot be sharded across {world_size} shards" + ), f"Prepacked tensor cannot be sharded across {world_size} shards" shard_block_size = block_size // world_size start = rank * shard_block_size stop = (rank + 1) * shard_block_size - weights.append(slice_[:, block_offset + start : block_offset + stop]) + if dim == 0: + tensor = slice_[block_offset + start : block_offset + stop] + elif dim == 1: + tensor = slice_[:, block_offset + start : block_offset + stop] + else: + raise NotImplementedError("Currently only dim=0 or dim=1 is supported") + tensors.append(tensor) block_offset += block_size + tensor = torch.cat(tensors, dim=dim) + tensor = tensor.to(device=self.device) - weight = torch.cat(weights, dim=1) - weight = weight.to(device=self.device) - return weight + # Avoid casting quantizer dtypes. + if tensor.dtype not in [torch.int16, torch.int32, torch.int64]: + tensor = tensor.to(dtype=self.dtype) + + return tensor def get_weights_col_packed_qkv( self, @@ -185,7 +213,9 @@ class Weights: from text_generation_server.layers.gptq import GPTQWeight try: - qweight = self._get_qweight(f"{prefix}.qweight", block_sizes) + qweight = self.get_packed_sharded( + f"{prefix}.qweight", dim=1, block_sizes=block_sizes + ) except RuntimeError: raise RuntimeError( f"Cannot load `{quantize}` weight, make sure the model is already quantized." @@ -193,8 +223,12 @@ class Weights: gptq_params = self._get_gptq_params() - qzeros = self._get_qweight(f"{prefix}.qzeros", block_sizes) - scales = self._get_qweight(f"{prefix}.scales", block_sizes) + qzeros = self.get_packed_sharded( + f"{prefix}.qzeros", dim=1, block_sizes=block_sizes + ) + scales = self.get_packed_sharded( + f"{prefix}.scales", dim=1, block_sizes=block_sizes + ) scales = scales.to(dtype=self.dtype) if quantize == "gptq" and gptq_params.quant_method == "gptq": @@ -237,13 +271,17 @@ class Weights: if quant_method == "gptq": gptq_params = self._get_gptq_params() try: - qweight = self._get_qweight(f"{prefix}.qweight", block_sizes) + qweight = self.get_packed_sharded( + f"{prefix}.qweight", dim=1, block_sizes=block_sizes + ) except RuntimeError: raise RuntimeError( f"Cannot load `{quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized" ) - scales = self._get_qweight(f"{prefix}.scales", block_sizes) + scales = self.get_packed_sharded( + f"{prefix}.scales", dim=1, block_sizes=block_sizes + ) g_idx = self.get_tensor(f"{prefix}.g_idx") weight = repack_gptq_for_marlin( qweight=qweight, @@ -257,34 +295,17 @@ class Weights: ) else: - B = self._get_qweight(f"{prefix}.B", block_sizes) - s = self._get_qweight(f"{prefix}.s", block_sizes) + B = self.get_packed_sharded( + f"{prefix}.B", dim=1, block_sizes=block_sizes + ) + s = self.get_packed_sharded( + f"{prefix}.s", dim=1, block_sizes=block_sizes + ) weight = MarlinWeight(B=B, s=s) else: - slice_ = self._get_slice(f"{prefix}.weight") - total_size = slice_.get_shape()[0] - block_sizes = _blocks_to_block_sizes( - total_size=total_size, blocks=block_sizes + weight = self.get_packed_sharded( + f"{prefix}.weight", dim=0, block_sizes=block_sizes ) - - world_size = self.process_group.size() - rank = self.process_group.rank() - - tensors = [] - block_offset = 0 - for block_size in block_sizes: - assert ( - block_size % world_size == 0 - ), f"Prepacked weights cannot be sharded across {world_size} shards" - shard_block_size = block_size // world_size - start = rank * shard_block_size - stop = (rank + 1) * shard_block_size - tensor = slice_[block_offset + start : block_offset + stop] - tensors.append(tensor) - block_offset += block_size - weight = torch.cat(tensors, dim=0) - weight = weight.to(device=self.device) - weight = weight.to(dtype=self.dtype) return weight def get_weights_col(self, prefix: str, quantize: str):