diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index eb8d4103..a94a33dc 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -62,16 +62,23 @@ jobs: id: meta uses: docker/metadata-action@v4.3.0 with: + flavor: | + latest=auto images: | + ghcr.io/huggingface/text-generation-inference registry.internal.huggingface.tech/api-inference/community/text-generation-inference + db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference tags: | - type=raw,value=llama-sha-${{ env.GITHUB_SHA_SHORT }} + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }} + type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }} - name: Build and push Docker image uses: docker/build-push-action@v2 with: context: . file: Dockerfile - push: true + push: ${{ github.event_name != 'pull_request' }} platforms: 'linux/amd64' tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index e5c09cbe..228529cc 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -233,8 +233,6 @@ class PositionRotaryEmbedding(RotaryEmbedding): ): self._seq_len_cached = seqlen t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) - # Don't do einsum, it converts fp32 to fp16 - # freqs = torch.einsum("i,j->ij", t, self.inv_freq) freqs = torch.outer(t, self.inv_freq.to(device=t.device)) self._cos_cached = torch.cos(freqs).to(dtype) self._sin_cached = torch.sin(freqs).to(dtype)