mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
Merge branch 'main' into tgi-rocm
This commit is contained in:
commit
7052827be4
38
.github/workflows/build.yaml
vendored
38
.github/workflows/build.yaml
vendored
@ -79,11 +79,6 @@ jobs:
|
||||
install: true
|
||||
- name: Inject slug/short variables
|
||||
uses: rlespinasse/github-slug-action@v4.4.1
|
||||
- name: Install cosign
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: sigstore/cosign-installer@f3c664df7af409cb4873aa5068053ba9d61a57b6 #v2.6.0
|
||||
with:
|
||||
cosign-release: 'v1.13.1'
|
||||
- name: Tailscale
|
||||
uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
|
||||
with:
|
||||
@ -150,39 +145,6 @@ jobs:
|
||||
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
|
||||
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
|
||||
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
|
||||
# Sign the resulting Docker image digest except on PRs.
|
||||
# This will only write to the public Rekor transparency log when the Docker
|
||||
# repository is public to avoid leaking data.
|
||||
- name: Sign the published Docker image
|
||||
if: ${{ github.event_name != 'pull_request' }}
|
||||
env:
|
||||
COSIGN_EXPERIMENTAL: "true"
|
||||
# This step uses the identity token to provision an ephemeral certificate
|
||||
# against the sigstore community Fulcio instance.
|
||||
run: echo "${{ steps.meta.outputs.tags }}" | xargs -I {} cosign sign {}@${{ steps.build-and-push.outputs.digest }}
|
||||
- name: Run Trivy in GitHub SBOM mode and submit results to Dependency Graph
|
||||
uses: aquasecurity/trivy-action@master
|
||||
if: ${{ github.event_name != 'pull_request' }}
|
||||
with:
|
||||
image-ref: 'ghcr.io/huggingface/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}'
|
||||
format: 'github'
|
||||
output: 'dependency-results.sbom.json'
|
||||
github-pat: ${{ secrets.GITHUB_TOKEN }}
|
||||
scanners: 'vuln'
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@master
|
||||
if: ${{ github.event_name != 'pull_request' }}
|
||||
with:
|
||||
image-ref: 'ghcr.io/huggingface/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}'
|
||||
format: 'sarif'
|
||||
output: 'trivy-results.sarif'
|
||||
severity: 'CRITICAL'
|
||||
scanners: 'vuln'
|
||||
- name: Upload Trivy scan results to GitHub Security tab
|
||||
uses: github/codeql-action/upload-sarif@v2
|
||||
if: ${{ github.event_name != 'pull_request' }}
|
||||
with:
|
||||
sarif_file: 'trivy-results.sarif'
|
||||
|
||||
build-and-push-image-rocm:
|
||||
concurrency:
|
||||
|
@ -62,7 +62,7 @@ For a detailed starting guide, please see the [Quick Tour](https://huggingface.c
|
||||
model=tiiuae/falcon-7b-instruct
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model
|
||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.1 --model-id $model
|
||||
```
|
||||
|
||||
And then you can make requests like
|
||||
@ -106,7 +106,7 @@ model=meta-llama/Llama-2-7b-chat-hf
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
token=<your cli READ token>
|
||||
|
||||
docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model
|
||||
docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.1 --model-id $model
|
||||
```
|
||||
|
||||
### A note on Shared Memory (shm)
|
||||
|
@ -82,7 +82,7 @@ impl Infer {
|
||||
}
|
||||
|
||||
/// Add a new request to the queue and return a stream of InferStreamResponse
|
||||
#[instrument(skip(self))]
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn generate_stream(
|
||||
&self,
|
||||
request: GenerateRequest,
|
||||
@ -133,7 +133,7 @@ impl Infer {
|
||||
}
|
||||
|
||||
/// Add a new request to the queue and return a InferResponse
|
||||
#[instrument(skip(self))]
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn generate(
|
||||
&self,
|
||||
request: GenerateRequest,
|
||||
@ -214,7 +214,7 @@ impl Infer {
|
||||
}
|
||||
/// Add best_of new requests to the queue and return a InferResponse of the sequence with
|
||||
/// the highest log probability per token
|
||||
#[instrument(skip(self))]
|
||||
#[instrument(skip(self, request))]
|
||||
pub(crate) async fn generate_best_of(
|
||||
&self,
|
||||
request: GenerateRequest,
|
||||
|
@ -69,7 +69,7 @@ impl Validation {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
#[instrument(skip(self, inputs))]
|
||||
async fn validate_input(
|
||||
&self,
|
||||
inputs: String,
|
||||
|
Loading…
Reference in New Issue
Block a user