mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 15:32:08 +00:00
Remove the useless modifications
Signed-off-by: yuanwu <yuan.wu@intel.com>
This commit is contained in:
parent
15de6c9195
commit
eaeef6e7a4
10
README.md
10
README.md
@ -22,22 +22,12 @@ limitations under the License.
|
||||
- [Table of contents](#table-of-contents)
|
||||
- [Tested Models and Configurations](#tested-models-and-configurations)
|
||||
- [Running TGI on Gaudi](#running-tgi-on-gaudi)
|
||||
- [TGI-Gaudi Benchmark](#tgi-gaudi-benchmark)
|
||||
- [Static Batching Benchmark](#static-batching-benchmark)
|
||||
- [Continuous Batching Benchmark](#continuous-batching-benchmark)
|
||||
- [Tested Models and Configurations](#tested-models-and-configurations)
|
||||
- [Running TGI with BF16 Precision](#running-tgi-with-bf16-precision)
|
||||
- [Llama2-7B on 1 Card](#llama2-7b-on-1-card)
|
||||
- [Llama2-70B on 8 cards](#llama2-70b-on-8-cards)
|
||||
- [Llama3.1-8B on 1 card](#llama31-8b-on-1-card)
|
||||
- [Llama3.1-70B 8 cards](#llama31-70b-8-cards)
|
||||
- [Llava-v1.6-Mistral-7B on 1 card](#llava-v16-mistral-7b-on-1-card)
|
||||
- [Running TGI with FP8 Precision](#running-tgi-with-fp8-precision)
|
||||
- [TGI-Gaudi Benchmark](#tgi-gaudi-benchmark)
|
||||
- [Adjusting TGI Parameters](#adjusting-tgi-parameters)
|
||||
- [Environment Variables](#environment-variables)
|
||||
- [Profiler](#profiler)
|
||||
- [License](#license)
|
||||
|
||||
|
||||
## Tested Models and Configurations
|
||||
|
@ -104,14 +104,6 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
|
||||
|
||||
max_supported_total_tokens = self.model.warmup(request)
|
||||
return generate_pb2.WarmupResponse(max_supported_total_tokens=max_supported_total_tokens)
|
||||
# else:
|
||||
# batch = self.model.batch_type.from_pb(
|
||||
# request.batch, self.model.tokenizer, self.model.dtype, self.model.device
|
||||
# )
|
||||
|
||||
# max_supported_total_tokens = self.model.warmup(batch)
|
||||
# return generate_pb2.WarmupResponse(max_supported_total_tokens=max_supported_total_tokens)
|
||||
|
||||
|
||||
async def Prefill(self, request, context):
|
||||
start = time.time_ns()
|
||||
|
Loading…
Reference in New Issue
Block a user