mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
36 lines
1.9 KiB
YAML
36 lines
1.9 KiB
YAML
|
inference:
|
||
|
greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
|
||
|
top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering.
|
||
|
top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
|
||
|
temperature: 1.0 # sampling temperature
|
||
|
add_BOS: True # add the bos token at the begining of the prompt
|
||
|
tokens_to_generate: 30 # The minimum length of the sequence to be generated.
|
||
|
all_probs: False # whether return the log prob for all the tokens in vocab
|
||
|
repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty.
|
||
|
min_tokens_to_generate: 0 # The minimum length of the sequence to be generated.
|
||
|
compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False
|
||
|
|
||
|
|
||
|
trainer:
|
||
|
devices: 1
|
||
|
num_nodes: 1
|
||
|
accelerator: gpu
|
||
|
logger: False # logger provided by exp_manager
|
||
|
precision: 16 # 16, 32, or bf16
|
||
|
|
||
|
tensor_model_parallel_size: 1
|
||
|
pipeline_model_parallel_size: 1
|
||
|
pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
|
||
|
gpt_model_file: null # GPT nemo file path
|
||
|
checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
|
||
|
checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
|
||
|
hparams_file: null # model configuration file, only used for PTL checkpoint loading
|
||
|
prompts: # prompts for GPT inference
|
||
|
- "Q: How are you?"
|
||
|
- "Q: How big is the universe?"
|
||
|
server: False # whether launch the API server
|
||
|
port: 5555 # the port number for the inference server
|
||
|
web_server: False # whether launch the web inference server
|
||
|
share: False # whether create a public URL
|
||
|
username: test # user name for web client
|
||
|
password: test2 # password for web client
|