text-generation-inference/server/text_generation/models/conf/megatron_gpt_inference.yaml

inference:
  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  temperature: 1.0 # sampling temperature
  add_BOS: True # add the bos token at the begining of the prompt
  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
  all_probs: False  # whether return the log prob for all the tokens in vocab
  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False


trainer:
  devices: 1
  num_nodes: 1
  accelerator: gpu
  logger: False # logger provided by exp_manager
  precision: 16 # 16, 32, or bf16

tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
gpt_model_file: null  # GPT nemo file path
checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
hparams_file: null # model configuration file, only used for PTL checkpoint loading
prompts: # prompts for GPT inference
  - "Q: How are you?"
  - "Q: How big is the universe?"
server: False  # whether launch the API server
port: 5555 # the port number for the inference server
web_server: False # whether launch the web inference server
share: False  # whether create a public URL
username: test # user name for web client
password: test2  # password for web client
first tests 2023-02-23 08:52:17 +00:00			`inference:`
			`greedy: False # Whether or not to use sampling ; use greedy decoding otherwise`
			`top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering.`
			`top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.`
			`temperature: 1.0 # sampling temperature`
			`add_BOS: True # add the bos token at the begining of the prompt`
			`tokens_to_generate: 30 # The minimum length of the sequence to be generated.`
			`all_probs: False # whether return the log prob for all the tokens in vocab`
			`repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty.`
			`min_tokens_to_generate: 0 # The minimum length of the sequence to be generated.`
			`compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False`


			`trainer:`
			`devices: 1`
			`num_nodes: 1`
			`accelerator: gpu`
			`logger: False # logger provided by exp_manager`
			`precision: 16 # 16, 32, or bf16`

			`tensor_model_parallel_size: 1`
			`pipeline_model_parallel_size: 1`
			`pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model`
			`gpt_model_file: null # GPT nemo file path`
			`checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training`
			`checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading`
			`hparams_file: null # model configuration file, only used for PTL checkpoint loading`
			`prompts: # prompts for GPT inference`
			`- "Q: How are you?"`
			`- "Q: How big is the universe?"`
			`server: False # whether launch the API server`
			`port: 5555 # the port number for the inference server`
			`web_server: False # whether launch the web inference server`
			`share: False # whether create a public URL`
			`username: test # user name for web client`
			`password: test2 # password for web client`