mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 06:42:10 +00:00
fix(docker): increase shm size (#60)
This commit is contained in:
parent
c503a639b1
commit
1ad3250b89
@ -30,9 +30,7 @@ ENV LANG=C.UTF-8 \
|
|||||||
MODEL_ID=bigscience/bloom-560m \
|
MODEL_ID=bigscience/bloom-560m \
|
||||||
QUANTIZE=false \
|
QUANTIZE=false \
|
||||||
NUM_SHARD=1 \
|
NUM_SHARD=1 \
|
||||||
SAFETENSORS_FAST_GPU=1 \
|
|
||||||
PORT=80 \
|
PORT=80 \
|
||||||
NCCL_ASYNC_ERROR_HANDLING=1 \
|
|
||||||
CUDA_HOME=/usr/local/cuda \
|
CUDA_HOME=/usr/local/cuda \
|
||||||
LD_LIBRARY_PATH="/opt/miniconda/envs/text-generation/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \
|
LD_LIBRARY_PATH="/opt/miniconda/envs/text-generation/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \
|
||||||
CONDA_DEFAULT_ENV=text-generation \
|
CONDA_DEFAULT_ENV=text-generation \
|
||||||
|
37
README.md
37
README.md
@ -25,8 +25,9 @@ to power LLMs api-inference widgets.
|
|||||||
- [Officially Supported Models](#officially-supported-models)
|
- [Officially Supported Models](#officially-supported-models)
|
||||||
- [Get Started](#get-started)
|
- [Get Started](#get-started)
|
||||||
- [Docker](#docker)
|
- [Docker](#docker)
|
||||||
|
- [API Documentation](#api-documentation)
|
||||||
|
- [A note on Shared Memory](#a-note-on-shared-memory-shm)
|
||||||
- [Local Install](#local-install)
|
- [Local Install](#local-install)
|
||||||
- [OpenAPI](#api-documentation)
|
|
||||||
- [CUDA Kernels](#cuda-kernels)
|
- [CUDA Kernels](#cuda-kernels)
|
||||||
- [Run BLOOM](#run-bloom)
|
- [Run BLOOM](#run-bloom)
|
||||||
- [Download](#download)
|
- [Download](#download)
|
||||||
@ -54,7 +55,7 @@ to power LLMs api-inference widgets.
|
|||||||
- ~~[Galactica](https://huggingface.co/facebook/galactica-120b)~~ (deactivated)
|
- ~~[Galactica](https://huggingface.co/facebook/galactica-120b)~~ (deactivated)
|
||||||
- [SantaCoder](https://huggingface.co/bigcode/santacoder)
|
- [SantaCoder](https://huggingface.co/bigcode/santacoder)
|
||||||
- [GPT-Neox 20B](https://huggingface.co/EleutherAI/gpt-neox-20b)
|
- [GPT-Neox 20B](https://huggingface.co/EleutherAI/gpt-neox-20b)
|
||||||
- [FLAN-T5-XXL](https://huggingface.co/google/flan-t5-xxl): use `--revision pr/26`
|
- [FLAN-T5-XXL](https://huggingface.co/google/flan-t5-xxl)
|
||||||
|
|
||||||
Other models are supported on a best effort basis using:
|
Other models are supported on a best effort basis using:
|
||||||
|
|
||||||
@ -75,7 +76,7 @@ model=bigscience/bloom-560m
|
|||||||
num_shard=2
|
num_shard=2
|
||||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||||
|
|
||||||
docker run --gpus all -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --num-shard $num_shard
|
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --num-shard $num_shard
|
||||||
```
|
```
|
||||||
|
|
||||||
You can then query the model using either the `/generate` or `/generate_stream` routes:
|
You can then query the model using either the `/generate` or `/generate_stream` routes:
|
||||||
@ -101,6 +102,32 @@ curl 127.0.0.1:8080/generate_stream \
|
|||||||
You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route.
|
You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route.
|
||||||
The Swagger UI is also available at: [https://huggingface.github.io/text-generation-inference](https://huggingface.github.io/text-generation-inference).
|
The Swagger UI is also available at: [https://huggingface.github.io/text-generation-inference](https://huggingface.github.io/text-generation-inference).
|
||||||
|
|
||||||
|
### A note on Shared Memory (shm)
|
||||||
|
|
||||||
|
[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
|
||||||
|
`PyTorch` to do distributed training/inference. `text-generation-inference` make
|
||||||
|
use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.
|
||||||
|
|
||||||
|
In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if
|
||||||
|
peer-to-peer using NVLink or PCI is not possible.
|
||||||
|
|
||||||
|
To allow the container to use 1G of Shared Memory and support SHM sharing, we add `--shm-size 1g` on the above command.
|
||||||
|
|
||||||
|
If you are running `text-generation-inference` inside `Kubernetes`. You can also add Shared Memory to the container by
|
||||||
|
creating a volume with:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: shm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 1Gi
|
||||||
|
```
|
||||||
|
|
||||||
|
and mounting it to `/dev/shm`.
|
||||||
|
|
||||||
|
Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that
|
||||||
|
this will impact performance.
|
||||||
|
|
||||||
### Local install
|
### Local install
|
||||||
|
|
||||||
You can also opt to install `text-generation-inference` locally.
|
You can also opt to install `text-generation-inference` locally.
|
||||||
@ -122,10 +149,10 @@ BUILD_EXTENSIONS=True make install # Install repository and HF/transformer fork
|
|||||||
make run-bloom-560m
|
make run-bloom-560m
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note:** on some machines, you may also need the OpenSSL libraries. On Linux machines, run:
|
**Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
sudo apt-get install libssl-dev
|
sudo apt-get install libssl-dev gcc -y
|
||||||
```
|
```
|
||||||
|
|
||||||
### CUDA Kernels
|
### CUDA Kernels
|
||||||
|
@ -38,9 +38,9 @@ struct Args {
|
|||||||
port: u16,
|
port: u16,
|
||||||
#[clap(default_value = "/tmp/text-generation-server", long, env)]
|
#[clap(default_value = "/tmp/text-generation-server", long, env)]
|
||||||
shard_uds_path: String,
|
shard_uds_path: String,
|
||||||
#[clap(default_value = "0.0.0.0", long, env)]
|
#[clap(default_value = "localhost", long, env)]
|
||||||
master_addr: String,
|
master_addr: String,
|
||||||
#[clap(default_value = "6000", long, env)]
|
#[clap(default_value = "29500", long, env)]
|
||||||
master_port: usize,
|
master_port: usize,
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
json_output: bool,
|
json_output: bool,
|
||||||
@ -305,6 +305,7 @@ fn shard_manager(
|
|||||||
("MASTER_ADDR".into(), master_addr.into()),
|
("MASTER_ADDR".into(), master_addr.into()),
|
||||||
("MASTER_PORT".into(), master_port.to_string().into()),
|
("MASTER_PORT".into(), master_port.to_string().into()),
|
||||||
("SAFETENSORS_FAST_GPU".into(), "1".into()),
|
("SAFETENSORS_FAST_GPU".into(), "1".into()),
|
||||||
|
("NCCL_ASYNC_ERROR_HANDLING".into(), "1".into()),
|
||||||
];
|
];
|
||||||
|
|
||||||
// If the HUGGINGFACE_HUB_CACHE env var is set, pass it to the shard
|
// If the HUGGINGFACE_HUB_CACHE env var is set, pass it to the shard
|
||||||
@ -322,6 +323,12 @@ fn shard_manager(
|
|||||||
));
|
));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// If the NCCL_SHM_DISABLE env var is set, pass it to the shard
|
||||||
|
// needed when running NCCL inside a docker container and when you can't increase shm size
|
||||||
|
if let Ok(nccl_shm_disalbe) = env::var("NCCL_SHM_DISABLE") {
|
||||||
|
env.push(("NCCL_SHM_DISABLE".into(), nccl_shm_disalbe.into()));
|
||||||
|
};
|
||||||
|
|
||||||
// If the CUDA_VISIBLE_DEVICES env var is set, pass it to the shard
|
// If the CUDA_VISIBLE_DEVICES env var is set, pass it to the shard
|
||||||
if let Ok(cuda_visible_devices) = env::var("CUDA_VISIBLE_DEVICES") {
|
if let Ok(cuda_visible_devices) = env::var("CUDA_VISIBLE_DEVICES") {
|
||||||
env.push(("CUDA_VISIBLE_DEVICES".into(), cuda_visible_devices.into()));
|
env.push(("CUDA_VISIBLE_DEVICES".into(), cuda_visible_devices.into()));
|
||||||
|
@ -162,29 +162,29 @@ def initialize_torch_distributed():
|
|||||||
world_size = int(os.getenv("WORLD_SIZE", "1"))
|
world_size = int(os.getenv("WORLD_SIZE", "1"))
|
||||||
|
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
# initialized `torch.distributed`
|
from torch.distributed import ProcessGroupNCCL
|
||||||
# Set the device id.
|
# Set the device id.
|
||||||
assert world_size <= torch.cuda.device_count(), "Each process is one gpu"
|
assert world_size <= torch.cuda.device_count(), "Each process is one gpu"
|
||||||
device = rank % torch.cuda.device_count()
|
device = rank % torch.cuda.device_count()
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
backend = "nccl"
|
backend = "nccl"
|
||||||
|
options = ProcessGroupNCCL.Options()
|
||||||
|
options.is_high_priority_stream = True
|
||||||
|
options._timeout = timedelta(seconds=60)
|
||||||
else:
|
else:
|
||||||
backend = "gloo"
|
backend = "gloo"
|
||||||
|
options = None
|
||||||
master_ip = os.getenv("MASTER_ADDR", "0.0.0.0")
|
|
||||||
master_port = os.getenv("MASTER_PORT", "6000")
|
|
||||||
init_method = f"tcp://{master_ip}:{master_port}"
|
|
||||||
|
|
||||||
# Call the init process.
|
# Call the init process.
|
||||||
torch.distributed.init_process_group(
|
torch.distributed.init_process_group(
|
||||||
backend=backend,
|
backend=backend,
|
||||||
init_method=init_method,
|
|
||||||
world_size=world_size,
|
world_size=world_size,
|
||||||
rank=rank,
|
rank=rank,
|
||||||
timeout=timedelta(seconds=60),
|
timeout=timedelta(seconds=60),
|
||||||
|
pg_options=options
|
||||||
)
|
)
|
||||||
|
|
||||||
return torch.distributed.distributed_c10d._get_default_group(), rank, world_size
|
return torch.distributed.group.WORLD, rank, world_size
|
||||||
|
|
||||||
|
|
||||||
def weight_hub_files(model_id, revision=None, extension=".safetensors"):
|
def weight_hub_files(model_id, revision=None, extension=".safetensors"):
|
||||||
|
Loading…
Reference in New Issue
Block a user