diff --git a/Dockerfile b/Dockerfile index 65e2a6f2..73892494 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ RUN cargo build --release # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install -ARG PYTORCH_VERSION=2.1.1 +ARG PYTORCH_VERSION=2.2.0 ARG PYTHON_VERSION=3.10 # Keep in sync with `server/pyproject.toml ARG CUDA_VERSION=12.1 @@ -225,7 +225,7 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements_cuda.txt && \ - pip install ".[bnb, accelerate, quantize, peft]" --no-cache-dir + pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir # Install benchmarker COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark diff --git a/Dockerfile_amd b/Dockerfile_amd index 9a5e3568..fb820116 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -150,7 +150,7 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements_rocm.txt && \ - pip install ".[accelerate, peft]" --no-cache-dir + pip install ".[accelerate, peft, outlines]" --no-cache-dir # Install benchmarker COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark diff --git a/server/Makefile b/server/Makefile index 31d55c41..dbc9b7ef 100644 --- a/server/Makefile +++ b/server/Makefile @@ -23,7 +23,7 @@ install-megablocks: install: gen-server pip install pip --upgrade pip install -r requirements_cuda.txt - pip install -e ".[bnb, accelerate, quantize, peft]" + pip install -e ".[bnb, accelerate, quantize, peft, outlines]" run-dev: SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded diff --git a/server/poetry.lock b/server/poetry.lock index b2fb7441..fab13a57 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -144,7 +144,7 @@ frozenlist = ">=1.1.0" name = "annotated-types" version = "0.6.0" description = "Reusable constraint types to use with typing.Annotated" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, @@ -166,7 +166,7 @@ files = [ name = "attrs" version = "23.2.0" description = "Classes Without Boilerplate" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "attrs-23.2.0-py3-none-any.whl", hash = "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1"}, @@ -331,7 +331,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "cloudpickle" version = "3.0.0" description = "Pickler class to extend the standard pickle.Pickler functionality" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "cloudpickle-3.0.0-py3-none-any.whl", hash = "sha256:246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7"}, @@ -429,7 +429,7 @@ profile = ["gprof2dot (>=2022.7.29)"] name = "diskcache" version = "5.6.3" description = "Disk Cache -- Disk and file backed persistent cache." -optional = false +optional = true python-versions = ">=3" files = [ {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"}, @@ -952,7 +952,7 @@ files = [ name = "interegular" version = "0.3.3" description = "a regex intersection checker" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c"}, @@ -963,7 +963,7 @@ files = [ name = "jinja2" version = "3.1.3" description = "A very fast and expressive template engine." -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "Jinja2-3.1.3-py3-none-any.whl", hash = "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa"}, @@ -980,7 +980,7 @@ i18n = ["Babel (>=2.7)"] name = "joblib" version = "1.3.2" description = "Lightweight pipelining with Python functions" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, @@ -991,7 +991,7 @@ files = [ name = "jsonschema" version = "4.21.1" description = "An implementation of JSON Schema validation for Python" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "jsonschema-4.21.1-py3-none-any.whl", hash = "sha256:7996507afae316306f9e2290407761157c6f78002dcf7419acb99822143d1c6f"}, @@ -1012,7 +1012,7 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339- name = "jsonschema-specifications" version = "2023.12.1" description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "jsonschema_specifications-2023.12.1-py3-none-any.whl", hash = "sha256:87e4fdf3a94858b8a2ba2778d9ba57d8a9cafca7c7489c46ba0d30a8bc6a9c3c"}, @@ -1026,7 +1026,7 @@ referencing = ">=0.31.0" name = "lark" version = "1.1.9" description = "a modern parsing library" -optional = false +optional = true python-versions = ">=3.6" files = [ {file = "lark-1.1.9-py3-none-any.whl", hash = "sha256:a0dd3a87289f8ccbb325901e4222e723e7d745dbfc1803eaf5f3d2ace19cf2db"}, @@ -1043,7 +1043,7 @@ regex = ["regex"] name = "llvmlite" version = "0.42.0" description = "lightweight wrapper around basic LLVM functionality" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "llvmlite-0.42.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3366938e1bf63d26c34fbfb4c8e8d2ded57d11e0567d5bb243d89aab1eb56098"}, @@ -1091,7 +1091,7 @@ dev = ["Sphinx (>=4.1.1)", "black (>=19.10b0)", "colorama (>=0.3.4)", "docutils name = "markupsafe" version = "2.1.5" description = "Safely add untrusted strings to HTML/XML markup." -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, @@ -1160,7 +1160,7 @@ files = [ name = "mpmath" version = "1.3.0" description = "Python library for arbitrary-precision floating-point arithmetic" -optional = false +optional = true python-versions = "*" files = [ {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, @@ -1300,7 +1300,7 @@ dill = ">=0.3.8" name = "nest-asyncio" version = "1.6.0" description = "Patch asyncio to allow nested event loops" -optional = false +optional = true python-versions = ">=3.5" files = [ {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, @@ -1311,7 +1311,7 @@ files = [ name = "networkx" version = "3.2.1" description = "Python package for creating and manipulating graphs and networks" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"}, @@ -1329,7 +1329,7 @@ test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] name = "numba" version = "0.59.0" description = "compiling Python code using LLVM" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "numba-0.59.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d061d800473fb8fef76a455221f4ad649a53f5e0f96e3f6c8b8553ee6fa98fa"}, @@ -1408,7 +1408,7 @@ files = [ name = "nvidia-cublas-cu12" version = "12.1.3.1" description = "CUBLAS native runtime libraries" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"}, @@ -1419,7 +1419,7 @@ files = [ name = "nvidia-cuda-cupti-cu12" version = "12.1.105" description = "CUDA profiling tools runtime libs." -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"}, @@ -1430,7 +1430,7 @@ files = [ name = "nvidia-cuda-nvrtc-cu12" version = "12.1.105" description = "NVRTC native runtime libraries" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"}, @@ -1441,7 +1441,7 @@ files = [ name = "nvidia-cuda-runtime-cu12" version = "12.1.105" description = "CUDA Runtime native Libraries" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"}, @@ -1452,7 +1452,7 @@ files = [ name = "nvidia-cudnn-cu12" version = "8.9.2.26" description = "cuDNN runtime libraries" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl", hash = "sha256:5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9"}, @@ -1465,7 +1465,7 @@ nvidia-cublas-cu12 = "*" name = "nvidia-cufft-cu12" version = "11.0.2.54" description = "CUFFT native runtime libraries" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"}, @@ -1476,7 +1476,7 @@ files = [ name = "nvidia-curand-cu12" version = "10.3.2.106" description = "CURAND native runtime libraries" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"}, @@ -1487,7 +1487,7 @@ files = [ name = "nvidia-cusolver-cu12" version = "11.4.5.107" description = "CUDA solver native runtime libraries" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"}, @@ -1503,7 +1503,7 @@ nvidia-nvjitlink-cu12 = "*" name = "nvidia-cusparse-cu12" version = "12.1.0.106" description = "CUSPARSE native runtime libraries" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"}, @@ -1517,7 +1517,7 @@ nvidia-nvjitlink-cu12 = "*" name = "nvidia-nccl-cu12" version = "2.19.3" description = "NVIDIA Collective Communication Library (NCCL) Runtime" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl", hash = "sha256:a9734707a2c96443331c1e48c717024aa6678a0e2a4cb66b2c364d18cee6b48d"}, @@ -1527,7 +1527,7 @@ files = [ name = "nvidia-nvjitlink-cu12" version = "12.3.101" description = "Nvidia JIT LTO Library" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_nvjitlink_cu12-12.3.101-py3-none-manylinux1_x86_64.whl", hash = "sha256:64335a8088e2b9d196ae8665430bc6a2b7e6ef2eb877a9c735c804bd4ff6467c"}, @@ -1538,7 +1538,7 @@ files = [ name = "nvidia-nvtx-cu12" version = "12.1.105" description = "NVIDIA Tools Extension" -optional = false +optional = true python-versions = ">=3" files = [ {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"}, @@ -1703,7 +1703,7 @@ files = [ name = "outlines" version = "0.0.27" description = "Probabilistic Generative Model Programming" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "outlines-0.0.27-py3-none-any.whl", hash = "sha256:dd614f49760ff8870a5d491fad4a372d7b7d4da5c1646f1b42f12a9d5e34db4b"}, @@ -2055,7 +2055,7 @@ files = [ name = "pydantic" version = "2.6.1" description = "Data validation using Python type hints" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "pydantic-2.6.1-py3-none-any.whl", hash = "sha256:0b6a909df3192245cb736509a92ff69e4fef76116feffec68e93a567347bae6f"}, @@ -2074,7 +2074,7 @@ email = ["email-validator (>=2.0.0)"] name = "pydantic-core" version = "2.16.2" description = "" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "pydantic_core-2.16.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3fab4e75b8c525a4776e7630b9ee48aea50107fea6ca9f593c98da3f4d11bf7c"}, @@ -2271,7 +2271,7 @@ files = [ name = "referencing" version = "0.33.0" description = "JSON Referencing + Python" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "referencing-0.33.0-py3-none-any.whl", hash = "sha256:39240f2ecc770258f28b642dd47fd74bc8b02484de54e1882b74b35ebd779bd5"}, @@ -2409,7 +2409,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "rpds-py" version = "0.18.0" description = "Python bindings to Rust's persistent data structures (rpds)" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "rpds_py-0.18.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:5b4e7d8d6c9b2e8ee2d55c90b59c707ca59bc30058269b3db7b1f8df5763557e"}, @@ -2719,7 +2719,7 @@ files = [ name = "sympy" version = "1.12" description = "Computer algebra system (CAS) in Python" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, @@ -2882,7 +2882,7 @@ files = [ name = "torch" version = "2.2.0" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" -optional = false +optional = true python-versions = ">=3.8.0" files = [ {file = "torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:d366158d6503a3447e67f8c0ad1328d54e6c181d88572d688a625fac61b13a97"}, @@ -3028,7 +3028,7 @@ vision = ["Pillow (>=10.0.1,<=15.0)"] name = "triton" version = "2.2.0" description = "A language and compiler for custom Deep Learning operations" -optional = false +optional = true python-versions = "*" files = [ {file = "triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5"}, @@ -3422,6 +3422,7 @@ multidict = ">=4.0" [extras] accelerate = ["accelerate"] bnb = ["bitsandbytes"] +outlines = ["outlines"] peft = ["peft"] quantize = ["accelerate", "datasets", "texttable"] torch = ["torch"] @@ -3429,4 +3430,4 @@ torch = ["torch"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "c2990763ec38a0249cbb2140f0fb87d2ff4633c4810e4ec6f979e821ecb5442e" +content-hash = "47696ea72017636437a1bb73aa6463f064cae02620cbf93027fad8f2ebecd014" diff --git a/server/pyproject.toml b/server/pyproject.toml index e9443034..ad67745c 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -34,7 +34,7 @@ peft = { version = "^0.8.2", optional = true } torch = { version = "^2.1.1", optional = true } scipy = "^1.11.1" pillow = "^10.0.0" -outlines="^0.0.27" +outlines= { version = "^0.0.27", optional = true } [tool.poetry.extras] torch = ["torch"] @@ -42,6 +42,7 @@ accelerate = ["accelerate"] bnb = ["bitsandbytes"] peft = ["peft"] quantize = ["texttable", "datasets", "accelerate"] +outlines = ["outlines"] [tool.poetry.group.dev.dependencies] grpcio-tools = "^1.51.1" diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt index 3152646f..1e3477bf 100644 --- a/server/requirements_cuda.txt +++ b/server/requirements_cuda.txt @@ -1,14 +1,10 @@ -annotated-types==0.6.0 ; python_version >= "3.9" and python_version < "3.13" -attrs==23.2.0 ; python_version >= "3.9" and python_version < "3.13" backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13" bitsandbytes==0.41.3.post2 ; python_version >= "3.9" and python_version < "3.13" certifi==2024.2.2 ; python_version >= "3.9" and python_version < "3.13" charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13" click==8.1.7 ; python_version >= "3.9" and python_version < "3.13" -cloudpickle==3.0.0 ; python_version >= "3.9" and python_version < "3.13" colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows") deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13" -diskcache==5.6.3 ; python_version >= "3.9" and python_version < "3.13" einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13" filelock==3.13.1 ; python_version >= "3.9" and python_version < "3.13" fsspec==2023.10.0 ; python_version >= "3.9" and python_version < "3.13" @@ -20,32 +16,8 @@ grpcio==1.60.1 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.5 ; python_version >= "3.9" and python_version < "3.13" huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13" idna==3.6 ; python_version >= "3.9" and python_version < "3.13" -interegular==0.3.3 ; python_version >= "3.9" and python_version < "3.13" -jinja2==3.1.3 ; python_version >= "3.9" and python_version < "3.13" -joblib==1.3.2 ; python_version >= "3.9" and python_version < "3.13" -jsonschema-specifications==2023.12.1 ; python_version >= "3.9" and python_version < "3.13" -jsonschema==4.21.1 ; python_version >= "3.9" and python_version < "3.13" -lark==1.1.9 ; python_version >= "3.9" and python_version < "3.13" -llvmlite==0.42.0 ; python_version >= "3.9" and python_version < "3.13" loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13" -markupsafe==2.1.5 ; python_version >= "3.9" and python_version < "3.13" -mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13" -nest-asyncio==1.6.0 ; python_version >= "3.9" and python_version < "3.13" -networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13" -numba==0.59.0 ; python_version >= "3.9" and python_version < "3.13" numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13" -nvidia-cublas-cu12==12.1.3.1 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cuda-cupti-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cuda-runtime-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cudnn-cu12==8.9.2.26 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cufft-cu12==11.0.2.54 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-curand-cu12==10.3.2.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cusolver-cu12==11.4.5.107 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cusparse-cu12==12.1.0.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-nccl-cu12==2.19.3 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-nvjitlink-cu12==12.3.101 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-nvtx-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13" @@ -55,27 +27,19 @@ opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_versi opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13" -outlines==0.0.27 ; python_version >= "3.9" and python_version < "3.13" packaging==23.2 ; python_version >= "3.9" and python_version < "3.13" pillow==10.2.0 ; python_version >= "3.9" and python_version < "3.13" protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13" -pydantic-core==2.16.2 ; python_version >= "3.9" and python_version < "3.13" -pydantic==2.6.1 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13" -referencing==0.33.0 ; python_version >= "3.9" and python_version < "3.13" regex==2023.12.25 ; python_version >= "3.9" and python_version < "3.13" requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13" -rpds-py==0.18.0 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13" scipy==1.12.0 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" setuptools==69.1.0 ; python_version >= "3.9" and python_version < "3.13" -sympy==1.12 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.15.2 ; python_version >= "3.9" and python_version < "3.13" -torch==2.2.0 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13" transformers==4.37.1 ; python_version >= "3.9" and python_version < "3.13" -triton==2.2.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.9.0 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.0 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt index dd17ba18..3912abd8 100644 --- a/server/requirements_rocm.txt +++ b/server/requirements_rocm.txt @@ -1,13 +1,9 @@ -annotated-types==0.6.0 ; python_version >= "3.9" and python_version < "3.13" -attrs==23.2.0 ; python_version >= "3.9" and python_version < "3.13" backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13" certifi==2024.2.2 ; python_version >= "3.9" and python_version < "3.13" charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13" click==8.1.7 ; python_version >= "3.9" and python_version < "3.13" -cloudpickle==3.0.0 ; python_version >= "3.9" and python_version < "3.13" colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows") deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13" -diskcache==5.6.3 ; python_version >= "3.9" and python_version < "3.13" einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13" filelock==3.13.1 ; python_version >= "3.9" and python_version < "3.13" fsspec==2023.10.0 ; python_version >= "3.9" and python_version < "3.13" @@ -19,32 +15,8 @@ grpcio==1.60.1 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.5 ; python_version >= "3.9" and python_version < "3.13" huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13" idna==3.6 ; python_version >= "3.9" and python_version < "3.13" -interegular==0.3.3 ; python_version >= "3.9" and python_version < "3.13" -jinja2==3.1.3 ; python_version >= "3.9" and python_version < "3.13" -joblib==1.3.2 ; python_version >= "3.9" and python_version < "3.13" -jsonschema-specifications==2023.12.1 ; python_version >= "3.9" and python_version < "3.13" -jsonschema==4.21.1 ; python_version >= "3.9" and python_version < "3.13" -lark==1.1.9 ; python_version >= "3.9" and python_version < "3.13" -llvmlite==0.42.0 ; python_version >= "3.9" and python_version < "3.13" loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13" -markupsafe==2.1.5 ; python_version >= "3.9" and python_version < "3.13" -mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13" -nest-asyncio==1.6.0 ; python_version >= "3.9" and python_version < "3.13" -networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13" -numba==0.59.0 ; python_version >= "3.9" and python_version < "3.13" numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13" -nvidia-cublas-cu12==12.1.3.1 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cuda-cupti-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cuda-runtime-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cudnn-cu12==8.9.2.26 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cufft-cu12==11.0.2.54 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-curand-cu12==10.3.2.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cusolver-cu12==11.4.5.107 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-cusparse-cu12==12.1.0.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-nccl-cu12==2.19.3 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-nvjitlink-cu12==12.3.101 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" -nvidia-nvtx-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13" @@ -54,27 +26,19 @@ opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_versi opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13" -outlines==0.0.27 ; python_version >= "3.9" and python_version < "3.13" packaging==23.2 ; python_version >= "3.9" and python_version < "3.13" pillow==10.2.0 ; python_version >= "3.9" and python_version < "3.13" protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13" -pydantic-core==2.16.2 ; python_version >= "3.9" and python_version < "3.13" -pydantic==2.6.1 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13" -referencing==0.33.0 ; python_version >= "3.9" and python_version < "3.13" regex==2023.12.25 ; python_version >= "3.9" and python_version < "3.13" requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13" -rpds-py==0.18.0 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13" scipy==1.12.0 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" setuptools==69.1.0 ; python_version >= "3.9" and python_version < "3.13" -sympy==1.12 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.15.2 ; python_version >= "3.9" and python_version < "3.13" -torch==2.2.0 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13" transformers==4.37.1 ; python_version >= "3.9" and python_version < "3.13" -triton==2.2.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.9.0 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.0 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index a0f0c9e8..3de45921 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -415,14 +415,14 @@ class CausalLMBatch(Batch): # We slice the keys to remove the padding from previous batches past_seq_len = batch.max_input_length - 1 if batch.keys_head_dim_last: - padded_past_keys[start_index:end_index, :, -past_seq_len:, :] = ( - past_keys[:, :, -past_seq_len:, :] - ) + padded_past_keys[ + start_index:end_index, :, -past_seq_len:, : + ] = past_keys[:, :, -past_seq_len:, :] else: # BLOOM case - padded_past_keys[start_index:end_index, :, :, -past_seq_len:] = ( - past_keys[:, :, :, -past_seq_len:] - ) + padded_past_keys[ + start_index:end_index, :, :, -past_seq_len: + ] = past_keys[:, :, :, -past_seq_len:] del past_keys start_index = end_index @@ -440,9 +440,9 @@ class CausalLMBatch(Batch): end_index = start_index + len(batch) # We slice the past values to remove the padding from previous batches past_seq_len = batch.max_input_length - 1 - padded_past_values[start_index:end_index, :, -past_seq_len:, :] = ( - past_values[:, :, -past_seq_len:, :] - ) + padded_past_values[ + start_index:end_index, :, -past_seq_len:, : + ] = past_values[:, :, -past_seq_len:, :] del past_values # Update values diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index b8d0be22..5168a33d 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -1017,9 +1017,9 @@ class FlashCausalLM(Model): # Copy batch.input_ids to prefill_token_indices if prefill_logprobs: if len(batch) > 1: - prefill_tokens_indices[out_start_index : out_end_index - 1] = ( - batch.input_ids[start_index + 1 : start_index + out_length] - ) + prefill_tokens_indices[ + out_start_index : out_end_index - 1 + ] = batch.input_ids[start_index + 1 : start_index + out_length] else: # Set prefill_tokens_indices to the correct slice prefill_tokens_indices = batch.input_ids[ diff --git a/server/text_generation_server/utils/logits_process.py b/server/text_generation_server/utils/logits_process.py index 950c074d..40f31ce2 100644 --- a/server/text_generation_server/utils/logits_process.py +++ b/server/text_generation_server/utils/logits_process.py @@ -1,10 +1,8 @@ import math import torch -import json from loguru import logger -from functools import lru_cache -from typing import Optional, List, Dict, Union +from typing import Dict, Union from text_generation_server.pb.generate_pb2 import GrammarType from outlines.fsm.fsm import RegexFSM @@ -492,7 +490,7 @@ class GrammarLogitProcessor(LogitsProcessor): if fsm_grammar_state == -1 or self.fsm is None: return logits allowed_tokens = self.fsm.allowed_token_ids(fsm_grammar_state) - mask = torch.full((logits.shape[-1],), -math.inf, device=self.device) + mask = torch.full_like(logits, -math.inf) mask[allowed_tokens] = 0 biased_scores = logits + mask return biased_scores @@ -550,22 +548,15 @@ class GrammarLogitProcessor(LogitsProcessor): logger.debug(f"Adapted tokenizer in {time.time() - start_time:.2f}s") return tokenizer - def filter(self, indices): - new_fsms = [] - for i in indices: - new_fsms.append(self.fsms[i]) - self.fsms = new_fsms - return self - class HeterogeneousGrammarLogitProcessor(LogitsProcessor): - def __init__(self, tokenizer, device, grammars, grammar_type): + def __init__(self, tokenizer, device, grammars, grammar_types): self.device = device self.tokenizer = GrammarLogitProcessor._cached_adapt_tokenizer(tokenizer) self.fsms = [] - for i in range(len(grammars)): + for grammar, grammar_type in zip(grammars, grammar_types): fsm = GrammarLogitProcessor._cached_compile_fsm( - grammar_type[i], grammars[i], self.tokenizer + grammar_type, grammar, self.tokenizer ) self.fsms.append(fsm) @@ -573,7 +564,6 @@ class HeterogeneousGrammarLogitProcessor(LogitsProcessor): self, logits: torch.Tensor, fsm_grammar_states: List[int], - mask: torch.Tensor, ): mask = torch.full_like(logits, -math.inf) for i in range(logits.shape[0]): @@ -585,7 +575,7 @@ class HeterogeneousGrammarLogitProcessor(LogitsProcessor): logits += mask return logits - def advance_batch(self, next_token_ids, fsm_grammar_states, grammars): + def advance_batch(self, next_token_ids, fsm_grammar_states): return [ GrammarLogitProcessor._advance( next_token_ids[i], fsm_grammar_states[i], self.fsms[i] @@ -599,4 +589,8 @@ class HeterogeneousGrammarLogitProcessor(LogitsProcessor): ) def filter(self, indices): - return GrammarLogitProcessor.filter(self, indices) + new_fsms = [] + for i in indices: + new_fsms.append(self.fsms[i]) + self.fsms = new_fsms + return self