From 5788c942a53e98bfb3c63d04d9d111c1c53465ed Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Tue, 6 Aug 2024 10:29:46 +0000
Subject: [PATCH] ix issues

---
 Dockerfile_amd                                | 35 +++++++++----------
 .../layers/attention/rocm.py                  |  2 +-
 .../models/flash_causal_lm.py                 |  2 +-
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/Dockerfile_amd b/Dockerfile_amd
index 399bc869..efc80234 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -98,26 +98,25 @@ RUN pip uninstall -y triton && \
     cd triton/python && \
     pip install .
 
-# RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir
+RUN git clone --depth 1 --recursive --single-branch --branch main https://github.com/pytorch/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir && \
+    git checkout da320214e66b5af0f7db8fd18a64dbb519d17b27
 
-# ARG _GLIBCXX_USE_CXX11_ABI="1"
-# ARG CMAKE_PREFIX_PATH="/opt/conda"
-# ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-# ARG BUILD_CAFFE2="0" \
-#     BUILD_CAFFE2_OPS="0" \
-#     USE_CUDA="0" \
-#     USE_ROCM="1" \
-#     BUILD_TEST="0" \
-#     USE_FBGEMM="0" \
-#     USE_NNPACK="0" \
-#     USE_QNNPACK="0" \
-#     USE_XNNPACK="0" \
-#     USE_FLASH_ATTENTION="1" \
-#     USE_MEM_EFF_ATTENTION="0"
+ARG _GLIBCXX_USE_CXX11_ABI="1"
+ARG CMAKE_PREFIX_PATH="/opt/conda"
+ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+ARG BUILD_CAFFE2="0" \
+    BUILD_CAFFE2_OPS="0" \
+    USE_CUDA="0" \
+    USE_ROCM="1" \
+    BUILD_TEST="0" \
+    USE_FBGEMM="0" \
+    USE_NNPACK="0" \
+    USE_QNNPACK="0" \
+    USE_XNNPACK="0" \
+    USE_FLASH_ATTENTION="1" \
+    USE_MEM_EFF_ATTENTION="0"
 
-# RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install
-
-RUN pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.1
+RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install
 
 # Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
 ENV HIP_FORCE_DEV_KERNARG=1
diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py
index 77ba4c92..da8a4bcd 100644
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@@ -208,7 +208,7 @@ if ENGINE == "ck":
             softcap,
             False,
             None,
-        )
+        )[0]
 
 elif ENGINE == "triton":
     from .flash_attn_triton import triton_attention
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 9b870447..174bba65 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1159,7 +1159,7 @@ class FlashCausalLM(Model):
 
                 log_master(
                     logger.info,
-                    f"PyTorch TunableOp (https://github.com/fxmarty/pytorch/tree/2.3-patched/aten/src/ATen/cuda/tunable) is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`.",
+                    f"PyTorch TunableOp is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`.",
                 )
 
                 if os.path.isfile(tunableop_filepath):