mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-23 16:02:10 +00:00
Tested with ``` CUDA_VISIBLE_DEVICES=0 text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq EXLLAMA_VERSION=1 CUDA_VISIBLE_DEVICES=0 text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq CUDA_VISIBLE_DEVICES="0,1" text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq ``` all with good and identical results on MI210. --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
34 lines
699 B
Plaintext
34 lines
699 B
Plaintext
// Adapted from turboderp exllama: https://github.com/turboderp/exllama
|
|
|
|
#ifndef _util_cuh
|
|
#define _util_cuh
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <cuda_fp16.h>
|
|
#include <cstdint>
|
|
#include <cstdio>
|
|
|
|
#if defined(USE_ROCM)
|
|
#define cudaUnspecified hipErrorUnknown
|
|
#else
|
|
#define cudaUnspecified cudaErrorApiFailureBase
|
|
#endif
|
|
|
|
// React to failure on return code != cudaSuccess
|
|
|
|
#define _cuda_check(fn) \
|
|
do { \
|
|
{_cuda_err = fn;} \
|
|
if (_cuda_err != cudaSuccess) goto _cuda_fail; \
|
|
} while(false)
|
|
|
|
// React to failure on return code == 0
|
|
|
|
#define _alloc_check(fn) \
|
|
do { \
|
|
if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
|
|
else _cuda_err = cudaSuccess; \
|
|
} while(false)
|
|
|
|
#endif
|