diff --git a/.gitignore b/.gitignore index 2ac2f6b4..e9ad1808 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ server/exllama_kernels/exllama_kernels/hip_buffers.cuh server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp data/ +load_tests/*.json diff --git a/load_tests/Makefile b/load_tests/Makefile new file mode 100644 index 00000000..81d16b87 --- /dev/null +++ b/load_tests/Makefile @@ -0,0 +1,6 @@ + +ShareGPT_V3_unfiltered_cleaned_split.json: + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +prepare_data: ShareGPT_V3_unfiltered_cleaned_split.json + python filter.py diff --git a/load_tests/filter.py b/load_tests/filter.py new file mode 100644 index 00000000..a00226ed --- /dev/null +++ b/load_tests/filter.py @@ -0,0 +1,26 @@ +import json + + +def main(): + with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f: + data = json.load(f) + + # Select only the first 2k conversations that start with a human. + max = 2000 + conversations = [] + for conversation in data: + conv = conversation.get("conversations") + if conv and conv[0]["from"] == "human": + # Trim the rest of the output + conversation["conversations"] = conversation["conversations"][:1] + conversations.append(conversation) + + if len(conversation) >= max: + break + + with open("./small.json", "w") as f: + data = json.dump(conversations, f, indent=4) + + +if __name__ == "__main__": + main()