Adding scripts to prepare load data.

2025-09-11 12:24:53 +00:00 · 2024-05-01 09:11:57 +00:00 · 2024-05-01 09:11:57 +00:00 · ab156adc0f
commit ab156adc0f
parent 6073ece4fc
3 changed files with 33 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,3 +13,4 @@ server/exllama_kernels/exllama_kernels/hip_buffers.cuh
 server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp

 data/
+load_tests/*.json
--- a/load_tests/Makefile
+++ b/load_tests/Makefile
@ -0,0 +1,6 @@
+
+ShareGPT_V3_unfiltered_cleaned_split.json:
+	wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+prepare_data: ShareGPT_V3_unfiltered_cleaned_split.json
+	python filter.py
--- a/load_tests/filter.py
+++ b/load_tests/filter.py
@ -0,0 +1,26 @@
+import json
+
+
+def main():
+    with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
+        data = json.load(f)
+
+    # Select only the first 2k conversations that start with a human.
+    max = 2000
+    conversations = []
+    for conversation in data:
+        conv = conversation.get("conversations")
+        if conv and conv[0]["from"] == "human":
+            # Trim the rest of the output
+            conversation["conversations"] = conversation["conversations"][:1]
+            conversations.append(conversation)
+
+            if len(conversation) >= max:
+                break
+
+    with open("./small.json", "w") as f:
+        data = json.dump(conversations, f, indent=4)
+
+
+if __name__ == "__main__":
+    main()