mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 12:24:53 +00:00
Adding scripts to prepare load data.
This commit is contained in:
parent
6073ece4fc
commit
ab156adc0f
1
.gitignore
vendored
1
.gitignore
vendored
@ -13,3 +13,4 @@ server/exllama_kernels/exllama_kernels/hip_buffers.cuh
|
||||
server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
|
||||
|
||||
data/
|
||||
load_tests/*.json
|
||||
|
6
load_tests/Makefile
Normal file
6
load_tests/Makefile
Normal file
@ -0,0 +1,6 @@
|
||||
|
||||
ShareGPT_V3_unfiltered_cleaned_split.json:
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
prepare_data: ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
python filter.py
|
26
load_tests/filter.py
Normal file
26
load_tests/filter.py
Normal file
@ -0,0 +1,26 @@
|
||||
import json
|
||||
|
||||
|
||||
def main():
|
||||
with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Select only the first 2k conversations that start with a human.
|
||||
max = 2000
|
||||
conversations = []
|
||||
for conversation in data:
|
||||
conv = conversation.get("conversations")
|
||||
if conv and conv[0]["from"] == "human":
|
||||
# Trim the rest of the output
|
||||
conversation["conversations"] = conversation["conversations"][:1]
|
||||
conversations.append(conversation)
|
||||
|
||||
if len(conversation) >= max:
|
||||
break
|
||||
|
||||
with open("./small.json", "w") as f:
|
||||
data = json.dump(conversations, f, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user