text-generation-inference/load_tests/filter.py

import json


def main():
    with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
        data = json.load(f)

    # Select only the first 2k conversations that start with a human.
    max = 2000
    conversations = []
    for conversation in data:
        conv = conversation.get("conversations")
        if conv and conv[0]["from"] == "human":
            # Trim the rest of the output
            conversation["conversations"] = conversation["conversations"][:1]
            conversations.append(conversation)

            if len(conversation) >= max:
                break

    with open("./small.json", "w") as f:
        data = json.dump(conversations, f, indent=4)


if __name__ == "__main__":
    main()