Skip to content

Commit 14bce17

Browse files
committed
add xtuner2sharegpt.py
1 parent a3ead4f commit 14bce17

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed

scripts/xtuner2sharegpt.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import json
2+
3+
# Given JSON data in string format
4+
# original_json_data = """
5+
# [
6+
# {
7+
# "conversation": [
8+
# {"system": "system", "input": "input", "output": "output"},
9+
# {"input": "input", "output": "output"},
10+
# {"input": "input", "output": "output"}
11+
# ]
12+
# },
13+
# {
14+
# "conversation": [
15+
# {"system": "system", "input": "input", "output": "output"},
16+
# {"input": "input", "output": "output"},
17+
# {"input": "input", "output": "output"}
18+
# ]
19+
# }
20+
# ]
21+
# """
22+
23+
# Parse the original JSON data into Python objects
24+
def convert_xtuner_to_sharegpt(input_path, output_path):
25+
with open(input_path, 'r', encoding='utf-8') as file:
26+
data = json.load(file)
27+
28+
# Initialize a new list to hold transformed conversations
29+
transformed_conversations = []
30+
31+
for conversation_group in data:
32+
system = conversation_group["conversation"][0]["system"]
33+
34+
# Extract human and GPT inputs and outputs from each conversation pair
35+
transformed_pairs = []
36+
for pair in conversation_group["conversation"]:
37+
# if "system" in pair:
38+
# continue # Skip the initial system entry
39+
40+
transformed_pairs.append({"from": "human", "value": pair["input"]})
41+
transformed_pairs.append({"from": "gpt", "value": pair["output"]})
42+
# print(transformed_pairs)
43+
# Add the transformed conversation group to the result list
44+
transformed_conversation = {
45+
"conversations": transformed_pairs,
46+
"system": system,
47+
}
48+
transformed_conversations.append(transformed_conversation)
49+
50+
# Convert the transformed Python objects back into JSON format
51+
with open(output_path, "w", encoding='utf-8') as output_file:
52+
json.dump(transformed_conversations, output_file, ensure_ascii=False, indent=4)
53+
54+
55+
if __name__ == "__main__":
56+
input_path = "../datasets/scientist.json"
57+
output_path = "../datasets/scientist_sharegpt.json"
58+
convert_xtuner_to_sharegpt(input_path, output_path)
59+

0 commit comments

Comments
 (0)