1
+ import json
2
+
3
+ # Given JSON data in string format
4
+ # original_json_data = """
5
+ # [
6
+ # {
7
+ # "conversation": [
8
+ # {"system": "system", "input": "input", "output": "output"},
9
+ # {"input": "input", "output": "output"},
10
+ # {"input": "input", "output": "output"}
11
+ # ]
12
+ # },
13
+ # {
14
+ # "conversation": [
15
+ # {"system": "system", "input": "input", "output": "output"},
16
+ # {"input": "input", "output": "output"},
17
+ # {"input": "input", "output": "output"}
18
+ # ]
19
+ # }
20
+ # ]
21
+ # """
22
+
23
+ # Parse the original JSON data into Python objects
24
+ def convert_xtuner_to_sharegpt (input_path , output_path ):
25
+ with open (input_path , 'r' , encoding = 'utf-8' ) as file :
26
+ data = json .load (file )
27
+
28
+ # Initialize a new list to hold transformed conversations
29
+ transformed_conversations = []
30
+
31
+ for conversation_group in data :
32
+ system = conversation_group ["conversation" ][0 ]["system" ]
33
+
34
+ # Extract human and GPT inputs and outputs from each conversation pair
35
+ transformed_pairs = []
36
+ for pair in conversation_group ["conversation" ]:
37
+ # if "system" in pair:
38
+ # continue # Skip the initial system entry
39
+
40
+ transformed_pairs .append ({"from" : "human" , "value" : pair ["input" ]})
41
+ transformed_pairs .append ({"from" : "gpt" , "value" : pair ["output" ]})
42
+ # print(transformed_pairs)
43
+ # Add the transformed conversation group to the result list
44
+ transformed_conversation = {
45
+ "conversations" : transformed_pairs ,
46
+ "system" : system ,
47
+ }
48
+ transformed_conversations .append (transformed_conversation )
49
+
50
+ # Convert the transformed Python objects back into JSON format
51
+ with open (output_path , "w" , encoding = 'utf-8' ) as output_file :
52
+ json .dump (transformed_conversations , output_file , ensure_ascii = False , indent = 4 )
53
+
54
+
55
+ if __name__ == "__main__" :
56
+ input_path = "../datasets/scientist.json"
57
+ output_path = "../datasets/scientist_sharegpt.json"
58
+ convert_xtuner_to_sharegpt (input_path , output_path )
59
+
0 commit comments