Skip to content

Commit bd12e8b

Browse files
author
Steven I Reeves
authored
Merge pull request huggingface#7 from ROCmSoftwarePlatform/gpt2-tf2
Updating GPT2-TF2 Scripts
2 parents ee5302e + 95080f2 commit bd12e8b

File tree

5 files changed

+155
-4
lines changed

5 files changed

+155
-4
lines changed

scripts/gpt2-tf2/gpt2_1step.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import sys
2+
3+
import numpy as np
4+
import jsonlines as jsonl
5+
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
6+
import tensorflow as tf
7+
from tensorflow.keras import metrics
8+
9+
BATCH_SIZE=1
10+
11+
def get_dataset(fil):
12+
data = []
13+
with jsonl.open(fil) as reader:
14+
for line in reader:
15+
data.append(line['text'])
16+
return data
17+
18+
if len(sys.argv) == 1:
19+
model_size = "Small"
20+
data_dir = '/dockerx/data/'
21+
else:
22+
model_size = sys.argv[1]
23+
data_dir = sys.argv[2]
24+
25+
if model_size == "Small":
26+
model_name = "gpt2"
27+
train_file = data_dir+'small-117M.train.jsonl'
28+
test_file = data_dir+'small-117M.test.jsonl'
29+
elif model_size == "Medium":
30+
model_name = "gpt2-medium"
31+
train_file = data_dir+'medium-345M.train.jsonl'
32+
test_file = data_dir+'medium-345M.test.jsonl'
33+
elif model_size == "Large":
34+
model_name = "gpt2-large"
35+
train_file = data_dir+'large-762M.train.jsonl'
36+
test_file = data_dir+'large-762M.test.jsonl'
37+
elif model_size == "XL":
38+
model_name = 'gpt2-xl'
39+
train_file = data_dir+'xl-1542M.train.jsonl'
40+
test_file = data_dir+'xl-1542M.test.jsonl'
41+
print("Profiling model " + model_name)
42+
43+
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
44+
tokenizer.pad_token = tokenizer.eos_token
45+
def tokenize(data):
46+
data = tokenizer(data[0], return_tensors='tf', padding=True, truncation=True)
47+
return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids']))
48+
49+
train_dataset = tokenize(get_dataset(train_file)).batch(BATCH_SIZE)
50+
model = TFGPT2LMHeadModel.from_pretrained(model_name)
51+
#Supresses the past_key_values from being expressed in the progress bar
52+
model.config.use_cache=False
53+
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
54+
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
55+
metric = metrics.SparseCategoricalAccuracy(name='Accuracy')
56+
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer])
57+
model.fit(train_dataset, batch_size=1, epochs=1)
58+

scripts/gpt2-tf2/gpt2_profile.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import sys
2+
import pandas as pd
3+
profile_dir = sys.argv[1]
4+
df = pd.read_csv(profile_dir+'results.stats.csv')
5+
print('Total time for one step GPT2', sum(df["TotalDurationNs"])*1e-9, 's')

scripts/gpt2-tf2/gpt2_train.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import sys
22
import numpy as np
3+
import jsonlines as jsonl
34
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
45
import tensorflow as tf
56
from tensorflow.keras import metrics
6-
import jsonlines as jsonl
77

88
BATCH_SIZE=1
99

@@ -69,8 +69,7 @@ def tokenize(data, truncate=False):
6969
print("========================= Compiling Model ============================")
7070
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])
7171
print("========================= Finetuning Model ==================================")
72-
model.fit(train_dataset, batch_size=64, epochs=num_epochs)#, testation_data=test_dataset)
72+
model.fit(train_dataset, batch_size=64, epochs=num_epochs)
7373
print("========================= Evaluating Model ==================================")
7474
info = model.evaluate(test_dataset, verbose=2)
75-
#print("========================= Saving Model ======================================")
76-
#model.save(model_name+'finetuned')
75+
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import sys
2+
import numpy as np
3+
import jsonlines as jsonl
4+
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel
5+
import tensorflow as tf
6+
from tensorflow.keras import metrics
7+
8+
9+
def get_dataset(fil):
10+
data = []
11+
with jsonl.open(fil) as reader:
12+
for line in reader:
13+
data.append(line['text'])
14+
return data
15+
16+
if len(sys.argv) == 1:
17+
model_size = "Small"
18+
data_dir = '/dockerx/data/tf-gpt-2/data/'
19+
num_epochs = 1
20+
num_gpus = len(tf.config.list_physical_devices(device_type='GPU'))
21+
truncate = True
22+
else:
23+
model_size = sys.argv[1]
24+
data_dir = sys.argv[2]
25+
num_epochs = int(sys.argv[3])
26+
num_gpus = int(sys.argv[4])
27+
if int(sys.argv[5]) == 1:
28+
truncate = True
29+
else:
30+
truncate = False
31+
32+
if model_size == "Small":
33+
model_name = "gpt2"
34+
train_file = data_dir+'small-117M-k40.train.jsonl'
35+
valid_file = data_dir+'small-117M-k40.valid.jsonl'
36+
elif model_size == "Medium":
37+
model_name = "gpt2-medium"
38+
train_file = data_dir+'medium-345M-k40.train.jsonl'
39+
valid_file = data_dir+'medium-345M-k40.valid.jsonl'
40+
elif model_size == "Large":
41+
model_name = "gpt2-large"
42+
train_file = data_dir+'large-762M-k40.train.jsonl'
43+
valid_file = data_dir+'large-762M-k40.valid.jsonl'
44+
elif model_size == "XL":
45+
model_name = 'gpt2-xl'
46+
train_file = data_dir+'xl-1542M-k40.train.jsonl'
47+
valid_file = data_dir+'xl-1542M-k40.valid.jsonl'
48+
print("Finetuning model " + model_name)
49+
print("With dataset "+train_file)
50+
51+
def tokenize(data, tokenizer, truncate=False):
52+
if truncate:
53+
data = tokenizer(data[:1000], return_tensors='tf', padding=True, truncation=True)
54+
else:
55+
data = tokenizer(data, return_tensors='tf', padding=True, truncation=True)
56+
return tf.data.Dataset.from_tensor_slices((dict(data), data['input_ids']))
57+
58+
print("============================ Creating Distributed Strategy ===========================")
59+
devices = []
60+
for i in range(num_gpus):
61+
devices.append("GPU:"+str(i))
62+
strategy = tf.distribute.MirroredStrategy(devices=devices)
63+
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
64+
print("============================ Loading model from pretrained and compiling ===========================")
65+
with strategy.scope():
66+
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
67+
tokenizer.pad_token = tokenizer.eos_token
68+
print("========================= Loading dataset ========================")
69+
train_dataset = tokenize(get_dataset(train_file),tokenizer, truncate).batch(num_gpus)
70+
valid_dataset = tokenize(get_dataset(valid_file),tokenizer, truncate).batch(num_gpus)
71+
model = TFGPT2LMHeadModel.from_pretrained(model_name)
72+
#Disable past key values
73+
model.config.use_cache=False
74+
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
75+
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
76+
metric = metrics.SparseCategoricalAccuracy(name='Accuracy')
77+
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])
78+
print("========================= Finetuning Model ==================================")
79+
model.fit(train_dataset, batch_size=64, epochs=num_epochs)
80+
print("========================= Evaluating Model ==================================")
81+
model.evaluate(valid_dataset)
82+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
model_size=$1
3+
echo $model_size
4+
model_dir=$2
5+
profile_dir=$3
6+
rocprof --stats python3 gpt2_1step.py $model_size $model_dir
7+
python3 gpt2_profile.py $profile_dir

0 commit comments

Comments
 (0)