Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 7909c69

Browse files
nshazeerRyan Sepassi
authored and
Ryan Sepassi
committed
fix bug with bpe32k - prepend <pad> and <EOS> to vocab.
PiperOrigin-RevId: 177271941
1 parent 2ca3232 commit 7909c69

File tree

1 file changed

+4
-2
lines changed

1 file changed

+4
-2
lines changed

tensor2tensor/data_generators/translate_ende.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,10 @@ def generator(self, data_dir, tmp_dir, train):
9999
token_tmp_path = os.path.join(tmp_dir, self.vocab_file)
100100
token_path = os.path.join(data_dir, self.vocab_file)
101101
tf.gfile.Copy(token_tmp_path, token_path, overwrite=True)
102-
with tf.gfile.GFile(token_path, mode="a") as f:
103-
f.write("UNK\n") # Add UNK to the vocab.
102+
with tf.gfile.GFile(token_path, mode="r") as f:
103+
vocab_data = "<pad>\n<EOS>\n" + f.read() + "UNK\n"
104+
with tf.gfile.GFile(token_path, mode="w") as f:
105+
f.write(vocab_data)
104106
token_vocab = text_encoder.TokenTextEncoder(token_path, replace_oov="UNK")
105107
return translate.token_generator(train_path + ".en", train_path + ".de",
106108
token_vocab, EOS)

0 commit comments

Comments
 (0)