42
42
# This is far from being the real WMT17 task - only toyset here
43
43
# you need to register to get UN data and CWT data. Also, by convention,
44
44
# this is EN to ZH - use translate_enzh_wmt8k_rev for ZH to EN task
45
- _ENZH_TRAIN_DATASETS = [[("http://data.statmt.org/wmt17/translation-task/"
46
- "training-parallel-nc-v12.tgz" ),
47
- ("training/news-commentary-v12.zh-en.en" ,
48
- "training/news-commentary-v12.zh-en.zh" )]]
45
+ #
46
+ # News Commentary, around 220k lines
47
+ # This dataset is only a small fraction of full WMT17 task
48
+ _NC_TRAIN_DATASETS = [[
49
+ "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz" ,
50
+ ["training/news-commentary-v12.zh-en.en" ,
51
+ "training/news-commentary-v12.zh-en.zh" ]]]
49
52
50
- _ENZH_TEST_DATASETS = [[
53
+ # Test set from News Commentary. 2000 lines
54
+ _NC_TEST_DATASETS = [[
51
55
"http://data.statmt.org/wmt17/translation-task/dev.tgz" ,
52
56
("dev/newsdev2017-enzh-src.en.sgm" , "dev/newsdev2017-enzh-ref.zh.sgm" )
53
57
]]
54
58
59
+ # UN parallel corpus. 15,886,041 lines
60
+ # Visit source website to download manually:
61
+ # https://conferences.unite.un.org/UNCorpus
62
+ #
63
+ # NOTE: You need to register to download dataset from official source
64
+ # place into tmp directory e.g. /tmp/t2t_datagen/dataset.tgz
65
+ _UN_TRAIN_DATASETS = [[
66
+ "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/UNv1.0.en-zh.tar.gz" ,
67
+ ["en-zh/UNv1.0.en-zh.en" ,
68
+ "en-zh/UNv1.0.en-zh.zh" ]]]
69
+
70
+ # CWMT corpus
71
+ # Visit source website to download manually:
72
+ # http://nlp.nju.edu.cn/cwmt-wmt/
73
+ #
74
+ # casia2015: 1,050,000 lines
75
+ # casict2015: 2,036,833 lines
76
+ # datum2015: 1,000,003 lines
77
+ # datum2017: 1,999,968 lines
78
+ # NEU2017: 2,000,000 lines
79
+ #
80
+ # NOTE: You need to register to download dataset from official source
81
+ # place into tmp directory e.g. /tmp/t2t_datagen/dataset.tgz
82
+
83
+ _CWMT_TRAIN_DATASETS = [
84
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
85
+ ["cwmt/casia2015/casia2015_en.txt" ,
86
+ "cwmt/casia2015/casia2015_ch.txt" ]],
87
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
88
+ ["cwmt/casict2015/casict2015_en.txt" ,
89
+ "cwmt/casict2015/casict2015_ch.txt" ]],
90
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
91
+ ["cwmt/neu2017/NEU_en.txt" ,
92
+ "cwmt/neu2017/NEU_cn.txt" ]],
93
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
94
+ ["cwmt/datum2015/datum_en.txt" ,
95
+ "cwmt/datum2015/datum_ch.txt" ]],
96
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
97
+ ["cwmt/datum2017/Book1_en.txt" ,
98
+ "cwmt/datum2017/Book1_cn.txt" ]],
99
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
100
+ ["cwmt/datum2017/Book2_en.txt" ,
101
+ "cwmt/datum2017/Book2_cn.txt" ]],
102
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
103
+ ["cwmt/datum2017/Book3_en.txt" ,
104
+ "cwmt/datum2017/Book3_cn.txt" ]],
105
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
106
+ ["cwmt/datum2017/Book4_en.txt" ,
107
+ "cwmt/datum2017/Book4_cn.txt" ]],
108
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
109
+ ["cwmt/datum2017/Book5_en.txt" ,
110
+ "cwmt/datum2017/Book5_cn.txt" ]],
111
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
112
+ ["cwmt/datum2017/Book6_en.txt" ,
113
+ "cwmt/datum2017/Book6_cn.txt" ]],
114
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
115
+ ["cwmt/datum2017/Book7_en.txt" ,
116
+ "cwmt/datum2017/Book7_cn.txt" ]],
117
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
118
+ ["cwmt/datum2017/Book8_en.txt" ,
119
+ "cwmt/datum2017/Book8_cn.txt" ]],
120
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
121
+ ["cwmt/datum2017/Book9_en.txt" ,
122
+ "cwmt/datum2017/Book9_cn.txt" ]],
123
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
124
+ ["cwmt/datum2017/Book10_en.txt" ,
125
+ "cwmt/datum2017/Book10_cn.txt" ]],
126
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
127
+ ["cwmt/datum2017/Book11_en.txt" ,
128
+ "cwmt/datum2017/Book11_cn.txt" ]],
129
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
130
+ ["cwmt/datum2017/Book12_en.txt" ,
131
+ "cwmt/datum2017/Book12_cn.txt" ]],
132
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
133
+ ["cwmt/datum2017/Book13_en.txt" ,
134
+ "cwmt/datum2017/Book13_cn.txt" ]],
135
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
136
+ ["cwmt/datum2017/Book14_en.txt" ,
137
+ "cwmt/datum2017/Book14_cn.txt" ]],
138
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
139
+ ["cwmt/datum2017/Book15_en.txt" ,
140
+ "cwmt/datum2017/Book15_cn.txt" ]],
141
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
142
+ ["cwmt/datum2017/Book16_en.txt" ,
143
+ "cwmt/datum2017/Book16_cn.txt" ]],
144
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
145
+ ["cwmt/datum2017/Book17_en.txt" ,
146
+ "cwmt/datum2017/Book17_cn.txt" ]],
147
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
148
+ ["cwmt/datum2017/Book18_en.txt" ,
149
+ "cwmt/datum2017/Book18_cn.txt" ]],
150
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
151
+ ["cwmt/datum2017/Book19_en.txt" ,
152
+ "cwmt/datum2017/Book19_cn.txt" ]],
153
+ ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz" ,
154
+ ["cwmt/datum2017/Book20_en.txt" ,
155
+ "cwmt/datum2017/Book20_cn.txt" ]]
156
+ ]
157
+
158
+
159
+ def get_filename (dataset ):
160
+ return dataset [0 ][0 ].split ('/' )[- 1 ]
55
161
56
162
@registry .register_problem
57
- class TranslateEnzhWmt8k (translate .TranslateProblem ):
58
- """Problem spec for WMT En-Zh translation."""
163
+ class TranslateEnzhWmt32k (translate .TranslateProblem ):
164
+ """Problem spec for WMT En-Zh translation.
165
+ Attempts to use full training dataset, which needs website
166
+ registration and downloaded manually from official sources:
59
167
60
- @property
61
- def targeted_vocab_size (self ):
62
- return 2 ** 13 # 8192
168
+ CWMT:
169
+ - http://nlp.nju.edu.cn/cwmt-wmt/
170
+ - Website contrains instructions for FTP server access.
171
+ - You'll need to download CASIA, CASICT, DATUM2015, DATUM2017,
172
+ NEU datasets
173
+
174
+ UN Parallel Corpus:
175
+ - https://conferences.unite.un.org/UNCorpus
176
+ - You'll need to register your to download the dataset.
177
+
178
+ NOTE: place into tmp directory e.g. /tmp/t2t_datagen/dataset.tgz
179
+ """
63
180
64
181
@property
65
- def num_shards (self ):
66
- return 10 # This is a small dataset.
182
+ def targeted_vocab_size (self ):
183
+ return 2 ** 15 # 32k
67
184
68
185
@property
69
186
def source_vocab_name (self ):
@@ -72,20 +189,35 @@ def source_vocab_name(self):
72
189
@property
73
190
def target_vocab_name (self ):
74
191
return "vocab.enzh-zh.%d" % self .targeted_vocab_size
192
+
193
+ def get_training_dataset (self , tmp_dir ):
194
+ """UN Parallel Corpus and CWMT Corpus need to be downloaded manually.
195
+ Append to training dataset if available
196
+ """
197
+ full_dataset = _NC_TRAIN_DATASETS
198
+ for dataset in [_CWMT_TRAIN_DATASETS , _UN_TRAIN_DATASETS ]:
199
+ filename = get_filename (dataset )
200
+ tmp_filepath = os .path .join (tmp_dir , filename )
201
+ if tf .gfile .Exists (tmp_filepath ):
202
+ full_dataset = full_dataset + dataset
203
+ else :
204
+ tf .logging .info ("[TranslateEzhWmt] dataset incomplete, you need to manually download %s" % filename )
205
+ return full_dataset
75
206
76
207
def generator (self , data_dir , tmp_dir , train ):
77
- datasets = _ENZH_TRAIN_DATASETS if train else _ENZH_TEST_DATASETS
78
- source_datasets = [[item [0 ], [item [1 ][0 ]]] for item in _ENZH_TRAIN_DATASETS ]
79
- target_datasets = [[item [0 ], [item [1 ][1 ]]] for item in _ENZH_TRAIN_DATASETS ]
208
+ TRAIN_DATASET = self .get_training_dataset (tmp_dir )
209
+ datasets = TRAIN_DATASET if train else _NC_TEST_DATASETS
210
+ source_datasets = [[item [0 ], [item [1 ][0 ]]] for item in TRAIN_DATASET ]
211
+ target_datasets = [[item [0 ], [item [1 ][1 ]]] for item in TRAIN_DATASET ]
80
212
source_vocab = generator_utils .get_or_generate_vocab (
81
213
data_dir , tmp_dir , self .source_vocab_name , self .targeted_vocab_size ,
82
- source_datasets )
214
+ source_datasets , _file_byte_budget = 1e8 )
83
215
target_vocab = generator_utils .get_or_generate_vocab (
84
216
data_dir , tmp_dir , self .target_vocab_name , self .targeted_vocab_size ,
85
- target_datasets )
217
+ target_datasets , _file_byte_budget = 1e8 )
86
218
tag = "train" if train else "dev"
87
- data_path = translate . compile_data ( tmp_dir , datasets ,
88
- "wmt_enzh_tok_%s" % tag )
219
+ filename_base = "wmt_enzh_%sk_tok_%s" % ( self . targeted_vocab_size , tag )
220
+ data_path = translate . compile_data ( tmp_dir , datasets , filename_base )
89
221
return translate .bi_vocabs_token_generator (data_path + ".lang1" ,
90
222
data_path + ".lang2" ,
91
223
source_vocab , target_vocab , EOS )
@@ -107,3 +239,22 @@ def feature_encoders(self, data_dir):
107
239
"inputs" : source_token ,
108
240
"targets" : target_token ,
109
241
}
242
+
243
+
244
+ @registry .register_problem
245
+ class TranslateEnzhWmt8k (TranslateEnzhWmt32k ):
246
+ """Problem spec for WMT En-Zh translation.
247
+ This is far from being the real WMT17 task - only toyset here
248
+ """
249
+
250
+ @property
251
+ def targeted_vocab_size (self ):
252
+ return 2 ** 13 # 8192
253
+
254
+ @property
255
+ def num_shards (self ):
256
+ return 10 # This is a small dataset.
257
+
258
+ def get_training_dataset (self , tmp_dir ):
259
+ """Uses only News Commentary Dataset for training"""
260
+ return _NC_TRAIN_DATASETS
0 commit comments