This change breaks previous checkpoints. Make Transformer fast on TPU.

nshazeer · Ryan Sepassi · commit c10e0160e1bd · 2017-11-29T13:36:17.000-08:00
PiperOrigin-RevId: 176747359
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
@@ -801,7 +801,7 @@ def combine_first_two_dimensions(x):
 
 @expert_utils.add_name_scope()
 def split_heads(x, num_heads):
-  """Split channels (dimension 3) into multiple heads (becomes dimension 1).
+  """Split channels (dimension 2) into multiple heads (becomes dimension 1).
 
   Args:
     x: a Tensor with shape [batch, length, channels]
@@ -815,7 +815,7 @@ def split_heads(x, num_heads):
 
 @expert_utils.add_name_scope()
 def split_heads_2d(x, num_heads):
-  """Split channels (dimension 4) into multiple heads (becomes dimension 1).
+  """Split channels (dimension 3) into multiple heads (becomes dimension 1).
 
   Args:
     x: a Tensor with shape [batch, height, width, channels]
@@ -2191,10 +2191,10 @@ def compute_qkv(query_antecedent,
   """
   if memory_antecedent is None and q_filter_width == kv_filter_width == 1:
     # self attention with single position q, k, and v
-    combined = common_layers.conv1d(
+    combined = tf.layers.dense(
         query_antecedent,
         total_key_depth * 2 + total_value_depth,
-        1,
+        use_bias=False,
         name="qkv_transform")
     q, k, v = tf.split(
         combined, [total_key_depth, total_key_depth, total_value_depth], axis=2)
@@ -2250,22 +2250,19 @@ def compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth,
   """
   # self attention with single position q, k, and v
   if memory_antecedent is None:
-    combined = tf.layers.conv2d(
-        query_antecedent,
-        total_key_depth * 2 + total_value_depth, (1, 1),
-        name="qkv_transform")
+    combined = tf.layers.dense(
+        query_antecedent, total_key_depth * 2 + total_value_depth,
+        use_bias=False, name="qkv_transform")
     q, k, v = tf.split(
         combined, [total_key_depth, total_key_depth, total_value_depth],
         axis=-1)
     return q, k, v
 
   # Encoder decoder attention
-  q = common_layers.conv1d(
-      query_antecedent, total_key_depth, 1, name="q_transform")
-  combined = common_layers.conv1d(
-      memory_antecedent,
-      total_key_depth + total_value_depth,
-      1,
+  q = tf.layers.dense(
+      query_antecedent, total_key_depth, use_bias=False, name="q_transform")
+  combined = tf.layers.dense(
+      memory_antecedent, total_key_depth + total_value_depth, use_bias=False,
       name="kv_transform")
   k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2)
 
@@ -2410,7 +2407,8 @@ def multihead_attention(query_antecedent,
       x = dilated_self_attention_1d(q, k, v, block_length, block_width,
                                     gap_size, num_memory_blocks)
     x = combine_heads(x)
-    x = common_layers.conv1d(x, output_depth, 1, name="output_transform")
+    x = tf.layers.dense(
+        x, output_depth, use_bias=False, name="output_transform")
     if additional_returned_value is not None:
       return x, additional_returned_value
     return x
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
@@ -179,6 +179,9 @@ def basic_params1():
       # This is the actual batch size, *not* tokens per batch (i.e. for
       # language models this is the number of sentences in the batch)
       tpu_batch_size_per_shard=24,
+      # Set by tpu_trainer to let the model know whether we are on TPU.
+      # Switching on/off tpu should not invalidate checkpoints.
+      use_tpu=False,
   )
 
 
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
@@ -1229,6 +1229,15 @@ def relu_density_logit(x, reduce_dims):
   return scaled
 
 
+def conv_hidden_relu_simple(inputs, hidden_size, output_size, dropout=0.0):
+  h = tf.layers.dense(
+      inputs, hidden_size, use_bias=False, activation=tf.nn.relu, name="conv1")
+  if dropout != 0.0:
+    h = tf.nn.dropout(h, 1.0 - dropout)
+  o = tf.layers.dense(h, output_size, use_bias=False, name="conv2")
+  return o
+
+
 def conv_hidden_relu(inputs,
                      hidden_size,
                      output_size,
@@ -1239,6 +1248,9 @@ def conv_hidden_relu(inputs,
   """Hidden layer with RELU activation followed by linear projection."""
   name = kwargs.pop("name") if "name" in kwargs else None
   with tf.variable_scope(name, "conv_hidden_relu", [inputs]):
+    if kernel_size == (1, 1) and second_kernel_size == (1, 1):
+      return conv_hidden_relu_simple(
+          inputs, hidden_size, output_size, dropout=dropout)
     if inputs.get_shape().ndims == 3:
       is_3d = True
       inputs = tf.expand_dims(inputs, 2)
@@ -1487,10 +1499,15 @@ def padded_cross_entropy(logits,
   confidence = 1.0 - label_smoothing
   vocab_size = shape_list(logits)[-1]
   with tf.name_scope("padded_cross_entropy", [logits, labels]):
-    pad_logits, pad_labels = pad_with_zeros(logits, labels)
-    xent = smoothing_cross_entropy(pad_logits, pad_labels, vocab_size,
-                                   confidence)
-    weights = weights_fn(pad_labels)
+    if len(logits.get_shape().as_list()) == 2:
+      # Deal with the case where we did not insert extra dimensions due to
+      # TPU issues.  No pad-to-same-length happens in this case.
+      # TODO(noam): remove this logic once TPU can handle extra dimensions.
+      labels = tf.reshape(labels, [-1])
+    else:
+      logits, labels = pad_with_zeros(logits, labels)
+    xent = smoothing_cross_entropy(logits, labels, vocab_size, confidence)
+    weights = weights_fn(labels)
     if not reduce_sum:
       return xent * weights, weights
     return tf.reduce_sum(xent * weights), tf.reduce_sum(weights)
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
@@ -30,6 +30,15 @@
 import tensorflow as tf
 
 
+# TODO(noam): remove this function after TPUs do gather faster.
+def tpu_gather(params, indices):
+  vocab_size = params.get_shape().as_list()[0]
+  indices_flat = tf.reshape(indices, [-1])
+  out = tf.matmul(tf.one_hot(indices_flat, vocab_size), params)
+  out = eu.reshape_like(out, tf.expand_dims(indices, -1))
+  return out
+
+
 @registry.register_symbol_modality("default")
 class SymbolModality(modality.Modality):
   """Modality for sets of discrete symbols.
@@ -94,7 +103,8 @@ def bottom_simple(self, x, name, reuse):
       # Squeeze out the channels dimension.
       x = tf.squeeze(x, axis=3)
       var = self._get_weights()
-      ret = tf.gather(var, x)
+      ret = (tpu_gather(var, x) if self._model_hparams.use_tpu
+             else tf.gather(var, x))
       if self._model_hparams.multiply_embedding_mode == "sqrt_depth":
         ret *= self._body_input_depth**0.5
       ret *= tf.expand_dims(tf.to_float(tf.not_equal(x, 0)), -1)
@@ -142,14 +152,18 @@ def top(self, body_output, _):
           self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN):
         # insert channels dimension
         body_output = tf.expand_dims(body_output, 3)
-        logits = common_layers.FactoredTensor(body_output, var)
+        return common_layers.FactoredTensor(body_output, var)
       else:
         body_output = tf.reshape(body_output, [-1, body_output_shape[-1]])
         logits = tf.matmul(body_output, var, transpose_b=True)
-
-        out_shape = body_output_shape[:-1] + [1, self._vocab_size]
-        logits = tf.reshape(logits, out_shape)
-      return logits
+        if (self._model_hparams.use_tpu and
+            self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN):
+          # TPU does not react kindly to extra dimensions.
+          # TODO(noam): remove this once TPU is more forgiving of extra dims.
+          return logits
+        else:
+          return tf.reshape(
+              logits, body_output_shape[:-1] + [1, self._vocab_size])
 
 
 @registry.register_symbol_modality("ctc")
diff --git a/tensor2tensor/layers/modalities_test.py b/tensor2tensor/layers/modalities_test.py
@@ -42,7 +42,8 @@ def testSymbolModalityInputs(self):
         multiply_embedding_mode="sqrt_depth",
         symbol_modality_skip_top=0,
         shared_embedding_and_softmax_weights=0,
-        prepend_mode="none")
+        prepend_mode="none",
+        use_tpu=False)
     x = -1 + np.random.random_integers(
         vocab_size, size=(batch_size, length, 1, 1))
     m = modalities.SymbolModality(model_hparams, vocab_size)
@@ -71,7 +72,8 @@ def testSymbolModalityTargets(self):
         shared_embedding_and_softmax_weights=0,
         factored_logits=0,
         mode=tf.estimator.ModeKeys.TRAIN,
-        prepend_mode="none")
+        prepend_mode="none",
+        use_tpu=False)
     body_output = -1 + np.random.random_integers(
         100, size=(batch_size, length, height, hidden_size))
     targets = -1 + np.random.random_integers(
@@ -107,7 +109,8 @@ def testSymbolModalityTargetsFactored(self):
         shared_embedding_and_softmax_weights=0,
         factored_logits=1,
         mode=tf.estimator.ModeKeys.TRAIN,
-        prepend_mode="none")
+        prepend_mode="none",
+        use_tpu=False)
     body_output = -1 + np.random.random_integers(
         100, size=(batch_size, length, height, hidden_size))
     targets = -1 + np.random.random_integers(
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
@@ -108,8 +108,13 @@ def decode(self,
         hparams,
         cache=cache)
 
-    # Expand since t2t expects 4d tensors.
-    return tf.expand_dims(decoder_output, axis=2)
+    if hparams.use_tpu and hparams.mode == tf.estimator.ModeKeys.TRAIN:
+      # TPU does not react kindly to extra dimensions.
+      # TODO(noam): remove this once TPU is more forgiving of extra dims.
+      return decoder_output
+    else:
+      # Expand since t2t expects 4d tensors.
+      return tf.expand_dims(decoder_output, axis=2)
 
   def model_fn_body(self, features):
     """Transformer main model_fn.
@@ -1113,3 +1118,20 @@ def transformer_clean_big():
   hparams.hidden_size = 1024
   hparams.filter_size = 4096
   return hparams
+
+
+@registry.register_hparams
+def transformer_tpu_lm1b():
+  """Hparams for training languagemodel_lm1b8k_concat on tpu."""
+  hparams = transformer_clean()
+  update_hparams_for_tpu(hparams)
+  hparams.max_length = 512
+  hparams.tpu_batch_size_per_shard = 8
+  hparams.hidden_size = 1024
+  hparams.filter_size = 4096
+  hparams.num_heads = 4
+  hparams.label_smoothing = 0.0
+  hparams.layer_prepostprocess_dropout = 0.0
+  hparams.attention_dropout = 0.0
+  hparams.relu_dropout = 0.0
+  return hparams
diff --git a/tensor2tensor/tpu/tpu_trainer_lib.py b/tensor2tensor/tpu/tpu_trainer_lib.py
@@ -212,6 +212,7 @@ def t2t_model_fn(model_name,
   hparams = copy.deepcopy(hparams)
   problem = hparams.problem_instances[0]
   problem_hp = hparams.problems[0]
+  hparams.use_tpu = use_tpu
 
   features["problem_choice"] = tf.constant(0)
   features["input_space_id"] = tf.constant(problem_hp.input_space_id)

Original file line number	Diff line number	Diff line change
`@@ -179,6 +179,9 @@ def basic_params1():`
`179`	`179`	`# This is the actual batch size, not tokens per batch (i.e. for`
`180`	`180`	`# language models this is the number of sentences in the batch)`
`181`	`181`	`tpu_batch_size_per_shard=24,`
	`182`	`+ # Set by tpu_trainer to let the model know whether we are on TPU.`
	`183`	`+ # Switching on/off tpu should not invalidate checkpoints.`
	`184`	`+ use_tpu=False,`
`182`	`185`	`)`
`183`	`186`
`184`	`187`