Skip to content

Commit c868bb6

Browse files
committed
This is the attempt that will pierce the heavens!
1 parent d54b581 commit c868bb6

File tree

64 files changed

+10176
-152
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+10176
-152
lines changed

src/transformers/models/albert/modeling_tf_albert.py

+182-4
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
146146
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
147147
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
148148

149-
def build(self, input_shape: tf.TensorShape):
149+
def build(self, input_shape=None):
150150
with tf.name_scope("word_embeddings"):
151151
self.weight = self.add_weight(
152152
name="weight",
@@ -168,7 +168,12 @@ def build(self, input_shape: tf.TensorShape):
168168
initializer=get_initializer(self.initializer_range),
169169
)
170170

171-
super().build(input_shape)
171+
if self.built:
172+
return
173+
self.built = True
174+
if getattr(self, "LayerNorm", None) is not None:
175+
with tf.name_scope(self.LayerNorm.name):
176+
self.LayerNorm.build([None, None, self.config.embedding_size])
172177

173178
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
174179
def call(
@@ -246,6 +251,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
246251
# Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
247252
self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
248253
self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
254+
self.config = config
249255

250256
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
251257
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -307,6 +313,26 @@ def call(
307313

308314
return outputs
309315

316+
def build(self, input_shape=None):
317+
if self.built:
318+
return
319+
self.built = True
320+
if getattr(self, "query", None) is not None:
321+
with tf.name_scope(self.query.name):
322+
self.query.build(self.config.hidden_size)
323+
if getattr(self, "key", None) is not None:
324+
with tf.name_scope(self.key.name):
325+
self.key.build(self.config.hidden_size)
326+
if getattr(self, "value", None) is not None:
327+
with tf.name_scope(self.value.name):
328+
self.value.build(self.config.hidden_size)
329+
if getattr(self, "dense", None) is not None:
330+
with tf.name_scope(self.dense.name):
331+
self.dense.build(self.config.hidden_size)
332+
if getattr(self, "LayerNorm", None) is not None:
333+
with tf.name_scope(self.LayerNorm.name):
334+
self.LayerNorm.build([None, None, self.config.hidden_size])
335+
310336

311337
class TFAlbertLayer(tf.keras.layers.Layer):
312338
def __init__(self, config: AlbertConfig, **kwargs):
@@ -329,6 +355,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
329355
epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
330356
)
331357
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
358+
self.config = config
332359

333360
def call(
334361
self,
@@ -356,6 +383,23 @@ def call(
356383

357384
return outputs
358385

386+
def build(self, input_shape=None):
387+
if self.built:
388+
return
389+
self.built = True
390+
if getattr(self, "attention", None) is not None:
391+
with tf.name_scope(self.attention.name):
392+
self.attention.build(None)
393+
if getattr(self, "ffn", None) is not None:
394+
with tf.name_scope(self.ffn.name):
395+
self.ffn.build(self.config.hidden_size)
396+
if getattr(self, "ffn_output", None) is not None:
397+
with tf.name_scope(self.ffn_output.name):
398+
self.ffn_output.build(self.config.intermediate_size)
399+
if getattr(self, "full_layer_layer_norm", None) is not None:
400+
with tf.name_scope(self.full_layer_layer_norm.name):
401+
self.full_layer_layer_norm.build([None, None, self.config.hidden_size])
402+
359403

360404
class TFAlbertLayerGroup(tf.keras.layers.Layer):
361405
def __init__(self, config: AlbertConfig, **kwargs):
@@ -399,6 +443,15 @@ def call(
399443

400444
return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None)
401445

446+
def build(self, input_shape=None):
447+
if self.built:
448+
return
449+
self.built = True
450+
if getattr(self, "albert_layers", None) is not None:
451+
for layer in self.albert_layers:
452+
with tf.name_scope(layer.name):
453+
layer.build(None)
454+
402455

403456
class TFAlbertTransformer(tf.keras.layers.Layer):
404457
def __init__(self, config: AlbertConfig, **kwargs):
@@ -416,6 +469,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
416469
self.albert_layer_groups = [
417470
TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups)
418471
]
472+
self.config = config
419473

420474
def call(
421475
self,
@@ -457,6 +511,18 @@ def call(
457511
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
458512
)
459513

514+
def build(self, input_shape=None):
515+
if self.built:
516+
return
517+
self.built = True
518+
if getattr(self, "embedding_hidden_mapping_in", None) is not None:
519+
with tf.name_scope(self.embedding_hidden_mapping_in.name):
520+
self.embedding_hidden_mapping_in.build(self.config.embedding_size)
521+
if getattr(self, "albert_layer_groups", None) is not None:
522+
for layer in self.albert_layer_groups:
523+
with tf.name_scope(layer.name):
524+
layer.build(None)
525+
460526

461527
class TFAlbertPreTrainedModel(TFPreTrainedModel):
462528
"""
@@ -488,13 +554,21 @@ def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer
488554
# an output-only bias for each token.
489555
self.decoder = input_embeddings
490556

491-
def build(self, input_shape: tf.TensorShape):
557+
def build(self, input_shape=None):
492558
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
493559
self.decoder_bias = self.add_weight(
494560
shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
495561
)
496562

497-
super().build(input_shape)
563+
if self.built:
564+
return
565+
self.built = True
566+
if getattr(self, "dense", None) is not None:
567+
with tf.name_scope(self.dense.name):
568+
self.dense.build(self.config.hidden_size)
569+
if getattr(self, "LayerNorm", None) is not None:
570+
with tf.name_scope(self.LayerNorm.name):
571+
self.LayerNorm.build([None, None, self.config.embedding_size])
498572

499573
def get_output_embeddings(self) -> tf.keras.layers.Layer:
500574
return self.decoder
@@ -650,6 +724,20 @@ def call(
650724
attentions=encoder_outputs.attentions,
651725
)
652726

727+
def build(self, input_shape=None):
728+
if self.built:
729+
return
730+
self.built = True
731+
if getattr(self, "embeddings", None) is not None:
732+
with tf.name_scope(self.embeddings.name):
733+
self.embeddings.build(None)
734+
if getattr(self, "encoder", None) is not None:
735+
with tf.name_scope(self.encoder.name):
736+
self.encoder.build(None)
737+
if getattr(self, "pooler", None) is not None:
738+
with tf.name_scope(self.pooler.name):
739+
self.pooler.build(None) # TODO Matt might be wrong
740+
653741

654742
@dataclass
655743
class TFAlbertForPreTrainingOutput(ModelOutput):
@@ -825,6 +913,14 @@ def call(
825913

826914
return outputs
827915

916+
def build(self, input_shape=None):
917+
if self.built:
918+
return
919+
self.built = True
920+
if getattr(self, "albert", None) is not None:
921+
with tf.name_scope(self.albert.name):
922+
self.albert.build(None)
923+
828924

829925
@add_start_docstrings(
830926
"""
@@ -921,6 +1017,20 @@ def call(
9211017
attentions=outputs.attentions,
9221018
)
9231019

1020+
def build(self, input_shape=None):
1021+
if self.built:
1022+
return
1023+
self.built = True
1024+
if getattr(self, "albert", None) is not None:
1025+
with tf.name_scope(self.albert.name):
1026+
self.albert.build(None)
1027+
if getattr(self, "predictions", None) is not None:
1028+
with tf.name_scope(self.predictions.name):
1029+
self.predictions.build(None)
1030+
if getattr(self, "sop_classifier", None) is not None:
1031+
with tf.name_scope(self.sop_classifier.name):
1032+
self.sop_classifier.build(None)
1033+
9241034

9251035
class TFAlbertSOPHead(tf.keras.layers.Layer):
9261036
def __init__(self, config: AlbertConfig, **kwargs):
@@ -932,13 +1042,22 @@ def __init__(self, config: AlbertConfig, **kwargs):
9321042
kernel_initializer=get_initializer(config.initializer_range),
9331043
name="classifier",
9341044
)
1045+
self.config = config
9351046

9361047
def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor:
9371048
dropout_pooled_output = self.dropout(inputs=pooled_output, training=training)
9381049
logits = self.classifier(inputs=dropout_pooled_output)
9391050

9401051
return logits
9411052

1053+
def build(self, input_shape=None):
1054+
if self.built:
1055+
return
1056+
self.built = True
1057+
if getattr(self, "classifier", None) is not None:
1058+
with tf.name_scope(self.classifier.name):
1059+
self.classifier.build(self.config.hidden_size)
1060+
9421061

9431062
@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
9441063
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1035,6 +1154,17 @@ def call(
10351154
attentions=outputs.attentions,
10361155
)
10371156

1157+
def build(self, input_shape=None):
1158+
if self.built:
1159+
return
1160+
self.built = True
1161+
if getattr(self, "albert", None) is not None:
1162+
with tf.name_scope(self.albert.name):
1163+
self.albert.build(None)
1164+
if getattr(self, "predictions", None) is not None:
1165+
with tf.name_scope(self.predictions.name):
1166+
self.predictions.build(None)
1167+
10381168

10391169
@add_start_docstrings(
10401170
"""
@@ -1058,6 +1188,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
10581188
self.classifier = tf.keras.layers.Dense(
10591189
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
10601190
)
1191+
self.config = config
10611192

10621193
@unpack_inputs
10631194
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1117,6 +1248,17 @@ def call(
11171248
attentions=outputs.attentions,
11181249
)
11191250

1251+
def build(self, input_shape=None):
1252+
if self.built:
1253+
return
1254+
self.built = True
1255+
if getattr(self, "albert", None) is not None:
1256+
with tf.name_scope(self.albert.name):
1257+
self.albert.build(None)
1258+
if getattr(self, "classifier", None) is not None:
1259+
with tf.name_scope(self.classifier.name):
1260+
self.classifier.build(self.config.hidden_size)
1261+
11201262

11211263
@add_start_docstrings(
11221264
"""
@@ -1145,6 +1287,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
11451287
self.classifier = tf.keras.layers.Dense(
11461288
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
11471289
)
1290+
self.config = config
11481291

11491292
@unpack_inputs
11501293
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1200,6 +1343,17 @@ def call(
12001343
attentions=outputs.attentions,
12011344
)
12021345

1346+
def build(self, input_shape=None):
1347+
if self.built:
1348+
return
1349+
self.built = True
1350+
if getattr(self, "albert", None) is not None:
1351+
with tf.name_scope(self.albert.name):
1352+
self.albert.build(None)
1353+
if getattr(self, "classifier", None) is not None:
1354+
with tf.name_scope(self.classifier.name):
1355+
self.classifier.build(self.config.hidden_size)
1356+
12031357

12041358
@add_start_docstrings(
12051359
"""
@@ -1221,6 +1375,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
12211375
self.qa_outputs = tf.keras.layers.Dense(
12221376
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
12231377
)
1378+
self.config = config
12241379

12251380
@unpack_inputs
12261381
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1295,6 +1450,17 @@ def call(
12951450
attentions=outputs.attentions,
12961451
)
12971452

1453+
def build(self, input_shape=None):
1454+
if self.built:
1455+
return
1456+
self.built = True
1457+
if getattr(self, "albert", None) is not None:
1458+
with tf.name_scope(self.albert.name):
1459+
self.albert.build(None)
1460+
if getattr(self, "qa_outputs", None) is not None:
1461+
with tf.name_scope(self.qa_outputs.name):
1462+
self.qa_outputs.build(self.config.hidden_size)
1463+
12981464

12991465
@add_start_docstrings(
13001466
"""
@@ -1316,6 +1482,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
13161482
self.classifier = tf.keras.layers.Dense(
13171483
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
13181484
)
1485+
self.config = config
13191486

13201487
@unpack_inputs
13211488
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@@ -1394,3 +1561,14 @@ def call(
13941561
hidden_states=outputs.hidden_states,
13951562
attentions=outputs.attentions,
13961563
)
1564+
1565+
def build(self, input_shape=None):
1566+
if self.built:
1567+
return
1568+
self.built = True
1569+
if getattr(self, "albert", None) is not None:
1570+
with tf.name_scope(self.albert.name):
1571+
self.albert.build(None)
1572+
if getattr(self, "classifier", None) is not None:
1573+
with tf.name_scope(self.classifier.name):
1574+
self.classifier.build(self.config.hidden_size)

0 commit comments

Comments
 (0)