Skip to content

Commit b9df7a0

Browse files
committed
Attempt poedator#3
1 parent f59e72c commit b9df7a0

File tree

66 files changed

+10434
-178
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+10434
-178
lines changed

src/transformers/models/albert/modeling_tf_albert.py

+191-4
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
146146
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
147147
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
148148

149-
def build(self, input_shape: tf.TensorShape):
149+
def build(self, input_shape=None):
150150
with tf.name_scope("word_embeddings"):
151151
self.weight = self.add_weight(
152152
name="weight",
@@ -168,7 +168,12 @@ def build(self, input_shape: tf.TensorShape):
168168
initializer=get_initializer(self.initializer_range),
169169
)
170170

171-
super().build(input_shape)
171+
if self.built:
172+
return
173+
self.built = True
174+
if getattr(self, "LayerNorm", None) is not None:
175+
with tf.name_scope(self.LayerNorm.name):
176+
self.LayerNorm.build([None, None, self.config.embedding_size])
172177

173178
# Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
174179
def call(
@@ -246,6 +251,8 @@ def __init__(self, config: AlbertConfig, **kwargs):
246251
# Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
247252
self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
248253
self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
254+
self.hidden_size = config.hidden_size
255+
self.config = config
249256

250257
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
251258
# Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -307,6 +314,26 @@ def call(
307314

308315
return outputs
309316

317+
def build(self, input_shape=None):
318+
if self.built:
319+
return
320+
self.built = True
321+
if getattr(self, "query", None) is not None:
322+
with tf.name_scope(self.query.name):
323+
self.query.build(self.config.hidden_size)
324+
if getattr(self, "key", None) is not None:
325+
with tf.name_scope(self.key.name):
326+
self.key.build(self.config.hidden_size)
327+
if getattr(self, "value", None) is not None:
328+
with tf.name_scope(self.value.name):
329+
self.value.build(self.config.hidden_size)
330+
if getattr(self, "dense", None) is not None:
331+
with tf.name_scope(self.dense.name):
332+
self.dense.build(self.config.hidden_size)
333+
if getattr(self, "LayerNorm", None) is not None:
334+
with tf.name_scope(self.LayerNorm.name):
335+
self.LayerNorm.build([None, None, self.config.hidden_size])
336+
310337

311338
class TFAlbertLayer(tf.keras.layers.Layer):
312339
def __init__(self, config: AlbertConfig, **kwargs):
@@ -329,6 +356,9 @@ def __init__(self, config: AlbertConfig, **kwargs):
329356
epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
330357
)
331358
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
359+
self.intermediate_size = config.intermediate_size
360+
self.hidden_size = config.hidden_size
361+
self.config = config
332362

333363
def call(
334364
self,
@@ -356,6 +386,23 @@ def call(
356386

357387
return outputs
358388

389+
def build(self, input_shape=None):
390+
if self.built:
391+
return
392+
self.built = True
393+
if getattr(self, "attention", None) is not None:
394+
with tf.name_scope(self.attention.name):
395+
self.attention.build(None)
396+
if getattr(self, "ffn", None) is not None:
397+
with tf.name_scope(self.ffn.name):
398+
self.ffn.build(self.config.hidden_size)
399+
if getattr(self, "ffn_output", None) is not None:
400+
with tf.name_scope(self.ffn_output.name):
401+
self.ffn_output.build(self.config.intermediate_size)
402+
if getattr(self, "full_layer_layer_norm", None) is not None:
403+
with tf.name_scope(self.full_layer_layer_norm.name):
404+
self.full_layer_layer_norm.build([None, None, self.config.hidden_size])
405+
359406

360407
class TFAlbertLayerGroup(tf.keras.layers.Layer):
361408
def __init__(self, config: AlbertConfig, **kwargs):
@@ -399,6 +446,15 @@ def call(
399446

400447
return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None)
401448

449+
def build(self, input_shape=None):
450+
if self.built:
451+
return
452+
self.built = True
453+
if getattr(self, "albert_layers", None) is not None:
454+
for layer in self.albert_layers:
455+
with tf.name_scope(layer.name):
456+
layer.build(None)
457+
402458

403459
class TFAlbertTransformer(tf.keras.layers.Layer):
404460
def __init__(self, config: AlbertConfig, **kwargs):
@@ -416,6 +472,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
416472
self.albert_layer_groups = [
417473
TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups)
418474
]
475+
self.config = config
419476

420477
def call(
421478
self,
@@ -457,6 +514,18 @@ def call(
457514
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
458515
)
459516

517+
def build(self, input_shape=None):
518+
if self.built:
519+
return
520+
self.built = True
521+
if getattr(self, "embedding_hidden_mapping_in", None) is not None:
522+
with tf.name_scope(self.embedding_hidden_mapping_in.name):
523+
self.embedding_hidden_mapping_in.build(self.config.embedding_size)
524+
if getattr(self, "albert_layer_groups", None) is not None:
525+
for layer in self.albert_layer_groups:
526+
with tf.name_scope(layer.name):
527+
layer.build(None)
528+
460529

461530
class TFAlbertPreTrainedModel(TFPreTrainedModel):
462531
"""
@@ -487,14 +556,23 @@ def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer
487556
# The output weights are the same as the input embeddings, but there is
488557
# an output-only bias for each token.
489558
self.decoder = input_embeddings
559+
self.hidden_size = config.hidden_size
490560

491-
def build(self, input_shape: tf.TensorShape):
561+
def build(self, input_shape=None):
492562
self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
493563
self.decoder_bias = self.add_weight(
494564
shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
495565
)
496566

497-
super().build(input_shape)
567+
if self.built:
568+
return
569+
self.built = True
570+
if getattr(self, "dense", None) is not None:
571+
with tf.name_scope(self.dense.name):
572+
self.dense.build(self.config.hidden_size)
573+
if getattr(self, "LayerNorm", None) is not None:
574+
with tf.name_scope(self.LayerNorm.name):
575+
self.LayerNorm.build([None, None, self.config.embedding_size])
498576

499577
def get_output_embeddings(self) -> tf.keras.layers.Layer:
500578
return self.decoder
@@ -650,6 +728,20 @@ def call(
650728
attentions=encoder_outputs.attentions,
651729
)
652730

731+
def build(self, input_shape=None):
732+
if self.built:
733+
return
734+
self.built = True
735+
if getattr(self, "embeddings", None) is not None:
736+
with tf.name_scope(self.embeddings.name):
737+
self.embeddings.build(None)
738+
if getattr(self, "encoder", None) is not None:
739+
with tf.name_scope(self.encoder.name):
740+
self.encoder.build(None)
741+
if getattr(self, "pooler", None) is not None:
742+
with tf.name_scope(self.pooler.name):
743+
self.pooler.build(None) # TODO Matt might be wrong
744+
653745

654746
@dataclass
655747
class TFAlbertForPreTrainingOutput(ModelOutput):
@@ -825,6 +917,14 @@ def call(
825917

826918
return outputs
827919

920+
def build(self, input_shape=None):
921+
if self.built:
922+
return
923+
self.built = True
924+
if getattr(self, "albert", None) is not None:
925+
with tf.name_scope(self.albert.name):
926+
self.albert.build(None)
927+
828928

829929
@add_start_docstrings(
830930
"""
@@ -921,6 +1021,20 @@ def call(
9211021
attentions=outputs.attentions,
9221022
)
9231023

1024+
def build(self, input_shape=None):
1025+
if self.built:
1026+
return
1027+
self.built = True
1028+
if getattr(self, "albert", None) is not None:
1029+
with tf.name_scope(self.albert.name):
1030+
self.albert.build(None)
1031+
if getattr(self, "predictions", None) is not None:
1032+
with tf.name_scope(self.predictions.name):
1033+
self.predictions.build(None)
1034+
if getattr(self, "sop_classifier", None) is not None:
1035+
with tf.name_scope(self.sop_classifier.name):
1036+
self.sop_classifier.build(None)
1037+
9241038

9251039
class TFAlbertSOPHead(tf.keras.layers.Layer):
9261040
def __init__(self, config: AlbertConfig, **kwargs):
@@ -932,13 +1046,23 @@ def __init__(self, config: AlbertConfig, **kwargs):
9321046
kernel_initializer=get_initializer(config.initializer_range),
9331047
name="classifier",
9341048
)
1049+
self.hidden_size = config.hidden_size
1050+
self.config = config
9351051

9361052
def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor:
9371053
dropout_pooled_output = self.dropout(inputs=pooled_output, training=training)
9381054
logits = self.classifier(inputs=dropout_pooled_output)
9391055

9401056
return logits
9411057

1058+
def build(self, input_shape=None):
1059+
if self.built:
1060+
return
1061+
self.built = True
1062+
if getattr(self, "classifier", None) is not None:
1063+
with tf.name_scope(self.classifier.name):
1064+
self.classifier.build(self.config.hidden_size)
1065+
9421066

9431067
@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
9441068
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1035,6 +1159,17 @@ def call(
10351159
attentions=outputs.attentions,
10361160
)
10371161

1162+
def build(self, input_shape=None):
1163+
if self.built:
1164+
return
1165+
self.built = True
1166+
if getattr(self, "albert", None) is not None:
1167+
with tf.name_scope(self.albert.name):
1168+
self.albert.build(None)
1169+
if getattr(self, "predictions", None) is not None:
1170+
with tf.name_scope(self.predictions.name):
1171+
self.predictions.build(None)
1172+
10381173

10391174
@add_start_docstrings(
10401175
"""
@@ -1058,6 +1193,8 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
10581193
self.classifier = tf.keras.layers.Dense(
10591194
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
10601195
)
1196+
self.hidden_size = config.hidden_size
1197+
self.config = config
10611198

10621199
@unpack_inputs
10631200
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1117,6 +1254,17 @@ def call(
11171254
attentions=outputs.attentions,
11181255
)
11191256

1257+
def build(self, input_shape=None):
1258+
if self.built:
1259+
return
1260+
self.built = True
1261+
if getattr(self, "albert", None) is not None:
1262+
with tf.name_scope(self.albert.name):
1263+
self.albert.build(None)
1264+
if getattr(self, "classifier", None) is not None:
1265+
with tf.name_scope(self.classifier.name):
1266+
self.classifier.build(self.config.hidden_size)
1267+
11201268

11211269
@add_start_docstrings(
11221270
"""
@@ -1145,6 +1293,8 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
11451293
self.classifier = tf.keras.layers.Dense(
11461294
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
11471295
)
1296+
self.hidden_size = config.hidden_size
1297+
self.config = config
11481298

11491299
@unpack_inputs
11501300
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1200,6 +1350,17 @@ def call(
12001350
attentions=outputs.attentions,
12011351
)
12021352

1353+
def build(self, input_shape=None):
1354+
if self.built:
1355+
return
1356+
self.built = True
1357+
if getattr(self, "albert", None) is not None:
1358+
with tf.name_scope(self.albert.name):
1359+
self.albert.build(None)
1360+
if getattr(self, "classifier", None) is not None:
1361+
with tf.name_scope(self.classifier.name):
1362+
self.classifier.build(self.config.hidden_size)
1363+
12031364

12041365
@add_start_docstrings(
12051366
"""
@@ -1221,6 +1382,8 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
12211382
self.qa_outputs = tf.keras.layers.Dense(
12221383
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
12231384
)
1385+
self.hidden_size = config.hidden_size
1386+
self.config = config
12241387

12251388
@unpack_inputs
12261389
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@@ -1295,6 +1458,17 @@ def call(
12951458
attentions=outputs.attentions,
12961459
)
12971460

1461+
def build(self, input_shape=None):
1462+
if self.built:
1463+
return
1464+
self.built = True
1465+
if getattr(self, "albert", None) is not None:
1466+
with tf.name_scope(self.albert.name):
1467+
self.albert.build(None)
1468+
if getattr(self, "qa_outputs", None) is not None:
1469+
with tf.name_scope(self.qa_outputs.name):
1470+
self.qa_outputs.build(self.config.hidden_size)
1471+
12981472

12991473
@add_start_docstrings(
13001474
"""
@@ -1316,6 +1490,8 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
13161490
self.classifier = tf.keras.layers.Dense(
13171491
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
13181492
)
1493+
self.hidden_size = config.hidden_size
1494+
self.config = config
13191495

13201496
@unpack_inputs
13211497
@add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@@ -1394,3 +1570,14 @@ def call(
13941570
hidden_states=outputs.hidden_states,
13951571
attentions=outputs.attentions,
13961572
)
1573+
1574+
def build(self, input_shape=None):
1575+
if self.built:
1576+
return
1577+
self.built = True
1578+
if getattr(self, "albert", None) is not None:
1579+
with tf.name_scope(self.albert.name):
1580+
self.albert.build(None)
1581+
if getattr(self, "classifier", None) is not None:
1582+
with tf.name_scope(self.classifier.name):
1583+
self.classifier.build(self.config.hidden_size)

0 commit comments

Comments
 (0)