Skip to content

Commit 0996a10

Browse files
authored
Revert low cpu mem tie weights (#29135)
* Revert "Add tie_weights() to LM heads and set bias in set_output_embeddings() (#28948)" This reverts commit 725f4ad. * Revert "Patch to skip failing `test_save_load_low_cpu_mem_usage` tests (#29043)" This reverts commit 4156f51.
1 parent 15cfe38 commit 0996a10

26 files changed

+0
-144
lines changed

src/transformers/models/bert/modeling_bert.py

-6
Original file line numberDiff line numberDiff line change
@@ -692,9 +692,6 @@ def __init__(self, config):
692692
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
693693
self.decoder.bias = self.bias
694694

695-
def _tie_weights(self):
696-
self.decoder.bias = self.bias
697-
698695
def forward(self, hidden_states):
699696
hidden_states = self.transform(hidden_states)
700697
hidden_states = self.decoder(hidden_states)
@@ -1065,7 +1062,6 @@ def get_output_embeddings(self):
10651062

10661063
def set_output_embeddings(self, new_embeddings):
10671064
self.cls.predictions.decoder = new_embeddings
1068-
self.cls.predictions.bias = new_embeddings.bias
10691065

10701066
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
10711067
@replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
@@ -1175,7 +1171,6 @@ def get_output_embeddings(self):
11751171

11761172
def set_output_embeddings(self, new_embeddings):
11771173
self.cls.predictions.decoder = new_embeddings
1178-
self.cls.predictions.bias = new_embeddings.bias
11791174

11801175
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
11811176
@add_code_sample_docstrings(
@@ -1329,7 +1324,6 @@ def get_output_embeddings(self):
13291324

13301325
def set_output_embeddings(self, new_embeddings):
13311326
self.cls.predictions.decoder = new_embeddings
1332-
self.cls.predictions.bias = new_embeddings.bias
13331327

13341328
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
13351329
@add_code_sample_docstrings(

src/transformers/models/big_bird/modeling_big_bird.py

-6
Original file line numberDiff line numberDiff line change
@@ -1707,9 +1707,6 @@ def __init__(self, config):
17071707
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
17081708
self.decoder.bias = self.bias
17091709

1710-
def _tie_weights(self):
1711-
self.decoder.bias = self.bias
1712-
17131710
def forward(self, hidden_states):
17141711
hidden_states = self.transform(hidden_states)
17151712
hidden_states = self.decoder(hidden_states)
@@ -2269,7 +2266,6 @@ def get_output_embeddings(self):
22692266

22702267
def set_output_embeddings(self, new_embeddings):
22712268
self.cls.predictions.decoder = new_embeddings
2272-
self.cls.predictions.bias = new_embeddings.bias
22732269

22742270
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
22752271
@replace_return_docstrings(output_type=BigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
@@ -2382,7 +2378,6 @@ def get_output_embeddings(self):
23822378

23832379
def set_output_embeddings(self, new_embeddings):
23842380
self.cls.predictions.decoder = new_embeddings
2385-
self.cls.predictions.bias = new_embeddings.bias
23862381

23872382
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
23882383
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
@@ -2524,7 +2519,6 @@ def get_output_embeddings(self):
25242519

25252520
def set_output_embeddings(self, new_embeddings):
25262521
self.cls.predictions.decoder = new_embeddings
2527-
self.cls.predictions.bias = new_embeddings.bias
25282522

25292523
@add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
25302524
@add_code_sample_docstrings(

src/transformers/models/blip/modeling_blip_text.py

-4
Original file line numberDiff line numberDiff line change
@@ -523,9 +523,6 @@ def __init__(self, config):
523523
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
524524
self.decoder.bias = self.bias
525525

526-
def _tie_weights(self):
527-
self.decoder.bias = self.bias
528-
529526
def forward(self, hidden_states):
530527
hidden_states = self.transform(hidden_states)
531528
hidden_states = self.decoder(hidden_states)
@@ -820,7 +817,6 @@ def get_output_embeddings(self):
820817

821818
def set_output_embeddings(self, new_embeddings):
822819
self.cls.predictions.decoder = new_embeddings
823-
self.cls.predictions.bias = new_embeddings.bias
824820

825821
def forward(
826822
self,

src/transformers/models/ernie/modeling_ernie.py

-6
Original file line numberDiff line numberDiff line change
@@ -608,9 +608,6 @@ def __init__(self, config):
608608
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
609609
self.decoder.bias = self.bias
610610

611-
def _tie_weights(self):
612-
self.decoder.bias = self.bias
613-
614611
def forward(self, hidden_states):
615612
hidden_states = self.transform(hidden_states)
616613
hidden_states = self.decoder(hidden_states)
@@ -998,7 +995,6 @@ def get_output_embeddings(self):
998995
# Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings
999996
def set_output_embeddings(self, new_embeddings):
1000997
self.cls.predictions.decoder = new_embeddings
1001-
self.cls.predictions.bias = new_embeddings.bias
1002998

1003999
@add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
10041000
@replace_return_docstrings(output_type=ErnieForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
@@ -1113,7 +1109,6 @@ def get_output_embeddings(self):
11131109
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings
11141110
def set_output_embeddings(self, new_embeddings):
11151111
self.cls.predictions.decoder = new_embeddings
1116-
self.cls.predictions.bias = new_embeddings.bias
11171112

11181113
@add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
11191114
@add_code_sample_docstrings(
@@ -1274,7 +1269,6 @@ def get_output_embeddings(self):
12741269
# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings
12751270
def set_output_embeddings(self, new_embeddings):
12761271
self.cls.predictions.decoder = new_embeddings
1277-
self.cls.predictions.bias = new_embeddings.bias
12781272

12791273
@add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
12801274
@add_code_sample_docstrings(

src/transformers/models/layoutlm/modeling_layoutlm.py

-4
Original file line numberDiff line numberDiff line change
@@ -589,9 +589,6 @@ def __init__(self, config):
589589
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
590590
self.decoder.bias = self.bias
591591

592-
def _tie_weights(self):
593-
self.decoder.bias = self.bias
594-
595592
def forward(self, hidden_states):
596593
hidden_states = self.transform(hidden_states)
597594
hidden_states = self.decoder(hidden_states)
@@ -872,7 +869,6 @@ def get_output_embeddings(self):
872869

873870
def set_output_embeddings(self, new_embeddings):
874871
self.cls.predictions.decoder = new_embeddings
875-
self.cls.predictions.bias = new_embeddings.bias
876872

877873
@add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
878874
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)

src/transformers/models/markuplm/modeling_markuplm.py

-3
Original file line numberDiff line numberDiff line change
@@ -318,9 +318,6 @@ def __init__(self, config):
318318
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
319319
self.decoder.bias = self.bias
320320

321-
def _tie_weights(self):
322-
self.decoder.bias = self.bias
323-
324321
def forward(self, hidden_states):
325322
hidden_states = self.transform(hidden_states)
326323
hidden_states = self.decoder(hidden_states)

src/transformers/models/megatron_bert/modeling_megatron_bert.py

-6
Original file line numberDiff line numberDiff line change
@@ -659,9 +659,6 @@ def __init__(self, config):
659659
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
660660
self.decoder.bias = self.bias
661661

662-
def _tie_weights(self):
663-
self.decoder.bias = self.bias
664-
665662
def forward(self, hidden_states):
666663
hidden_states = self.transform(hidden_states)
667664
hidden_states = self.decoder(hidden_states)
@@ -1026,7 +1023,6 @@ def get_output_embeddings(self):
10261023

10271024
def set_output_embeddings(self, new_embeddings):
10281025
self.cls.predictions.decoder = new_embeddings
1029-
self.cls.predictions.bias = new_embeddings.bias
10301026

10311027
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
10321028
@replace_return_docstrings(output_type=MegatronBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
@@ -1136,7 +1132,6 @@ def get_output_embeddings(self):
11361132

11371133
def set_output_embeddings(self, new_embeddings):
11381134
self.cls.predictions.decoder = new_embeddings
1139-
self.cls.predictions.bias = new_embeddings.bias
11401135

11411136
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
11421137
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
@@ -1295,7 +1290,6 @@ def get_output_embeddings(self):
12951290

12961291
def set_output_embeddings(self, new_embeddings):
12971292
self.cls.predictions.decoder = new_embeddings
1298-
self.cls.predictions.bias = new_embeddings.bias
12991293

13001294
@add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
13011295
@add_code_sample_docstrings(

src/transformers/models/mpnet/modeling_mpnet.py

-4
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,6 @@ def get_output_embeddings(self):
587587

588588
def set_output_embeddings(self, new_embeddings):
589589
self.lm_head.decoder = new_embeddings
590-
self.lm_head.bias = new_embeddings.bias
591590

592591
@add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
593592
@add_code_sample_docstrings(
@@ -660,9 +659,6 @@ def __init__(self, config):
660659
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
661660
self.decoder.bias = self.bias
662661

663-
def _tie_weights(self):
664-
self.decoder.bias = self.bias
665-
666662
def forward(self, features, **kwargs):
667663
x = self.dense(features)
668664
x = gelu(x)

src/transformers/models/mra/modeling_mra.py

-4
Original file line numberDiff line numberDiff line change
@@ -810,9 +810,6 @@ def __init__(self, config):
810810
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
811811
self.decoder.bias = self.bias
812812

813-
def _tie_weights(self):
814-
self.decoder.bias = self.bias
815-
816813
def forward(self, hidden_states):
817814
hidden_states = self.transform(hidden_states)
818815
hidden_states = self.decoder(hidden_states)
@@ -1046,7 +1043,6 @@ def get_output_embeddings(self):
10461043

10471044
def set_output_embeddings(self, new_embeddings):
10481045
self.cls.predictions.decoder = new_embeddings
1049-
self.cls.predictions.bias = new_embeddings.bias
10501046

10511047
@add_start_docstrings_to_model_forward(MRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
10521048
@add_code_sample_docstrings(

src/transformers/models/nezha/modeling_nezha.py

-5
Original file line numberDiff line numberDiff line change
@@ -679,9 +679,6 @@ def __init__(self, config):
679679
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
680680
self.decoder.bias = self.bias
681681

682-
def _tie_weights(self):
683-
self.decoder.bias = self.bias
684-
685682
def forward(self, hidden_states):
686683
hidden_states = self.transform(hidden_states)
687684
hidden_states = self.decoder(hidden_states)
@@ -1047,7 +1044,6 @@ def get_output_embeddings(self):
10471044

10481045
def set_output_embeddings(self, new_embeddings):
10491046
self.cls.predictions.decoder = new_embeddings
1050-
self.cls.predictions.bias = new_embeddings.bias
10511047

10521048
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
10531049
@replace_return_docstrings(output_type=NezhaForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
@@ -1156,7 +1152,6 @@ def get_output_embeddings(self):
11561152

11571153
def set_output_embeddings(self, new_embeddings):
11581154
self.cls.predictions.decoder = new_embeddings
1159-
self.cls.predictions.bias = new_embeddings.bias
11601155

11611156
@add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
11621157
@add_code_sample_docstrings(

src/transformers/models/nystromformer/modeling_nystromformer.py

-4
Original file line numberDiff line numberDiff line change
@@ -428,9 +428,6 @@ def __init__(self, config):
428428
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
429429
self.decoder.bias = self.bias
430430

431-
def _tie_weights(self):
432-
self.decoder.bias = self.bias
433-
434431
def forward(self, hidden_states):
435432
hidden_states = self.transform(hidden_states)
436433
hidden_states = self.decoder(hidden_states)
@@ -669,7 +666,6 @@ def get_output_embeddings(self):
669666

670667
def set_output_embeddings(self, new_embeddings):
671668
self.cls.predictions.decoder = new_embeddings
672-
self.cls.predictions.bias = new_embeddings.bias
673669

674670
@add_start_docstrings_to_model_forward(NYSTROMFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
675671
@add_code_sample_docstrings(

src/transformers/models/qdqbert/modeling_qdqbert.py

-5
Original file line numberDiff line numberDiff line change
@@ -683,9 +683,6 @@ def __init__(self, config):
683683
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
684684
self.decoder.bias = self.bias
685685

686-
def _tie_weights(self):
687-
self.decoder.bias = self.bias
688-
689686
def forward(self, hidden_states):
690687
hidden_states = self.transform(hidden_states)
691688
hidden_states = self.decoder(hidden_states)
@@ -1027,7 +1024,6 @@ def get_output_embeddings(self):
10271024

10281025
def set_output_embeddings(self, new_embeddings):
10291026
self.cls.predictions.decoder = new_embeddings
1030-
self.cls.predictions.bias = new_embeddings.bias
10311027

10321028
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
10331029
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
@@ -1194,7 +1190,6 @@ def get_output_embeddings(self):
11941190

11951191
def set_output_embeddings(self, new_embeddings):
11961192
self.cls.predictions.decoder = new_embeddings
1197-
self.cls.predictions.bias = new_embeddings.bias
11981193

11991194
@add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
12001195
@add_code_sample_docstrings(

src/transformers/models/roc_bert/modeling_roc_bert.py

-6
Original file line numberDiff line numberDiff line change
@@ -744,9 +744,6 @@ def __init__(self, config):
744744
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
745745
self.decoder.bias = self.bias
746746

747-
def _tie_weights(self):
748-
self.decoder.bias = self.bias
749-
750747
def forward(self, hidden_states):
751748
hidden_states = self.transform(hidden_states)
752749
hidden_states = self.decoder(hidden_states)
@@ -1093,7 +1090,6 @@ def get_output_embeddings(self):
10931090
# Copied from transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings
10941091
def set_output_embeddings(self, new_embeddings):
10951092
self.cls.predictions.decoder = new_embeddings
1096-
self.cls.predictions.bias = new_embeddings.bias
10971093

10981094
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
10991095
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
@@ -1286,7 +1282,6 @@ def get_output_embeddings(self):
12861282
# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings
12871283
def set_output_embeddings(self, new_embeddings):
12881284
self.cls.predictions.decoder = new_embeddings
1289-
self.cls.predictions.bias = new_embeddings.bias
12901285

12911286
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
12921287
def forward(
@@ -1424,7 +1419,6 @@ def get_output_embeddings(self):
14241419
# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings
14251420
def set_output_embeddings(self, new_embeddings):
14261421
self.cls.predictions.decoder = new_embeddings
1427-
self.cls.predictions.bias = new_embeddings.bias
14281422

14291423
@add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
14301424
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)

src/transformers/models/tapas/modeling_tapas.py

-4
Original file line numberDiff line numberDiff line change
@@ -729,9 +729,6 @@ def __init__(self, config):
729729
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
730730
self.decoder.bias = self.bias
731731

732-
def _tie_weights(self):
733-
self.decoder.bias = self.bias
734-
735732
def forward(self, hidden_states):
736733
hidden_states = self.transform(hidden_states)
737734
hidden_states = self.decoder(hidden_states)
@@ -1011,7 +1008,6 @@ def get_output_embeddings(self):
10111008

10121009
def set_output_embeddings(self, new_embeddings):
10131010
self.cls.predictions.decoder = new_embeddings
1014-
self.cls.predictions.bias = new_embeddings.bias
10151011

10161012
@add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
10171013
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)

src/transformers/models/vilt/modeling_vilt.py

-4
Original file line numberDiff line numberDiff line change
@@ -896,7 +896,6 @@ def get_output_embeddings(self):
896896

897897
def set_output_embeddings(self, new_embeddings):
898898
self.mlm_score.decoder = new_embeddings
899-
self.mlm_score.bias = new_embeddings.bias
900899

901900
@add_start_docstrings_to_model_forward(VILT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
902901
@replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
@@ -1043,9 +1042,6 @@ def __init__(self, config, weight=None):
10431042
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
10441043
self.decoder.bias = self.bias
10451044

1046-
def _tie_weights(self):
1047-
self.decoder.bias = self.bias
1048-
10491045
def forward(self, x):
10501046
x = self.transform(x)
10511047
x = self.decoder(x)

src/transformers/models/visual_bert/modeling_visual_bert.py

-4
Original file line numberDiff line numberDiff line change
@@ -499,9 +499,6 @@ def __init__(self, config):
499499
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
500500
self.decoder.bias = self.bias
501501

502-
def _tie_weights(self):
503-
self.decoder.bias = self.bias
504-
505502
def forward(self, hidden_states):
506503
hidden_states = self.transform(hidden_states)
507504
hidden_states = self.decoder(hidden_states)
@@ -882,7 +879,6 @@ def get_output_embeddings(self):
882879

883880
def set_output_embeddings(self, new_embeddings):
884881
self.cls.predictions.decoder = new_embeddings
885-
self.cls.predictions.bias = new_embeddings.bias
886882

887883
@add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
888884
@replace_return_docstrings(output_type=VisualBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)

src/transformers/models/yoso/modeling_yoso.py

-4
Original file line numberDiff line numberDiff line change
@@ -626,9 +626,6 @@ def __init__(self, config):
626626
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
627627
self.decoder.bias = self.bias
628628

629-
def _tie_weights(self):
630-
self.decoder.bias = self.bias
631-
632629
def forward(self, hidden_states):
633630
hidden_states = self.transform(hidden_states)
634631
hidden_states = self.decoder(hidden_states)
@@ -867,7 +864,6 @@ def get_output_embeddings(self):
867864

868865
def set_output_embeddings(self, new_embeddings):
869866
self.cls.predictions.decoder = new_embeddings
870-
self.cls.predictions.bias = new_embeddings.bias
871867

872868
@add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
873869
@add_code_sample_docstrings(

0 commit comments

Comments
 (0)