Skip to content

Commit d99ad13

Browse files
committed
Delete lots of transformers.AdamW references
1 parent c839be7 commit d99ad13

File tree

19 files changed

+25
-39
lines changed

19 files changed

+25
-39
lines changed

examples/legacy/question-answering/run_squad.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
from transformers import (
3333
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
3434
WEIGHTS_NAME,
35-
AdamW,
3635
AutoConfig,
3736
AutoModelForQuestionAnswering,
3837
AutoTokenizer,
@@ -96,7 +95,7 @@ def train(args, train_dataset, model, tokenizer):
9695
},
9796
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
9897
]
99-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
98+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
10099
scheduler = get_linear_schedule_with_warmup(
101100
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
102101
)

examples/legacy/run_openai_gpt.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
from transformers import (
4444
CONFIG_NAME,
4545
WEIGHTS_NAME,
46-
AdamW,
4746
OpenAIGPTDoubleHeadsModel,
4847
OpenAIGPTTokenizer,
4948
get_linear_schedule_with_warmup,
@@ -236,7 +235,7 @@ def tokenize_and_encode(obj):
236235
},
237236
{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
238237
]
239-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
238+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
240239
scheduler = get_linear_schedule_with_warmup(
241240
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
242241
)

examples/legacy/run_swag.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import transformers
3535
from transformers import (
3636
WEIGHTS_NAME,
37-
AdamW,
3837
AutoConfig,
3938
AutoModelForMultipleChoice,
4039
AutoTokenizer,
@@ -298,7 +297,7 @@ def train(args, train_dataset, model, tokenizer):
298297
},
299298
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
300299
]
301-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
300+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
302301
scheduler = get_linear_schedule_with_warmup(
303302
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
304303
)

examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141

4242
import transformers
4343
from transformers import (
44-
AdamW,
4544
DataCollatorWithPadding,
4645
EvalPrediction,
4746
SchedulerType,
@@ -767,7 +766,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
767766
"weight_decay": 0.0,
768767
},
769768
]
770-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
769+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
771770

772771
# Scheduler and math around the number of training steps.
773772
overrode_max_train_steps = False

examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333

3434
import transformers
3535
from transformers import (
36-
AdamW,
3736
SchedulerType,
3837
Wav2Vec2Config,
3938
Wav2Vec2FeatureExtractor,
@@ -583,7 +582,7 @@ def prepare_dataset(batch):
583582
)
584583

585584
# Optimizer
586-
optimizer = AdamW(
585+
optimizer = torch.optim.AdamW(
587586
list(model.parameters()),
588587
lr=args.learning_rate,
589588
betas=[args.adam_beta1, args.adam_beta2],

examples/research_projects/bert-loses-patience/run_glue_with_pabee.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import transformers
3535
from transformers import (
3636
WEIGHTS_NAME,
37-
AdamW,
3837
AlbertConfig,
3938
AlbertTokenizer,
4039
BertConfig,
@@ -95,7 +94,7 @@ def train(args, train_dataset, model, tokenizer):
9594
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
9695
]
9796

98-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
97+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
9998
scheduler = get_linear_schedule_with_warmup(
10099
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
101100
)

examples/research_projects/deebert/run_glue_deebert.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from src.modeling_highway_roberta import DeeRobertaForSequenceClassification
2020
from transformers import (
2121
WEIGHTS_NAME,
22-
AdamW,
2322
BertConfig,
2423
BertTokenizer,
2524
RobertaConfig,
@@ -123,7 +122,7 @@ def train(args, train_dataset, model, tokenizer, train_highway=False):
123122
"weight_decay": 0.0,
124123
},
125124
]
126-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
125+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
127126
scheduler = get_linear_schedule_with_warmup(
128127
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
129128
)

examples/research_projects/distillation/run_squad_w_distillation.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import transformers
3333
from transformers import (
3434
WEIGHTS_NAME,
35-
AdamW,
3635
BertConfig,
3736
BertForQuestionAnswering,
3837
BertTokenizer,
@@ -114,7 +113,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
114113
},
115114
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
116115
]
117-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
116+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
118117
scheduler = get_linear_schedule_with_warmup(
119118
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
120119
)

examples/research_projects/information-gain-filtration/igf/igf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from torch.utils.data import DataLoader
1313
from tqdm import tqdm
1414

15-
from transformers import AdamW, GPT2LMHeadModel, get_linear_schedule_with_warmup
15+
from transformers import GPT2LMHeadModel, get_linear_schedule_with_warmup
1616

1717

1818
logger = logging.getLogger(__name__)
@@ -112,7 +112,7 @@ def recopy_gpt2(orig_model, device, max_steps):
112112
},
113113
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
114114
]
115-
lm_optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
115+
lm_optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
116116
lm_scheduler = get_linear_schedule_with_warmup(lm_optimizer, 0, max_steps)
117117
torch.cuda.empty_cache()
118118
return model, lm_optimizer, lm_scheduler

examples/research_projects/longform-qa/eli5_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
1717
from tqdm import tqdm
1818

19-
from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup
19+
from transformers import AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup
2020

2121

2222
pd.set_option("display.max_colwidth", None)
@@ -316,7 +316,7 @@ def evaluate_qa_retriever(model, dataset, tokenizer, args):
316316

317317

318318
def train_qa_retriever(qar_model, qar_tokenizer, qar_train_dset, qar_valid_dset, qar_args):
319-
qar_optimizer = AdamW(qar_model.parameters(), lr=qar_args.learning_rate, eps=1e-8)
319+
qar_optimizer = torch.optim.AdamW(qar_model.parameters(), lr=qar_args.learning_rate, eps=1e-8)
320320
qar_scheduler = get_linear_schedule_with_warmup(
321321
qar_optimizer,
322322
num_warmup_steps=100,
@@ -493,7 +493,7 @@ def eval_qa_s2s_epoch(model, dataset, tokenizer, args):
493493

494494

495495
def train_qa_s2s(qa_s2s_model, qa_s2s_tokenizer, s2s_train_dset, s2s_valid_dset, s2s_args):
496-
s2s_optimizer = AdamW(qa_s2s_model.parameters(), lr=s2s_args.learning_rate, eps=1e-8)
496+
s2s_optimizer = torch.optim.AdamW(qa_s2s_model.parameters(), lr=s2s_args.learning_rate, eps=1e-8)
497497
s2s_scheduler = get_linear_schedule_with_warmup(
498498
s2s_optimizer,
499499
num_warmup_steps=400,

examples/research_projects/luke/run_luke_ner_no_trainer.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636

3737
import transformers
3838
from transformers import (
39-
AdamW,
4039
LukeConfig,
4140
LukeForEntitySpanClassification,
4241
LukeTokenizer,
@@ -569,7 +568,7 @@ def tokenize_and_align_labels(examples):
569568
"weight_decay": 0.0,
570569
},
571570
]
572-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
571+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
573572

574573
# Use the device given by the `accelerator` object.
575574
device = accelerator.device

examples/research_projects/mm-imdb/run_mmimdb.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import transformers
3535
from transformers import (
3636
WEIGHTS_NAME,
37-
AdamW,
3837
AutoConfig,
3938
AutoModel,
4039
AutoTokenizer,
@@ -93,7 +92,7 @@ def train(args, train_dataset, model, tokenizer, criterion):
9392
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
9493
]
9594

96-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
95+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
9796
scheduler = get_linear_schedule_with_warmup(
9897
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
9998
)

examples/research_projects/movement-pruning/masked_run_glue.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232

3333
from transformers import (
3434
WEIGHTS_NAME,
35-
AdamW,
3635
BertConfig,
3736
BertForSequenceClassification,
3837
BertTokenizer,
@@ -145,7 +144,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
145144
},
146145
]
147146

148-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
147+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
149148
scheduler = get_linear_schedule_with_warmup(
150149
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
151150
)

examples/research_projects/movement-pruning/masked_run_squad.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232

3333
from transformers import (
3434
WEIGHTS_NAME,
35-
AdamW,
3635
BertConfig,
3736
BertForQuestionAnswering,
3837
BertTokenizer,
@@ -152,7 +151,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
152151
},
153152
]
154153

155-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
154+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
156155
scheduler = get_linear_schedule_with_warmup(
157156
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
158157
)

examples/research_projects/rag-end2end-retriever/lightning_base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
from typing import Any, Dict
66

77
import pytorch_lightning as pl
8+
import torch
89
from pytorch_lightning.utilities import rank_zero_info
910

1011
from transformers import (
11-
AdamW,
1212
AutoConfig,
1313
AutoModel,
1414
AutoModelForPreTraining,
@@ -148,7 +148,7 @@ def configure_optimizers(self):
148148
)
149149

150150
else:
151-
optimizer = AdamW(
151+
optimizer = torch.optim.AdamW(
152152
optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
153153
)
154154
self.opt = optimizer

examples/research_projects/rag/lightning_base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
from typing import Any, Dict
66

77
import pytorch_lightning as pl
8+
import torch
89
from pytorch_lightning.utilities import rank_zero_info
910

1011
from transformers import (
11-
AdamW,
1212
AutoConfig,
1313
AutoModel,
1414
AutoModelForPreTraining,
@@ -146,7 +146,7 @@ def configure_optimizers(self):
146146
)
147147

148148
else:
149-
optimizer = AdamW(
149+
optimizer = torch.optim.AdamW(
150150
optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
151151
)
152152
self.opt = optimizer

examples/research_projects/self-training-text-classification/finetuning.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
from tqdm.auto import tqdm
3434

3535
from transformers import (
36-
AdamW,
3736
AutoConfig,
3837
AutoModelForSequenceClassification,
3938
AutoTokenizer,
@@ -749,7 +748,7 @@ def preprocess_function(examples):
749748
"weight_decay": 0.0,
750749
},
751750
]
752-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
751+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
753752

754753
# Prepare everything with our `accelerator`.
755754
model, optimizer, train_dataloader, eval_dataloader, test_dataloader, infer_dataloader = accelerator.prepare(

examples/research_projects/seq2seq-distillation/lightning_base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
from typing import Any, Dict
66

77
import pytorch_lightning as pl
8+
import torch
89
from pytorch_lightning.utilities import rank_zero_info
910

1011
from transformers import (
11-
AdamW,
1212
AutoConfig,
1313
AutoModel,
1414
AutoModelForPreTraining,
@@ -146,7 +146,7 @@ def configure_optimizers(self):
146146
)
147147

148148
else:
149-
optimizer = AdamW(
149+
optimizer = torch.optim.AdamW(
150150
optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
151151
)
152152
self.opt = optimizer

templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,6 @@ def _mp_fn(index):
535535
from transformers import (
536536
CONFIG_MAPPING,
537537
MODEL_MAPPING,
538-
AdamW,
539538
AutoConfig,
540539
{{cookiecutter.model_class}},
541540
AutoTokenizer,
@@ -863,7 +862,7 @@ def tokenize_function(examples):
863862
"weight_decay": 0.0,
864863
},
865864
]
866-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
865+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
867866

868867
# Prepare everything with our `accelerator`.
869868
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(

0 commit comments

Comments
 (0)