Skip to content

Commit 9be4728

Browse files
Just import torch AdamW instead (#36177)
* Just import torch AdamW instead * Update docs too * Make AdamW undocumented * make fixup * Add a basic wrapper class * Add it back to the docs * Just remove AdamW entirely * Remove some AdamW references * Drop AdamW from the public init * make fix-copies * Cleanup some references * make fixup * Delete lots of transformers.AdamW references * Remove extra references to adamw_hf
1 parent 51bd0ce commit 9be4728

File tree

18 files changed

+18
-174
lines changed

18 files changed

+18
-174
lines changed

docs/source/en/main_classes/optimizer_schedules.md

-3
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@ The `.optimization` module provides:
2222
- several schedules in the form of schedule objects that inherit from `_LRSchedule`:
2323
- a gradient accumulation class to accumulate the gradients of multiple batches
2424

25-
## AdamW (PyTorch)
26-
27-
[[autodoc]] AdamW
2825

2926
## AdaFactor (PyTorch)
3027

docs/source/ja/main_classes/optimizer_schedules.md

-4
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,6 @@ rendered properly in your Markdown viewer.
2222
- `_LRSchedule` から継承するスケジュール オブジェクトの形式のいくつかのスケジュール:
2323
- 複数のバッチの勾配を累積するための勾配累積クラス
2424

25-
## AdamW (PyTorch)
26-
27-
[[autodoc]] AdamW
28-
2925
## AdaFactor (PyTorch)
3026

3127
[[autodoc]] Adafactor

docs/source/zh/main_classes/optimizer_schedules.md

-4
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,6 @@ rendered properly in your Markdown viewer.
2222
- 继承自 `_LRSchedule` 多个调度器:
2323
- 一个梯度累积类,用于累积多个批次的梯度
2424

25-
## AdamW (PyTorch)
26-
27-
[[autodoc]] AdamW
28-
2925
## AdaFactor (PyTorch)
3026

3127
[[autodoc]] Adafactor

examples/legacy/pytorch-lightning/lightning_base.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from pytorch_lightning.utilities import rank_zero_info
99

1010
from transformers import (
11-
AdamW,
1211
AutoConfig,
1312
AutoModel,
1413
AutoModelForPreTraining,
@@ -20,6 +19,7 @@
2019
AutoTokenizer,
2120
PretrainedConfig,
2221
PreTrainedTokenizer,
22+
is_torch_available,
2323
)
2424
from transformers.optimization import (
2525
Adafactor,
@@ -31,6 +31,10 @@
3131
from transformers.utils.versions import require_version
3232

3333

34+
if is_torch_available():
35+
import torch
36+
37+
3438
logger = logging.getLogger(__name__)
3539

3640
require_version("pytorch_lightning>=1.0.4")
@@ -146,7 +150,7 @@ def configure_optimizers(self):
146150
)
147151

148152
else:
149-
optimizer = AdamW(
153+
optimizer = torch.optim.AdamW(
150154
optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
151155
)
152156
self.opt = optimizer

examples/legacy/question-answering/run_squad.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
from transformers import (
3333
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
3434
WEIGHTS_NAME,
35-
AdamW,
3635
AutoConfig,
3736
AutoModelForQuestionAnswering,
3837
AutoTokenizer,
@@ -96,7 +95,7 @@ def train(args, train_dataset, model, tokenizer):
9695
},
9796
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
9897
]
99-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
98+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
10099
scheduler = get_linear_schedule_with_warmup(
101100
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
102101
)

examples/legacy/run_openai_gpt.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
from transformers import (
4444
CONFIG_NAME,
4545
WEIGHTS_NAME,
46-
AdamW,
4746
OpenAIGPTDoubleHeadsModel,
4847
OpenAIGPTTokenizer,
4948
get_linear_schedule_with_warmup,
@@ -236,7 +235,7 @@ def tokenize_and_encode(obj):
236235
},
237236
{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
238237
]
239-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
238+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
240239
scheduler = get_linear_schedule_with_warmup(
241240
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
242241
)

examples/legacy/run_swag.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import transformers
3535
from transformers import (
3636
WEIGHTS_NAME,
37-
AdamW,
3837
AutoConfig,
3938
AutoModelForMultipleChoice,
4039
AutoTokenizer,
@@ -298,7 +297,7 @@ def train(args, train_dataset, model, tokenizer):
298297
},
299298
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
300299
]
301-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
300+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
302301
scheduler = get_linear_schedule_with_warmup(
303302
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
304303
)

examples/legacy/seq2seq/seq2seq_trainer.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from transformers.models.fsmt.configuration_fsmt import FSMTConfig
2323
from transformers.optimization import (
2424
Adafactor,
25-
AdamW,
2625
get_constant_schedule,
2726
get_constant_schedule_with_warmup,
2827
get_cosine_schedule_with_warmup,
@@ -102,12 +101,11 @@ def create_optimizer_and_scheduler(self, num_training_steps: int):
102101
"weight_decay": 0.0,
103102
},
104103
]
105-
optimizer_cls = Adafactor if self.args.adafactor else AdamW
106104
if self.args.adafactor:
107105
optimizer_cls = Adafactor
108106
optimizer_kwargs = {"scale_parameter": False, "relative_step": False}
109107
else:
110-
optimizer_cls = AdamW
108+
optimizer_cls = torch.optim.AdamW
111109
optimizer_kwargs = {
112110
"betas": (self.args.adam_beta1, self.args.adam_beta2),
113111
"eps": self.args.adam_epsilon,

examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141

4242
import transformers
4343
from transformers import (
44-
AdamW,
4544
DataCollatorWithPadding,
4645
EvalPrediction,
4746
SchedulerType,
@@ -767,7 +766,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
767766
"weight_decay": 0.0,
768767
},
769768
]
770-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
769+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
771770

772771
# Scheduler and math around the number of training steps.
773772
overrode_max_train_steps = False

examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333

3434
import transformers
3535
from transformers import (
36-
AdamW,
3736
SchedulerType,
3837
Wav2Vec2Config,
3938
Wav2Vec2FeatureExtractor,
@@ -583,7 +582,7 @@ def prepare_dataset(batch):
583582
)
584583

585584
# Optimizer
586-
optimizer = AdamW(
585+
optimizer = torch.optim.AdamW(
587586
list(model.parameters()),
588587
lr=args.learning_rate,
589588
betas=[args.adam_beta1, args.adam_beta2],

src/transformers/__init__.py

-2
Original file line numberDiff line numberDiff line change
@@ -4111,7 +4111,6 @@
41114111
)
41124112
_import_structure["optimization"] = [
41134113
"Adafactor",
4114-
"AdamW",
41154114
"get_constant_schedule",
41164115
"get_constant_schedule_with_warmup",
41174116
"get_cosine_schedule_with_warmup",
@@ -8758,7 +8757,6 @@
87588757
# Optimization
87598758
from .optimization import (
87608759
Adafactor,
8761-
AdamW,
87628760
get_constant_schedule,
87638761
get_constant_schedule_with_warmup,
87648762
get_cosine_schedule_with_warmup,

src/transformers/optimization.py

+1-116
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,9 @@
1717
import math
1818
import warnings
1919
from functools import partial
20-
from typing import Callable, Iterable, Optional, Tuple, Union
20+
from typing import Optional, Union
2121

2222
import torch
23-
from torch import nn
2423
from torch.optim import Optimizer
2524
from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
2625

@@ -604,120 +603,6 @@ def scheduler_hook(param):
604603
)
605604

606605

607-
class AdamW(Optimizer):
608-
"""
609-
Implements Adam algorithm with weight decay fix as introduced in [Decoupled Weight Decay
610-
Regularization](https://arxiv.org/abs/1711.05101).
611-
612-
Parameters:
613-
params (`Iterable[nn.parameter.Parameter]`):
614-
Iterable of parameters to optimize or dictionaries defining parameter groups.
615-
lr (`float`, *optional*, defaults to 0.001):
616-
The learning rate to use.
617-
betas (`Tuple[float,float]`, *optional*, defaults to `(0.9, 0.999)`):
618-
Adam's betas parameters (b1, b2).
619-
eps (`float`, *optional*, defaults to 1e-06):
620-
Adam's epsilon for numerical stability.
621-
weight_decay (`float`, *optional*, defaults to 0.0):
622-
Decoupled weight decay to apply.
623-
correct_bias (`bool`, *optional*, defaults to `True`):
624-
Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
625-
no_deprecation_warning (`bool`, *optional*, defaults to `False`):
626-
A flag used to disable the deprecation warning (set to `True` to disable the warning).
627-
"""
628-
629-
def __init__(
630-
self,
631-
params: Iterable[nn.parameter.Parameter],
632-
lr: float = 1e-3,
633-
betas: Tuple[float, float] = (0.9, 0.999),
634-
eps: float = 1e-6,
635-
weight_decay: float = 0.0,
636-
correct_bias: bool = True,
637-
no_deprecation_warning: bool = False,
638-
):
639-
if not no_deprecation_warning:
640-
warnings.warn(
641-
"This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch"
642-
" implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this"
643-
" warning",
644-
FutureWarning,
645-
)
646-
require_version("torch>=1.5.0") # add_ with alpha
647-
if lr < 0.0:
648-
raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
649-
if not 0.0 <= betas[0] < 1.0:
650-
raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)")
651-
if not 0.0 <= betas[1] < 1.0:
652-
raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
653-
if not 0.0 <= eps:
654-
raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
655-
defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
656-
super().__init__(params, defaults)
657-
658-
@torch.no_grad()
659-
def step(self, closure: Callable = None):
660-
"""
661-
Performs a single optimization step.
662-
663-
Arguments:
664-
closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
665-
"""
666-
loss = None
667-
if closure is not None:
668-
loss = closure()
669-
670-
for group in self.param_groups:
671-
for p in group["params"]:
672-
if p.grad is None:
673-
continue
674-
grad = p.grad
675-
if grad.is_sparse:
676-
raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
677-
678-
state = self.state[p]
679-
680-
# State initialization
681-
if len(state) == 0:
682-
state["step"] = 0
683-
# Exponential moving average of gradient values
684-
state["exp_avg"] = torch.zeros_like(p)
685-
# Exponential moving average of squared gradient values
686-
state["exp_avg_sq"] = torch.zeros_like(p)
687-
688-
exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
689-
beta1, beta2 = group["betas"]
690-
691-
state["step"] += 1
692-
693-
# Decay the first and second moment running average coefficient
694-
# In-place operations to update the averages at the same time
695-
exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
696-
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
697-
denom = exp_avg_sq.sqrt().add_(group["eps"])
698-
699-
step_size = group["lr"]
700-
if group["correct_bias"]: # No bias correction for Bert
701-
bias_correction1 = 1.0 - beta1 ** state["step"]
702-
bias_correction2 = 1.0 - beta2 ** state["step"]
703-
step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
704-
705-
p.addcdiv_(exp_avg, denom, value=-step_size)
706-
707-
# Just adding the square of the weights to the loss function is *not*
708-
# the correct way of using L2 regularization/weight decay with Adam,
709-
# since that will interact with the m and v parameters in strange ways.
710-
#
711-
# Instead we want to decay the weights in a manner that doesn't interact
712-
# with the m/v parameters. This is equivalent to adding the square
713-
# of the weights to the loss with plain (non-momentum) SGD.
714-
# Add weight decay at the end (fixed version)
715-
if group["weight_decay"] > 0.0:
716-
p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
717-
718-
return loss
719-
720-
721606
class Adafactor(Optimizer):
722607
"""
723608
AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code:

src/transformers/trainer.py

-5
Original file line numberDiff line numberDiff line change
@@ -1421,11 +1421,6 @@ def optimizer_hook(param):
14211421
if args.optim == OptimizerNames.ADAFACTOR:
14221422
optimizer_cls = Adafactor
14231423
optimizer_kwargs.update({"scale_parameter": False, "relative_step": False})
1424-
elif args.optim == OptimizerNames.ADAMW_HF:
1425-
from .optimization import AdamW
1426-
1427-
optimizer_cls = AdamW
1428-
optimizer_kwargs.update(adam_kwargs)
14291424
elif args.optim in [OptimizerNames.ADAMW_TORCH, OptimizerNames.ADAMW_TORCH_FUSED]:
14301425
from torch.optim import AdamW
14311426

src/transformers/training_args.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,6 @@ class OptimizerNames(ExplicitEnum):
146146
Stores the acceptable string identifiers for optimizers.
147147
"""
148148

149-
ADAMW_HF = "adamw_hf"
150149
ADAMW_TORCH = "adamw_torch"
151150
ADAMW_TORCH_FUSED = "adamw_torch_fused"
152151
ADAMW_TORCH_XLA = "adamw_torch_xla"
@@ -628,7 +627,7 @@ class TrainingArguments:
628627
629628
The options should be separated by whitespaces.
630629
optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
631-
The optimizer to use, such as "adamw_hf", "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision",
630+
The optimizer to use, such as "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision",
632631
"adafactor". See `OptimizerNames` in [training_args.py](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py)
633632
for a full list of optimizers.
634633
optim_args (`str`, *optional*):
@@ -2986,7 +2985,7 @@ def set_optimizer(
29862985
29872986
Args:
29882987
name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
2989-
The optimizer to use: `"adamw_hf"`, `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`,
2988+
The optimizer to use: `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`,
29902989
`"adamw_anyprecision"` or `"adafactor"`.
29912990
learning_rate (`float`, *optional*, defaults to 5e-5):
29922991
The initial learning rate.

src/transformers/utils/dummy_pt_objects.py

-7
Original file line numberDiff line numberDiff line change
@@ -10856,13 +10856,6 @@ def __init__(self, *args, **kwargs):
1085610856
requires_backends(self, ["torch"])
1085710857

1085810858

10859-
class AdamW(metaclass=DummyObject):
10860-
_backends = ["torch"]
10861-
10862-
def __init__(self, *args, **kwargs):
10863-
requires_backends(self, ["torch"])
10864-
10865-
1086610859
def get_constant_schedule(*args, **kwargs):
1086710860
requires_backends(get_constant_schedule, ["torch"])
1086810861

templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,6 @@ def _mp_fn(index):
535535
from transformers import (
536536
CONFIG_MAPPING,
537537
MODEL_MAPPING,
538-
AdamW,
539538
AutoConfig,
540539
{{cookiecutter.model_class}},
541540
AutoTokenizer,
@@ -863,7 +862,7 @@ def tokenize_function(examples):
863862
"weight_decay": 0.0,
864863
},
865864
]
866-
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
865+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
867866

868867
# Prepare everything with our `accelerator`.
869868
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(

0 commit comments

Comments
 (0)