|
24 | 24 | from .embeddings import CombinedTimestepLabelEmbeddings
|
25 | 25 |
|
26 | 26 |
|
| 27 | +def drop_path(input, drop_prob: float = 0.0, training: bool = False): |
| 28 | + """ |
| 29 | + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). |
| 30 | +
|
| 31 | + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, |
| 32 | + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... |
| 33 | + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the |
| 34 | + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the |
| 35 | + argument. |
| 36 | + """ |
| 37 | + if drop_prob == 0.0 or not training: |
| 38 | + return input |
| 39 | + keep_prob = 1 - drop_prob |
| 40 | + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets |
| 41 | + random_tensor = keep_prob + paddle.rand(shape, dtype=input.dtype) |
| 42 | + random_tensor = paddle.floor(random_tensor) # binarize |
| 43 | + output = (input / keep_prob) * random_tensor |
| 44 | + return output |
| 45 | + |
| 46 | + |
| 47 | +class DropPath(nn.Layer): |
| 48 | + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" |
| 49 | + |
| 50 | + def __init__(self, drop_prob: Optional[float] = None) -> None: |
| 51 | + super().__init__() |
| 52 | + self.drop_prob = drop_prob |
| 53 | + |
| 54 | + def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: |
| 55 | + return drop_path(hidden_states, self.drop_prob, self.training) |
| 56 | + |
| 57 | + def extra_repr(self) -> str: |
| 58 | + return "p={}".format(self.drop_prob) |
| 59 | + |
| 60 | + |
| 61 | +class Mlp(nn.Layer): |
| 62 | + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0): |
| 63 | + super().__init__() |
| 64 | + out_features = out_features or in_features |
| 65 | + hidden_features = hidden_features or in_features |
| 66 | + self.fc1 = nn.Linear(in_features, hidden_features) |
| 67 | + self.act = act_layer() |
| 68 | + self.fc2 = nn.Linear(hidden_features, out_features) |
| 69 | + self.drop = nn.Dropout(drop) |
| 70 | + |
| 71 | + def forward(self, x): |
| 72 | + x = self.fc1(x) |
| 73 | + x = self.act(x) |
| 74 | + x = self.drop(x) |
| 75 | + x = self.fc2(x) |
| 76 | + x = self.drop(x) |
| 77 | + return x |
| 78 | + |
| 79 | + |
27 | 80 | class AttentionBlock(nn.Layer):
|
28 | 81 | """
|
29 | 82 | An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
|
|
0 commit comments