I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,334 @@
from .module import Module # usort: skip
from .linear import Bilinear, Identity, LazyLinear, Linear # usort: skip
from .activation import (
CELU,
ELU,
GELU,
GLU,
Hardshrink,
Hardsigmoid,
Hardswish,
Hardtanh,
LeakyReLU,
LogSigmoid,
LogSoftmax,
Mish,
MultiheadAttention,
PReLU,
ReLU,
ReLU6,
RReLU,
SELU,
Sigmoid,
SiLU,
Softmax,
Softmax2d,
Softmin,
Softplus,
Softshrink,
Softsign,
Tanh,
Tanhshrink,
Threshold,
)
from .adaptive import AdaptiveLogSoftmaxWithLoss
from .batchnorm import (
BatchNorm1d,
BatchNorm2d,
BatchNorm3d,
LazyBatchNorm1d,
LazyBatchNorm2d,
LazyBatchNorm3d,
SyncBatchNorm,
)
from .channelshuffle import ChannelShuffle
from .container import (
Container,
ModuleDict,
ModuleList,
ParameterDict,
ParameterList,
Sequential,
)
from .conv import (
Conv1d,
Conv2d,
Conv3d,
ConvTranspose1d,
ConvTranspose2d,
ConvTranspose3d,
LazyConv1d,
LazyConv2d,
LazyConv3d,
LazyConvTranspose1d,
LazyConvTranspose2d,
LazyConvTranspose3d,
)
from .distance import CosineSimilarity, PairwiseDistance
from .dropout import (
AlphaDropout,
Dropout,
Dropout1d,
Dropout2d,
Dropout3d,
FeatureAlphaDropout,
)
from .flatten import Flatten, Unflatten
from .fold import Fold, Unfold
from .instancenorm import (
InstanceNorm1d,
InstanceNorm2d,
InstanceNorm3d,
LazyInstanceNorm1d,
LazyInstanceNorm2d,
LazyInstanceNorm3d,
)
from .loss import (
BCELoss,
BCEWithLogitsLoss,
CosineEmbeddingLoss,
CrossEntropyLoss,
CTCLoss,
GaussianNLLLoss,
HingeEmbeddingLoss,
HuberLoss,
KLDivLoss,
L1Loss,
MarginRankingLoss,
MSELoss,
MultiLabelMarginLoss,
MultiLabelSoftMarginLoss,
MultiMarginLoss,
NLLLoss,
NLLLoss2d,
PoissonNLLLoss,
SmoothL1Loss,
SoftMarginLoss,
TripletMarginLoss,
TripletMarginWithDistanceLoss,
)
from .normalization import (
CrossMapLRN2d,
GroupNorm,
LayerNorm,
LocalResponseNorm,
RMSNorm,
)
from .padding import (
CircularPad1d,
CircularPad2d,
CircularPad3d,
ConstantPad1d,
ConstantPad2d,
ConstantPad3d,
ReflectionPad1d,
ReflectionPad2d,
ReflectionPad3d,
ReplicationPad1d,
ReplicationPad2d,
ReplicationPad3d,
ZeroPad1d,
ZeroPad2d,
ZeroPad3d,
)
from .pixelshuffle import PixelShuffle, PixelUnshuffle
from .pooling import (
AdaptiveAvgPool1d,
AdaptiveAvgPool2d,
AdaptiveAvgPool3d,
AdaptiveMaxPool1d,
AdaptiveMaxPool2d,
AdaptiveMaxPool3d,
AvgPool1d,
AvgPool2d,
AvgPool3d,
FractionalMaxPool2d,
FractionalMaxPool3d,
LPPool1d,
LPPool2d,
LPPool3d,
MaxPool1d,
MaxPool2d,
MaxPool3d,
MaxUnpool1d,
MaxUnpool2d,
MaxUnpool3d,
)
from .rnn import GRU, GRUCell, LSTM, LSTMCell, RNN, RNNBase, RNNCell, RNNCellBase
from .sparse import Embedding, EmbeddingBag
from .transformer import (
Transformer,
TransformerDecoder,
TransformerDecoderLayer,
TransformerEncoder,
TransformerEncoderLayer,
)
from .upsampling import Upsample, UpsamplingBilinear2d, UpsamplingNearest2d
__all__ = [
"AdaptiveAvgPool1d",
"AdaptiveAvgPool2d",
"AdaptiveAvgPool3d",
"AdaptiveLogSoftmaxWithLoss",
"AdaptiveMaxPool1d",
"AdaptiveMaxPool2d",
"AdaptiveMaxPool3d",
"AlphaDropout",
"AvgPool1d",
"AvgPool2d",
"AvgPool3d",
"BCELoss",
"BCEWithLogitsLoss",
"BatchNorm1d",
"BatchNorm2d",
"BatchNorm3d",
"Bilinear",
"CELU",
"CTCLoss",
"ChannelShuffle",
"CircularPad1d",
"CircularPad2d",
"CircularPad3d",
"ConstantPad1d",
"ConstantPad2d",
"ConstantPad3d",
"Container",
"Conv1d",
"Conv2d",
"Conv3d",
"ConvTranspose1d",
"ConvTranspose2d",
"ConvTranspose3d",
"CosineEmbeddingLoss",
"CosineSimilarity",
"CrossEntropyLoss",
"CrossMapLRN2d",
"Dropout",
"Dropout1d",
"Dropout2d",
"Dropout3d",
"ELU",
"Embedding",
"EmbeddingBag",
"FeatureAlphaDropout",
"Flatten",
"Fold",
"FractionalMaxPool2d",
"FractionalMaxPool3d",
"GELU",
"GLU",
"GRU",
"GRUCell",
"GaussianNLLLoss",
"GroupNorm",
"Hardshrink",
"Hardsigmoid",
"Hardswish",
"Hardtanh",
"HingeEmbeddingLoss",
"HuberLoss",
"Identity",
"InstanceNorm1d",
"InstanceNorm2d",
"InstanceNorm3d",
"KLDivLoss",
"L1Loss",
"LPPool1d",
"LPPool2d",
"LPPool3d",
"LSTM",
"LSTMCell",
"LayerNorm",
"LazyBatchNorm1d",
"LazyBatchNorm2d",
"LazyBatchNorm3d",
"LazyConv1d",
"LazyConv2d",
"LazyConv3d",
"LazyConvTranspose1d",
"LazyConvTranspose2d",
"LazyConvTranspose3d",
"LazyInstanceNorm1d",
"LazyInstanceNorm2d",
"LazyInstanceNorm3d",
"LazyLinear",
"LeakyReLU",
"Linear",
"LocalResponseNorm",
"LogSigmoid",
"LogSoftmax",
"MSELoss",
"MarginRankingLoss",
"MaxPool1d",
"MaxPool2d",
"MaxPool3d",
"MaxUnpool1d",
"MaxUnpool2d",
"MaxUnpool3d",
"Mish",
"Module",
"ModuleDict",
"ModuleList",
"MultiLabelMarginLoss",
"MultiLabelSoftMarginLoss",
"MultiMarginLoss",
"MultiheadAttention",
"NLLLoss",
"NLLLoss2d",
"PReLU",
"PairwiseDistance",
"ParameterDict",
"ParameterList",
"PixelShuffle",
"PixelUnshuffle",
"PoissonNLLLoss",
"RMSNorm",
"RNN",
"RNNBase",
"RNNCell",
"RNNCellBase",
"RReLU",
"ReLU",
"ReLU6",
"ReflectionPad1d",
"ReflectionPad2d",
"ReflectionPad3d",
"ReplicationPad1d",
"ReplicationPad2d",
"ReplicationPad3d",
"SELU",
"Sequential",
"SiLU",
"Sigmoid",
"SmoothL1Loss",
"SoftMarginLoss",
"Softmax",
"Softmax2d",
"Softmin",
"Softplus",
"Softshrink",
"Softsign",
"SyncBatchNorm",
"Tanh",
"Tanhshrink",
"Threshold",
"Transformer",
"TransformerDecoder",
"TransformerDecoderLayer",
"TransformerEncoder",
"TransformerEncoderLayer",
"TripletMarginLoss",
"TripletMarginWithDistanceLoss",
"Unflatten",
"Unfold",
"Upsample",
"UpsamplingBilinear2d",
"UpsamplingNearest2d",
"ZeroPad1d",
"ZeroPad2d",
"ZeroPad3d",
]
# Please keep this list sorted
assert __all__ == sorted(__all__)

View File

@ -0,0 +1,319 @@
# mypy: allow-untyped-defs
import torch
import torch.distributed as dist
from torch.autograd.function import Function
class SyncBatchNorm(Function):
@staticmethod
def forward(
self,
input,
weight,
bias,
running_mean,
running_var,
eps,
momentum,
process_group,
world_size,
):
if not (
input.is_contiguous(memory_format=torch.channels_last)
or input.is_contiguous(memory_format=torch.channels_last_3d)
):
input = input.contiguous()
if weight is not None:
weight = weight.contiguous()
size = int(input.numel() // input.size(1))
if size == 1 and world_size < 2:
raise ValueError(
f"Expected more than 1 value per channel when training, got input size {size}"
)
num_channels = input.shape[1]
if input.numel() > 0:
# calculate mean/invstd for input.
mean, invstd = torch.batch_norm_stats(input, eps)
count = torch.full(
(1,),
input.numel() // input.size(1),
dtype=mean.dtype,
device=mean.device,
)
# C, C, 1 -> (2C + 1)
combined = torch.cat([mean, invstd, count], dim=0)
else:
# for empty input, set stats and the count to zero. The stats with
# zero count will be filtered out later when computing global mean
# & invstd, but they still needs to participate the all_gather
# collective communication to unblock other peer processes.
combined = torch.zeros(
2 * num_channels + 1, dtype=input.dtype, device=input.device
)
# Use allgather instead of allreduce because count could be different across
# ranks, simple all reduce op can not give correct results.
# batch_norm_gather_stats_with_counts calculates global mean & invstd based on
# all gathered mean, invstd and count.
# for nccl backend, use the optimized version of all gather.
# The Gloo backend does not support `all_gather_into_tensor`.
if process_group._get_backend_name() != "gloo":
# world_size * (2C + 1)
combined_size = combined.numel()
combined_flat = torch.empty(
1,
combined_size * world_size,
dtype=combined.dtype,
device=combined.device,
)
dist.all_gather_into_tensor(
combined_flat, combined, process_group, async_op=False
)
combined = torch.reshape(combined_flat, (world_size, combined_size))
# world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
else:
# world_size * (2C + 1)
combined_list = [torch.empty_like(combined) for _ in range(world_size)]
dist.all_gather(combined_list, combined, process_group, async_op=False)
combined = torch.stack(combined_list, dim=0)
# world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
if not (torch.cuda.is_available() and torch.cuda.is_current_stream_capturing()):
# The lines below force a synchronization between CUDA and CPU, because
# the shape of the result count_all depends on the values in mask tensor.
# Such synchronizations break CUDA Graph capturing.
# See https://github.com/pytorch/pytorch/issues/78549
# FIXME: https://github.com/pytorch/pytorch/issues/78656 describes
# a better longer-term solution.
# remove stats from empty inputs
mask = count_all.squeeze(-1) >= 1
count_all = count_all[mask]
mean_all = mean_all[mask]
invstd_all = invstd_all[mask]
# calculate global mean & invstd
counts = count_all.view(-1)
if running_mean is not None and counts.dtype != running_mean.dtype:
counts = counts.to(running_mean.dtype)
mean, invstd = torch.batch_norm_gather_stats_with_counts(
input,
mean_all,
invstd_all,
running_mean,
running_var,
momentum,
eps,
counts,
)
self.save_for_backward(input, weight, mean, invstd, count_all.to(torch.int32))
self.process_group = process_group
# apply element-wise normalization
if input.numel() > 0:
return torch.batch_norm_elemt(input, weight, bias, mean, invstd, eps)
else:
return torch.empty_like(input)
@staticmethod
def backward(self, grad_output):
if not (
grad_output.is_contiguous(memory_format=torch.channels_last)
or grad_output.is_contiguous(memory_format=torch.channels_last_3d)
):
grad_output = grad_output.contiguous()
saved_input, weight, mean, invstd, count_tensor = self.saved_tensors
grad_input = grad_weight = grad_bias = None
process_group = self.process_group
if saved_input.numel() > 0:
# calculate local stats as well as grad_weight / grad_bias
(
sum_dy,
sum_dy_xmu,
grad_weight,
grad_bias,
) = torch.batch_norm_backward_reduce(
grad_output,
saved_input,
mean,
invstd,
weight,
self.needs_input_grad[0],
self.needs_input_grad[1],
self.needs_input_grad[2],
)
if self.needs_input_grad[0]:
# synchronizing stats used to calculate input gradient.
num_channels = sum_dy.shape[0]
combined = torch.cat([sum_dy, sum_dy_xmu], dim=0)
torch.distributed.all_reduce(
combined,
torch.distributed.ReduceOp.SUM,
process_group,
async_op=False,
)
sum_dy, sum_dy_xmu = torch.split(combined, num_channels)
# backward pass for gradient calculation
if weight is not None and weight.dtype != mean.dtype:
weight = weight.to(mean.dtype)
grad_input = torch.batch_norm_backward_elemt(
grad_output,
saved_input,
mean,
invstd,
weight,
sum_dy,
sum_dy_xmu,
count_tensor,
)
# synchronizing of grad_weight / grad_bias is not needed as distributed
# training would handle all reduce.
if weight is None or not self.needs_input_grad[1]:
grad_weight = None
if weight is None or not self.needs_input_grad[2]:
grad_bias = None
else:
# This process got an empty input tensor in the forward pass.
# Although this process can directly set grad_input as an empty
# tensor of zeros, it still needs to participate in the collective
# communication to unblock its peers, as other peer processes might
# have received non-empty inputs.
num_channels = saved_input.shape[1]
if self.needs_input_grad[0]:
# launch all_reduce to unblock other peer processes
combined = torch.zeros(
2 * num_channels, dtype=saved_input.dtype, device=saved_input.device
)
torch.distributed.all_reduce(
combined,
torch.distributed.ReduceOp.SUM,
process_group,
async_op=False,
)
# Leave grad_input, grad_weight and grad_bias as None, which will be
# interpreted by the autograd engine as Tensors full of zeros.
return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
class CrossMapLRN2d(Function):
@staticmethod
def forward(ctx, input, size, alpha=1e-4, beta=0.75, k=1):
ctx.size = size
ctx.alpha = alpha
ctx.beta = beta
ctx.k = k
ctx.scale = None
if input.dim() != 4:
raise ValueError(
f"CrossMapLRN2d: Expected input to be 4D, got {input.dim()}D instead."
)
ctx.scale = ctx.scale or input.new()
output = input.new()
batch_size = input.size(0)
channels = input.size(1)
input_height = input.size(2)
input_width = input.size(3)
output.resize_as_(input)
ctx.scale.resize_as_(input)
# use output storage as temporary buffer
input_square = output
torch.pow(input, 2, out=input_square)
pre_pad = int((ctx.size - 1) / 2 + 1)
pre_pad_crop = min(pre_pad, channels)
scale_first = ctx.scale.select(1, 0)
scale_first.zero_()
# compute first feature map normalization
for c in range(pre_pad_crop):
scale_first.add_(input_square.select(1, c))
# reuse computations for next feature maps normalization
# by adding the next feature map and removing the previous
for c in range(1, channels):
scale_previous = ctx.scale.select(1, c - 1)
scale_current = ctx.scale.select(1, c)
scale_current.copy_(scale_previous)
if c < channels - pre_pad + 1:
square_next = input_square.select(1, c + pre_pad - 1)
scale_current.add_(square_next, alpha=1)
if c > pre_pad:
square_previous = input_square.select(1, c - pre_pad)
scale_current.add_(square_previous, alpha=-1)
ctx.scale.mul_(ctx.alpha / ctx.size).add_(ctx.k)
torch.pow(ctx.scale, -ctx.beta, out=output)
output.mul_(input)
ctx.save_for_backward(input, output)
return output
@staticmethod
def backward(ctx, grad_output):
input, output = ctx.saved_tensors
grad_input = grad_output.new()
batch_size = input.size(0)
channels = input.size(1)
input_height = input.size(2)
input_width = input.size(3)
paddded_ratio = input.new(channels + ctx.size - 1, input_height, input_width)
accum_ratio = input.new(input_height, input_width)
cache_ratio_value = 2 * ctx.alpha * ctx.beta / ctx.size
inversePrePad = int(ctx.size - (ctx.size - 1) / 2)
grad_input.resize_as_(input)
torch.pow(ctx.scale, -ctx.beta, out=grad_input).mul_(grad_output)
paddded_ratio.zero_()
padded_ratio_center = paddded_ratio.narrow(0, inversePrePad, channels)
for n in range(batch_size):
torch.mul(grad_output[n], output[n], out=padded_ratio_center)
padded_ratio_center.div_(ctx.scale[n])
torch.sum(
paddded_ratio.narrow(0, 0, ctx.size - 1),
0,
keepdim=False,
out=accum_ratio,
)
for c in range(channels):
accum_ratio.add_(paddded_ratio[c + ctx.size - 1])
grad_input[n][c].addcmul_(
input[n][c], accum_ratio, value=-cache_ratio_value
)
accum_ratio.add_(paddded_ratio[c], alpha=-1)
return grad_input, None, None, None, None
class BackwardHookFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, *args):
ctx.mark_non_differentiable(*[arg for arg in args if not arg.requires_grad])
return args
@staticmethod
def backward(ctx, *args):
return args

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,330 @@
# mypy: allow-untyped-defs
from collections import namedtuple
from typing import List, Sequence
import torch
import torch.nn.functional as F
from torch import Tensor
from .container import ModuleList, Sequential
from .linear import Linear
from .module import Module
__all__ = ["AdaptiveLogSoftmaxWithLoss"]
_ASMoutput = namedtuple("_ASMoutput", ["output", "loss"])
class AdaptiveLogSoftmaxWithLoss(Module):
"""Efficient softmax approximation.
As described in
`Efficient softmax approximation for GPUs by Edouard Grave, Armand Joulin,
Moustapha Ciss\u00e9, David Grangier, and Herv\u00e9 J\u00e9gou
<https://arxiv.org/abs/1609.04309>`__.
""" r"""
Adaptive softmax is an approximate strategy for training models with large
output spaces. It is most effective when the label distribution is highly
imbalanced, for example in natural language modelling, where the word
frequency distribution approximately follows the `Zipf's law`_.
Adaptive softmax partitions the labels into several clusters, according to
their frequency. These clusters may contain different number of targets
each.
Additionally, clusters containing less frequent labels assign lower
dimensional embeddings to those labels, which speeds up the computation.
For each minibatch, only clusters for which at least one target is
present are evaluated.
The idea is that the clusters which are accessed frequently
(like the first one, containing most frequent labels), should also be cheap
to compute -- that is, contain a small number of assigned labels.
We highly recommend taking a look at the original paper for more details.
* :attr:`cutoffs` should be an ordered Sequence of integers sorted
in the increasing order.
It controls number of clusters and the partitioning of targets into
clusters. For example setting ``cutoffs = [10, 100, 1000]``
means that first `10` targets will be assigned
to the 'head' of the adaptive softmax, targets `11, 12, ..., 100` will be
assigned to the first cluster, and targets `101, 102, ..., 1000` will be
assigned to the second cluster, while targets
`1001, 1002, ..., n_classes - 1` will be assigned
to the last, third cluster.
* :attr:`div_value` is used to compute the size of each additional cluster,
which is given as
:math:`\left\lfloor\frac{\texttt{in\_features}}{\texttt{div\_value}^{idx}}\right\rfloor`,
where :math:`idx` is the cluster index (with clusters
for less frequent words having larger indices,
and indices starting from :math:`1`).
* :attr:`head_bias` if set to True, adds a bias term to the 'head' of the
adaptive softmax. See paper for details. Set to False in the official
implementation.
.. warning::
Labels passed as inputs to this module should be sorted according to
their frequency. This means that the most frequent label should be
represented by the index `0`, and the least frequent
label should be represented by the index `n_classes - 1`.
.. note::
This module returns a ``NamedTuple`` with ``output``
and ``loss`` fields. See further documentation for details.
.. note::
To compute log-probabilities for all classes, the ``log_prob``
method can be used.
Args:
in_features (int): Number of features in the input tensor
n_classes (int): Number of classes in the dataset
cutoffs (Sequence): Cutoffs used to assign targets to their buckets
div_value (float, optional): value used as an exponent to compute sizes
of the clusters. Default: 4.0
head_bias (bool, optional): If ``True``, adds a bias term to the 'head' of the
adaptive softmax. Default: ``False``
Returns:
``NamedTuple`` with ``output`` and ``loss`` fields:
* **output** is a Tensor of size ``N`` containing computed target
log probabilities for each example
* **loss** is a Scalar representing the computed negative
log likelihood loss
Shape:
- input: :math:`(N, \texttt{in\_features})` or :math:`(\texttt{in\_features})`
- target: :math:`(N)` or :math:`()` where each value satisfies :math:`0 <= \texttt{target[i]} <= \texttt{n\_classes}`
- output1: :math:`(N)` or :math:`()`
- output2: ``Scalar``
.. _Zipf's law: https://en.wikipedia.org/wiki/Zipf%27s_law
"""
in_features: int
n_classes: int
cutoffs: List[int]
div_value: float
head_bias: bool
head: Linear
tail: ModuleList
def __init__(
self,
in_features: int,
n_classes: int,
cutoffs: Sequence[int],
div_value: float = 4.0,
head_bias: bool = False,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
cutoffs = list(cutoffs)
if len(cutoffs) == 0:
raise ValueError("cutoffs should be a sequence of length larger than 0")
if (
(cutoffs != sorted(cutoffs))
or (min(cutoffs) <= 0)
or (max(cutoffs) > (n_classes - 1))
or (len(set(cutoffs)) != len(cutoffs))
or any(int(c) != c for c in cutoffs)
):
raise ValueError(
"cutoffs should be a sequence of unique, positive "
"integers sorted in an increasing order, where "
"each value is between 1 and n_classes-1"
)
self.in_features = in_features
self.n_classes = n_classes
self.cutoffs = cutoffs + [n_classes]
self.div_value = div_value
self.head_bias = head_bias
self.shortlist_size = self.cutoffs[0]
self.n_clusters = len(self.cutoffs) - 1
self.head_size = self.shortlist_size + self.n_clusters
self.head = Linear(
self.in_features, self.head_size, bias=self.head_bias, **factory_kwargs
)
self.tail = ModuleList()
for i in range(self.n_clusters):
hsz = int(self.in_features // (self.div_value ** (i + 1)))
osz = self.cutoffs[i + 1] - self.cutoffs[i]
projection = Sequential(
Linear(self.in_features, hsz, bias=False, **factory_kwargs),
Linear(hsz, osz, bias=False, **factory_kwargs),
)
self.tail.append(projection)
def reset_parameters(self) -> None:
self.head.reset_parameters()
for i2h, h2o in self.tail:
i2h.reset_parameters()
h2o.reset_parameters()
def forward(self, input_: Tensor, target_: Tensor) -> _ASMoutput:
targ_dim = target_.dim()
if targ_dim == 1:
if input_.size(0) != target_.size(0):
raise RuntimeError(
"Input and target should have the same size "
"in the batch dimension."
)
if input_.dim() != 2:
raise RuntimeError(
"1D target tensor expects 2D input tensors, "
"but found inputs with size",
input_.size(),
)
elif targ_dim == 0:
if input_.dim() != 1:
raise RuntimeError(
"0D target tensor expects 1D input tensors, "
"but found inputs with size",
input_.size(),
)
else:
raise RuntimeError(
"0D or 1D target tensor expected, " "multi-target not supported"
)
is_batched = targ_dim > 0
input = input_ if is_batched else input_.unsqueeze(0)
target = target_ if is_batched else target_.unsqueeze(0)
used_rows = 0
batch_size = target.size(0)
output = input.new_zeros(batch_size)
gather_inds = target.new_empty(batch_size)
cutoff_values = [0] + self.cutoffs
for i in range(len(cutoff_values) - 1):
low_idx = cutoff_values[i]
high_idx = cutoff_values[i + 1]
target_mask = (target >= low_idx) & (target < high_idx)
row_indices = target_mask.nonzero().squeeze()
if row_indices.numel() == 0:
continue
if i == 0:
gather_inds.index_copy_(0, row_indices, target[target_mask])
else:
relative_target = target[target_mask] - low_idx
input_subset = input.index_select(0, row_indices)
cluster_output = self.tail[i - 1](input_subset)
cluster_index = self.shortlist_size + i - 1
gather_inds.index_fill_(0, row_indices, cluster_index)
cluster_logprob = F.log_softmax(cluster_output, dim=1)
local_logprob = cluster_logprob.gather(1, relative_target.unsqueeze(1))
output.index_copy_(0, row_indices, local_logprob.squeeze(1))
used_rows += row_indices.numel()
if used_rows != batch_size:
raise RuntimeError(
f"Target values should be in [0, {self.n_classes - 1}], "
f"but values in range [{target.min().item()}, {target.max().item()}] "
"were found. "
)
head_output = self.head(input)
head_logprob = F.log_softmax(head_output, dim=1)
output += head_logprob.gather(1, gather_inds.unsqueeze(1)).squeeze()
loss = (-output).mean()
if not is_batched:
output = output.squeeze(0)
return _ASMoutput(output, loss)
def _get_full_log_prob(self, input, head_output):
"""Given input tensor, and output of ``self.head``, compute the log of the full distribution."""
out = input.new_empty((head_output.size(0), self.n_classes))
head_logprob = F.log_softmax(head_output, dim=1)
out[:, : self.shortlist_size] = head_logprob[:, : self.shortlist_size]
for i, (start_idx, stop_idx) in enumerate(zip(self.cutoffs, self.cutoffs[1:])):
cluster_output = self.tail[i](input)
cluster_logprob = F.log_softmax(cluster_output, dim=1)
output_logprob = cluster_logprob + head_logprob[
:, self.shortlist_size + i
].unsqueeze(1)
out[:, start_idx:stop_idx] = output_logprob
return out
def log_prob(self, input: Tensor) -> Tensor:
r"""Compute log probabilities for all :math:`\texttt{n\_classes}`.
Args:
input (Tensor): a minibatch of examples
Returns:
log-probabilities of for each class :math:`c`
in range :math:`0 <= c <= \texttt{n\_classes}`, where :math:`\texttt{n\_classes}` is a
parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
Shape:
- Input: :math:`(N, \texttt{in\_features})`
- Output: :math:`(N, \texttt{n\_classes})`
"""
head_output = self.head(input)
return self._get_full_log_prob(input, head_output)
def predict(self, input: Tensor) -> Tensor:
r"""Return the class with the highest probability for each example in the input minibatch.
This is equivalent to ``self.log_prob(input).argmax(dim=1)``, but is more efficient in some cases.
Args:
input (Tensor): a minibatch of examples
Returns:
output (Tensor): a class with the highest probability for each example
Shape:
- Input: :math:`(N, \texttt{in\_features})`
- Output: :math:`(N)`
"""
head_output = self.head(input)
output = torch.argmax(head_output, dim=1)
not_in_shortlist = output >= self.shortlist_size
all_in_shortlist = not (not_in_shortlist.any())
if all_in_shortlist:
return output
elif not_in_shortlist.all():
log_prob = self._get_full_log_prob(input, head_output)
return torch.argmax(log_prob, dim=1)
else:
log_prob = self._get_full_log_prob(
input[not_in_shortlist], head_output[not_in_shortlist]
)
output[not_in_shortlist] = torch.argmax(log_prob, dim=1)
return output

View File

@ -0,0 +1,883 @@
# mypy: allow-untyped-defs
from typing import Any, Optional
import torch
from torch import Tensor
from torch.nn import functional as F, init
from torch.nn.parameter import Parameter, UninitializedBuffer, UninitializedParameter
from ._functions import SyncBatchNorm as sync_batch_norm
from .lazy import LazyModuleMixin
from .module import Module
__all__ = [
"BatchNorm1d",
"LazyBatchNorm1d",
"BatchNorm2d",
"LazyBatchNorm2d",
"BatchNorm3d",
"LazyBatchNorm3d",
"SyncBatchNorm",
]
class _NormBase(Module):
"""Common base of _InstanceNorm and _BatchNorm."""
_version = 2
__constants__ = ["track_running_stats", "momentum", "eps", "num_features", "affine"]
num_features: int
eps: float
momentum: Optional[float]
affine: bool
track_running_stats: bool
# WARNING: weight and bias purposely not defined here.
# See https://github.com/pytorch/pytorch/issues/39670
def __init__(
self,
num_features: int,
eps: float = 1e-5,
momentum: Optional[float] = 0.1,
affine: bool = True,
track_running_stats: bool = True,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
self.num_features = num_features
self.eps = eps
self.momentum = momentum
self.affine = affine
self.track_running_stats = track_running_stats
if self.affine:
self.weight = Parameter(torch.empty(num_features, **factory_kwargs))
self.bias = Parameter(torch.empty(num_features, **factory_kwargs))
else:
self.register_parameter("weight", None)
self.register_parameter("bias", None)
if self.track_running_stats:
self.register_buffer(
"running_mean", torch.zeros(num_features, **factory_kwargs)
)
self.register_buffer(
"running_var", torch.ones(num_features, **factory_kwargs)
)
self.running_mean: Optional[Tensor]
self.running_var: Optional[Tensor]
self.register_buffer(
"num_batches_tracked",
torch.tensor(
0,
dtype=torch.long,
**{k: v for k, v in factory_kwargs.items() if k != "dtype"},
),
)
self.num_batches_tracked: Optional[Tensor]
else:
self.register_buffer("running_mean", None)
self.register_buffer("running_var", None)
self.register_buffer("num_batches_tracked", None)
self.reset_parameters()
def reset_running_stats(self) -> None:
if self.track_running_stats:
# running_mean/running_var/num_batches... are registered at runtime depending
# if self.track_running_stats is on
self.running_mean.zero_() # type: ignore[union-attr]
self.running_var.fill_(1) # type: ignore[union-attr]
self.num_batches_tracked.zero_() # type: ignore[union-attr,operator]
def reset_parameters(self) -> None:
self.reset_running_stats()
if self.affine:
init.ones_(self.weight)
init.zeros_(self.bias)
def _check_input_dim(self, input):
raise NotImplementedError
def extra_repr(self):
return (
"{num_features}, eps={eps}, momentum={momentum}, affine={affine}, "
"track_running_stats={track_running_stats}".format(**self.__dict__)
)
def _load_from_state_dict(
self,
state_dict,
prefix,
local_metadata,
strict,
missing_keys,
unexpected_keys,
error_msgs,
):
version = local_metadata.get("version", None)
if (version is None or version < 2) and self.track_running_stats:
# at version 2: added num_batches_tracked buffer
# this should have a default value of 0
num_batches_tracked_key = prefix + "num_batches_tracked"
if num_batches_tracked_key not in state_dict:
state_dict[num_batches_tracked_key] = (
self.num_batches_tracked
if self.num_batches_tracked is not None
and self.num_batches_tracked.device != torch.device("meta")
else torch.tensor(0, dtype=torch.long)
)
super()._load_from_state_dict(
state_dict,
prefix,
local_metadata,
strict,
missing_keys,
unexpected_keys,
error_msgs,
)
class _BatchNorm(_NormBase):
def __init__(
self,
num_features: int,
eps: float = 1e-5,
momentum: Optional[float] = 0.1,
affine: bool = True,
track_running_stats: bool = True,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__(
num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
)
def forward(self, input: Tensor) -> Tensor:
self._check_input_dim(input)
# exponential_average_factor is set to self.momentum
# (when it is available) only so that it gets updated
# in ONNX graph when this node is exported to ONNX.
if self.momentum is None:
exponential_average_factor = 0.0
else:
exponential_average_factor = self.momentum
if self.training and self.track_running_stats:
# TODO: if statement only here to tell the jit to skip emitting this when it is None
if self.num_batches_tracked is not None: # type: ignore[has-type]
self.num_batches_tracked.add_(1) # type: ignore[has-type]
if self.momentum is None: # use cumulative moving average
exponential_average_factor = 1.0 / float(self.num_batches_tracked)
else: # use exponential moving average
exponential_average_factor = self.momentum
r"""
Decide whether the mini-batch stats should be used for normalization rather than the buffers.
Mini-batch stats are used in training mode, and in eval mode when buffers are None.
"""
if self.training:
bn_training = True
else:
bn_training = (self.running_mean is None) and (self.running_var is None)
r"""
Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
used for normalization (i.e. in eval mode when buffers are not None).
"""
return F.batch_norm(
input,
# If buffers are not to be tracked, ensure that they won't be updated
self.running_mean
if not self.training or self.track_running_stats
else None,
self.running_var if not self.training or self.track_running_stats else None,
self.weight,
self.bias,
bn_training,
exponential_average_factor,
self.eps,
)
class _LazyNormBase(LazyModuleMixin, _NormBase):
weight: UninitializedParameter # type: ignore[assignment]
bias: UninitializedParameter # type: ignore[assignment]
def __init__(
self,
eps=1e-5,
momentum=0.1,
affine=True,
track_running_stats=True,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__(
# affine and track_running_stats are hardcoded to False to
# avoid creating tensors that will soon be overwritten.
0,
eps,
momentum,
False,
False,
**factory_kwargs,
)
self.affine = affine
self.track_running_stats = track_running_stats
if self.affine:
self.weight = UninitializedParameter(**factory_kwargs)
self.bias = UninitializedParameter(**factory_kwargs)
if self.track_running_stats:
self.running_mean = UninitializedBuffer(**factory_kwargs)
self.running_var = UninitializedBuffer(**factory_kwargs)
self.num_batches_tracked = torch.tensor(
0,
dtype=torch.long,
**{k: v for k, v in factory_kwargs.items() if k != "dtype"},
)
def reset_parameters(self) -> None:
if not self.has_uninitialized_params() and self.num_features != 0:
super().reset_parameters()
def initialize_parameters(self, input) -> None: # type: ignore[override]
if self.has_uninitialized_params():
self.num_features = input.shape[1]
if self.affine:
assert isinstance(self.weight, UninitializedParameter)
assert isinstance(self.bias, UninitializedParameter)
self.weight.materialize((self.num_features,))
self.bias.materialize((self.num_features,))
if self.track_running_stats:
self.running_mean.materialize( # type:ignore[union-attr]
(self.num_features,)
)
self.running_var.materialize( # type:ignore[union-attr]
(self.num_features,)
)
self.reset_parameters()
class BatchNorm1d(_BatchNorm):
r"""Applies Batch Normalization over a 2D or 3D input.
Method described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
.. math::
y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the number of features or channels of the input). By default, the
elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
At train time in the forward pass, the standard-deviation is calculated via the biased estimator,
equivalent to ``torch.var(input, unbiased=False)``. However, the value stored in the
moving average of the standard-deviation is calculated via the unbiased estimator, equivalent to
``torch.var(input, unbiased=True)``.
Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.
If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.
.. note::
This :attr:`momentum` argument is different from one used in optimizer
classes and the conventional notion of momentum. Mathematically, the
update rule for running statistics here is
:math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
new observed value.
Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.
Args:
num_features: number of features or channels :math:`C` of the input
eps: a value added to the denominator for numerical stability.
Default: 1e-5
momentum: the value used for the running_mean and running_var
computation. Can be set to ``None`` for cumulative moving average
(i.e. simple average). Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters. Default: ``True``
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics, and initializes statistics
buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
When these buffers are ``None``, this module always uses batch statistics.
in both training and eval modes. Default: ``True``
Shape:
- Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
:math:`C` is the number of features or channels, and :math:`L` is the sequence length
- Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
Examples::
>>> # With Learnable Parameters
>>> m = nn.BatchNorm1d(100)
>>> # Without Learnable Parameters
>>> m = nn.BatchNorm1d(100, affine=False)
>>> input = torch.randn(20, 100)
>>> output = m(input)
"""
def _check_input_dim(self, input):
if input.dim() != 2 and input.dim() != 3:
raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
class LazyBatchNorm1d(_LazyNormBase, _BatchNorm):
r"""A :class:`torch.nn.BatchNorm1d` module with lazy initialization.
Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.
Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.
Args:
eps: a value added to the denominator for numerical stability.
Default: 1e-5
momentum: the value used for the running_mean and running_var
computation. Can be set to ``None`` for cumulative moving average
(i.e. simple average). Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters. Default: ``True``
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics, and initializes statistics
buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
When these buffers are ``None``, this module always uses batch statistics.
in both training and eval modes. Default: ``True``
"""
cls_to_become = BatchNorm1d # type: ignore[assignment]
def _check_input_dim(self, input):
if input.dim() != 2 and input.dim() != 3:
raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
class BatchNorm2d(_BatchNorm):
r"""Applies Batch Normalization over a 4D input.
4D is a mini-batch of 2D inputs
with additional channel dimension. Method described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
standard-deviation is calculated via the biased estimator, equivalent to
``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
standard-deviation is calculated via the unbiased estimator, equivalent to
``torch.var(input, unbiased=True)``.
Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.
If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.
.. note::
This :attr:`momentum` argument is different from one used in optimizer
classes and the conventional notion of momentum. Mathematically, the
update rule for running statistics here is
:math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
new observed value.
Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.
Args:
num_features: :math:`C` from an expected input of size
:math:`(N, C, H, W)`
eps: a value added to the denominator for numerical stability.
Default: 1e-5
momentum: the value used for the running_mean and running_var
computation. Can be set to ``None`` for cumulative moving average
(i.e. simple average). Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters. Default: ``True``
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics, and initializes statistics
buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
When these buffers are ``None``, this module always uses batch statistics.
in both training and eval modes. Default: ``True``
Shape:
- Input: :math:`(N, C, H, W)`
- Output: :math:`(N, C, H, W)` (same shape as input)
Examples::
>>> # With Learnable Parameters
>>> m = nn.BatchNorm2d(100)
>>> # Without Learnable Parameters
>>> m = nn.BatchNorm2d(100, affine=False)
>>> input = torch.randn(20, 100, 35, 45)
>>> output = m(input)
"""
def _check_input_dim(self, input):
if input.dim() != 4:
raise ValueError(f"expected 4D input (got {input.dim()}D input)")
class LazyBatchNorm2d(_LazyNormBase, _BatchNorm):
r"""A :class:`torch.nn.BatchNorm2d` module with lazy initialization.
Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.
Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.
Args:
eps: a value added to the denominator for numerical stability.
Default: 1e-5
momentum: the value used for the running_mean and running_var
computation. Can be set to ``None`` for cumulative moving average
(i.e. simple average). Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters. Default: ``True``
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics, and initializes statistics
buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
When these buffers are ``None``, this module always uses batch statistics.
in both training and eval modes. Default: ``True``
"""
cls_to_become = BatchNorm2d # type: ignore[assignment]
def _check_input_dim(self, input):
if input.dim() != 4:
raise ValueError(f"expected 4D input (got {input.dim()}D input)")
class BatchNorm3d(_BatchNorm):
r"""Applies Batch Normalization over a 5D input.
5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
standard-deviation is calculated via the biased estimator, equivalent to
``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
standard-deviation is calculated via the unbiased estimator, equivalent to
``torch.var(input, unbiased=True)``.
Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.
If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.
.. note::
This :attr:`momentum` argument is different from one used in optimizer
classes and the conventional notion of momentum. Mathematically, the
update rule for running statistics here is
:math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
new observed value.
Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
or Spatio-temporal Batch Normalization.
Args:
num_features: :math:`C` from an expected input of size
:math:`(N, C, D, H, W)`
eps: a value added to the denominator for numerical stability.
Default: 1e-5
momentum: the value used for the running_mean and running_var
computation. Can be set to ``None`` for cumulative moving average
(i.e. simple average). Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters. Default: ``True``
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics, and initializes statistics
buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
When these buffers are ``None``, this module always uses batch statistics.
in both training and eval modes. Default: ``True``
Shape:
- Input: :math:`(N, C, D, H, W)`
- Output: :math:`(N, C, D, H, W)` (same shape as input)
Examples::
>>> # With Learnable Parameters
>>> m = nn.BatchNorm3d(100)
>>> # Without Learnable Parameters
>>> m = nn.BatchNorm3d(100, affine=False)
>>> input = torch.randn(20, 100, 35, 45, 10)
>>> output = m(input)
"""
def _check_input_dim(self, input):
if input.dim() != 5:
raise ValueError(f"expected 5D input (got {input.dim()}D input)")
class LazyBatchNorm3d(_LazyNormBase, _BatchNorm):
r"""A :class:`torch.nn.BatchNorm3d` module with lazy initialization.
Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.
Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.
Args:
eps: a value added to the denominator for numerical stability.
Default: 1e-5
momentum: the value used for the running_mean and running_var
computation. Can be set to ``None`` for cumulative moving average
(i.e. simple average). Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters. Default: ``True``
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics, and initializes statistics
buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
When these buffers are ``None``, this module always uses batch statistics.
in both training and eval modes. Default: ``True``
"""
cls_to_become = BatchNorm3d # type: ignore[assignment]
def _check_input_dim(self, input):
if input.dim() != 5:
raise ValueError(f"expected 5D input (got {input.dim()}D input)")
class SyncBatchNorm(_BatchNorm):
r"""Applies Batch Normalization over a N-Dimensional input.
The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
The mean and standard-deviation are calculated per-dimension over all
mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
are learnable parameter vectors of size `C` (where `C` is the input size).
By default, the elements of :math:`\gamma` are sampled from
:math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
The standard-deviation is calculated via the biased estimator, equivalent to
`torch.var(input, unbiased=False)`.
Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.
If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.
.. note::
This :attr:`momentum` argument is different from one used in optimizer
classes and the conventional notion of momentum. Mathematically, the
update rule for running statistics here is
:math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
new observed value.
Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
Normalization or Spatio-temporal Batch Normalization.
Currently :class:`SyncBatchNorm` only supports
:class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
:meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
:attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
Network with DDP.
Args:
num_features: :math:`C` from an expected input of size
:math:`(N, C, +)`
eps: a value added to the denominator for numerical stability.
Default: ``1e-5``
momentum: the value used for the running_mean and running_var
computation. Can be set to ``None`` for cumulative moving average
(i.e. simple average). Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters. Default: ``True``
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics, and initializes statistics
buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
When these buffers are ``None``, this module always uses batch statistics.
in both training and eval modes. Default: ``True``
process_group: synchronization of stats happen within each process group
individually. Default behavior is synchronization across the whole
world
Shape:
- Input: :math:`(N, C, +)`
- Output: :math:`(N, C, +)` (same shape as input)
.. note::
Synchronization of batchnorm statistics occurs only while training, i.e.
synchronization is disabled when ``model.eval()`` is set or if
``self.training`` is otherwise ``False``.
Examples::
>>> # xdoctest: +SKIP
>>> # With Learnable Parameters
>>> m = nn.SyncBatchNorm(100)
>>> # creating process group (optional)
>>> # ranks is a list of int identifying rank ids.
>>> ranks = list(range(8))
>>> r1, r2 = ranks[:4], ranks[4:]
>>> # Note: every rank calls into new_group for every
>>> # process group created, even if that rank is not
>>> # part of the group.
>>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
>>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
>>> # Without Learnable Parameters
>>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
>>> input = torch.randn(20, 100, 35, 45, 10)
>>> output = m(input)
>>> # network is nn.BatchNorm layer
>>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
>>> # only single gpu per process is currently supported
>>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
>>> sync_bn_network,
>>> device_ids=[args.local_rank],
>>> output_device=args.local_rank)
"""
def __init__(
self,
num_features: int,
eps: float = 1e-5,
momentum: Optional[float] = 0.1,
affine: bool = True,
track_running_stats: bool = True,
process_group: Optional[Any] = None,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__(
num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
)
self.process_group = process_group
def _check_input_dim(self, input):
if input.dim() < 2:
raise ValueError(f"expected at least 2D input (got {input.dim()}D input)")
def _check_non_zero_input_channels(self, input):
if input.size(1) == 0:
raise ValueError(
"SyncBatchNorm number of input channels should be non-zero"
)
def forward(self, input: Tensor) -> Tensor:
self._check_input_dim(input)
self._check_non_zero_input_channels(input)
# exponential_average_factor is set to self.momentum
# (when it is available) only so that it gets updated
# in ONNX graph when this node is exported to ONNX.
if self.momentum is None:
exponential_average_factor = 0.0
else:
exponential_average_factor = self.momentum
if self.training and self.track_running_stats:
assert self.num_batches_tracked is not None
self.num_batches_tracked.add_(1)
if self.momentum is None: # use cumulative moving average
exponential_average_factor = 1.0 / self.num_batches_tracked.item()
else: # use exponential moving average
exponential_average_factor = self.momentum
r"""
Decide whether the mini-batch stats should be used for normalization rather than the buffers.
Mini-batch stats are used in training mode, and in eval mode when buffers are None.
"""
if self.training:
bn_training = True
else:
bn_training = (self.running_mean is None) and (self.running_var is None)
r"""
Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
used for normalization (i.e. in eval mode when buffers are not None).
"""
# If buffers are not to be tracked, ensure that they won't be updated
running_mean = (
self.running_mean if not self.training or self.track_running_stats else None
)
running_var = (
self.running_var if not self.training or self.track_running_stats else None
)
# Don't sync batchnorm stats in inference mode (model.eval()).
need_sync = (
bn_training
and self.training
and torch.distributed.is_available()
and torch.distributed.is_initialized()
)
if need_sync:
# currently only GPU/PrivateUse1 input is supported
if input.device.type not in [
"cuda",
torch._C._get_privateuse1_backend_name(),
]:
raise ValueError(
"SyncBatchNorm expected input tensor to be on GPU or "
f"{torch._C._get_privateuse1_backend_name()}"
)
process_group = torch.distributed.group.WORLD
if self.process_group:
process_group = self.process_group
world_size = torch.distributed.get_world_size(process_group)
need_sync = world_size > 1
# fallback to framework BN when synchronization is not necessary
if not need_sync:
return F.batch_norm(
input,
running_mean,
running_var,
self.weight,
self.bias,
bn_training,
exponential_average_factor,
self.eps,
)
else:
assert bn_training
return sync_batch_norm.apply(
input,
self.weight,
self.bias,
running_mean,
running_var,
self.eps,
exponential_average_factor,
process_group, # type: ignore[possibly-undefined]
world_size, # type: ignore[possibly-undefined]
)
@classmethod
def convert_sync_batchnorm(cls, module, process_group=None):
r"""Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.
Args:
module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
process_group (optional): process group to scope synchronization,
default is the whole world
Returns:
The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
instead.
Example::
>>> # Network with nn.BatchNorm layer
>>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
>>> module = torch.nn.Sequential(
>>> torch.nn.Linear(20, 100),
>>> torch.nn.BatchNorm1d(100),
>>> ).cuda()
>>> # creating process group (optional)
>>> # ranks is a list of int identifying rank ids.
>>> ranks = list(range(8))
>>> r1, r2 = ranks[:4], ranks[4:]
>>> # Note: every rank calls into new_group for every
>>> # process group created, even if that rank is not
>>> # part of the group.
>>> # xdoctest: +SKIP("distributed")
>>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
>>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
>>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)
"""
module_output = module
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
module_output = torch.nn.SyncBatchNorm(
module.num_features,
module.eps,
module.momentum,
module.affine,
module.track_running_stats,
process_group,
)
if module.affine:
with torch.no_grad():
module_output.weight = module.weight
module_output.bias = module.bias
module_output.running_mean = module.running_mean
module_output.running_var = module.running_var
module_output.num_batches_tracked = module.num_batches_tracked
module_output.training = module.training
if hasattr(module, "qconfig"):
module_output.qconfig = module.qconfig
for name, child in module.named_children():
module_output.add_module(
name, cls.convert_sync_batchnorm(child, process_group)
)
del module
return module_output

View File

@ -0,0 +1,56 @@
import torch.nn.functional as F
from torch import Tensor
from .module import Module
__all__ = ["ChannelShuffle"]
class ChannelShuffle(Module):
r"""Divides and rearranges the channels in a tensor.
This operation divides the channels in a tensor of shape :math:`(N, C, *)`
into g groups as :math:`(N, \frac{C}{g}, g, *)` and shuffles them,
while retaining the original tensor shape in the final output.
Args:
groups (int): number of groups to divide channels in.
Examples::
>>> channel_shuffle = nn.ChannelShuffle(2)
>>> input = torch.arange(1, 17, dtype=torch.float32).view(1, 4, 2, 2)
>>> input
tensor([[[[ 1., 2.],
[ 3., 4.]],
[[ 5., 6.],
[ 7., 8.]],
[[ 9., 10.],
[11., 12.]],
[[13., 14.],
[15., 16.]]]])
>>> output = channel_shuffle(input)
>>> output
tensor([[[[ 1., 2.],
[ 3., 4.]],
[[ 9., 10.],
[11., 12.]],
[[ 5., 6.],
[ 7., 8.]],
[[13., 14.],
[15., 16.]]]])
"""
__constants__ = ["groups"]
groups: int
def __init__(self, groups: int) -> None:
super().__init__()
self.groups = groups
def forward(self, input: Tensor) -> Tensor:
return F.channel_shuffle(input, self.groups)
def extra_repr(self) -> str:
return f"groups={self.groups}"

View File

@ -0,0 +1,976 @@
# mypy: allow-untyped-decorators
# mypy: allow-untyped-defs
import operator
from collections import abc as container_abcs, OrderedDict
from itertools import chain, islice
from typing import (
Any,
Dict,
Iterable,
Iterator,
Mapping,
Optional,
overload,
Tuple,
TypeVar,
Union,
)
from typing_extensions import deprecated, Self
import torch
from torch._jit_internal import _copy_to_script_wrapper
from torch.nn.parameter import Parameter
from .module import Module
__all__ = [
"Container",
"Sequential",
"ModuleList",
"ModuleDict",
"ParameterList",
"ParameterDict",
]
T = TypeVar("T", bound=Module)
# Copied from torch.nn.modules.module, required for a custom __repr__ for ModuleList
def _addindent(s_, numSpaces):
s = s_.split("\n")
# don't do anything for single-line stuff
if len(s) == 1:
return s_
first = s.pop(0)
s = [(numSpaces * " ") + line for line in s]
s = "\n".join(s)
s = first + "\n" + s
return s
@deprecated(
"`nn.Container` is deprecated. "
"All of it's functionality is now implemented in `nn.Module`. Subclass that instead.",
category=FutureWarning,
)
class Container(Module):
def __init__(self, **kwargs: Any) -> None:
super().__init__()
for key, value in kwargs.items():
self.add_module(key, value)
class Sequential(Module):
r"""A sequential container.
Modules will be added to it in the order they are passed in the
constructor. Alternatively, an ``OrderedDict`` of modules can be
passed in. The ``forward()`` method of ``Sequential`` accepts any
input and forwards it to the first module it contains. It then
"chains" outputs to inputs sequentially for each subsequent module,
finally returning the output of the last module.
The value a ``Sequential`` provides over manually calling a sequence
of modules is that it allows treating the whole container as a
single module, such that performing a transformation on the
``Sequential`` applies to each of the modules it stores (which are
each a registered submodule of the ``Sequential``).
What's the difference between a ``Sequential`` and a
:class:`torch.nn.ModuleList`? A ``ModuleList`` is exactly what it
sounds like--a list for storing ``Module`` s! On the other hand,
the layers in a ``Sequential`` are connected in a cascading way.
Example::
# Using Sequential to create a small model. When `model` is run,
# input will first be passed to `Conv2d(1,20,5)`. The output of
# `Conv2d(1,20,5)` will be used as the input to the first
# `ReLU`; the output of the first `ReLU` will become the input
# for `Conv2d(20,64,5)`. Finally, the output of
# `Conv2d(20,64,5)` will be used as input to the second `ReLU`
model = nn.Sequential(
nn.Conv2d(1,20,5),
nn.ReLU(),
nn.Conv2d(20,64,5),
nn.ReLU()
)
# Using Sequential with OrderedDict. This is functionally the
# same as the above code
model = nn.Sequential(OrderedDict([
('conv1', nn.Conv2d(1,20,5)),
('relu1', nn.ReLU()),
('conv2', nn.Conv2d(20,64,5)),
('relu2', nn.ReLU())
]))
"""
_modules: Dict[str, Module] # type: ignore[assignment]
@overload
def __init__(self, *args: Module) -> None:
...
@overload
def __init__(self, arg: "OrderedDict[str, Module]") -> None:
...
def __init__(self, *args):
super().__init__()
if len(args) == 1 and isinstance(args[0], OrderedDict):
for key, module in args[0].items():
self.add_module(key, module)
else:
for idx, module in enumerate(args):
self.add_module(str(idx), module)
def _get_item_by_idx(self, iterator, idx) -> T: # type: ignore[misc, type-var]
"""Get the idx-th item of the iterator."""
size = len(self)
idx = operator.index(idx)
if not -size <= idx < size:
raise IndexError(f"index {idx} is out of range")
idx %= size
return next(islice(iterator, idx, None))
@_copy_to_script_wrapper
def __getitem__(self, idx: Union[slice, int]) -> Union["Sequential", T]:
if isinstance(idx, slice):
return self.__class__(OrderedDict(list(self._modules.items())[idx]))
else:
return self._get_item_by_idx(self._modules.values(), idx)
def __setitem__(self, idx: int, module: Module) -> None:
key: str = self._get_item_by_idx(self._modules.keys(), idx)
return setattr(self, key, module)
def __delitem__(self, idx: Union[slice, int]) -> None:
if isinstance(idx, slice):
for key in list(self._modules.keys())[idx]:
delattr(self, key)
else:
key = self._get_item_by_idx(self._modules.keys(), idx)
delattr(self, key)
# To preserve numbering
str_indices = [str(i) for i in range(len(self._modules))]
self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
@_copy_to_script_wrapper
def __len__(self) -> int:
return len(self._modules)
def __add__(self, other) -> "Sequential":
if isinstance(other, Sequential):
ret = Sequential()
for layer in self:
ret.append(layer)
for layer in other:
ret.append(layer)
return ret
else:
raise ValueError(
"add operator supports only objects "
f"of Sequential class, but {str(type(other))} is given."
)
def pop(self, key: Union[int, slice]) -> Module:
v = self[key]
del self[key]
return v
def __iadd__(self, other) -> Self:
if isinstance(other, Sequential):
offset = len(self)
for i, module in enumerate(other):
self.add_module(str(i + offset), module)
return self
else:
raise ValueError(
"add operator supports only objects "
f"of Sequential class, but {str(type(other))} is given."
)
def __mul__(self, other: int) -> "Sequential":
if not isinstance(other, int):
raise TypeError(
f"unsupported operand type(s) for *: {type(self)} and {type(other)}"
)
elif other <= 0:
raise ValueError(
f"Non-positive multiplication factor {other} for {type(self)}"
)
else:
combined = Sequential()
offset = 0
for _ in range(other):
for module in self:
combined.add_module(str(offset), module)
offset += 1
return combined
def __rmul__(self, other: int) -> "Sequential":
return self.__mul__(other)
def __imul__(self, other: int) -> Self:
if not isinstance(other, int):
raise TypeError(
f"unsupported operand type(s) for *: {type(self)} and {type(other)}"
)
elif other <= 0:
raise ValueError(
f"Non-positive multiplication factor {other} for {type(self)}"
)
else:
len_original = len(self)
offset = len(self)
for _ in range(other - 1):
for i in range(len_original):
self.add_module(str(i + offset), self._modules[str(i)])
offset += len_original
return self
@_copy_to_script_wrapper
def __dir__(self):
keys = super().__dir__()
keys = [key for key in keys if not key.isdigit()]
return keys
@_copy_to_script_wrapper
def __iter__(self) -> Iterator[Module]:
return iter(self._modules.values())
# NB: We can't really type check this function as the type of input
# may change dynamically (as is tested in
# TestScript.test_sequential_intermediary_types). Cannot annotate
# with Any as TorchScript expects a more precise type
def forward(self, input):
for module in self:
input = module(input)
return input
def append(self, module: Module) -> "Sequential":
r"""Append a given module to the end.
Args:
module (nn.Module): module to append
"""
self.add_module(str(len(self)), module)
return self
def insert(self, index: int, module: Module) -> "Sequential":
if not isinstance(module, Module):
raise AssertionError(f"module should be of type: {Module}")
n = len(self._modules)
if not (-n <= index <= n):
raise IndexError(f"Index out of range: {index}")
if index < 0:
index += n
for i in range(n, index, -1):
self._modules[str(i)] = self._modules[str(i - 1)]
self._modules[str(index)] = module
return self
def extend(self, sequential) -> "Sequential":
for layer in sequential:
self.append(layer)
return self
class ModuleList(Module):
r"""Holds submodules in a list.
:class:`~torch.nn.ModuleList` can be indexed like a regular Python list, but
modules it contains are properly registered, and will be visible by all
:class:`~torch.nn.Module` methods.
Args:
modules (iterable, optional): an iterable of modules to add
Example::
class MyModule(nn.Module):
def __init__(self) -> None:
super().__init__()
self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
def forward(self, x):
# ModuleList can act as an iterable, or be indexed using ints
for i, l in enumerate(self.linears):
x = self.linears[i // 2](x) + l(x)
return x
"""
_modules: Dict[str, Module] # type: ignore[assignment]
def __init__(self, modules: Optional[Iterable[Module]] = None) -> None:
super().__init__()
if modules is not None:
self += modules
def _get_abs_string_index(self, idx):
"""Get the absolute index for the list of modules."""
idx = operator.index(idx)
if not (-len(self) <= idx < len(self)):
raise IndexError(f"index {idx} is out of range")
if idx < 0:
idx += len(self)
return str(idx)
@overload
def __getitem__(self, idx: slice) -> "ModuleList":
...
@overload
def __getitem__(self, idx: int) -> Module:
...
@_copy_to_script_wrapper
def __getitem__(self, idx: Union[int, slice]) -> Union[Module, "ModuleList"]:
if isinstance(idx, slice):
return self.__class__(list(self._modules.values())[idx])
else:
return self._modules[self._get_abs_string_index(idx)]
def __setitem__(self, idx: int, module: Module) -> None:
idx = self._get_abs_string_index(idx)
return setattr(self, str(idx), module)
def __delitem__(self, idx: Union[int, slice]) -> None:
if isinstance(idx, slice):
for k in range(len(self._modules))[idx]:
delattr(self, str(k))
else:
delattr(self, self._get_abs_string_index(idx))
# To preserve numbering, self._modules is being reconstructed with modules after deletion
str_indices = [str(i) for i in range(len(self._modules))]
self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
@_copy_to_script_wrapper
def __len__(self) -> int:
return len(self._modules)
@_copy_to_script_wrapper
def __iter__(self) -> Iterator[Module]:
return iter(self._modules.values())
def __iadd__(self, modules: Iterable[Module]) -> Self:
return self.extend(modules)
def __add__(self, other: Iterable[Module]) -> "ModuleList":
combined = ModuleList()
for i, module in enumerate(chain(self, other)):
combined.add_module(str(i), module)
return combined
def __repr__(self):
"""Return a custom repr for ModuleList that compresses repeated module representations."""
list_of_reprs = [repr(item) for item in self]
if len(list_of_reprs) == 0:
return self._get_name() + "()"
start_end_indices = [[0, 0]]
repeated_blocks = [list_of_reprs[0]]
for i, r in enumerate(list_of_reprs[1:], 1):
if r == repeated_blocks[-1]:
start_end_indices[-1][1] += 1
continue
start_end_indices.append([i, i])
repeated_blocks.append(r)
lines = []
main_str = self._get_name() + "("
for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
local_repr = f"({start_id}): {b}" # default repr
if start_id != end_id:
n = end_id - start_id + 1
local_repr = f"({start_id}-{end_id}): {n} x {b}"
local_repr = _addindent(local_repr, 2)
lines.append(local_repr)
main_str += "\n " + "\n ".join(lines) + "\n"
main_str += ")"
return main_str
@_copy_to_script_wrapper
def __dir__(self):
keys = super().__dir__()
keys = [key for key in keys if not key.isdigit()]
return keys
def insert(self, index: int, module: Module) -> None:
r"""Insert a given module before a given index in the list.
Args:
index (int): index to insert.
module (nn.Module): module to insert
"""
for i in range(len(self._modules), index, -1):
self._modules[str(i)] = self._modules[str(i - 1)]
self._modules[str(index)] = module
def append(self, module: Module) -> "ModuleList":
r"""Append a given module to the end of the list.
Args:
module (nn.Module): module to append
"""
self.add_module(str(len(self)), module)
return self
def pop(self, key: Union[int, slice]) -> Module:
v = self[key]
del self[key]
return v
def extend(self, modules: Iterable[Module]) -> Self:
r"""Append modules from a Python iterable to the end of the list.
Args:
modules (iterable): iterable of modules to append
"""
if not isinstance(modules, container_abcs.Iterable):
raise TypeError(
"ModuleList.extend should be called with an "
"iterable, but got " + type(modules).__name__
)
offset = len(self)
for i, module in enumerate(modules):
self.add_module(str(offset + i), module)
return self
# remove forward alltogether to fallback on Module's _forward_unimplemented
class ModuleDict(Module):
r"""Holds submodules in a dictionary.
:class:`~torch.nn.ModuleDict` can be indexed like a regular Python dictionary,
but modules it contains are properly registered, and will be visible by all
:class:`~torch.nn.Module` methods.
:class:`~torch.nn.ModuleDict` is an **ordered** dictionary that respects
* the order of insertion, and
* in :meth:`~torch.nn.ModuleDict.update`, the order of the merged
``OrderedDict``, ``dict`` (started from Python 3.6) or another
:class:`~torch.nn.ModuleDict` (the argument to
:meth:`~torch.nn.ModuleDict.update`).
Note that :meth:`~torch.nn.ModuleDict.update` with other unordered mapping
types (e.g., Python's plain ``dict`` before Python version 3.6) does not
preserve the order of the merged mapping.
Args:
modules (iterable, optional): a mapping (dictionary) of (string: module)
or an iterable of key-value pairs of type (string, module)
Example::
class MyModule(nn.Module):
def __init__(self) -> None:
super().__init__()
self.choices = nn.ModuleDict({
'conv': nn.Conv2d(10, 10, 3),
'pool': nn.MaxPool2d(3)
})
self.activations = nn.ModuleDict([
['lrelu', nn.LeakyReLU()],
['prelu', nn.PReLU()]
])
def forward(self, x, choice, act):
x = self.choices[choice](x)
x = self.activations[act](x)
return x
"""
_modules: Dict[str, Module] # type: ignore[assignment]
def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None:
super().__init__()
if modules is not None:
self.update(modules)
@_copy_to_script_wrapper
def __getitem__(self, key: str) -> Module:
return self._modules[key]
def __setitem__(self, key: str, module: Module) -> None:
self.add_module(key, module)
def __delitem__(self, key: str) -> None:
del self._modules[key]
@_copy_to_script_wrapper
def __len__(self) -> int:
return len(self._modules)
@_copy_to_script_wrapper
def __iter__(self) -> Iterator[str]:
return iter(self._modules)
@_copy_to_script_wrapper
def __contains__(self, key: str) -> bool:
return key in self._modules
def clear(self) -> None:
"""Remove all items from the ModuleDict."""
self._modules.clear()
def pop(self, key: str) -> Module:
r"""Remove key from the ModuleDict and return its module.
Args:
key (str): key to pop from the ModuleDict
"""
v = self[key]
del self[key]
return v
@_copy_to_script_wrapper
def keys(self) -> Iterable[str]:
r"""Return an iterable of the ModuleDict keys."""
return self._modules.keys()
@_copy_to_script_wrapper
def items(self) -> Iterable[Tuple[str, Module]]:
r"""Return an iterable of the ModuleDict key/value pairs."""
return self._modules.items()
@_copy_to_script_wrapper
def values(self) -> Iterable[Module]:
r"""Return an iterable of the ModuleDict values."""
return self._modules.values()
def update(self, modules: Mapping[str, Module]) -> None:
r"""Update the :class:`~torch.nn.ModuleDict` with key-value pairs from a mapping, overwriting existing keys.
.. note::
If :attr:`modules` is an ``OrderedDict``, a :class:`~torch.nn.ModuleDict`, or
an iterable of key-value pairs, the order of new elements in it is preserved.
Args:
modules (iterable): a mapping (dictionary) from string to :class:`~torch.nn.Module`,
or an iterable of key-value pairs of type (string, :class:`~torch.nn.Module`)
"""
if not isinstance(modules, container_abcs.Iterable):
raise TypeError(
"ModuleDict.update should be called with an "
"iterable of key/value pairs, but got " + type(modules).__name__
)
if isinstance(modules, (OrderedDict, ModuleDict, container_abcs.Mapping)):
for key, module in modules.items():
self[key] = module
else:
# modules here can be a list with two items
for j, m in enumerate(modules):
if not isinstance(m, container_abcs.Iterable):
raise TypeError(
"ModuleDict update sequence element "
"#" + str(j) + " should be Iterable; is" + type(m).__name__
)
if not len(m) == 2:
raise ValueError(
"ModuleDict update sequence element "
"#" + str(j) + " has length " + str(len(m)) + "; 2 is required"
)
# modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
# that's too cumbersome to type correctly with overloads, so we add an ignore here
self[m[0]] = m[1] # type: ignore[assignment]
# remove forward alltogether to fallback on Module's _forward_unimplemented
class ParameterList(Module):
r"""Holds parameters in a list.
:class:`~torch.nn.ParameterList` can be used like a regular Python
list, but Tensors that are :class:`~torch.nn.Parameter` are properly registered,
and will be visible by all :class:`~torch.nn.Module` methods.
Note that the constructor, assigning an element of the list, the
:meth:`~torch.nn.ParameterList.append` method and the :meth:`~torch.nn.ParameterList.extend`
method will convert any :class:`~torch.Tensor` into :class:`~torch.nn.Parameter`.
Args:
parameters (iterable, optional): an iterable of elements to add to the list.
Example::
class MyModule(nn.Module):
def __init__(self) -> None:
super().__init__()
self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])
def forward(self, x):
# ParameterList can act as an iterable, or be indexed using ints
for i, p in enumerate(self.params):
x = self.params[i // 2].mm(x) + p.mm(x)
return x
"""
def __init__(self, values: Optional[Iterable[Any]] = None) -> None:
super().__init__()
self._size = 0
if values is not None:
self += values
def _get_abs_string_index(self, idx):
"""Get the absolute index for the list of modules."""
idx = operator.index(idx)
if not (-len(self) <= idx < len(self)):
raise IndexError(f"index {idx} is out of range")
if idx < 0:
idx += len(self)
return str(idx)
@overload
def __getitem__(self, idx: int) -> Any:
...
@overload
def __getitem__(self: T, idx: slice) -> T:
...
def __getitem__(self, idx):
if isinstance(idx, slice):
start, stop, step = idx.indices(len(self))
out = self.__class__()
for i in range(start, stop, step):
out.append(self[i])
return out
else:
idx = self._get_abs_string_index(idx)
return getattr(self, str(idx))
def __setitem__(self, idx: int, param: Any) -> None:
# Note that all other function that add an entry to the list part of
# the ParameterList end up here. So this is the only place where we need
# to wrap things into Parameter if needed.
# Objects added via setattr() are not in the list part and thus won't
# call into this function.
idx = self._get_abs_string_index(idx)
if isinstance(param, torch.Tensor) and not isinstance(param, Parameter):
param = Parameter(param)
return setattr(self, str(idx), param)
def __len__(self) -> int:
return self._size
def __iter__(self) -> Iterator[Any]:
return iter(self[i] for i in range(len(self)))
def __iadd__(self, parameters: Iterable[Any]) -> Self:
return self.extend(parameters)
def __dir__(self):
keys = super().__dir__()
keys = [key for key in keys if not key.isdigit()]
return keys
def append(self, value: Any) -> "ParameterList":
"""Append a given value at the end of the list.
Args:
value (Any): value to append
"""
new_idx = len(self)
self._size += 1
self[new_idx] = value
return self
def extend(self, values: Iterable[Any]) -> Self:
"""Append values from a Python iterable to the end of the list.
Args:
values (iterable): iterable of values to append
"""
# Tensor is an iterable but we never want to unpack it here
if not isinstance(values, container_abcs.Iterable) or isinstance(
values, torch.Tensor
):
raise TypeError(
"ParameterList.extend should be called with an "
"iterable, but got " + type(values).__name__
)
for value in values:
self.append(value)
return self
def extra_repr(self) -> str:
child_lines = []
for k, p in enumerate(self):
if isinstance(p, torch.Tensor):
size_str = "x".join(str(size) for size in p.size())
if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]:
device_str = f" ({p.device})"
else:
device_str = ""
parastr = "{} containing: [{} of size {}{}]".format(
"Parameter" if isinstance(p, Parameter) else "Tensor",
p.dtype,
size_str,
device_str,
)
child_lines.append(" (" + str(k) + "): " + parastr)
else:
child_lines.append(
" (" + str(k) + "): Object of type: " + type(p).__name__
)
tmpstr = "\n".join(child_lines)
return tmpstr
def __call__(self, *args, **kwargs):
raise RuntimeError("ParameterList should not be called.")
class ParameterDict(Module):
r"""Holds parameters in a dictionary.
ParameterDict can be indexed like a regular Python dictionary, but Parameters it
contains are properly registered, and will be visible by all Module methods.
Other objects are treated as would be done by a regular Python dictionary
:class:`~torch.nn.ParameterDict` is an **ordered** dictionary.
:meth:`~torch.nn.ParameterDict.update` with other unordered mapping
types (e.g., Python's plain ``dict``) does not preserve the order of the
merged mapping. On the other hand, ``OrderedDict`` or another :class:`~torch.nn.ParameterDict`
will preserve their ordering.
Note that the constructor, assigning an element of the dictionary and the
:meth:`~torch.nn.ParameterDict.update` method will convert any :class:`~torch.Tensor` into
:class:`~torch.nn.Parameter`.
Args:
values (iterable, optional): a mapping (dictionary) of
(string : Any) or an iterable of key-value pairs
of type (string, Any)
Example::
class MyModule(nn.Module):
def __init__(self) -> None:
super().__init__()
self.params = nn.ParameterDict({
'left': nn.Parameter(torch.randn(5, 10)),
'right': nn.Parameter(torch.randn(5, 10))
})
def forward(self, x, choice):
x = self.params[choice].mm(x)
return x
"""
def __init__(self, parameters: Any = None) -> None:
super().__init__()
self._keys: Dict[str, None] = {}
if parameters is not None:
self.update(parameters)
def _key_to_attr(self, key: str) -> str:
if not isinstance(key, str):
raise TypeError(
"Index given to ParameterDict cannot be used as a key as it is "
f"not a string (type is '{type(key).__name__}'). Open an issue on "
"github if you need non-string keys."
)
else:
# Use the key as-is so that `.named_parameters()` returns the right thing
return key
def __getitem__(self, key: str) -> Any:
attr = self._key_to_attr(key)
return getattr(self, attr)
def __setitem__(self, key: str, value: Any) -> None:
# Note that all other function that add an entry to the dictionary part of
# the ParameterDict end up here. So this is the only place where we need
# to wrap things into Parameter if needed.
# Objects added via setattr() are not in the dictionary part and thus won't
# call into this function.
self._keys[key] = None
attr = self._key_to_attr(key)
if isinstance(value, torch.Tensor) and not isinstance(value, Parameter):
value = Parameter(value)
setattr(self, attr, value)
def __delitem__(self, key: str) -> None:
del self._keys[key]
attr = self._key_to_attr(key)
delattr(self, attr)
def __len__(self) -> int:
return len(self._keys)
def __iter__(self) -> Iterator[str]:
return iter(self._keys)
def __reversed__(self) -> Iterator[str]:
return reversed(list(self._keys))
def copy(self) -> "ParameterDict":
"""Return a copy of this :class:`~torch.nn.ParameterDict` instance."""
# We have to use an OrderedDict because the ParameterDict constructor
# behaves differently on plain dict vs OrderedDict
return ParameterDict(OrderedDict((k, self[k]) for k in self._keys))
def __contains__(self, key: str) -> bool:
return key in self._keys
def setdefault(self, key: str, default: Optional[Any] = None) -> Any:
"""Set the default for a key in the Parameterdict.
If key is in the ParameterDict, return its value.
If not, insert `key` with a parameter `default` and return `default`.
`default` defaults to `None`.
Args:
key (str): key to set default for
default (Any): the parameter set to the key
"""
if key not in self:
self[key] = default
return self[key]
def clear(self) -> None:
"""Remove all items from the ParameterDict."""
for k in self._keys.copy():
del self[k]
def pop(self, key: str) -> Any:
r"""Remove key from the ParameterDict and return its parameter.
Args:
key (str): key to pop from the ParameterDict
"""
v = self[key]
del self[key]
return v
def popitem(self) -> Tuple[str, Any]:
"""Remove and return the last inserted `(key, parameter)` pair from the ParameterDict."""
k, _ = self._keys.popitem()
# We need the key in the _keys to be able to access/del
self._keys[k] = None
val = self[k]
del self[k]
return k, val
def get(self, key: str, default: Optional[Any] = None) -> Any:
r"""Return the parameter associated with key if present. Otherwise return default if provided, None if not.
Args:
key (str): key to get from the ParameterDict
default (Parameter, optional): value to return if key not present
"""
return self[key] if key in self else default
def fromkeys(
self, keys: Iterable[str], default: Optional[Any] = None
) -> "ParameterDict":
r"""Return a new ParameterDict with the keys provided.
Args:
keys (iterable, string): keys to make the new ParameterDict from
default (Parameter, optional): value to set for all keys
"""
return ParameterDict((k, default) for k in keys)
def keys(self) -> Iterable[str]:
r"""Return an iterable of the ParameterDict keys."""
return self._keys.keys()
def items(self) -> Iterable[Tuple[str, Any]]:
r"""Return an iterable of the ParameterDict key/value pairs."""
return ((k, self[k]) for k in self._keys)
def values(self) -> Iterable[Any]:
r"""Return an iterable of the ParameterDict values."""
return (self[k] for k in self._keys)
def update(self, parameters: Union[Mapping[str, Any], "ParameterDict"]) -> None:
r"""Update the :class:`~torch.nn.ParameterDict` with key-value pairs from ``parameters``, overwriting existing keys.
.. note::
If :attr:`parameters` is an ``OrderedDict``, a :class:`~torch.nn.ParameterDict`, or
an iterable of key-value pairs, the order of new elements in it is preserved.
Args:
parameters (iterable): a mapping (dictionary) from string to
:class:`~torch.nn.Parameter`, or an iterable of
key-value pairs of type (string, :class:`~torch.nn.Parameter`)
"""
if not isinstance(parameters, container_abcs.Iterable):
raise TypeError(
"ParametersDict.update should be called with an "
"iterable of key/value pairs, but got " + type(parameters).__name__
)
if isinstance(parameters, (OrderedDict, ParameterDict)):
for key, parameter in parameters.items():
self[key] = parameter
elif isinstance(parameters, container_abcs.Mapping):
for key, parameter in sorted(parameters.items()):
self[key] = parameter
else:
for j, p in enumerate(parameters):
if not isinstance(p, container_abcs.Iterable):
raise TypeError(
"ParameterDict update sequence element "
"#" + str(j) + " should be Iterable; is" + type(p).__name__
)
if not len(p) == 2:
raise ValueError(
"ParameterDict update sequence element "
"#" + str(j) + " has length " + str(len(p)) + "; 2 is required"
)
# parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
self[p[0]] = p[1] # type: ignore[assignment]
def extra_repr(self) -> str:
child_lines = []
for k, p in self.items():
if isinstance(p, torch.Tensor):
size_str = "x".join(str(size) for size in p.size())
if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]:
device_str = f" ({p.device})"
else:
device_str = ""
parastr = "{} containing: [{} of size {}{}]".format(
"Parameter" if isinstance(p, Parameter) else "Tensor",
torch.typename(p),
size_str,
device_str,
)
child_lines.append(" (" + str(k) + "): " + parastr)
else:
child_lines.append(
" (" + str(k) + "): Object of type: " + type(p).__name__
)
tmpstr = "\n".join(child_lines)
return tmpstr
def __call__(self, input):
raise RuntimeError("ParameterDict should not be called.")
def __or__(self, other: "ParameterDict") -> "ParameterDict":
copy = self.copy()
copy.update(other)
return copy
def __ror__(self, other: "ParameterDict") -> "ParameterDict":
copy = other.copy()
copy.update(self)
return copy
def __ior__(self, other: "ParameterDict") -> Self:
self.update(other)
return self

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,93 @@
import torch.nn.functional as F
from torch import Tensor
from .module import Module
__all__ = ["PairwiseDistance", "CosineSimilarity"]
class PairwiseDistance(Module):
r"""
Computes the pairwise distance between input vectors, or between columns of input matrices.
Distances are computed using ``p``-norm, with constant ``eps`` added to avoid division by zero
if ``p`` is negative, i.e.:
.. math ::
\mathrm{dist}\left(x, y\right) = \left\Vert x-y + \epsilon e \right\Vert_p,
where :math:`e` is the vector of ones and the ``p``-norm is given by.
.. math ::
\Vert x \Vert _p = \left( \sum_{i=1}^n \vert x_i \vert ^ p \right) ^ {1/p}.
Args:
p (real, optional): the norm degree. Can be negative. Default: 2
eps (float, optional): Small value to avoid division by zero.
Default: 1e-6
keepdim (bool, optional): Determines whether or not to keep the vector dimension.
Default: False
Shape:
- Input1: :math:`(N, D)` or :math:`(D)` where `N = batch dimension` and `D = vector dimension`
- Input2: :math:`(N, D)` or :math:`(D)`, same shape as the Input1
- Output: :math:`(N)` or :math:`()` based on input dimension.
If :attr:`keepdim` is ``True``, then :math:`(N, 1)` or :math:`(1)` based on input dimension.
Examples::
>>> pdist = nn.PairwiseDistance(p=2)
>>> input1 = torch.randn(100, 128)
>>> input2 = torch.randn(100, 128)
>>> output = pdist(input1, input2)
"""
__constants__ = ["norm", "eps", "keepdim"]
norm: float
eps: float
keepdim: bool
def __init__(
self, p: float = 2.0, eps: float = 1e-6, keepdim: bool = False
) -> None:
super().__init__()
self.norm = p
self.eps = eps
self.keepdim = keepdim
def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
return F.pairwise_distance(x1, x2, self.norm, self.eps, self.keepdim)
class CosineSimilarity(Module):
r"""Returns cosine similarity between :math:`x_1` and :math:`x_2`, computed along `dim`.
.. math ::
\text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2 \cdot \Vert x_2 \Vert _2, \epsilon)}.
Args:
dim (int, optional): Dimension where cosine similarity is computed. Default: 1
eps (float, optional): Small value to avoid division by zero.
Default: 1e-8
Shape:
- Input1: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`
- Input2: :math:`(\ast_1, D, \ast_2)`, same number of dimensions as x1, matching x1 size at dimension `dim`,
and broadcastable with x1 at other dimensions.
- Output: :math:`(\ast_1, \ast_2)`
Examples::
>>> input1 = torch.randn(100, 128)
>>> input2 = torch.randn(100, 128)
>>> cos = nn.CosineSimilarity(dim=1, eps=1e-6)
>>> output = cos(input1, input2)
"""
__constants__ = ["dim", "eps"]
dim: int
eps: float
def __init__(self, dim: int = 1, eps: float = 1e-8) -> None:
super().__init__()
self.dim = dim
self.eps = eps
def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
return F.cosine_similarity(x1, x2, self.dim, self.eps)

View File

@ -0,0 +1,305 @@
import torch.nn.functional as F
from torch import Tensor
from .module import Module
__all__ = [
"Dropout",
"Dropout1d",
"Dropout2d",
"Dropout3d",
"AlphaDropout",
"FeatureAlphaDropout",
]
class _DropoutNd(Module):
__constants__ = ["p", "inplace"]
p: float
inplace: bool
def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
super().__init__()
if p < 0 or p > 1:
raise ValueError(
f"dropout probability has to be between 0 and 1, but got {p}"
)
self.p = p
self.inplace = inplace
def extra_repr(self) -> str:
return f"p={self.p}, inplace={self.inplace}"
class Dropout(_DropoutNd):
r"""During training, randomly zeroes some of the elements of the input tensor with probability :attr:`p`.
The zeroed elements are chosen independently for each forward call and are sampled from a Bernoulli distribution.
Each channel will be zeroed out independently on every forward call.
This has proven to be an effective technique for regularization and
preventing the co-adaptation of neurons as described in the paper
`Improving neural networks by preventing co-adaptation of feature
detectors`_ .
Furthermore, the outputs are scaled by a factor of :math:`\frac{1}{1-p}` during
training. This means that during evaluation the module simply computes an
identity function.
Args:
p: probability of an element to be zeroed. Default: 0.5
inplace: If set to ``True``, will do this operation in-place. Default: ``False``
Shape:
- Input: :math:`(*)`. Input can be of any shape
- Output: :math:`(*)`. Output is of the same shape as input
Examples::
>>> m = nn.Dropout(p=0.2)
>>> input = torch.randn(20, 16)
>>> output = m(input)
.. _Improving neural networks by preventing co-adaptation of feature
detectors: https://arxiv.org/abs/1207.0580
"""
def forward(self, input: Tensor) -> Tensor:
return F.dropout(input, self.p, self.training, self.inplace)
class Dropout1d(_DropoutNd):
r"""Randomly zero out entire channels.
A channel is a 1D feature map,
e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
batched input is a 1D tensor :math:`\text{input}[i, j]`.
Each channel will be zeroed out independently on every forward call with
probability :attr:`p` using samples from a Bernoulli distribution.
Usually the input comes from :class:`nn.Conv1d` modules.
As described in the paper
`Efficient Object Localization Using Convolutional Networks`_ ,
if adjacent pixels within feature maps are strongly correlated
(as is normally the case in early convolution layers) then i.i.d. dropout
will not regularize the activations and will otherwise just result
in an effective learning rate decrease.
In this case, :func:`nn.Dropout1d` will help promote independence between
feature maps and should be used instead.
Args:
p (float, optional): probability of an element to be zero-ed.
inplace (bool, optional): If set to ``True``, will do this operation
in-place
Shape:
- Input: :math:`(N, C, L)` or :math:`(C, L)`.
- Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input).
Examples::
>>> m = nn.Dropout1d(p=0.2)
>>> input = torch.randn(20, 16, 32)
>>> output = m(input)
.. _Efficient Object Localization Using Convolutional Networks:
https://arxiv.org/abs/1411.4280
"""
def forward(self, input: Tensor) -> Tensor:
return F.dropout1d(input, self.p, self.training, self.inplace)
class Dropout2d(_DropoutNd):
r"""Randomly zero out entire channels.
A channel is a 2D feature map,
e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
batched input is a 2D tensor :math:`\text{input}[i, j]`.
Each channel will be zeroed out independently on every forward call with
probability :attr:`p` using samples from a Bernoulli distribution.
Usually the input comes from :class:`nn.Conv2d` modules.
As described in the paper
`Efficient Object Localization Using Convolutional Networks`_ ,
if adjacent pixels within feature maps are strongly correlated
(as is normally the case in early convolution layers) then i.i.d. dropout
will not regularize the activations and will otherwise just result
in an effective learning rate decrease.
In this case, :func:`nn.Dropout2d` will help promote independence between
feature maps and should be used instead.
Args:
p (float, optional): probability of an element to be zero-ed.
inplace (bool, optional): If set to ``True``, will do this operation
in-place
.. warning ::
Due to historical reasons, this class will perform 1D channel-wise dropout
for 3D inputs (as done by :class:`nn.Dropout1d`). Thus, it currently does NOT
support inputs without a batch dimension of shape :math:`(C, H, W)`. This
behavior will change in a future release to interpret 3D inputs as no-batch-dim
inputs. To maintain the old behavior, switch to :class:`nn.Dropout1d`.
Shape:
- Input: :math:`(N, C, H, W)` or :math:`(N, C, L)`.
- Output: :math:`(N, C, H, W)` or :math:`(N, C, L)` (same shape as input).
Examples::
>>> m = nn.Dropout2d(p=0.2)
>>> input = torch.randn(20, 16, 32, 32)
>>> output = m(input)
.. _Efficient Object Localization Using Convolutional Networks:
https://arxiv.org/abs/1411.4280
"""
def forward(self, input: Tensor) -> Tensor:
return F.dropout2d(input, self.p, self.training, self.inplace)
class Dropout3d(_DropoutNd):
r"""Randomly zero out entire channels.
A channel is a 3D feature map,
e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
batched input is a 3D tensor :math:`\text{input}[i, j]`.
Each channel will be zeroed out independently on every forward call with
probability :attr:`p` using samples from a Bernoulli distribution.
Usually the input comes from :class:`nn.Conv3d` modules.
As described in the paper
`Efficient Object Localization Using Convolutional Networks`_ ,
if adjacent pixels within feature maps are strongly correlated
(as is normally the case in early convolution layers) then i.i.d. dropout
will not regularize the activations and will otherwise just result
in an effective learning rate decrease.
In this case, :func:`nn.Dropout3d` will help promote independence between
feature maps and should be used instead.
Args:
p (float, optional): probability of an element to be zeroed.
inplace (bool, optional): If set to ``True``, will do this operation
in-place
Shape:
- Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`.
- Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input).
Examples::
>>> m = nn.Dropout3d(p=0.2)
>>> input = torch.randn(20, 16, 4, 32, 32)
>>> output = m(input)
.. _Efficient Object Localization Using Convolutional Networks:
https://arxiv.org/abs/1411.4280
"""
def forward(self, input: Tensor) -> Tensor:
return F.dropout3d(input, self.p, self.training, self.inplace)
class AlphaDropout(_DropoutNd):
r"""Applies Alpha Dropout over the input.
Alpha Dropout is a type of Dropout that maintains the self-normalizing
property.
For an input with zero mean and unit standard deviation, the output of
Alpha Dropout maintains the original mean and standard deviation of the
input.
Alpha Dropout goes hand-in-hand with SELU activation function, which ensures
that the outputs have zero mean and unit standard deviation.
During training, it randomly masks some of the elements of the input
tensor with probability *p* using samples from a bernoulli distribution.
The elements to masked are randomized on every forward call, and scaled
and shifted to maintain zero mean and unit standard deviation.
During evaluation the module simply computes an identity function.
More details can be found in the paper `Self-Normalizing Neural Networks`_ .
Args:
p (float): probability of an element to be dropped. Default: 0.5
inplace (bool, optional): If set to ``True``, will do this operation
in-place
Shape:
- Input: :math:`(*)`. Input can be of any shape
- Output: :math:`(*)`. Output is of the same shape as input
Examples::
>>> m = nn.AlphaDropout(p=0.2)
>>> input = torch.randn(20, 16)
>>> output = m(input)
.. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
"""
def forward(self, input: Tensor) -> Tensor:
return F.alpha_dropout(input, self.p, self.training)
class FeatureAlphaDropout(_DropoutNd):
r"""Randomly masks out entire channels.
A channel is a feature map,
e.g. the :math:`j`-th channel of the :math:`i`-th sample in the batch input
is a tensor :math:`\text{input}[i, j]` of the input tensor). Instead of
setting activations to zero, as in regular Dropout, the activations are set
to the negative saturation value of the SELU activation function. More details
can be found in the paper `Self-Normalizing Neural Networks`_ .
Each element will be masked independently for each sample on every forward
call with probability :attr:`p` using samples from a Bernoulli distribution.
The elements to be masked are randomized on every forward call, and scaled
and shifted to maintain zero mean and unit variance.
Usually the input comes from :class:`nn.AlphaDropout` modules.
As described in the paper
`Efficient Object Localization Using Convolutional Networks`_ ,
if adjacent pixels within feature maps are strongly correlated
(as is normally the case in early convolution layers) then i.i.d. dropout
will not regularize the activations and will otherwise just result
in an effective learning rate decrease.
In this case, :func:`nn.AlphaDropout` will help promote independence between
feature maps and should be used instead.
Args:
p (float, optional): probability of an element to be zeroed. Default: 0.5
inplace (bool, optional): If set to ``True``, will do this operation
in-place
Shape:
- Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`.
- Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input).
Examples::
>>> m = nn.FeatureAlphaDropout(p=0.2)
>>> input = torch.randn(20, 16, 4, 32, 32)
>>> output = m(input)
.. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
.. _Efficient Object Localization Using Convolutional Networks:
https://arxiv.org/abs/1411.4280
"""
def forward(self, input: Tensor) -> Tensor:
return F.feature_alpha_dropout(input, self.p, self.training)

View File

@ -0,0 +1,158 @@
# mypy: allow-untyped-defs
from typing import Tuple, Union
from torch import Tensor
from torch.types import _size
from .module import Module
__all__ = ["Flatten", "Unflatten"]
class Flatten(Module):
r"""
Flattens a contiguous range of dims into a tensor.
For use with :class:`~nn.Sequential`, see :meth:`torch.flatten` for details.
Shape:
- Input: :math:`(*, S_{\text{start}},..., S_{i}, ..., S_{\text{end}}, *)`,'
where :math:`S_{i}` is the size at dimension :math:`i` and :math:`*` means any
number of dimensions including none.
- Output: :math:`(*, \prod_{i=\text{start}}^{\text{end}} S_{i}, *)`.
Args:
start_dim: first dim to flatten (default = 1).
end_dim: last dim to flatten (default = -1).
Examples::
>>> input = torch.randn(32, 1, 5, 5)
>>> # With default parameters
>>> m = nn.Flatten()
>>> output = m(input)
>>> output.size()
torch.Size([32, 25])
>>> # With non-default parameters
>>> m = nn.Flatten(0, 2)
>>> output = m(input)
>>> output.size()
torch.Size([160, 5])
"""
__constants__ = ["start_dim", "end_dim"]
start_dim: int
end_dim: int
def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None:
super().__init__()
self.start_dim = start_dim
self.end_dim = end_dim
def forward(self, input: Tensor) -> Tensor:
return input.flatten(self.start_dim, self.end_dim)
def extra_repr(self) -> str:
return f"start_dim={self.start_dim}, end_dim={self.end_dim}"
class Unflatten(Module):
r"""
Unflattens a tensor dim expanding it to a desired shape. For use with :class:`~nn.Sequential`.
* :attr:`dim` specifies the dimension of the input tensor to be unflattened, and it can
be either `int` or `str` when `Tensor` or `NamedTensor` is used, respectively.
* :attr:`unflattened_size` is the new shape of the unflattened dimension of the tensor and it can be
a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input; a `NamedShape`
(tuple of `(name, size)` tuples) for `NamedTensor` input.
Shape:
- Input: :math:`(*, S_{\text{dim}}, *)`, where :math:`S_{\text{dim}}` is the size at
dimension :attr:`dim` and :math:`*` means any number of dimensions including none.
- Output: :math:`(*, U_1, ..., U_n, *)`, where :math:`U` = :attr:`unflattened_size` and
:math:`\prod_{i=1}^n U_i = S_{\text{dim}}`.
Args:
dim (Union[int, str]): Dimension to be unflattened
unflattened_size (Union[torch.Size, Tuple, List, NamedShape]): New shape of the unflattened dimension
Examples:
>>> input = torch.randn(2, 50)
>>> # With tuple of ints
>>> m = nn.Sequential(
>>> nn.Linear(50, 50),
>>> nn.Unflatten(1, (2, 5, 5))
>>> )
>>> output = m(input)
>>> output.size()
torch.Size([2, 2, 5, 5])
>>> # With torch.Size
>>> m = nn.Sequential(
>>> nn.Linear(50, 50),
>>> nn.Unflatten(1, torch.Size([2, 5, 5]))
>>> )
>>> output = m(input)
>>> output.size()
torch.Size([2, 2, 5, 5])
>>> # With namedshape (tuple of tuples)
>>> input = torch.randn(2, 50, names=('N', 'features'))
>>> unflatten = nn.Unflatten('features', (('C', 2), ('H', 5), ('W', 5)))
>>> output = unflatten(input)
>>> output.size()
torch.Size([2, 2, 5, 5])
"""
NamedShape = Tuple[Tuple[str, int]]
__constants__ = ["dim", "unflattened_size"]
dim: Union[int, str]
unflattened_size: Union[_size, NamedShape]
def __init__(
self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]
) -> None:
super().__init__()
if isinstance(dim, int):
self._require_tuple_int(unflattened_size)
elif isinstance(dim, str):
self._require_tuple_tuple(unflattened_size)
else:
raise TypeError("invalid argument type for dim parameter")
self.dim = dim
self.unflattened_size = unflattened_size
def _require_tuple_tuple(self, input):
if isinstance(input, tuple):
for idx, elem in enumerate(input):
if not isinstance(elem, tuple):
raise TypeError(
"unflattened_size must be tuple of tuples, "
+ f"but found element of type {type(elem).__name__} at pos {idx}"
)
return
raise TypeError(
"unflattened_size must be a tuple of tuples, "
+ f"but found type {type(input).__name__}"
)
def _require_tuple_int(self, input):
if isinstance(input, (tuple, list)):
for idx, elem in enumerate(input):
if not isinstance(elem, int):
raise TypeError(
"unflattened_size must be tuple of ints, "
+ f"but found element of type {type(elem).__name__} at pos {idx}"
)
return
raise TypeError(
f"unflattened_size must be a tuple of ints, but found type {type(input).__name__}"
)
def forward(self, input: Tensor) -> Tensor:
return input.unflatten(self.dim, self.unflattened_size)
def extra_repr(self) -> str:
return f"dim={self.dim}, unflattened_size={self.unflattened_size}"

View File

@ -0,0 +1,315 @@
import torch.nn.functional as F
from torch import Tensor
from torch.nn.common_types import _size_any_t
from .module import Module
__all__ = ["Fold", "Unfold"]
class Fold(Module):
r"""Combines an array of sliding local blocks into a large containing tensor.
Consider a batched :attr:`input` tensor containing sliding local blocks,
e.g., patches of images, of shape :math:`(N, C \times \prod(\text{kernel\_size}), L)`,
where :math:`N` is batch dimension, :math:`C \times \prod(\text{kernel\_size})`
is the number of values within a block (a block has :math:`\prod(\text{kernel\_size})`
spatial locations each containing a :math:`C`-channeled vector), and
:math:`L` is the total number of blocks. (This is exactly the
same specification as the output shape of :class:`~torch.nn.Unfold`.) This
operation combines these local blocks into the large :attr:`output` tensor
of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
by summing the overlapping values. Similar to :class:`~torch.nn.Unfold`, the
arguments must satisfy
.. math::
L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] %
- \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
where :math:`d` is over all spatial dimensions.
* :attr:`output_size` describes the spatial shape of the large containing
tensor of the sliding local blocks. It is useful to resolve the ambiguity
when multiple input shapes map to same number of sliding blocks, e.g.,
with ``stride > 0``.
The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify
how the sliding blocks are retrieved.
* :attr:`stride` controls the stride for the sliding blocks.
* :attr:`padding` controls the amount of implicit zero-paddings on both
sides for :attr:`padding` number of points for each dimension before
reshaping.
""" """
* :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
""" r"""
Args:
output_size (int or tuple): the shape of the spatial dimensions of the
output (i.e., ``output.sizes()[2:]``)
kernel_size (int or tuple): the size of the sliding blocks
dilation (int or tuple, optional): a parameter that controls the
stride of elements within the
neighborhood. Default: 1
padding (int or tuple, optional): implicit zero padding to be added on
both sides of input. Default: 0
stride (int or tuple): the stride of the sliding blocks in the input
spatial dimensions. Default: 1
* If :attr:`output_size`, :attr:`kernel_size`, :attr:`dilation`,
:attr:`padding` or :attr:`stride` is an int or a tuple of length 1 then
their values will be replicated across all spatial dimensions.
* For the case of two output spatial dimensions this operation is sometimes
called ``col2im``.
.. note::
:class:`~torch.nn.Fold` calculates each combined value in the resulting
large tensor by summing all values from all containing blocks.
:class:`~torch.nn.Unfold` extracts the values in the local blocks by
copying from the large tensor. So, if the blocks overlap, they are not
inverses of each other.
In general, folding and unfolding operations are related as
follows. Consider :class:`~torch.nn.Fold` and
:class:`~torch.nn.Unfold` instances created with the same
parameters:
>>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...)
>>> fold = nn.Fold(output_size=..., **fold_params)
>>> unfold = nn.Unfold(**fold_params)
Then for any (supported) ``input`` tensor the following
equality holds:
::
fold(unfold(input)) == divisor * input
where ``divisor`` is a tensor that depends only on the shape
and dtype of the ``input``:
>>> # xdoctest: +SKIP
>>> input_ones = torch.ones(input.shape, dtype=input.dtype)
>>> divisor = fold(unfold(input_ones))
When the ``divisor`` tensor contains no zero elements, then
``fold`` and ``unfold`` operations are inverses of each
other (up to constant divisor).
.. warning::
Currently, only unbatched (3D) or batched (4D) image-like output tensors are supported.
Shape:
- Input: :math:`(N, C \times \prod(\text{kernel\_size}), L)` or :math:`(C \times \prod(\text{kernel\_size}), L)`
- Output: :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
or :math:`(C, \text{output\_size}[0], \text{output\_size}[1], \dots)` as described above
Examples::
>>> fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 2))
>>> input = torch.randn(1, 3 * 2 * 2, 12)
>>> output = fold(input)
>>> output.size()
torch.Size([1, 3, 4, 5])
.. _link:
https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
"""
__constants__ = ["output_size", "kernel_size", "dilation", "padding", "stride"]
output_size: _size_any_t
kernel_size: _size_any_t
dilation: _size_any_t
padding: _size_any_t
stride: _size_any_t
def __init__(
self,
output_size: _size_any_t,
kernel_size: _size_any_t,
dilation: _size_any_t = 1,
padding: _size_any_t = 0,
stride: _size_any_t = 1,
) -> None:
super().__init__()
self.output_size = output_size
self.kernel_size = kernel_size
self.dilation = dilation
self.padding = padding
self.stride = stride
def forward(self, input: Tensor) -> Tensor:
return F.fold(
input,
self.output_size,
self.kernel_size,
self.dilation,
self.padding,
self.stride,
)
def extra_repr(self) -> str:
return (
"output_size={output_size}, kernel_size={kernel_size}, "
"dilation={dilation}, padding={padding}, stride={stride}".format(
**self.__dict__
)
)
class Unfold(Module):
r"""Extracts sliding local blocks from a batched input tensor.
Consider a batched :attr:`input` tensor of shape :math:`(N, C, *)`,
where :math:`N` is the batch dimension, :math:`C` is the channel dimension,
and :math:`*` represent arbitrary spatial dimensions. This operation flattens
each sliding :attr:`kernel_size`-sized block within the spatial dimensions
of :attr:`input` into a column (i.e., last dimension) of a 3-D :attr:`output`
tensor of shape :math:`(N, C \times \prod(\text{kernel\_size}), L)`, where
:math:`C \times \prod(\text{kernel\_size})` is the total number of values
within each block (a block has :math:`\prod(\text{kernel\_size})` spatial
locations each containing a :math:`C`-channeled vector), and :math:`L` is
the total number of such blocks:
.. math::
L = \prod_d \left\lfloor\frac{\text{spatial\_size}[d] + 2 \times \text{padding}[d] %
- \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
where :math:`\text{spatial\_size}` is formed by the spatial dimensions
of :attr:`input` (:math:`*` above), and :math:`d` is over all spatial
dimensions.
Therefore, indexing :attr:`output` at the last dimension (column dimension)
gives all values within a certain block.
The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify
how the sliding blocks are retrieved.
* :attr:`stride` controls the stride for the sliding blocks.
* :attr:`padding` controls the amount of implicit zero-paddings on both
sides for :attr:`padding` number of points for each dimension before
reshaping.
""" """
* :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
""" r"""
Args:
kernel_size (int or tuple): the size of the sliding blocks
dilation (int or tuple, optional): a parameter that controls the
stride of elements within the
neighborhood. Default: 1
padding (int or tuple, optional): implicit zero padding to be added on
both sides of input. Default: 0
stride (int or tuple, optional): the stride of the sliding blocks in the input
spatial dimensions. Default: 1
* If :attr:`kernel_size`, :attr:`dilation`, :attr:`padding` or
:attr:`stride` is an int or a tuple of length 1, their values will be
replicated across all spatial dimensions.
* For the case of two input spatial dimensions this operation is sometimes
called ``im2col``.
.. note::
:class:`~torch.nn.Fold` calculates each combined value in the resulting
large tensor by summing all values from all containing blocks.
:class:`~torch.nn.Unfold` extracts the values in the local blocks by
copying from the large tensor. So, if the blocks overlap, they are not
inverses of each other.
In general, folding and unfolding operations are related as
follows. Consider :class:`~torch.nn.Fold` and
:class:`~torch.nn.Unfold` instances created with the same
parameters:
>>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...)
>>> fold = nn.Fold(output_size=..., **fold_params)
>>> unfold = nn.Unfold(**fold_params)
Then for any (supported) ``input`` tensor the following
equality holds:
::
fold(unfold(input)) == divisor * input
where ``divisor`` is a tensor that depends only on the shape
and dtype of the ``input``:
>>> # xdoctest: +SKIP
>>> input_ones = torch.ones(input.shape, dtype=input.dtype)
>>> divisor = fold(unfold(input_ones))
When the ``divisor`` tensor contains no zero elements, then
``fold`` and ``unfold`` operations are inverses of each
other (up to constant divisor).
.. warning::
Currently, only 4-D input tensors (batched image-like tensors) are
supported.
Shape:
- Input: :math:`(N, C, *)`
- Output: :math:`(N, C \times \prod(\text{kernel\_size}), L)` as described above
Examples::
>>> unfold = nn.Unfold(kernel_size=(2, 3))
>>> input = torch.randn(2, 5, 3, 4)
>>> output = unfold(input)
>>> # each patch contains 30 values (2x3=6 vectors, each of 5 channels)
>>> # 4 blocks (2x3 kernels) in total in the 3x4 input
>>> output.size()
torch.Size([2, 30, 4])
>>> # xdoctest: +IGNORE_WANT
>>> # Convolution is equivalent with Unfold + Matrix Multiplication + Fold (or view to output shape)
>>> inp = torch.randn(1, 3, 10, 12)
>>> w = torch.randn(2, 3, 4, 5)
>>> inp_unf = torch.nn.functional.unfold(inp, (4, 5))
>>> out_unf = inp_unf.transpose(1, 2).matmul(w.view(w.size(0), -1).t()).transpose(1, 2)
>>> out = torch.nn.functional.fold(out_unf, (7, 8), (1, 1))
>>> # or equivalently (and avoiding a copy),
>>> # out = out_unf.view(1, 2, 7, 8)
>>> (torch.nn.functional.conv2d(inp, w) - out).abs().max()
tensor(1.9073e-06)
.. _link:
https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
"""
__constants__ = ["kernel_size", "dilation", "padding", "stride"]
kernel_size: _size_any_t
dilation: _size_any_t
padding: _size_any_t
stride: _size_any_t
def __init__(
self,
kernel_size: _size_any_t,
dilation: _size_any_t = 1,
padding: _size_any_t = 0,
stride: _size_any_t = 1,
) -> None:
super().__init__()
self.kernel_size = kernel_size
self.dilation = dilation
self.padding = padding
self.stride = stride
def forward(self, input: Tensor) -> Tensor:
return F.unfold(
input, self.kernel_size, self.dilation, self.padding, self.stride
)
def extra_repr(self) -> str:
return (
"kernel_size={kernel_size}, dilation={dilation}, padding={padding},"
" stride={stride}".format(**self.__dict__)
)

View File

@ -0,0 +1,471 @@
# mypy: allow-untyped-defs
import warnings
import torch.nn.functional as F
from torch import Tensor
from .batchnorm import _LazyNormBase, _NormBase
__all__ = [
"InstanceNorm1d",
"InstanceNorm2d",
"InstanceNorm3d",
"LazyInstanceNorm1d",
"LazyInstanceNorm2d",
"LazyInstanceNorm3d",
]
class _InstanceNorm(_NormBase):
def __init__(
self,
num_features: int,
eps: float = 1e-5,
momentum: float = 0.1,
affine: bool = False,
track_running_stats: bool = False,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__(
num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
)
def _check_input_dim(self, input):
raise NotImplementedError
def _get_no_batch_dim(self):
raise NotImplementedError
def _handle_no_batch_input(self, input):
return self._apply_instance_norm(input.unsqueeze(0)).squeeze(0)
def _apply_instance_norm(self, input):
return F.instance_norm(
input,
self.running_mean,
self.running_var,
self.weight,
self.bias,
self.training or not self.track_running_stats,
self.momentum if self.momentum is not None else 0.0,
self.eps,
)
def _load_from_state_dict(
self,
state_dict,
prefix,
local_metadata,
strict,
missing_keys,
unexpected_keys,
error_msgs,
):
version = local_metadata.get("version", None)
# at version 1: removed running_mean and running_var when
# track_running_stats=False (default)
if version is None and not self.track_running_stats:
running_stats_keys = []
for name in ("running_mean", "running_var"):
key = prefix + name
if key in state_dict:
running_stats_keys.append(key)
if len(running_stats_keys) > 0:
error_msgs.append(
"Unexpected running stats buffer(s) {names} for {klass} "
"with track_running_stats=False. If state_dict is a "
"checkpoint saved before 0.4.0, this may be expected "
"because {klass} does not track running stats by default "
"since 0.4.0. Please remove these keys from state_dict. If "
"the running stats are actually needed, instead set "
"track_running_stats=True in {klass} to enable them. See "
"the documentation of {klass} for details.".format(
names=" and ".join(f'"{k}"' for k in running_stats_keys),
klass=self.__class__.__name__,
)
)
for key in running_stats_keys:
state_dict.pop(key)
super()._load_from_state_dict(
state_dict,
prefix,
local_metadata,
strict,
missing_keys,
unexpected_keys,
error_msgs,
)
def forward(self, input: Tensor) -> Tensor:
self._check_input_dim(input)
feature_dim = input.dim() - self._get_no_batch_dim()
if input.size(feature_dim) != self.num_features:
if self.affine:
raise ValueError(
f"expected input's size at dim={feature_dim} to match num_features"
f" ({self.num_features}), but got: {input.size(feature_dim)}."
)
else:
warnings.warn(
f"input's size at dim={feature_dim} does not match num_features. "
"You can silence this warning by not passing in num_features, "
"which is not used because affine=False"
)
if input.dim() == self._get_no_batch_dim():
return self._handle_no_batch_input(input)
return self._apply_instance_norm(input)
class InstanceNorm1d(_InstanceNorm):
r"""Applies Instance Normalization.
This operation applies Instance Normalization
over a 2D (unbatched) or 3D (batched) input as described in the paper
`Instance Normalization: The Missing Ingredient for Fast Stylization
<https://arxiv.org/abs/1607.08022>`__.
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
The mean and standard-deviation are calculated per-dimension separately
for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the number of features or channels of the input) if :attr:`affine` is ``True``.
The standard-deviation is calculated via the biased estimator, equivalent to
`torch.var(input, unbiased=False)`.
By default, this layer uses instance statistics computed from input data in
both training and evaluation modes.
If :attr:`track_running_stats` is set to ``True``, during training this
layer keeps running estimates of its computed mean and variance, which are
then used for normalization during evaluation. The running estimates are
kept with a default :attr:`momentum` of 0.1.
.. note::
This :attr:`momentum` argument is different from one used in optimizer
classes and the conventional notion of momentum. Mathematically, the
update rule for running statistics here is
:math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
new observed value.
.. note::
:class:`InstanceNorm1d` and :class:`LayerNorm` are very similar, but
have some subtle differences. :class:`InstanceNorm1d` is applied
on each channel of channeled data like multidimensional time series, but
:class:`LayerNorm` is usually applied on entire sample and often in NLP
tasks. Additionally, :class:`LayerNorm` applies elementwise affine
transform, while :class:`InstanceNorm1d` usually don't apply affine
transform.
Args:
num_features: number of features or channels :math:`C` of the input
eps: a value added to the denominator for numerical stability. Default: 1e-5
momentum: the value used for the running_mean and running_var computation. Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters, initialized the same way as done for batch normalization.
Default: ``False``.
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics and always uses batch
statistics in both training and eval modes. Default: ``False``
Shape:
- Input: :math:`(N, C, L)` or :math:`(C, L)`
- Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input)
Examples::
>>> # Without Learnable Parameters
>>> m = nn.InstanceNorm1d(100)
>>> # With Learnable Parameters
>>> m = nn.InstanceNorm1d(100, affine=True)
>>> input = torch.randn(20, 100, 40)
>>> output = m(input)
"""
def _get_no_batch_dim(self):
return 2
def _check_input_dim(self, input):
if input.dim() not in (2, 3):
raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
class LazyInstanceNorm1d(_LazyNormBase, _InstanceNorm):
r"""A :class:`torch.nn.InstanceNorm1d` module with lazy initialization of the ``num_features`` argument.
The ``num_features`` argument of the :class:`InstanceNorm1d` is inferred from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`, `running_mean` and `running_var`.
Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.
Args:
num_features: :math:`C` from an expected input of size
:math:`(N, C, L)` or :math:`(C, L)`
eps: a value added to the denominator for numerical stability. Default: 1e-5
momentum: the value used for the running_mean and running_var computation. Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters, initialized the same way as done for batch normalization.
Default: ``False``.
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics and always uses batch
statistics in both training and eval modes. Default: ``False``
Shape:
- Input: :math:`(N, C, L)` or :math:`(C, L)`
- Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input)
"""
cls_to_become = InstanceNorm1d # type: ignore[assignment]
def _get_no_batch_dim(self):
return 2
def _check_input_dim(self, input):
if input.dim() not in (2, 3):
raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
class InstanceNorm2d(_InstanceNorm):
r"""Applies Instance Normalization.
This operation applies Instance Normalization
over a 4D input (a mini-batch of 2D inputs
with additional channel dimension) as described in the paper
`Instance Normalization: The Missing Ingredient for Fast Stylization
<https://arxiv.org/abs/1607.08022>`__.
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
The mean and standard-deviation are calculated per-dimension separately
for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
The standard-deviation is calculated via the biased estimator, equivalent to
`torch.var(input, unbiased=False)`.
By default, this layer uses instance statistics computed from input data in
both training and evaluation modes.
If :attr:`track_running_stats` is set to ``True``, during training this
layer keeps running estimates of its computed mean and variance, which are
then used for normalization during evaluation. The running estimates are
kept with a default :attr:`momentum` of 0.1.
.. note::
This :attr:`momentum` argument is different from one used in optimizer
classes and the conventional notion of momentum. Mathematically, the
update rule for running statistics here is
:math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
new observed value.
.. note::
:class:`InstanceNorm2d` and :class:`LayerNorm` are very similar, but
have some subtle differences. :class:`InstanceNorm2d` is applied
on each channel of channeled data like RGB images, but
:class:`LayerNorm` is usually applied on entire sample and often in NLP
tasks. Additionally, :class:`LayerNorm` applies elementwise affine
transform, while :class:`InstanceNorm2d` usually don't apply affine
transform.
Args:
num_features: :math:`C` from an expected input of size
:math:`(N, C, H, W)` or :math:`(C, H, W)`
eps: a value added to the denominator for numerical stability. Default: 1e-5
momentum: the value used for the running_mean and running_var computation. Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters, initialized the same way as done for batch normalization.
Default: ``False``.
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics and always uses batch
statistics in both training and eval modes. Default: ``False``
Shape:
- Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`
- Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
Examples::
>>> # Without Learnable Parameters
>>> m = nn.InstanceNorm2d(100)
>>> # With Learnable Parameters
>>> m = nn.InstanceNorm2d(100, affine=True)
>>> input = torch.randn(20, 100, 35, 45)
>>> output = m(input)
"""
def _get_no_batch_dim(self):
return 3
def _check_input_dim(self, input):
if input.dim() not in (3, 4):
raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
class LazyInstanceNorm2d(_LazyNormBase, _InstanceNorm):
r"""A :class:`torch.nn.InstanceNorm2d` module with lazy initialization of the ``num_features`` argument.
The ``num_features`` argument of the :class:`InstanceNorm2d` is inferred from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.
Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.
Args:
num_features: :math:`C` from an expected input of size
:math:`(N, C, H, W)` or :math:`(C, H, W)`
eps: a value added to the denominator for numerical stability. Default: 1e-5
momentum: the value used for the running_mean and running_var computation. Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters, initialized the same way as done for batch normalization.
Default: ``False``.
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics and always uses batch
statistics in both training and eval modes. Default: ``False``
Shape:
- Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`
- Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
"""
cls_to_become = InstanceNorm2d # type: ignore[assignment]
def _get_no_batch_dim(self):
return 3
def _check_input_dim(self, input):
if input.dim() not in (3, 4):
raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
class InstanceNorm3d(_InstanceNorm):
r"""Applies Instance Normalization.
This operation applies Instance Normalization
over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper
`Instance Normalization: The Missing Ingredient for Fast Stylization
<https://arxiv.org/abs/1607.08022>`__.
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
The mean and standard-deviation are calculated per-dimension separately
for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size C (where C is the input size) if :attr:`affine` is ``True``.
The standard-deviation is calculated via the biased estimator, equivalent to
`torch.var(input, unbiased=False)`.
By default, this layer uses instance statistics computed from input data in
both training and evaluation modes.
If :attr:`track_running_stats` is set to ``True``, during training this
layer keeps running estimates of its computed mean and variance, which are
then used for normalization during evaluation. The running estimates are
kept with a default :attr:`momentum` of 0.1.
.. note::
This :attr:`momentum` argument is different from one used in optimizer
classes and the conventional notion of momentum. Mathematically, the
update rule for running statistics here is
:math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
new observed value.
.. note::
:class:`InstanceNorm3d` and :class:`LayerNorm` are very similar, but
have some subtle differences. :class:`InstanceNorm3d` is applied
on each channel of channeled data like 3D models with RGB color, but
:class:`LayerNorm` is usually applied on entire sample and often in NLP
tasks. Additionally, :class:`LayerNorm` applies elementwise affine
transform, while :class:`InstanceNorm3d` usually don't apply affine
transform.
Args:
num_features: :math:`C` from an expected input of size
:math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
eps: a value added to the denominator for numerical stability. Default: 1e-5
momentum: the value used for the running_mean and running_var computation. Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters, initialized the same way as done for batch normalization.
Default: ``False``.
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics and always uses batch
statistics in both training and eval modes. Default: ``False``
Shape:
- Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
- Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input)
Examples::
>>> # Without Learnable Parameters
>>> m = nn.InstanceNorm3d(100)
>>> # With Learnable Parameters
>>> m = nn.InstanceNorm3d(100, affine=True)
>>> input = torch.randn(20, 100, 35, 45, 10)
>>> output = m(input)
"""
def _get_no_batch_dim(self):
return 4
def _check_input_dim(self, input):
if input.dim() not in (4, 5):
raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
class LazyInstanceNorm3d(_LazyNormBase, _InstanceNorm):
r"""A :class:`torch.nn.InstanceNorm3d` module with lazy initialization of the ``num_features`` argument.
The ``num_features`` argument of the :class:`InstanceNorm3d` is inferred from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.
Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.
Args:
num_features: :math:`C` from an expected input of size
:math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
eps: a value added to the denominator for numerical stability. Default: 1e-5
momentum: the value used for the running_mean and running_var computation. Default: 0.1
affine: a boolean value that when set to ``True``, this module has
learnable affine parameters, initialized the same way as done for batch normalization.
Default: ``False``.
track_running_stats: a boolean value that when set to ``True``, this
module tracks the running mean and variance, and when set to ``False``,
this module does not track such statistics and always uses batch
statistics in both training and eval modes. Default: ``False``
Shape:
- Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
- Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input)
"""
cls_to_become = InstanceNorm3d # type: ignore[assignment]
def _get_no_batch_dim(self):
return 4
def _check_input_dim(self, input):
if input.dim() not in (4, 5):
raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")

View File

@ -0,0 +1,289 @@
# mypy: allow-untyped-defs
import itertools
from typing import Any, Optional, Protocol, Type
import torch
from torch.nn.parameter import is_lazy
__all__ = ["LazyModuleMixin"]
class _LazyProtocol(Protocol):
"""This class is used to avoid errors with mypy checks for the attributes in a mixin.
https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes
"""
def _register_load_state_dict_pre_hook(self, hook):
...
def register_forward_pre_hook(self, hook, *, prepend=False, with_kwargs=False):
...
def _lazy_load_hook(
self,
state_dict,
prefix,
local_metadata,
strict,
missing_keys,
unexpected_keys,
error_msgs,
):
...
def _get_name(self):
...
def _infer_parameters(self, module, input):
...
@property
def _parameters(self):
...
@property
def _buffers(self):
...
@property
def _non_persistent_buffers_set(self):
...
@property
def _load_hook(self):
...
@property
def _initialize_hook(self):
...
class LazyModuleMixin:
r"""A mixin for modules that lazily initialize parameters, also known as "lazy modules".
.. warning:
Lazy modules are an experimental new feature under active development,
and their API is likely to change.
Modules that lazily initialize parameters, or "lazy modules",
derive the shapes of their parameters from the first input(s)
to their forward method. Until that first forward they contain
:class:`torch.nn.UninitializedParameter` s that should not be accessed
or used, and afterward they contain regular :class:`torch.nn.Parameter` s.
Lazy modules are convenient since they don't require computing some
module arguments, like the :attr:`in_features` argument of a
typical :class:`torch.nn.Linear`.
After construction, networks with lazy modules should first
be converted to the desired dtype and placed on the expected device.
This is because lazy modules only perform shape inference so the usual dtype
and device placement behavior applies.
The lazy modules should then perform "dry runs" to initialize all the components in the module.
These "dry runs" send inputs of the correct size, dtype, and device through
the network and to each one of its lazy modules. After this the network can be used as usual.
>>> # xdoctest: +SKIP
>>> class LazyMLP(torch.nn.Module):
... def __init__(self) -> None:
... super().__init__()
... self.fc1 = torch.nn.LazyLinear(10)
... self.relu1 = torch.nn.ReLU()
... self.fc2 = torch.nn.LazyLinear(1)
... self.relu2 = torch.nn.ReLU()
...
... def forward(self, input):
... x = self.relu1(self.fc1(input))
... y = self.relu2(self.fc2(x))
... return y
>>> # constructs a network with lazy modules
>>> lazy_mlp = LazyMLP()
>>> # transforms the network's device and dtype
>>> # NOTE: these transforms can and should be applied after construction and before any 'dry runs'
>>> lazy_mlp = lazy_mlp.cuda().double()
>>> lazy_mlp
LazyMLP( (fc1): LazyLinear(in_features=0, out_features=10, bias=True)
(relu1): ReLU()
(fc2): LazyLinear(in_features=0, out_features=1, bias=True)
(relu2): ReLU()
)
>>> # performs a dry run to initialize the network's lazy modules
>>> lazy_mlp(torch.ones(10,10).cuda())
>>> # after initialization, LazyLinear modules become regular Linear modules
>>> lazy_mlp
LazyMLP(
(fc1): Linear(in_features=10, out_features=10, bias=True)
(relu1): ReLU()
(fc2): Linear(in_features=10, out_features=1, bias=True)
(relu2): ReLU()
)
>>> # attaches an optimizer, since parameters can now be used as usual
>>> optim = torch.optim.SGD(mlp.parameters(), lr=0.01)
A final caveat when using lazy modules is that the order of initialization of a network's
parameters may change, since the lazy modules are always initialized after other modules.
For example, if the LazyMLP class defined above had a :class:`torch.nn.LazyLinear` module
first and then a regular :class:`torch.nn.Linear` second, the second module would be
initialized on construction and the first module would be initialized during the first dry run.
This can cause the parameters of a network using lazy modules to be initialized differently
than the parameters of a network without lazy modules as the order of parameter initializations,
which often depends on a stateful random number generator, is different.
Check :doc:`/notes/randomness` for more details.
Lazy modules can be serialized with a state dict like other modules. For example:
>>> lazy_mlp = LazyMLP()
>>> # The state dict shows the uninitialized parameters
>>> lazy_mlp.state_dict()
OrderedDict([('fc1.weight', Uninitialized parameter),
('fc1.bias',
tensor([-1.8832e+25, 4.5636e-41, -1.8832e+25, 4.5636e-41, -6.1598e-30,
4.5637e-41, -1.8788e+22, 4.5636e-41, -2.0042e-31, 4.5637e-41])),
('fc2.weight', Uninitialized parameter),
('fc2.bias', tensor([0.0019]))])
Lazy modules can load regular :class:`torch.nn.Parameter` s (i.e. you can serialize/deserialize
initialized LazyModules and they will remain initialized)
>>> full_mlp = LazyMLP()
>>> # Dry run to initialize another module
>>> full_mlp.forward(torch.ones(10, 1))
>>> # Load an initialized state into a lazy module
>>> lazy_mlp.load_state_dict(full_mlp.state_dict())
>>> # The state dict now holds valid values
>>> lazy_mlp.state_dict()
OrderedDict([('fc1.weight',
tensor([[-0.3837],
[ 0.0907],
[ 0.6708],
[-0.5223],
[-0.9028],
[ 0.2851],
[-0.4537],
[ 0.6813],
[ 0.5766],
[-0.8678]])),
('fc1.bias',
tensor([-1.8832e+25, 4.5636e-41, -1.8832e+25, 4.5636e-41, -6.1598e-30,
4.5637e-41, -1.8788e+22, 4.5636e-41, -2.0042e-31, 4.5637e-41])),
('fc2.weight',
tensor([[ 0.1320, 0.2938, 0.0679, 0.2793, 0.1088, -0.1795, -0.2301, 0.2807,
0.2479, 0.1091]])),
('fc2.bias', tensor([0.0019]))])
Note, however, that the loaded parameters will not be replaced when doing a "dry run" if they are initialized
when the state is loaded. This prevents using initialized modules in different contexts.
"""
# modules inheriting from this will change their __class__ to the specified
# one after they are fully initialized
cls_to_become: Optional[Type[Any]] = None
def __init__(self: _LazyProtocol, *args, **kwargs):
# Mypy doesnt like this super call in a mixin
super().__init__(*args, **kwargs) # type: ignore[misc]
self._load_hook = self._register_load_state_dict_pre_hook(self._lazy_load_hook)
self._initialize_hook = self.register_forward_pre_hook(
self._infer_parameters, with_kwargs=True
)
def _save_to_state_dict(self: _LazyProtocol, destination, prefix, keep_vars):
# This should be ideally implemented as a hook,
# but we should override `detach` in the UninitializedParameter to return itself
# which is not clean
for name, param in self._parameters.items():
if param is not None:
if not (is_lazy(param) or keep_vars):
param = param.detach()
destination[prefix + name] = param
for name, buf in self._buffers.items():
if buf is not None and name not in self._non_persistent_buffers_set:
if not (is_lazy(buf) or keep_vars):
buf = buf.detach()
destination[prefix + name] = buf
def _lazy_load_hook(
self: _LazyProtocol,
state_dict,
prefix,
local_metadata,
strict,
missing_keys,
unexpected_keys,
error_msgs,
):
"""load_state_dict pre-hook function for lazy buffers and parameters.
The purpose of this hook is to adjust the current state and/or
``state_dict`` being loaded so that a module instance serialized in
both un/initialized state can be deserialized onto both un/initialized
module instance.
See comment in ``torch.nn.Module._register_load_state_dict_pre_hook``
for the details of the hook specification.
"""
for name, param in itertools.chain(
self._parameters.items(), self._buffers.items()
):
key = prefix + name
if key in state_dict and param is not None:
input_param = state_dict[key]
if is_lazy(param):
# The current parameter is not initialized but the one being loaded one is
# create a new parameter based on the uninitialized one
if not is_lazy(input_param):
with torch.no_grad():
param.materialize(input_param.shape)
def initialize_parameters(self: _LazyProtocol, *args, **kwargs):
r"""Initialize parameters according to the input batch properties.
This adds an interface to isolate parameter initialization from the
forward pass when doing parameter shape inference.
"""
raise NotImplementedError(
f"initialize_parameters is not implemented for {self.__class__.__name__}"
)
def has_uninitialized_params(self: _LazyProtocol):
r"""Check if a module has parameters that are not initialized."""
# This is to avoid the JIT to track this parameter and force
# custom modules __setstate__ to add it
params = self._parameters.values()
buffers = self._buffers.values()
for param in itertools.chain(params, buffers):
if is_lazy(param):
return True
return False
# torchrec tests the code consistency with the following code
# fmt: off
def _infer_parameters(self: _LazyProtocol, module, args, kwargs=None):
r"""Infers the size and initializes the parameters according to the provided input batch.
Given a module that contains parameters that were declared inferrable
using :class:`torch.nn.parameter.ParameterMode.Infer`, runs a forward pass
in the complete module using the provided input to initialize all the parameters
as needed.
The module is set into evaluation mode before running the forward pass in order
to avoid saving statistics or calculating gradients
"""
kwargs = kwargs if kwargs else {}
module.initialize_parameters(*args, **kwargs)
if module.has_uninitialized_params():
raise RuntimeError(f'module {self._get_name()} has not been fully initialized')
module._initialize_hook.remove()
module._load_hook.remove()
delattr(module, '_initialize_hook')
delattr(module, '_load_hook')
if module.cls_to_become is not None:
module.__class__ = module.cls_to_become
# fmt: on
def _replicate_for_data_parallel(self: _LazyProtocol):
raise RuntimeError(
"Modules with uninitialized parameters can't be used with `DataParallel`. "
"Run a dummy forward pass to correctly initialize the modules"
)

View File

@ -0,0 +1,293 @@
# mypy: allow-untyped-defs
import math
from typing import Any
import torch
from torch import Tensor
from torch.nn import functional as F, init
from torch.nn.parameter import Parameter, UninitializedParameter
from .lazy import LazyModuleMixin
from .module import Module
__all__ = [
"Bilinear",
"Identity",
"LazyLinear",
"Linear",
]
class Identity(Module):
r"""A placeholder identity operator that is argument-insensitive.
Args:
args: any argument (unused)
kwargs: any keyword argument (unused)
Shape:
- Input: :math:`(*)`, where :math:`*` means any number of dimensions.
- Output: :math:`(*)`, same shape as the input.
Examples::
>>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
>>> input = torch.randn(128, 20)
>>> output = m(input)
>>> print(output.size())
torch.Size([128, 20])
"""
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__()
def forward(self, input: Tensor) -> Tensor:
return input
class Linear(Module):
r"""Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.
This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
Args:
in_features: size of each input sample
out_features: size of each output sample
bias: If set to ``False``, the layer will not learn an additive bias.
Default: ``True``
Shape:
- Input: :math:`(*, H_{in})` where :math:`*` means any number of
dimensions including none and :math:`H_{in} = \text{in\_features}`.
- Output: :math:`(*, H_{out})` where all but the last dimension
are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
Attributes:
weight: the learnable weights of the module of shape
:math:`(\text{out\_features}, \text{in\_features})`. The values are
initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
:math:`k = \frac{1}{\text{in\_features}}`
bias: the learnable bias of the module of shape :math:`(\text{out\_features})`.
If :attr:`bias` is ``True``, the values are initialized from
:math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
:math:`k = \frac{1}{\text{in\_features}}`
Examples::
>>> m = nn.Linear(20, 30)
>>> input = torch.randn(128, 20)
>>> output = m(input)
>>> print(output.size())
torch.Size([128, 30])
"""
__constants__ = ["in_features", "out_features"]
in_features: int
out_features: int
weight: Tensor
def __init__(
self,
in_features: int,
out_features: int,
bias: bool = True,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = Parameter(
torch.empty((out_features, in_features), **factory_kwargs)
)
if bias:
self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
else:
self.register_parameter("bias", None)
self.reset_parameters()
def reset_parameters(self) -> None:
# Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
# uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
# https://github.com/pytorch/pytorch/issues/57109
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
init.uniform_(self.bias, -bound, bound)
def forward(self, input: Tensor) -> Tensor:
return F.linear(input, self.weight, self.bias)
def extra_repr(self) -> str:
return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
# This class exists solely to avoid triggering an obscure error when scripting
# an improperly quantized attention layer. See this issue for details:
# https://github.com/pytorch/pytorch/issues/58969
# TODO: fail fast on quantization API usage error, then remove this class
# and replace uses of it with plain Linear
class NonDynamicallyQuantizableLinear(Linear):
def __init__(
self,
in_features: int,
out_features: int,
bias: bool = True,
device=None,
dtype=None,
) -> None:
super().__init__(
in_features, out_features, bias=bias, device=device, dtype=dtype
)
class Bilinear(Module):
r"""Applies a bilinear transformation to the incoming data: :math:`y = x_1^T A x_2 + b`.
Args:
in1_features: size of each first input sample
in2_features: size of each second input sample
out_features: size of each output sample
bias: If set to False, the layer will not learn an additive bias.
Default: ``True``
Shape:
- Input1: :math:`(*, H_{in1})` where :math:`H_{in1}=\text{in1\_features}` and
:math:`*` means any number of additional dimensions including none. All but the last dimension
of the inputs should be the same.
- Input2: :math:`(*, H_{in2})` where :math:`H_{in2}=\text{in2\_features}`.
- Output: :math:`(*, H_{out})` where :math:`H_{out}=\text{out\_features}`
and all but the last dimension are the same shape as the input.
Attributes:
weight: the learnable weights of the module of shape
:math:`(\text{out\_features}, \text{in1\_features}, \text{in2\_features})`.
The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
:math:`k = \frac{1}{\text{in1\_features}}`
bias: the learnable bias of the module of shape :math:`(\text{out\_features})`.
If :attr:`bias` is ``True``, the values are initialized from
:math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
:math:`k = \frac{1}{\text{in1\_features}}`
Examples::
>>> m = nn.Bilinear(20, 30, 40)
>>> input1 = torch.randn(128, 20)
>>> input2 = torch.randn(128, 30)
>>> output = m(input1, input2)
>>> print(output.size())
torch.Size([128, 40])
"""
__constants__ = ["in1_features", "in2_features", "out_features"]
in1_features: int
in2_features: int
out_features: int
weight: Tensor
def __init__(
self,
in1_features: int,
in2_features: int,
out_features: int,
bias: bool = True,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
self.in1_features = in1_features
self.in2_features = in2_features
self.out_features = out_features
self.weight = Parameter(
torch.empty((out_features, in1_features, in2_features), **factory_kwargs)
)
if bias:
self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
else:
self.register_parameter("bias", None)
self.reset_parameters()
def reset_parameters(self) -> None:
bound = 1 / math.sqrt(self.weight.size(1))
init.uniform_(self.weight, -bound, bound)
if self.bias is not None:
init.uniform_(self.bias, -bound, bound)
def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
return F.bilinear(input1, input2, self.weight, self.bias)
def extra_repr(self) -> str:
return (
f"in1_features={self.in1_features}, in2_features={self.in2_features}, "
f"out_features={self.out_features}, bias={self.bias is not None}"
)
class LazyLinear(LazyModuleMixin, Linear):
r"""A :class:`torch.nn.Linear` module where `in_features` is inferred.
In this module, the `weight` and `bias` are of :class:`torch.nn.UninitializedParameter`
class. They will be initialized after the first call to ``forward`` is done and the
module will become a regular :class:`torch.nn.Linear` module. The ``in_features`` argument
of the :class:`Linear` is inferred from the ``input.shape[-1]``.
Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.
Args:
out_features: size of each output sample
bias: If set to ``False``, the layer will not learn an additive bias.
Default: ``True``
Attributes:
weight: the learnable weights of the module of shape
:math:`(\text{out\_features}, \text{in\_features})`. The values are
initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
:math:`k = \frac{1}{\text{in\_features}}`
bias: the learnable bias of the module of shape :math:`(\text{out\_features})`.
If :attr:`bias` is ``True``, the values are initialized from
:math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
:math:`k = \frac{1}{\text{in\_features}}`
"""
cls_to_become = Linear # type: ignore[assignment]
weight: UninitializedParameter
bias: UninitializedParameter # type: ignore[assignment]
def __init__(
self, out_features: int, bias: bool = True, device=None, dtype=None
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
# bias is hardcoded to False to avoid creating tensor
# that will soon be overwritten.
super().__init__(0, 0, False)
self.weight = UninitializedParameter(**factory_kwargs)
self.out_features = out_features
if bias:
self.bias = UninitializedParameter(**factory_kwargs)
def reset_parameters(self) -> None:
if not self.has_uninitialized_params() and self.in_features != 0:
super().reset_parameters()
def initialize_parameters(self, input) -> None: # type: ignore[override]
if self.has_uninitialized_params():
with torch.no_grad():
self.in_features = input.shape[-1]
self.weight.materialize((self.out_features, self.in_features))
if self.bias is not None:
self.bias.materialize((self.out_features,))
self.reset_parameters()
# TODO: PartialLinear - maybe in sparse?

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,415 @@
# mypy: allow-untyped-defs
import numbers
from typing import List, Optional, Tuple, Union
import torch
from torch import Size, Tensor
from torch.nn import functional as F, init
from torch.nn.parameter import Parameter
from ._functions import CrossMapLRN2d as _cross_map_lrn2d
from .module import Module
__all__ = ["LocalResponseNorm", "CrossMapLRN2d", "LayerNorm", "GroupNorm", "RMSNorm"]
class LocalResponseNorm(Module):
r"""Applies local response normalization over an input signal.
The input signal is composed of several input planes, where channels occupy the second dimension.
Applies normalization across channels.
.. math::
b_{c} = a_{c}\left(k + \frac{\alpha}{n}
\sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta}
Args:
size: amount of neighbouring channels used for normalization
alpha: multiplicative factor. Default: 0.0001
beta: exponent. Default: 0.75
k: additive factor. Default: 1
Shape:
- Input: :math:`(N, C, *)`
- Output: :math:`(N, C, *)` (same shape as input)
Examples::
>>> lrn = nn.LocalResponseNorm(2)
>>> signal_2d = torch.randn(32, 5, 24, 24)
>>> signal_4d = torch.randn(16, 5, 7, 7, 7, 7)
>>> output_2d = lrn(signal_2d)
>>> output_4d = lrn(signal_4d)
"""
__constants__ = ["size", "alpha", "beta", "k"]
size: int
alpha: float
beta: float
k: float
def __init__(
self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.0
) -> None:
super().__init__()
self.size = size
self.alpha = alpha
self.beta = beta
self.k = k
def forward(self, input: Tensor) -> Tensor:
return F.local_response_norm(input, self.size, self.alpha, self.beta, self.k)
def extra_repr(self):
return "{size}, alpha={alpha}, beta={beta}, k={k}".format(**self.__dict__)
class CrossMapLRN2d(Module):
size: int
alpha: float
beta: float
k: float
def __init__(
self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1
) -> None:
super().__init__()
self.size = size
self.alpha = alpha
self.beta = beta
self.k = k
def forward(self, input: Tensor) -> Tensor:
return _cross_map_lrn2d.apply(input, self.size, self.alpha, self.beta, self.k)
def extra_repr(self) -> str:
return "{size}, alpha={alpha}, beta={beta}, k={k}".format(**self.__dict__)
_shape_t = Union[int, List[int], Size]
class LayerNorm(Module):
r"""Applies Layer Normalization over a mini-batch of inputs.
This layer implements the operation as described in
the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
The mean and standard-deviation are calculated over the last `D` dimensions, where `D`
is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape`
is ``(3, 5)`` (a 2-dimensional shape), the mean and standard-deviation are computed over
the last 2 dimensions of the input (i.e. ``input.mean((-2, -1))``).
:math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
:attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
The standard-deviation is calculated via the biased estimator, equivalent to
`torch.var(input, unbiased=False)`.
.. note::
Unlike Batch Normalization and Instance Normalization, which applies
scalar scale and bias for each entire channel/plane with the
:attr:`affine` option, Layer Normalization applies per-element scale and
bias with :attr:`elementwise_affine`.
This layer uses statistics computed from input data in both training and
evaluation modes.
Args:
normalized_shape (int or list or torch.Size): input shape from an expected input
of size
.. math::
[* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
\times \ldots \times \text{normalized\_shape}[-1]]
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps: a value added to the denominator for numerical stability. Default: 1e-5
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
:attr:`elementwise_affine` is ``True``). Default: ``True``.
Attributes:
weight: the learnable weights of the module of shape
:math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
The values are initialized to 1.
bias: the learnable bias of the module of shape
:math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
The values are initialized to 0.
Shape:
- Input: :math:`(N, *)`
- Output: :math:`(N, *)` (same shape as input)
Examples::
>>> # NLP Example
>>> batch, sentence_length, embedding_dim = 20, 5, 10
>>> embedding = torch.randn(batch, sentence_length, embedding_dim)
>>> layer_norm = nn.LayerNorm(embedding_dim)
>>> # Activate module
>>> layer_norm(embedding)
>>>
>>> # Image Example
>>> N, C, H, W = 20, 5, 10, 10
>>> input = torch.randn(N, C, H, W)
>>> # Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
>>> # as shown in the image below
>>> layer_norm = nn.LayerNorm([C, H, W])
>>> output = layer_norm(input)
.. image:: ../_static/img/nn/layer_norm.jpg
:scale: 50 %
"""
__constants__ = ["normalized_shape", "eps", "elementwise_affine"]
normalized_shape: Tuple[int, ...]
eps: float
elementwise_affine: bool
def __init__(
self,
normalized_shape: _shape_t,
eps: float = 1e-5,
elementwise_affine: bool = True,
bias: bool = True,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
if isinstance(normalized_shape, numbers.Integral):
# mypy error: incompatible types in assignment
normalized_shape = (normalized_shape,) # type: ignore[assignment]
self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type]
self.eps = eps
self.elementwise_affine = elementwise_affine
if self.elementwise_affine:
self.weight = Parameter(
torch.empty(self.normalized_shape, **factory_kwargs)
)
if bias:
self.bias = Parameter(
torch.empty(self.normalized_shape, **factory_kwargs)
)
else:
self.register_parameter("bias", None)
else:
self.register_parameter("weight", None)
self.register_parameter("bias", None)
self.reset_parameters()
def reset_parameters(self) -> None:
if self.elementwise_affine:
init.ones_(self.weight)
if self.bias is not None:
init.zeros_(self.bias)
def forward(self, input: Tensor) -> Tensor:
return F.layer_norm(
input, self.normalized_shape, self.weight, self.bias, self.eps
)
def extra_repr(self) -> str:
return (
"{normalized_shape}, eps={eps}, "
"elementwise_affine={elementwise_affine}".format(**self.__dict__)
)
class GroupNorm(Module):
r"""Applies Group Normalization over a mini-batch of inputs.
This layer implements the operation as described in
the paper `Group Normalization <https://arxiv.org/abs/1803.08494>`__
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
The input channels are separated into :attr:`num_groups` groups, each containing
``num_channels / num_groups`` channels. :attr:`num_channels` must be divisible by
:attr:`num_groups`. The mean and standard-deviation are calculated
separately over the each group. :math:`\gamma` and :math:`\beta` are learnable
per-channel affine transform parameter vectors of size :attr:`num_channels` if
:attr:`affine` is ``True``.
The standard-deviation is calculated via the biased estimator, equivalent to
`torch.var(input, unbiased=False)`.
This layer uses statistics computed from input data in both training and
evaluation modes.
Args:
num_groups (int): number of groups to separate the channels into
num_channels (int): number of channels expected in input
eps: a value added to the denominator for numerical stability. Default: 1e-5
affine: a boolean value that when set to ``True``, this module
has learnable per-channel affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
Shape:
- Input: :math:`(N, C, *)` where :math:`C=\text{num\_channels}`
- Output: :math:`(N, C, *)` (same shape as input)
Examples::
>>> input = torch.randn(20, 6, 10, 10)
>>> # Separate 6 channels into 3 groups
>>> m = nn.GroupNorm(3, 6)
>>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm)
>>> m = nn.GroupNorm(6, 6)
>>> # Put all 6 channels into a single group (equivalent with LayerNorm)
>>> m = nn.GroupNorm(1, 6)
>>> # Activating the module
>>> output = m(input)
"""
__constants__ = ["num_groups", "num_channels", "eps", "affine"]
num_groups: int
num_channels: int
eps: float
affine: bool
def __init__(
self,
num_groups: int,
num_channels: int,
eps: float = 1e-5,
affine: bool = True,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
if num_channels % num_groups != 0:
raise ValueError("num_channels must be divisible by num_groups")
self.num_groups = num_groups
self.num_channels = num_channels
self.eps = eps
self.affine = affine
if self.affine:
self.weight = Parameter(torch.empty(num_channels, **factory_kwargs))
self.bias = Parameter(torch.empty(num_channels, **factory_kwargs))
else:
self.register_parameter("weight", None)
self.register_parameter("bias", None)
self.reset_parameters()
def reset_parameters(self) -> None:
if self.affine:
init.ones_(self.weight)
init.zeros_(self.bias)
def forward(self, input: Tensor) -> Tensor:
return F.group_norm(input, self.num_groups, self.weight, self.bias, self.eps)
def extra_repr(self) -> str:
return "{num_groups}, {num_channels}, eps={eps}, " "affine={affine}".format(
**self.__dict__
)
class RMSNorm(Module):
r"""Applies Root Mean Square Layer Normalization over a mini-batch of inputs.
This layer implements the operation as described in
the paper `Root Mean Square Layer Normalization <https://arxiv.org/pdf/1910.07467.pdf>`__
.. math::
y = \frac{x}{\sqrt{\mathrm{RMS}[x] + \epsilon}} * \gamma
The root mean squared norm is taken over the last ``D`` dimensions, where ``D``
is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape`
is ``(3, 5)`` (a 2-dimensional shape), the rms norm is computed over
the last 2 dimensions of the input.
Args:
normalized_shape (int or list or torch.Size): input shape from an expected input
of size
.. math::
[* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
\times \ldots \times \text{normalized\_shape}[-1]]
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps: a value added to the denominator for numerical stability. Default: :func:`torch.finfo(x.dtype).eps`
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
Shape:
- Input: :math:`(N, *)`
- Output: :math:`(N, *)` (same shape as input)
Examples::
>>> rms_norm = nn.RMSNorm([2, 3])
>>> input = torch.randn(2, 2, 3)
>>> rms_norm(input)
"""
__constants__ = ["normalized_shape", "eps", "elementwise_affine"]
normalized_shape: Tuple[int, ...]
eps: Optional[float]
elementwise_affine: bool
def __init__(
self,
normalized_shape: _shape_t,
eps: Optional[float] = None,
elementwise_affine: bool = True,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
if isinstance(normalized_shape, numbers.Integral):
# mypy error: incompatible types in assignment
normalized_shape = (normalized_shape,) # type: ignore[assignment]
self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type]
self.eps = eps
self.elementwise_affine = elementwise_affine
if self.elementwise_affine:
self.weight = Parameter(
torch.empty(self.normalized_shape, **factory_kwargs)
)
else:
self.register_parameter("weight", None)
self.reset_parameters()
def reset_parameters(self) -> None:
"""
Resets parameters based on their initialization used in __init__.
"""
if self.elementwise_affine:
init.ones_(self.weight)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Runs forward pass.
"""
return F.rms_norm(x, self.normalized_shape, self.weight, self.eps)
def extra_repr(self) -> str:
"""
Extra information about the module.
"""
return (
"{normalized_shape}, eps={eps}, "
"elementwise_affine={elementwise_affine}".format(**self.__dict__)
)
# TODO: ContrastiveNorm2d
# TODO: DivisiveNorm2d
# TODO: SubtractiveNorm2d

View File

@ -0,0 +1,813 @@
# mypy: allow-untyped-defs
from typing import Sequence, Tuple
import torch.nn.functional as F
from torch import Tensor
from torch.nn.common_types import _size_2_t, _size_4_t, _size_6_t
from .module import Module
from .utils import _ntuple, _pair, _quadruple
# TODO: grad_output size asserts in THNN
__all__ = [
"CircularPad1d",
"CircularPad2d",
"CircularPad3d",
"ConstantPad1d",
"ConstantPad2d",
"ConstantPad3d",
"ReflectionPad1d",
"ReflectionPad2d",
"ReflectionPad3d",
"ReplicationPad1d",
"ReplicationPad2d",
"ReplicationPad3d",
"ZeroPad1d",
"ZeroPad2d",
"ZeroPad3d",
]
class _CircularPadNd(Module):
__constants__ = ["padding"]
padding: Sequence[int]
def _check_input_dim(self, input):
raise NotImplementedError
def forward(self, input: Tensor) -> Tensor:
self._check_input_dim(input)
return F.pad(input, self.padding, "circular")
def extra_repr(self) -> str:
return f"{self.padding}"
class CircularPad1d(_CircularPadNd):
r"""Pads the input tensor using circular padding of the input boundary.
Tensor values at the beginning of the dimension are used to pad the end,
and values at the end are used to pad the beginning. If negative padding is
applied then the ends of the tensor get removed.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 2-`tuple`, uses
(:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
Shape:
- Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
- Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
>>> m = nn.CircularPad1d(2)
>>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
>>> input
tensor([[[0., 1., 2., 3.],
[4., 5., 6., 7.]]])
>>> m(input)
tensor([[[2., 3., 0., 1., 2., 3., 0., 1.],
[6., 7., 4., 5., 6., 7., 4., 5.]]])
>>> # using different paddings for different sides
>>> m = nn.CircularPad1d((3, 1))
>>> m(input)
tensor([[[1., 2., 3., 0., 1., 2., 3., 0.],
[5., 6., 7., 4., 5., 6., 7., 4.]]])
"""
padding: Tuple[int, int]
def __init__(self, padding: _size_2_t) -> None:
super().__init__()
self.padding = _pair(padding)
def _check_input_dim(self, input):
if input.dim() != 2 and input.dim() != 3:
raise ValueError(f"expected 2D or 3D input (got {input.dim()}D input)")
class CircularPad2d(_CircularPadNd):
r"""Pads the input tensor using circular padding of the input boundary.
Tensor values at the beginning of the dimension are used to pad the end,
and values at the end are used to pad the beginning. If negative padding is
applied then the ends of the tensor get removed.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
:math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
Shape:
- Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
- Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
:math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> m = nn.CircularPad2d(2)
>>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
>>> input
tensor([[[[0., 1., 2.],
[3., 4., 5.],
[6., 7., 8.]]]])
>>> m(input)
tensor([[[[4., 5., 3., 4., 5., 3., 4.],
[7., 8., 6., 7., 8., 6., 7.],
[1., 2., 0., 1., 2., 0., 1.],
[4., 5., 3., 4., 5., 3., 4.],
[7., 8., 6., 7., 8., 6., 7.],
[1., 2., 0., 1., 2., 0., 1.],
[4., 5., 3., 4., 5., 3., 4.]]]])
>>> # using different paddings for different sides
>>> m = nn.CircularPad2d((1, 1, 2, 0))
>>> m(input)
tensor([[[[5., 3., 4., 5., 3.],
[8., 6., 7., 8., 6.],
[2., 0., 1., 2., 0.],
[5., 3., 4., 5., 3.],
[8., 6., 7., 8., 6.]]]])
"""
padding: Tuple[int, int, int, int]
def __init__(self, padding: _size_4_t) -> None:
super().__init__()
self.padding = _quadruple(padding)
def _check_input_dim(self, input):
if input.dim() != 3 and input.dim() != 4:
raise ValueError(f"expected 3D or 4D input (got {input.dim()}D input)")
class CircularPad3d(_CircularPadNd):
r"""Pads the input tensor using circular padding of the input boundary.
Tensor values at the beginning of the dimension are used to pad the end,
and values at the end are used to pad the beginning. If negative padding is
applied then the ends of the tensor get removed.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 6-`tuple`, uses
(:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
:math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
:math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
Shape:
- Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
- Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
where
:math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
:math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> m = nn.CircularPad3d(3)
>>> input = torch.randn(16, 3, 8, 320, 480)
>>> output = m(input)
>>> # using different paddings for different sides
>>> m = nn.CircularPad3d((3, 3, 6, 6, 1, 1))
>>> output = m(input)
"""
padding: Tuple[int, int, int, int, int, int]
def __init__(self, padding: _size_6_t) -> None:
super().__init__()
self.padding = _ntuple(6)(padding)
def _check_input_dim(self, input):
if input.dim() != 4 and input.dim() != 5:
raise ValueError(f"expected 4D or 5D input (got {input.dim()}D input)")
class _ConstantPadNd(Module):
__constants__ = ["padding", "value"]
value: float
padding: Sequence[int]
def __init__(self, value: float) -> None:
super().__init__()
self.value = value
def forward(self, input: Tensor) -> Tensor:
return F.pad(input, self.padding, "constant", self.value)
def extra_repr(self) -> str:
return f"padding={self.padding}, value={self.value}"
class ConstantPad1d(_ConstantPadNd):
r"""Pads the input tensor boundaries with a constant value.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in both boundaries. If a 2-`tuple`, uses
(:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
Shape:
- Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
- Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> m = nn.ConstantPad1d(2, 3.5)
>>> input = torch.randn(1, 2, 4)
>>> input
tensor([[[-1.0491, -0.7152, -0.0749, 0.8530],
[-1.3287, 1.8966, 0.1466, -0.2771]]])
>>> m(input)
tensor([[[ 3.5000, 3.5000, -1.0491, -0.7152, -0.0749, 0.8530, 3.5000,
3.5000],
[ 3.5000, 3.5000, -1.3287, 1.8966, 0.1466, -0.2771, 3.5000,
3.5000]]])
>>> m = nn.ConstantPad1d(2, 3.5)
>>> input = torch.randn(1, 2, 3)
>>> input
tensor([[[ 1.6616, 1.4523, -1.1255],
[-3.6372, 0.1182, -1.8652]]])
>>> m(input)
tensor([[[ 3.5000, 3.5000, 1.6616, 1.4523, -1.1255, 3.5000, 3.5000],
[ 3.5000, 3.5000, -3.6372, 0.1182, -1.8652, 3.5000, 3.5000]]])
>>> # using different paddings for different sides
>>> m = nn.ConstantPad1d((3, 1), 3.5)
>>> m(input)
tensor([[[ 3.5000, 3.5000, 3.5000, 1.6616, 1.4523, -1.1255, 3.5000],
[ 3.5000, 3.5000, 3.5000, -3.6372, 0.1182, -1.8652, 3.5000]]])
"""
padding: Tuple[int, int]
def __init__(self, padding: _size_2_t, value: float):
super().__init__(value)
self.padding = _pair(padding)
class ConstantPad2d(_ConstantPadNd):
r"""Pads the input tensor boundaries with a constant value.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
:math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
Shape:
- Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
- Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
:math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> m = nn.ConstantPad2d(2, 3.5)
>>> input = torch.randn(1, 2, 2)
>>> input
tensor([[[ 1.6585, 0.4320],
[-0.8701, -0.4649]]])
>>> m(input)
tensor([[[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000],
[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000],
[ 3.5000, 3.5000, 1.6585, 0.4320, 3.5000, 3.5000],
[ 3.5000, 3.5000, -0.8701, -0.4649, 3.5000, 3.5000],
[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000],
[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000]]])
>>> # using different paddings for different sides
>>> m = nn.ConstantPad2d((3, 0, 2, 1), 3.5)
>>> m(input)
tensor([[[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000],
[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000],
[ 3.5000, 3.5000, 3.5000, 1.6585, 0.4320],
[ 3.5000, 3.5000, 3.5000, -0.8701, -0.4649],
[ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000]]])
"""
__constants__ = ["padding", "value"]
padding: Tuple[int, int, int, int]
def __init__(self, padding: _size_4_t, value: float) -> None:
super().__init__(value)
self.padding = _quadruple(padding)
class ConstantPad3d(_ConstantPadNd):
r"""Pads the input tensor boundaries with a constant value.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 6-`tuple`, uses
(:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
:math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
:math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
Shape:
- Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
- Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
:math:`(C, D_{out}, H_{out}, W_{out})`, where
:math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
:math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> m = nn.ConstantPad3d(3, 3.5)
>>> input = torch.randn(16, 3, 10, 20, 30)
>>> output = m(input)
>>> # using different paddings for different sides
>>> m = nn.ConstantPad3d((3, 3, 6, 6, 0, 1), 3.5)
>>> output = m(input)
"""
padding: Tuple[int, int, int, int, int, int]
def __init__(self, padding: _size_6_t, value: float) -> None:
super().__init__(value)
self.padding = _ntuple(6)(padding)
class _ReflectionPadNd(Module):
__constants__ = ["padding"]
padding: Sequence[int]
def forward(self, input: Tensor) -> Tensor:
return F.pad(input, self.padding, "reflect")
def extra_repr(self) -> str:
return f"{self.padding}"
class ReflectionPad1d(_ReflectionPadNd):
r"""Pads the input tensor using the reflection of the input boundary.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 2-`tuple`, uses
(:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
Shape:
- Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
- Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> m = nn.ReflectionPad1d(2)
>>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles")
>>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
>>> input
tensor([[[0., 1., 2., 3.],
[4., 5., 6., 7.]]])
>>> m(input)
tensor([[[2., 1., 0., 1., 2., 3., 2., 1.],
[6., 5., 4., 5., 6., 7., 6., 5.]]])
>>> # using different paddings for different sides
>>> m = nn.ReflectionPad1d((3, 1))
>>> m(input)
tensor([[[3., 2., 1., 0., 1., 2., 3., 2.],
[7., 6., 5., 4., 5., 6., 7., 6.]]])
"""
padding: Tuple[int, int]
def __init__(self, padding: _size_2_t) -> None:
super().__init__()
self.padding = _pair(padding)
class ReflectionPad2d(_ReflectionPadNd):
r"""Pads the input tensor using the reflection of the input boundary.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
:math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
Note that padding size should be less than the corresponding input dimension.
Shape:
- Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
- Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})` where
:math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
>>> m = nn.ReflectionPad2d(2)
>>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
>>> input
tensor([[[[0., 1., 2.],
[3., 4., 5.],
[6., 7., 8.]]]])
>>> m(input)
tensor([[[[8., 7., 6., 7., 8., 7., 6.],
[5., 4., 3., 4., 5., 4., 3.],
[2., 1., 0., 1., 2., 1., 0.],
[5., 4., 3., 4., 5., 4., 3.],
[8., 7., 6., 7., 8., 7., 6.],
[5., 4., 3., 4., 5., 4., 3.],
[2., 1., 0., 1., 2., 1., 0.]]]])
>>> # using different paddings for different sides
>>> m = nn.ReflectionPad2d((1, 1, 2, 0))
>>> m(input)
tensor([[[[7., 6., 7., 8., 7.],
[4., 3., 4., 5., 4.],
[1., 0., 1., 2., 1.],
[4., 3., 4., 5., 4.],
[7., 6., 7., 8., 7.]]]])
"""
padding: Tuple[int, int, int, int]
def __init__(self, padding: _size_4_t) -> None:
super().__init__()
self.padding = _quadruple(padding)
class ReflectionPad3d(_ReflectionPadNd):
r"""Pads the input tensor using the reflection of the input boundary.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 6-`tuple`, uses
(:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
:math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
:math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
Shape:
- Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
- Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
where
:math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
:math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
>>> m = nn.ReflectionPad3d(1)
>>> input = torch.arange(8, dtype=torch.float).reshape(1, 1, 2, 2, 2)
>>> m(input)
tensor([[[[[7., 6., 7., 6.],
[5., 4., 5., 4.],
[7., 6., 7., 6.],
[5., 4., 5., 4.]],
[[3., 2., 3., 2.],
[1., 0., 1., 0.],
[3., 2., 3., 2.],
[1., 0., 1., 0.]],
[[7., 6., 7., 6.],
[5., 4., 5., 4.],
[7., 6., 7., 6.],
[5., 4., 5., 4.]],
[[3., 2., 3., 2.],
[1., 0., 1., 0.],
[3., 2., 3., 2.],
[1., 0., 1., 0.]]]]])
"""
padding: Tuple[int, int, int, int, int, int]
def __init__(self, padding: _size_6_t) -> None:
super().__init__()
self.padding = _ntuple(6)(padding)
class _ReplicationPadNd(Module):
__constants__ = ["padding"]
padding: Sequence[int]
def forward(self, input: Tensor) -> Tensor:
return F.pad(input, self.padding, "replicate")
def extra_repr(self) -> str:
return f"{self.padding}"
class ReplicationPad1d(_ReplicationPadNd):
r"""Pads the input tensor using replication of the input boundary.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 2-`tuple`, uses
(:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
Shape:
- Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
- Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
>>> m = nn.ReplicationPad1d(2)
>>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
>>> input
tensor([[[0., 1., 2., 3.],
[4., 5., 6., 7.]]])
>>> m(input)
tensor([[[0., 0., 0., 1., 2., 3., 3., 3.],
[4., 4., 4., 5., 6., 7., 7., 7.]]])
>>> # using different paddings for different sides
>>> m = nn.ReplicationPad1d((3, 1))
>>> m(input)
tensor([[[0., 0., 0., 0., 1., 2., 3., 3.],
[4., 4., 4., 4., 5., 6., 7., 7.]]])
"""
padding: Tuple[int, int]
def __init__(self, padding: _size_2_t) -> None:
super().__init__()
self.padding = _pair(padding)
class ReplicationPad2d(_ReplicationPadNd):
r"""Pads the input tensor using replication of the input boundary.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
:math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
Shape:
- Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
- Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
:math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> m = nn.ReplicationPad2d(2)
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
>>> input
tensor([[[[0., 1., 2.],
[3., 4., 5.],
[6., 7., 8.]]]])
>>> m(input)
tensor([[[[0., 0., 0., 1., 2., 2., 2.],
[0., 0., 0., 1., 2., 2., 2.],
[0., 0., 0., 1., 2., 2., 2.],
[3., 3., 3., 4., 5., 5., 5.],
[6., 6., 6., 7., 8., 8., 8.],
[6., 6., 6., 7., 8., 8., 8.],
[6., 6., 6., 7., 8., 8., 8.]]]])
>>> # using different paddings for different sides
>>> m = nn.ReplicationPad2d((1, 1, 2, 0))
>>> m(input)
tensor([[[[0., 0., 1., 2., 2.],
[0., 0., 1., 2., 2.],
[0., 0., 1., 2., 2.],
[3., 3., 4., 5., 5.],
[6., 6., 7., 8., 8.]]]])
"""
padding: Tuple[int, int, int, int]
def __init__(self, padding: _size_4_t) -> None:
super().__init__()
self.padding = _quadruple(padding)
class ReplicationPad3d(_ReplicationPadNd):
r"""Pads the input tensor using replication of the input boundary.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 6-`tuple`, uses
(:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
:math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
:math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
Shape:
- Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
- Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
where
:math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
:math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> m = nn.ReplicationPad3d(3)
>>> input = torch.randn(16, 3, 8, 320, 480)
>>> output = m(input)
>>> # using different paddings for different sides
>>> m = nn.ReplicationPad3d((3, 3, 6, 6, 1, 1))
>>> output = m(input)
"""
padding: Tuple[int, int, int, int, int, int]
def __init__(self, padding: _size_6_t) -> None:
super().__init__()
self.padding = _ntuple(6)(padding)
class ZeroPad1d(ConstantPad1d):
r"""Pads the input tensor boundaries with zero.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in both boundaries. If a 2-`tuple`, uses
(:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
Shape:
- Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
- Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> m = nn.ZeroPad1d(2)
>>> input = torch.randn(1, 2, 4)
>>> input
tensor([[[-1.0491, -0.7152, -0.0749, 0.8530],
[-1.3287, 1.8966, 0.1466, -0.2771]]])
>>> m(input)
tensor([[[ 0.0000, 0.0000, -1.0491, -0.7152, -0.0749, 0.8530, 0.0000,
0.0000],
[ 0.0000, 0.0000, -1.3287, 1.8966, 0.1466, -0.2771, 0.0000,
0.0000]]])
>>> m = nn.ZeroPad1d(2)
>>> input = torch.randn(1, 2, 3)
>>> input
tensor([[[ 1.6616, 1.4523, -1.1255],
[-3.6372, 0.1182, -1.8652]]])
>>> m(input)
tensor([[[ 0.0000, 0.0000, 1.6616, 1.4523, -1.1255, 0.0000, 0.0000],
[ 0.0000, 0.0000, -3.6372, 0.1182, -1.8652, 0.0000, 0.0000]]])
>>> # using different paddings for different sides
>>> m = nn.ZeroPad1d((3, 1))
>>> m(input)
tensor([[[ 0.0000, 0.0000, 0.0000, 1.6616, 1.4523, -1.1255, 0.0000],
[ 0.0000, 0.0000, 0.0000, -3.6372, 0.1182, -1.8652, 0.0000]]])
"""
padding: Tuple[int, int]
def __init__(self, padding: _size_2_t) -> None:
super().__init__(padding, 0.0)
def extra_repr(self) -> str:
return f"{self.padding}"
class ZeroPad2d(ConstantPad2d):
r"""Pads the input tensor boundaries with zero.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
:math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
Shape:
- Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
- Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
:math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> m = nn.ZeroPad2d(2)
>>> input = torch.randn(1, 1, 3, 3)
>>> input
tensor([[[[-0.1678, -0.4418, 1.9466],
[ 0.9604, -0.4219, -0.5241],
[-0.9162, -0.5436, -0.6446]]]])
>>> m(input)
tensor([[[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, -0.1678, -0.4418, 1.9466, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.9604, -0.4219, -0.5241, 0.0000, 0.0000],
[ 0.0000, 0.0000, -0.9162, -0.5436, -0.6446, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]])
>>> # using different paddings for different sides
>>> m = nn.ZeroPad2d((1, 1, 2, 0))
>>> m(input)
tensor([[[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, -0.1678, -0.4418, 1.9466, 0.0000],
[ 0.0000, 0.9604, -0.4219, -0.5241, 0.0000],
[ 0.0000, -0.9162, -0.5436, -0.6446, 0.0000]]]])
"""
padding: Tuple[int, int, int, int]
def __init__(self, padding: _size_4_t) -> None:
super().__init__(padding, 0.0)
def extra_repr(self) -> str:
return f"{self.padding}"
class ZeroPad3d(ConstantPad3d):
r"""Pads the input tensor boundaries with zero.
For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
Args:
padding (int, tuple): the size of the padding. If is `int`, uses the same
padding in all boundaries. If a 6-`tuple`, uses
(:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
:math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
:math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
Shape:
- Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
- Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
:math:`(C, D_{out}, H_{out}, W_{out})`, where
:math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
:math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
:math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
Examples::
>>> m = nn.ZeroPad3d(3)
>>> input = torch.randn(16, 3, 10, 20, 30)
>>> output = m(input)
>>> # using different paddings for different sides
>>> m = nn.ZeroPad3d((3, 3, 6, 6, 0, 1))
>>> output = m(input)
"""
padding: Tuple[int, int, int, int, int, int]
def __init__(self, padding: _size_6_t) -> None:
super().__init__(padding, 0.0)
def extra_repr(self) -> str:
return f"{self.padding}"

View File

@ -0,0 +1,115 @@
import torch.nn.functional as F
from torch import Tensor
from .module import Module
__all__ = ["PixelShuffle", "PixelUnshuffle"]
class PixelShuffle(Module):
r"""Rearrange elements in a tensor according to an upscaling factor.
Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an upscale factor.
This is useful for implementing efficient sub-pixel convolution
with a stride of :math:`1/r`.
See the paper:
`Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
by Shi et al. (2016) for more details.
Args:
upscale_factor (int): factor to increase spatial resolution by
Shape:
- Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
- Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
.. math::
C_{out} = C_{in} \div \text{upscale\_factor}^2
.. math::
H_{out} = H_{in} \times \text{upscale\_factor}
.. math::
W_{out} = W_{in} \times \text{upscale\_factor}
Examples::
>>> pixel_shuffle = nn.PixelShuffle(3)
>>> input = torch.randn(1, 9, 4, 4)
>>> output = pixel_shuffle(input)
>>> print(output.size())
torch.Size([1, 1, 12, 12])
.. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
https://arxiv.org/abs/1609.05158
"""
__constants__ = ["upscale_factor"]
upscale_factor: int
def __init__(self, upscale_factor: int) -> None:
super().__init__()
self.upscale_factor = upscale_factor
def forward(self, input: Tensor) -> Tensor:
return F.pixel_shuffle(input, self.upscale_factor)
def extra_repr(self) -> str:
return f"upscale_factor={self.upscale_factor}"
class PixelUnshuffle(Module):
r"""Reverse the PixelShuffle operation.
Reverses the :class:`~torch.nn.PixelShuffle` operation by rearranging elements
in a tensor of shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape
:math:`(*, C \times r^2, H, W)`, where r is a downscale factor.
See the paper:
`Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
by Shi et al. (2016) for more details.
Args:
downscale_factor (int): factor to decrease spatial resolution by
Shape:
- Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
- Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
.. math::
C_{out} = C_{in} \times \text{downscale\_factor}^2
.. math::
H_{out} = H_{in} \div \text{downscale\_factor}
.. math::
W_{out} = W_{in} \div \text{downscale\_factor}
Examples::
>>> pixel_unshuffle = nn.PixelUnshuffle(3)
>>> input = torch.randn(1, 1, 12, 12)
>>> output = pixel_unshuffle(input)
>>> print(output.size())
torch.Size([1, 9, 4, 4])
.. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
https://arxiv.org/abs/1609.05158
"""
__constants__ = ["downscale_factor"]
downscale_factor: int
def __init__(self, downscale_factor: int) -> None:
super().__init__()
self.downscale_factor = downscale_factor
def forward(self, input: Tensor) -> Tensor:
return F.pixel_unshuffle(input, self.downscale_factor)
def extra_repr(self) -> str:
return f"downscale_factor={self.downscale_factor}"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,546 @@
# mypy: allow-untyped-defs
from typing import Optional
import torch
from torch import Tensor
from torch.nn import functional as F, init
from torch.nn.parameter import Parameter
from .module import Module
__all__ = ["Embedding", "EmbeddingBag"]
class Embedding(Module):
r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
This module is often used to store word embeddings and retrieve them using indices.
The input to the module is a list of indices, and the output is the corresponding
word embeddings.
Args:
num_embeddings (int): size of the dictionary of embeddings
embedding_dim (int): the size of each embedding vector
padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
i.e. it remains as a fixed "pad". For a newly constructed Embedding,
the embedding vector at :attr:`padding_idx` will default to all zeros,
but can be updated to another value to be used as the padding vector.
max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
is renormalized to have norm :attr:`max_norm`.
norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse of frequency of
the words in the mini-batch. Default ``False``.
sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor.
See Notes for more details regarding sparse gradients.
Attributes:
weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
initialized from :math:`\mathcal{N}(0, 1)`
Shape:
- Input: :math:`(*)`, IntTensor or LongTensor of arbitrary shape containing the indices to extract
- Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
.. note::
Keep in mind that only a limited number of optimizers support
sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
:class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
.. note::
When :attr:`max_norm` is not ``None``, :class:`Embedding`'s forward method will modify the
:attr:`weight` tensor in-place. Since tensors needed for gradient computations cannot be
modified in-place, performing a differentiable operation on ``Embedding.weight`` before
calling :class:`Embedding`'s forward method requires cloning ``Embedding.weight`` when
:attr:`max_norm` is not ``None``. For example::
n, d, m = 3, 5, 7
embedding = nn.Embedding(n, d, max_norm=1.0)
W = torch.randn((m, d), requires_grad=True)
idx = torch.tensor([1, 2])
a = embedding.weight.clone() @ W.t() # weight must be cloned for this to be differentiable
b = embedding(idx) @ W.t() # modifies weight in-place
out = (a.unsqueeze(0) + b.unsqueeze(1))
loss = out.sigmoid().prod()
loss.backward()
Examples::
>>> # an Embedding module containing 10 tensors of size 3
>>> embedding = nn.Embedding(10, 3)
>>> # a batch of 2 samples of 4 indices each
>>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> embedding(input)
tensor([[[-0.0251, -1.6902, 0.7172],
[-0.6431, 0.0748, 0.6969],
[ 1.4970, 1.3448, -0.9685],
[-0.3677, -2.7265, -0.1685]],
[[ 1.4970, 1.3448, -0.9685],
[ 0.4362, -0.4004, 0.9400],
[-0.6431, 0.0748, 0.6969],
[ 0.9124, -2.3616, 1.1151]]])
>>> # example with padding_idx
>>> embedding = nn.Embedding(10, 3, padding_idx=0)
>>> input = torch.LongTensor([[0, 2, 0, 5]])
>>> embedding(input)
tensor([[[ 0.0000, 0.0000, 0.0000],
[ 0.1535, -2.0309, 0.9315],
[ 0.0000, 0.0000, 0.0000],
[-0.1655, 0.9897, 0.0635]]])
>>> # example of changing `pad` vector
>>> padding_idx = 0
>>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
>>> embedding.weight
Parameter containing:
tensor([[ 0.0000, 0.0000, 0.0000],
[-0.7895, -0.7089, -0.0364],
[ 0.6778, 0.5803, 0.2678]], requires_grad=True)
>>> with torch.no_grad():
... embedding.weight[padding_idx] = torch.ones(3)
>>> embedding.weight
Parameter containing:
tensor([[ 1.0000, 1.0000, 1.0000],
[-0.7895, -0.7089, -0.0364],
[ 0.6778, 0.5803, 0.2678]], requires_grad=True)
"""
__constants__ = [
"num_embeddings",
"embedding_dim",
"padding_idx",
"max_norm",
"norm_type",
"scale_grad_by_freq",
"sparse",
]
num_embeddings: int
embedding_dim: int
padding_idx: Optional[int]
max_norm: Optional[float]
norm_type: float
scale_grad_by_freq: bool
weight: Tensor
freeze: bool
sparse: bool
def __init__(
self,
num_embeddings: int,
embedding_dim: int,
padding_idx: Optional[int] = None,
max_norm: Optional[float] = None,
norm_type: float = 2.0,
scale_grad_by_freq: bool = False,
sparse: bool = False,
_weight: Optional[Tensor] = None,
_freeze: bool = False,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
self.num_embeddings = num_embeddings
self.embedding_dim = embedding_dim
if padding_idx is not None:
if padding_idx > 0:
assert (
padding_idx < self.num_embeddings
), "Padding_idx must be within num_embeddings"
elif padding_idx < 0:
assert (
padding_idx >= -self.num_embeddings
), "Padding_idx must be within num_embeddings"
padding_idx = self.num_embeddings + padding_idx
self.padding_idx = padding_idx
self.max_norm = max_norm
self.norm_type = norm_type
self.scale_grad_by_freq = scale_grad_by_freq
if _weight is None:
self.weight = Parameter(
torch.empty((num_embeddings, embedding_dim), **factory_kwargs),
requires_grad=not _freeze,
)
self.reset_parameters()
else:
assert list(_weight.shape) == [
num_embeddings,
embedding_dim,
], "Shape of weight does not match num_embeddings and embedding_dim"
self.weight = Parameter(_weight, requires_grad=not _freeze)
self.sparse = sparse
def reset_parameters(self) -> None:
init.normal_(self.weight)
self._fill_padding_idx_with_zero()
def _fill_padding_idx_with_zero(self) -> None:
if self.padding_idx is not None:
with torch.no_grad():
self.weight[self.padding_idx].fill_(0)
def forward(self, input: Tensor) -> Tensor:
return F.embedding(
input,
self.weight,
self.padding_idx,
self.max_norm,
self.norm_type,
self.scale_grad_by_freq,
self.sparse,
)
def extra_repr(self) -> str:
s = "{num_embeddings}, {embedding_dim}"
if self.padding_idx is not None:
s += ", padding_idx={padding_idx}"
if self.max_norm is not None:
s += ", max_norm={max_norm}"
if self.norm_type != 2:
s += ", norm_type={norm_type}"
if self.scale_grad_by_freq is not False:
s += ", scale_grad_by_freq={scale_grad_by_freq}"
if self.sparse is not False:
s += ", sparse=True"
return s.format(**self.__dict__)
@classmethod
def from_pretrained(
cls,
embeddings,
freeze=True,
padding_idx=None,
max_norm=None,
norm_type=2.0,
scale_grad_by_freq=False,
sparse=False,
):
r"""Create Embedding instance from given 2-dimensional FloatTensor.
Args:
embeddings (Tensor): FloatTensor containing weights for the Embedding.
First dimension is being passed to Embedding as ``num_embeddings``, second as ``embedding_dim``.
freeze (bool, optional): If ``True``, the tensor does not get updated in the learning process.
Equivalent to ``embedding.weight.requires_grad = False``. Default: ``True``
padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
i.e. it remains as a fixed "pad".
max_norm (float, optional): See module initialization documentation.
norm_type (float, optional): See module initialization documentation. Default ``2``.
scale_grad_by_freq (bool, optional): See module initialization documentation. Default ``False``.
sparse (bool, optional): See module initialization documentation.
Examples::
>>> # FloatTensor containing pretrained weights
>>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
>>> embedding = nn.Embedding.from_pretrained(weight)
>>> # Get embeddings for index 1
>>> input = torch.LongTensor([1])
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> embedding(input)
tensor([[ 4.0000, 5.1000, 6.3000]])
"""
assert (
embeddings.dim() == 2
), "Embeddings parameter is expected to be 2-dimensional"
rows, cols = embeddings.shape
embedding = cls(
num_embeddings=rows,
embedding_dim=cols,
_weight=embeddings,
_freeze=freeze,
padding_idx=padding_idx,
max_norm=max_norm,
norm_type=norm_type,
scale_grad_by_freq=scale_grad_by_freq,
sparse=sparse,
)
return embedding
class EmbeddingBag(Module):
r"""Compute sums or means of 'bags' of embeddings, without instantiating the intermediate embeddings.
For bags of constant length, no :attr:`per_sample_weights`, no indices equal to :attr:`padding_idx`,
and with 2D inputs, this class
* with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``,
* with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=1)``,
* with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=1)``.
However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
operations.
EmbeddingBag also supports per-sample weights as an argument to the forward
pass. This scales the output of the Embedding before performing a weighted
reduction as specified by ``mode``. If :attr:`per_sample_weights` is passed, the
only supported ``mode`` is ``"sum"``, which computes a weighted sum according to
:attr:`per_sample_weights`.
Args:
num_embeddings (int): size of the dictionary of embeddings
embedding_dim (int): the size of each embedding vector
max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
is renormalized to have norm :attr:`max_norm`.
norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of
the words in the mini-batch. Default ``False``.
Note: this option is not supported when ``mode="max"``.
mode (str, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights`
into consideration. ``"mean"`` computes the average of the values
in the bag, ``"max"`` computes the max value over each bag.
Default: ``"mean"``
sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See
Notes for more details regarding sparse gradients. Note: this option is not
supported when ``mode="max"``.
include_last_offset (bool, optional): if ``True``, :attr:`offsets` has one additional element, where the last element
is equivalent to the size of `indices`. This matches the CSR format.
padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the
gradient; therefore, the embedding vector at :attr:`padding_idx` is not updated
during training, i.e. it remains as a fixed "pad". For a newly constructed
EmbeddingBag, the embedding vector at :attr:`padding_idx` will default to all
zeros, but can be updated to another value to be used as the padding vector.
Note that the embedding vector at :attr:`padding_idx` is excluded from the
reduction.
Attributes:
weight (Tensor): the learnable weights of the module of shape `(num_embeddings, embedding_dim)`
initialized from :math:`\mathcal{N}(0, 1)`.
Examples::
>>> # an EmbeddingBag module containing 10 tensors of size 3
>>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')
>>> # a batch of 2 samples of 4 indices each
>>> input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long)
>>> offsets = torch.tensor([0, 4], dtype=torch.long)
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> embedding_sum(input, offsets)
tensor([[-0.8861, -5.4350, -0.0523],
[ 1.1306, -2.5798, -1.0044]])
>>> # Example with padding_idx
>>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum', padding_idx=2)
>>> input = torch.tensor([2, 2, 2, 2, 4, 3, 2, 9], dtype=torch.long)
>>> offsets = torch.tensor([0, 4], dtype=torch.long)
>>> embedding_sum(input, offsets)
tensor([[ 0.0000, 0.0000, 0.0000],
[-0.7082, 3.2145, -2.6251]])
>>> # An EmbeddingBag can be loaded from an Embedding like so
>>> embedding = nn.Embedding(10, 3, padding_idx=2)
>>> embedding_sum = nn.EmbeddingBag.from_pretrained(
embedding.weight,
padding_idx=embedding.padding_idx,
mode='sum')
"""
__constants__ = [
"num_embeddings",
"embedding_dim",
"max_norm",
"norm_type",
"scale_grad_by_freq",
"mode",
"sparse",
"include_last_offset",
"padding_idx",
]
num_embeddings: int
embedding_dim: int
max_norm: Optional[float]
norm_type: float
scale_grad_by_freq: bool
weight: Tensor
mode: str
sparse: bool
include_last_offset: bool
padding_idx: Optional[int]
def __init__(
self,
num_embeddings: int,
embedding_dim: int,
max_norm: Optional[float] = None,
norm_type: float = 2.0,
scale_grad_by_freq: bool = False,
mode: str = "mean",
sparse: bool = False,
_weight: Optional[Tensor] = None,
include_last_offset: bool = False,
padding_idx: Optional[int] = None,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super().__init__()
self.num_embeddings = num_embeddings
self.embedding_dim = embedding_dim
self.max_norm = max_norm
self.norm_type = norm_type
self.scale_grad_by_freq = scale_grad_by_freq
if padding_idx is not None:
if padding_idx > 0:
assert (
padding_idx < self.num_embeddings
), "padding_idx must be within num_embeddings"
elif padding_idx < 0:
assert (
padding_idx >= -self.num_embeddings
), "padding_idx must be within num_embeddings"
padding_idx = self.num_embeddings + padding_idx
self.padding_idx = padding_idx
if _weight is None:
self.weight = Parameter(
torch.empty((num_embeddings, embedding_dim), **factory_kwargs)
)
self.reset_parameters()
else:
assert list(_weight.shape) == [
num_embeddings,
embedding_dim,
], "Shape of weight does not match num_embeddings and embedding_dim"
self.weight = Parameter(_weight)
self.mode = mode
self.sparse = sparse
self.include_last_offset = include_last_offset
def reset_parameters(self) -> None:
init.normal_(self.weight)
self._fill_padding_idx_with_zero()
def _fill_padding_idx_with_zero(self) -> None:
if self.padding_idx is not None:
with torch.no_grad():
self.weight[self.padding_idx].fill_(0)
def forward(
self,
input: Tensor,
offsets: Optional[Tensor] = None,
per_sample_weights: Optional[Tensor] = None,
) -> Tensor:
"""Forward pass of EmbeddingBag.
Args:
input (Tensor): Tensor containing bags of indices into the embedding matrix.
offsets (Tensor, optional): Only used when :attr:`input` is 1D. :attr:`offsets` determines
the starting index position of each bag (sequence) in :attr:`input`.
per_sample_weights (Tensor, optional): a tensor of float / double weights, or None
to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights`
must have exactly the same shape as input and is treated as having the same
:attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``.
Returns:
Tensor output shape of `(B, embedding_dim)`.
.. note::
A few notes about ``input`` and ``offsets``:
- :attr:`input` and :attr:`offsets` have to be of the same type, either int or long
- If :attr:`input` is 2D of shape `(B, N)`, it will be treated as ``B`` bags (sequences)
each of fixed length ``N``, and this will return ``B`` values aggregated in a way
depending on the :attr:`mode`. :attr:`offsets` is ignored and required to be ``None`` in this case.
- If :attr:`input` is 1D of shape `(N)`, it will be treated as a concatenation of
multiple bags (sequences). :attr:`offsets` is required to be a 1D tensor containing the
starting index positions of each bag in :attr:`input`. Therefore, for :attr:`offsets` of shape `(B)`,
:attr:`input` will be viewed as having ``B`` bags. Empty bags (i.e., having 0-length) will have
returned vectors filled by zeros.
"""
return F.embedding_bag(
input,
self.weight,
offsets,
self.max_norm,
self.norm_type,
self.scale_grad_by_freq,
self.mode,
self.sparse,
per_sample_weights,
self.include_last_offset,
self.padding_idx,
)
def extra_repr(self) -> str:
s = "{num_embeddings}, {embedding_dim}"
if self.max_norm is not None:
s += ", max_norm={max_norm}"
if self.norm_type != 2:
s += ", norm_type={norm_type}"
if self.scale_grad_by_freq is not False:
s += ", scale_grad_by_freq={scale_grad_by_freq}"
s += ", mode={mode}"
if self.padding_idx is not None:
s += ", padding_idx={padding_idx}"
return s.format(**{k: repr(v) for k, v in self.__dict__.items()})
@classmethod
def from_pretrained(
cls,
embeddings: Tensor,
freeze: bool = True,
max_norm: Optional[float] = None,
norm_type: float = 2.0,
scale_grad_by_freq: bool = False,
mode: str = "mean",
sparse: bool = False,
include_last_offset: bool = False,
padding_idx: Optional[int] = None,
) -> "EmbeddingBag":
r"""Create EmbeddingBag instance from given 2-dimensional FloatTensor.
Args:
embeddings (Tensor): FloatTensor containing weights for the EmbeddingBag.
First dimension is being passed to EmbeddingBag as 'num_embeddings', second as 'embedding_dim'.
freeze (bool, optional): If ``True``, the tensor does not get updated in the learning process.
Equivalent to ``embeddingbag.weight.requires_grad = False``. Default: ``True``
max_norm (float, optional): See module initialization documentation. Default: ``None``
norm_type (float, optional): See module initialization documentation. Default ``2``.
scale_grad_by_freq (bool, optional): See module initialization documentation. Default ``False``.
mode (str, optional): See module initialization documentation. Default: ``"mean"``
sparse (bool, optional): See module initialization documentation. Default: ``False``.
include_last_offset (bool, optional): See module initialization documentation. Default: ``False``.
padding_idx (int, optional): See module initialization documentation. Default: ``None``.
Examples::
>>> # FloatTensor containing pretrained weights
>>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
>>> embeddingbag = nn.EmbeddingBag.from_pretrained(weight)
>>> # Get embeddings for index 1
>>> input = torch.LongTensor([[1, 0]])
>>> # xdoctest: +IGNORE_WANT("non-deterministic")
>>> embeddingbag(input)
tensor([[ 2.5000, 3.7000, 4.6500]])
"""
assert (
embeddings.dim() == 2
), "Embeddings parameter is expected to be 2-dimensional"
rows, cols = embeddings.shape
embeddingbag = cls(
num_embeddings=rows,
embedding_dim=cols,
_weight=embeddings,
max_norm=max_norm,
norm_type=norm_type,
scale_grad_by_freq=scale_grad_by_freq,
mode=mode,
sparse=sparse,
include_last_offset=include_last_offset,
padding_idx=padding_idx,
)
embeddingbag.weight.requires_grad = not freeze
return embeddingbag

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,293 @@
# mypy: allow-untyped-defs
from typing import Optional
import torch.nn.functional as F
from torch import Tensor
from torch.nn.common_types import _ratio_2_t, _ratio_any_t, _size_2_t, _size_any_t
from .module import Module
__all__ = ["Upsample", "UpsamplingNearest2d", "UpsamplingBilinear2d"]
class Upsample(Module):
r"""Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D (volumetric) data.
The input data is assumed to be of the form
`minibatch x channels x [optional depth] x [optional height] x width`.
Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.
The algorithms available for upsampling are nearest neighbor and linear,
bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor,
respectively.
One can either give a :attr:`scale_factor` or the target output :attr:`size` to
calculate the output size. (You cannot give both, as it is ambiguous)
Args:
size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional):
output spatial sizes
scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional):
multiplier for spatial size. Has to match input size if it is a tuple.
mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
Default: ``'nearest'``
align_corners (bool, optional): if ``True``, the corner pixels of the input
and output tensors are aligned, and thus preserving the values at
those pixels. This only has effect when :attr:`mode` is
``'linear'``, ``'bilinear'``, ``'bicubic'``, or ``'trilinear'``.
Default: ``False``
recompute_scale_factor (bool, optional): recompute the scale_factor for use in the
interpolation calculation. If `recompute_scale_factor` is ``True``, then
`scale_factor` must be passed in and `scale_factor` is used to compute the
output `size`. The computed output `size` will be used to infer new scales for
the interpolation. Note that when `scale_factor` is floating-point, it may differ
from the recomputed `scale_factor` due to rounding and precision issues.
If `recompute_scale_factor` is ``False``, then `size` or `scale_factor` will
be used directly for interpolation.
Shape:
- Input: :math:`(N, C, W_{in})`, :math:`(N, C, H_{in}, W_{in})` or :math:`(N, C, D_{in}, H_{in}, W_{in})`
- Output: :math:`(N, C, W_{out})`, :math:`(N, C, H_{out}, W_{out})`
or :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
.. math::
D_{out} = \left\lfloor D_{in} \times \text{scale\_factor} \right\rfloor
.. math::
H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor
.. math::
W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor
.. warning::
With ``align_corners = True``, the linearly interpolating modes
(`linear`, `bilinear`, `bicubic`, and `trilinear`) don't proportionally
align the output and input pixels, and thus the output values can depend
on the input size. This was the default behavior for these modes up to
version 0.3.1. Since then, the default behavior is
``align_corners = False``. See below for concrete examples on how this
affects the outputs.
.. note::
If you want downsampling/general resizing, you should use :func:`~nn.functional.interpolate`.
Examples::
>>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
>>> input
tensor([[[[1., 2.],
[3., 4.]]]])
>>> m = nn.Upsample(scale_factor=2, mode='nearest')
>>> m(input)
tensor([[[[1., 1., 2., 2.],
[1., 1., 2., 2.],
[3., 3., 4., 4.],
[3., 3., 4., 4.]]]])
>>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles")
>>> m = nn.Upsample(scale_factor=2, mode='bilinear') # align_corners=False
>>> m(input)
tensor([[[[1.0000, 1.2500, 1.7500, 2.0000],
[1.5000, 1.7500, 2.2500, 2.5000],
[2.5000, 2.7500, 3.2500, 3.5000],
[3.0000, 3.2500, 3.7500, 4.0000]]]])
>>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
>>> m(input)
tensor([[[[1.0000, 1.3333, 1.6667, 2.0000],
[1.6667, 2.0000, 2.3333, 2.6667],
[2.3333, 2.6667, 3.0000, 3.3333],
[3.0000, 3.3333, 3.6667, 4.0000]]]])
>>> # Try scaling the same data in a larger tensor
>>> input_3x3 = torch.zeros(3, 3).view(1, 1, 3, 3)
>>> input_3x3[:, :, :2, :2].copy_(input)
tensor([[[[1., 2.],
[3., 4.]]]])
>>> input_3x3
tensor([[[[1., 2., 0.],
[3., 4., 0.],
[0., 0., 0.]]]])
>>> # xdoctest: +IGNORE_WANT("seems to fail when other tests are run in the same session")
>>> m = nn.Upsample(scale_factor=2, mode='bilinear') # align_corners=False
>>> # Notice that values in top left corner are the same with the small input (except at boundary)
>>> m(input_3x3)
tensor([[[[1.0000, 1.2500, 1.7500, 1.5000, 0.5000, 0.0000],
[1.5000, 1.7500, 2.2500, 1.8750, 0.6250, 0.0000],
[2.5000, 2.7500, 3.2500, 2.6250, 0.8750, 0.0000],
[2.2500, 2.4375, 2.8125, 2.2500, 0.7500, 0.0000],
[0.7500, 0.8125, 0.9375, 0.7500, 0.2500, 0.0000],
[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]])
>>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
>>> # Notice that values in top left corner are now changed
>>> m(input_3x3)
tensor([[[[1.0000, 1.4000, 1.8000, 1.6000, 0.8000, 0.0000],
[1.8000, 2.2000, 2.6000, 2.2400, 1.1200, 0.0000],
[2.6000, 3.0000, 3.4000, 2.8800, 1.4400, 0.0000],
[2.4000, 2.7200, 3.0400, 2.5600, 1.2800, 0.0000],
[1.2000, 1.3600, 1.5200, 1.2800, 0.6400, 0.0000],
[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]])
"""
__constants__ = [
"size",
"scale_factor",
"mode",
"align_corners",
"name",
"recompute_scale_factor",
]
name: str
size: Optional[_size_any_t]
scale_factor: Optional[_ratio_any_t]
mode: str
align_corners: Optional[bool]
recompute_scale_factor: Optional[bool]
def __init__(
self,
size: Optional[_size_any_t] = None,
scale_factor: Optional[_ratio_any_t] = None,
mode: str = "nearest",
align_corners: Optional[bool] = None,
recompute_scale_factor: Optional[bool] = None,
) -> None:
super().__init__()
self.name = type(self).__name__
self.size = size
if isinstance(scale_factor, tuple):
self.scale_factor = tuple(float(factor) for factor in scale_factor)
else:
self.scale_factor = float(scale_factor) if scale_factor else None
self.mode = mode
self.align_corners = align_corners
self.recompute_scale_factor = recompute_scale_factor
def forward(self, input: Tensor) -> Tensor:
return F.interpolate(
input,
self.size,
self.scale_factor,
self.mode,
self.align_corners,
recompute_scale_factor=self.recompute_scale_factor,
)
def __setstate__(self, state):
if "recompute_scale_factor" not in state:
state["recompute_scale_factor"] = True
super().__setstate__(state)
def extra_repr(self) -> str:
if self.scale_factor is not None:
info = "scale_factor=" + repr(self.scale_factor)
else:
info = "size=" + repr(self.size)
info += ", mode=" + repr(self.mode)
return info
class UpsamplingNearest2d(Upsample):
r"""Applies a 2D nearest neighbor upsampling to an input signal composed of several input channels.
To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
as it's constructor argument.
When :attr:`size` is given, it is the output size of the image `(h, w)`.
Args:
size (int or Tuple[int, int], optional): output spatial sizes
scale_factor (float or Tuple[float, float], optional): multiplier for
spatial size.
.. warning::
This class is deprecated in favor of :func:`~nn.functional.interpolate`.
Shape:
- Input: :math:`(N, C, H_{in}, W_{in})`
- Output: :math:`(N, C, H_{out}, W_{out})` where
.. math::
H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor
.. math::
W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor
Examples::
>>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
>>> input
tensor([[[[1., 2.],
[3., 4.]]]])
>>> m = nn.UpsamplingNearest2d(scale_factor=2)
>>> m(input)
tensor([[[[1., 1., 2., 2.],
[1., 1., 2., 2.],
[3., 3., 4., 4.],
[3., 3., 4., 4.]]]])
"""
def __init__(
self,
size: Optional[_size_2_t] = None,
scale_factor: Optional[_ratio_2_t] = None,
) -> None:
super().__init__(size, scale_factor, mode="nearest")
class UpsamplingBilinear2d(Upsample):
r"""Applies a 2D bilinear upsampling to an input signal composed of several input channels.
To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
as it's constructor argument.
When :attr:`size` is given, it is the output size of the image `(h, w)`.
Args:
size (int or Tuple[int, int], optional): output spatial sizes
scale_factor (float or Tuple[float, float], optional): multiplier for
spatial size.
.. warning::
This class is deprecated in favor of :func:`~nn.functional.interpolate`. It is
equivalent to ``nn.functional.interpolate(..., mode='bilinear', align_corners=True)``.
Shape:
- Input: :math:`(N, C, H_{in}, W_{in})`
- Output: :math:`(N, C, H_{out}, W_{out})` where
.. math::
H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor
.. math::
W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor
Examples::
>>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
>>> input
tensor([[[[1., 2.],
[3., 4.]]]])
>>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?")
>>> m = nn.UpsamplingBilinear2d(scale_factor=2)
>>> m(input)
tensor([[[[1.0000, 1.3333, 1.6667, 2.0000],
[1.6667, 2.0000, 2.3333, 2.6667],
[2.3333, 2.6667, 3.0000, 3.3333],
[3.0000, 3.3333, 3.6667, 4.0000]]]])
"""
def __init__(
self,
size: Optional[_size_2_t] = None,
scale_factor: Optional[_ratio_2_t] = None,
) -> None:
super().__init__(size, scale_factor, mode="bilinear", align_corners=True)

View File

@ -0,0 +1,81 @@
# mypy: allow-untyped-defs
import collections
from itertools import repeat
from typing import Any, Dict, List
__all__ = ["consume_prefix_in_state_dict_if_present"]
def _ntuple(n, name="parse"):
def parse(x):
if isinstance(x, collections.abc.Iterable):
return tuple(x)
return tuple(repeat(x, n))
parse.__name__ = name
return parse
_single = _ntuple(1, "_single")
_pair = _ntuple(2, "_pair")
_triple = _ntuple(3, "_triple")
_quadruple = _ntuple(4, "_quadruple")
def _reverse_repeat_tuple(t, n):
r"""Reverse the order of `t` and repeat each element for `n` times.
This can be used to translate padding arg used by Conv and Pooling modules
to the ones used by `F.pad`.
"""
return tuple(x for x in reversed(t) for _ in range(n))
def _list_with_default(out_size: List[int], defaults: List[int]) -> List[int]:
import torch
if isinstance(out_size, (int, torch.SymInt)):
return out_size
if len(defaults) <= len(out_size):
raise ValueError(f"Input dimension should be at least {len(out_size) + 1}")
return [
v if v is not None else d for v, d in zip(out_size, defaults[-len(out_size) :])
]
def consume_prefix_in_state_dict_if_present(
state_dict: Dict[str, Any],
prefix: str,
) -> None:
r"""Strip the prefix in state_dict in place, if any.
..note::
Given a `state_dict` from a DP/DDP model, a local model can load it by applying
`consume_prefix_in_state_dict_if_present(state_dict, "module.")` before calling
:meth:`torch.nn.Module.load_state_dict`.
Args:
state_dict (OrderedDict): a state-dict to be loaded to the model.
prefix (str): prefix.
"""
keys = list(state_dict.keys())
for key in keys:
if key.startswith(prefix):
newkey = key[len(prefix) :]
state_dict[newkey] = state_dict.pop(key)
# also strip the prefix in metadata if any.
if hasattr(state_dict, "_metadata"):
keys = list(state_dict._metadata.keys())
for key in keys:
# for the metadata dict, the key can be:
# '': for the DDP module, which we want to remove.
# 'module': for the actual model.
# 'module.xx.xx': for the rest.
if len(key) == 0:
continue
# handling both, 'module' case and 'module.' cases
if key == prefix.replace(".", "") or key.startswith(prefix):
newkey = key[len(prefix) :]
state_dict._metadata[newkey] = state_dict._metadata.pop(key)