4085 lines
167 KiB
C++
4085 lines
167 KiB
C++
/*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
#include <cmath>
|
|
|
|
#include "onnx/defs/function.h"
|
|
#include "onnx/defs/schema.h"
|
|
|
|
namespace ONNX_NAMESPACE {
|
|
|
|
// For GlobalPool operations.
|
|
void globalPoolTypeShapeInference_opset2(InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
|
|
// needs at least one input with shape.
|
|
if (!hasNInputShapes(ctx, 1)) {
|
|
return;
|
|
}
|
|
|
|
auto input_shape = ctx.getInputType(0)->tensor_type().shape();
|
|
if (input_shape.dim_size() < 2) {
|
|
return;
|
|
}
|
|
|
|
// first dim is the batch axis and the next is the number of channels.
|
|
size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
|
|
|
|
// (N, C, 1, 1, ..., 1)
|
|
auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
|
|
*output_shape->add_dim() = input_shape.dim(0);
|
|
*output_shape->add_dim() = input_shape.dim(1);
|
|
|
|
for (size_t i = 0; i < n_input_dims; ++i) {
|
|
output_shape->add_dim()->set_dim_value(1);
|
|
}
|
|
}
|
|
|
|
std::function<void(OpSchema&)> GlobalLpPoolingOpSchemaGenerator_opset2(const char* op_type, const char* op) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
Global{op_type} consumes an input tensor X and applies {op} pooling across
|
|
the values in the same channel. This is equivalent to {op_type} with kernel size
|
|
equal to the spatial dimension of input tensor.)DOC";
|
|
ReplaceAll(doc, "{op_type}", op_type);
|
|
ReplaceAll(doc, "{op}", op););
|
|
schema.SetDoc(doc);
|
|
schema.Attr(
|
|
"p", "p value of the Lp norm used to pool over the input data.", AttributeProto::INT, static_cast<int64_t>(2));
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the width "
|
|
"of the data. For non image case, the dimensions are "
|
|
"in the form of (N x C x D1 x D2 ... Dn), "
|
|
"where N is the batch size.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from pooling across the input "
|
|
"tensor. The output tensor has the same rank as the input. "
|
|
"The first two dimensions of output shape are the same as "
|
|
"the input (N x C), while the other dimensions are all 1.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.TypeConstraint("T", OpSchema::all_float_types_ir4(), "Constrain input and output types to float tensors.");
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { globalPoolTypeShapeInference_opset2(ctx); });
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
GlobalLpPool,
|
|
2,
|
|
OpSchema().FillUsing(GlobalLpPoolingOpSchemaGenerator_opset2("LpPool", "lp pool")));
|
|
|
|
const char* pads_doc_opset11 =
|
|
"Padding for the beginning and ending along each spatial axis, it can take any value greater "
|
|
"than or equal to 0. The value represent the number of pixels added to the beginning "
|
|
"and end part of the corresponding axis. `pads` format should be as follow "
|
|
"[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
|
|
"added at the beginning of axis `i` and xi_end, the number of pixels added at "
|
|
"the end of axis `i`. This attribute cannot be used simultaneously with "
|
|
"auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
|
|
const char* conv_auto_pad_doc_opset19 =
|
|
"auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
|
|
"default value is NOTSET, which means explicit padding is used. "
|
|
"SAME_UPPER or SAME_LOWER mean pad the input so that "
|
|
"`output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. "
|
|
"The padding is split between the two sides equally or almost equally (depending "
|
|
"on whether it is even or odd). In case the padding is an odd number, the extra "
|
|
"padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.";
|
|
const char* conv_transpose_auto_pad_doc_opset19 =
|
|
"auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
|
|
"default value is NOTSET, which means explicit padding is used. "
|
|
"SAME_UPPER or SAME_LOWER mean pad the input so that "
|
|
"`output_shape[i] = input_shape[i] * strides[i]` for each axis `i`. "
|
|
"The padding is split between the two sides equally or almost equally (depending "
|
|
"on whether it is even or odd). In case the padding is an odd number, the extra "
|
|
"padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.";
|
|
|
|
void convPoolShapeInference_opset19(
|
|
InferenceContext& ctx,
|
|
bool use_dilation,
|
|
bool require_kernel_shape,
|
|
int input1Idx,
|
|
int input2Idx) {
|
|
// we need the first input shape for this inference.
|
|
if (!hasInputShape(ctx, input1Idx)) {
|
|
return;
|
|
}
|
|
|
|
// if kernel shape is an input (and not attribute)
|
|
// we need the shape of the second input.
|
|
if (!require_kernel_shape && !hasInputShape(ctx, input2Idx)) {
|
|
return;
|
|
}
|
|
|
|
auto input_shape = ctx.getInputType(input1Idx)->tensor_type().shape();
|
|
if (input_shape.dim_size() < 2) {
|
|
fail_shape_inference("Input tensor must have at least 2 dimensions");
|
|
}
|
|
|
|
// first dim is the batch axis and the next is the number of channels.
|
|
size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
|
|
|
|
// Only MaxPool and Conv support dilation. For
|
|
// simplicity of the code, we just treat the rest of them as having all-1s
|
|
// dilation.
|
|
std::vector<int64_t> dilations;
|
|
if (use_dilation && getRepeatedAttribute(ctx, "dilations", dilations)) {
|
|
if (dilations.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute dilations has incorrect size");
|
|
}
|
|
} else {
|
|
dilations.assign(n_input_dims, 1);
|
|
}
|
|
|
|
std::vector<int64_t> strides;
|
|
if (getRepeatedAttribute(ctx, "strides", strides)) {
|
|
if (strides.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute strides has incorrect size");
|
|
}
|
|
} else {
|
|
strides.assign(n_input_dims, 1);
|
|
}
|
|
|
|
std::vector<int64_t> kernel_shape;
|
|
if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
|
|
if (kernel_shape.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute kernel_shape has incorrect size");
|
|
}
|
|
} else if (require_kernel_shape) {
|
|
fail_shape_inference("Attribute kernel_shape must be specified");
|
|
} else {
|
|
auto second_input_shape = ctx.getInputType(input2Idx)->tensor_type().shape();
|
|
for (int i = 2; i < second_input_shape.dim_size(); ++i) {
|
|
if (!second_input_shape.dim(i).has_dim_value()) {
|
|
return;
|
|
}
|
|
kernel_shape.push_back(second_input_shape.dim(i).dim_value());
|
|
}
|
|
}
|
|
|
|
std::vector<int64_t> effective_kernel_shape = kernel_shape;
|
|
for (int i = 0; i < static_cast<int>(kernel_shape.size()); i++) {
|
|
// accounting for dilation, how big is the kernel in this dimension
|
|
effective_kernel_shape[i] = (effective_kernel_shape[i] - 1) * dilations[i] + 1;
|
|
}
|
|
|
|
std::vector<int64_t> pads;
|
|
if (getRepeatedAttribute(ctx, "pads", pads)) {
|
|
if (pads.size() != n_input_dims * 2) {
|
|
fail_shape_inference("Attribute pads has incorrect size");
|
|
}
|
|
} else {
|
|
pads.assign(n_input_dims * 2, 0);
|
|
const auto* auto_pad_attr = ctx.getAttribute("auto_pad");
|
|
if ((nullptr != auto_pad_attr) && (auto_pad_attr->s() != "VALID")) {
|
|
int input_dims_size = static_cast<int>(n_input_dims);
|
|
for (int i = 0; i < input_dims_size; ++i) {
|
|
int64_t residual = 0;
|
|
int64_t stride = strides[i];
|
|
if (stride > 1) {
|
|
if (!input_shape.dim(2 + i).has_dim_value()) {
|
|
continue;
|
|
}
|
|
residual = input_shape.dim(2 + i).dim_value();
|
|
while (residual >= stride) {
|
|
residual -= stride;
|
|
}
|
|
}
|
|
if (i >= static_cast<int>(effective_kernel_shape.size())) {
|
|
fail_shape_inference("kernel shape should have ", input_dims_size, " values in ", ctx.getDisplayName(), ".");
|
|
}
|
|
int64_t total_pad = residual == 0 ? effective_kernel_shape[i] - stride : effective_kernel_shape[i] - residual;
|
|
if (total_pad < 0)
|
|
total_pad = 0;
|
|
int64_t half_pad_small = total_pad >> 1;
|
|
int64_t half_pad_big = total_pad - half_pad_small;
|
|
if (auto_pad_attr->s() == "SAME_UPPER") {
|
|
pads[i] = half_pad_small;
|
|
pads[i + input_dims_size] = half_pad_big;
|
|
} else if (auto_pad_attr->s() == "SAME_LOWER") {
|
|
pads[i] = half_pad_big;
|
|
pads[i + input_dims_size] = half_pad_small;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
|
|
|
|
if (require_kernel_shape) {
|
|
// add the first two dimensions from the input.
|
|
*output_shape->add_dim() = input_shape.dim(0);
|
|
*output_shape->add_dim() = input_shape.dim(1);
|
|
} else {
|
|
*output_shape->add_dim() = input_shape.dim(0);
|
|
auto& second_input_shape = getInputShape(ctx, input2Idx);
|
|
if (second_input_shape.dim_size() < 1) {
|
|
fail_shape_inference("Second input tensor has wrong dimension");
|
|
}
|
|
*output_shape->add_dim() = second_input_shape.dim(0);
|
|
}
|
|
|
|
int kernel_shape_size = static_cast<int>(kernel_shape.size());
|
|
for (int i = 0; i < kernel_shape_size; ++i) {
|
|
auto newdim = output_shape->add_dim();
|
|
if (!input_shape.dim(2 + i).has_dim_value()) {
|
|
continue;
|
|
}
|
|
// how big is the input, including padding
|
|
int64_t effective_input_size = input_shape.dim(2 + i).dim_value();
|
|
effective_input_size += pads[i];
|
|
effective_input_size += pads[i + kernel_shape_size];
|
|
|
|
// default is floor mode .i.e. ceil_mode is set to 0
|
|
auto ceil_mode = getAttribute(ctx, "ceil_mode", 0);
|
|
|
|
// how many times we can move the kernel from it's initial position, based
|
|
// on the stride
|
|
int64_t strided_kernel_positions;
|
|
|
|
if (ceil_mode == 1)
|
|
strided_kernel_positions =
|
|
(int64_t)(std::ceil((effective_input_size - effective_kernel_shape[i]) / float(strides[i])));
|
|
else
|
|
strided_kernel_positions = (effective_input_size - effective_kernel_shape[i]) / strides[i];
|
|
|
|
// add in the initial position
|
|
newdim->set_dim_value(1 + strided_kernel_positions);
|
|
}
|
|
|
|
if (ctx.getNumOutputs() > 1) {
|
|
// MaxPool with two outputs case.
|
|
auto second_output_shape = ctx.getOutputType(1)->mutable_tensor_type()->mutable_shape();
|
|
second_output_shape->CopyFrom(*output_shape);
|
|
}
|
|
}
|
|
|
|
static const char* Dropout_ver13_doc = R"DOC(
|
|
Dropout takes an input floating-point tensor, an optional input ratio (floating-point scalar) and an optional input training_mode (boolean scalar). It produces two tensor outputs,
|
|
output (floating-point tensor) and mask (optional `Tensor<bool>`). If `training_mode` is true then the output Y will be a random dropout;
|
|
Note that this Dropout scales the masked input data by the following equation, so to convert the trained model into inference mode,
|
|
the user can simply not pass `training_mode` input or set it to false.
|
|
```
|
|
output = scale * data * mask,
|
|
```
|
|
where
|
|
```
|
|
scale = 1. / (1. - ratio).
|
|
```
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
Dropout,
|
|
13,
|
|
OpSchema()
|
|
.SetDoc(GET_OP_DOC_STR(std::string(Dropout_ver13_doc) + GenerateOptionalArgumentsDoc()))
|
|
.Attr(
|
|
"seed",
|
|
"(Optional) Seed to the random generator, if not specified we will auto generate one.",
|
|
AttributeProto::INT,
|
|
OPTIONAL_VALUE)
|
|
.Input(0, "data", "The input data as Tensor.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
|
|
.Input(
|
|
1,
|
|
"ratio",
|
|
"The ratio of random dropout, with value in [0, 1). If this input was not set, "
|
|
"or if it was set to 0, the output would be a simple copy of the input. "
|
|
"If it's non-zero, output will be a random dropout of the scaled input, which is typically "
|
|
"the case during training. It is an optional value, if not specified it will default to 0.5.",
|
|
"T1",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.Input(
|
|
2,
|
|
"training_mode",
|
|
"If set to true then it indicates dropout is being used for training. It is an optional value hence unless "
|
|
"specified explicitly, it is false. If it is false, ratio is ignored and the operation mimics inference mode where "
|
|
"nothing will be dropped from the input data and if mask is requested as output it will contain all ones.",
|
|
"T2",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.Output(0, "output", "The output.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
|
|
.Output(1, "mask", "The output mask.", "T2", OpSchema::Optional, true, 1, OpSchema::NonDifferentiable)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeConstraint(
|
|
"T1",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input 'ratio' types to float tensors.")
|
|
.TypeConstraint("T2", {"tensor(bool)"}, "Constrain output 'mask' types to boolean tensors.")
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (hasInputShape(ctx, 0)) {
|
|
propagateShapeFromInputToOutput(ctx, 0, 0);
|
|
}
|
|
|
|
if (ctx.getNumInputs() > 1 && hasInputShape(ctx, 1)) {
|
|
auto& ratio_input_shape = getInputShape(ctx, 1);
|
|
if (static_cast<int>(ratio_input_shape.dim_size()) != 0) {
|
|
fail_shape_inference("Ratio of Dropout must be a scalar.");
|
|
}
|
|
}
|
|
|
|
if (ctx.getNumInputs() > 2 && hasInputShape(ctx, 2)) {
|
|
auto& training_mode_input_shape = getInputShape(ctx, 2);
|
|
if (static_cast<int>(training_mode_input_shape.dim_size()) != 0) {
|
|
fail_shape_inference("training_mode of Dropout must be a scalar.");
|
|
}
|
|
}
|
|
|
|
if (ctx.getNumOutputs() == 2) {
|
|
updateOutputElemType(ctx, 1, TensorProto::BOOL);
|
|
if (hasNInputShapes(ctx, 1)) {
|
|
propagateShapeFromInputToOutput(ctx, 0, 1);
|
|
}
|
|
}
|
|
}));
|
|
|
|
static const char* LpNormalization_ver1_doc = R"DOC(
|
|
Given a matrix, apply Lp-normalization along the provided axis.
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
LpNormalization,
|
|
1,
|
|
OpSchema()
|
|
.Input(0, "input", "Input matrix", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
|
|
.Output(0, "output", "Matrix after normalization", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.SetDoc(LpNormalization_ver1_doc)
|
|
.Attr(
|
|
"axis",
|
|
"The axis on which to apply normalization, -1 mean last axis.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(-1))
|
|
.Attr(
|
|
"p",
|
|
"The order of the normalization, only 1 or 2 are supported.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(2))
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { propagateShapeAndTypeFromFirstInput(ctx); }));
|
|
|
|
static const char* InstanceNormalization_ver6_doc = R"DOC(
|
|
Carries out instance normalization as described in the paper
|
|
https://arxiv.org/abs/1607.08022.
|
|
|
|
y = scale * (x - mean) / sqrt(variance + epsilon) + B,
|
|
where mean and variance are computed per instance per channel.
|
|
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
InstanceNormalization,
|
|
6,
|
|
OpSchema()
|
|
.SetDoc(InstanceNormalization_ver6_doc)
|
|
.Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
|
|
.Input(
|
|
0,
|
|
"input",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
|
"size.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Input(
|
|
1,
|
|
"scale",
|
|
"The input 1-dimensional scale tensor of size C.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Input(
|
|
2,
|
|
"B",
|
|
"The input 1-dimensional bias tensor of size C.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Output(
|
|
0,
|
|
"output",
|
|
"The output tensor of the same shape as input.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { propagateShapeAndTypeFromFirstInput(ctx); }));
|
|
|
|
void maxUnpoolShapeInference_opset11(InferenceContext& ctx) {
|
|
// we need at least two inputs to have a shape for this inference.
|
|
if (ctx.getNumInputs() != 2 && ctx.getNumInputs() != 3) {
|
|
fail_type_inference("MaxUnpool op must have either two or three inputs.");
|
|
}
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (!hasInputShape(ctx, 0)) {
|
|
return; // If first input does not have shape, we cannot infer much.
|
|
}
|
|
auto input_shape = ctx.getInputType(0)->tensor_type().shape();
|
|
if (input_shape.dim_size() < 2) {
|
|
fail_shape_inference("Input tensor X must have at least 2 dimensions.");
|
|
}
|
|
|
|
// first dim is the batch axis and the next is the number of channels.
|
|
size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
|
|
|
|
std::vector<int64_t> pads;
|
|
if (getRepeatedAttribute(ctx, "pads", pads)) {
|
|
if (pads.size() != n_input_dims * 2) {
|
|
fail_shape_inference("Attribute pads has incorrect size.");
|
|
}
|
|
} else {
|
|
pads.assign(n_input_dims * 2, 0);
|
|
}
|
|
|
|
std::vector<int64_t> strides;
|
|
if (getRepeatedAttribute(ctx, "strides", strides)) {
|
|
if (strides.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute strides has incorrect size.");
|
|
}
|
|
} else {
|
|
strides.assign(n_input_dims, 1);
|
|
}
|
|
|
|
std::vector<int64_t> kernel_shape;
|
|
if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
|
|
if (kernel_shape.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute kernel_shape has incorrect size.");
|
|
}
|
|
} else {
|
|
fail_shape_inference("Attribute kernel_shape must be specified.");
|
|
}
|
|
|
|
if (ctx.getNumInputs() == 3) {
|
|
// If the third input, output_size, is specified, then use that instead
|
|
// of inferring shape from inputs.
|
|
if (hasInputShape(ctx, 2)) {
|
|
auto& output_shape = getInputShape(ctx, 2);
|
|
if (output_shape.dim_size() != 1) {
|
|
fail_type_inference("'output_shape' must be rank 1 tensor.");
|
|
}
|
|
if (output_shape.dim(static_cast<int>(0)).has_dim_value() &&
|
|
static_cast<int>(output_shape.dim(static_cast<int>(0)).dim_value()) != input_shape.dim_size()) {
|
|
fail_shape_inference("'output_shape' must have same number of elements as the shape of input tensor X.");
|
|
}
|
|
}
|
|
return; // 'output_shape' is specified as input. Actual shape will be
|
|
// determined at runtime.
|
|
}
|
|
|
|
auto final_output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
|
|
|
|
*final_output_shape->add_dim() = input_shape.dim(0);
|
|
*final_output_shape->add_dim() =
|
|
ctx.getInputType(1)->tensor_type().shape().dim(1); // channels should be the second dim of second input.
|
|
|
|
int kernel_shape_size = static_cast<int>(kernel_shape.size());
|
|
for (int i = 0; i < kernel_shape_size; ++i) {
|
|
auto newdim = final_output_shape->add_dim();
|
|
if (!input_shape.dim(2 + i).has_dim_value()) {
|
|
continue;
|
|
}
|
|
|
|
int64_t newdim_value = strides[i] * (input_shape.dim(2 + i).dim_value() - 1);
|
|
newdim_value += kernel_shape[i];
|
|
newdim_value -= pads[i];
|
|
newdim_value -= pads[i + kernel_shape_size];
|
|
|
|
// add in the initial position
|
|
newdim->set_dim_value(newdim_value);
|
|
}
|
|
}
|
|
|
|
// For GlobalPool operations.
|
|
void globalPoolTypeShapeInference_opset1(InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
|
|
// needs at least one input with shape.
|
|
if (!hasNInputShapes(ctx, 1)) {
|
|
return;
|
|
}
|
|
|
|
auto input_shape = ctx.getInputType(0)->tensor_type().shape();
|
|
if (input_shape.dim_size() < 2) {
|
|
return;
|
|
}
|
|
|
|
// first dim is the batch axis and the next is the number of channels.
|
|
size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
|
|
|
|
// (N, C, 1, 1, ..., 1)
|
|
auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
|
|
*output_shape->add_dim() = input_shape.dim(0);
|
|
*output_shape->add_dim() = input_shape.dim(1);
|
|
|
|
for (size_t i = 0; i < n_input_dims; ++i) {
|
|
output_shape->add_dim()->set_dim_value(1);
|
|
}
|
|
}
|
|
|
|
std::function<void(OpSchema&)> GlobalPoolingOpSchemaGenerator_opset1(const char* op_type, const char* op) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
Global{op_type} consumes an input tensor X and applies {op} pooling across
|
|
the values in the same channel. This is equivalent to {op_type} with kernel size
|
|
equal to the spatial dimension of input tensor.)DOC";
|
|
ReplaceAll(doc, "{op_type}", op_type);
|
|
ReplaceAll(doc, "{op}", op););
|
|
schema.SetDoc(doc);
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the width "
|
|
"of the data. For non image case, the dimensions are "
|
|
"in the form of (N x C x D1 x D2 ... Dn), "
|
|
"where N is the batch size.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from pooling across the input "
|
|
"tensor. The output tensor has the same rank as the input. "
|
|
"The first two dimensions of output shape are the same as "
|
|
"the input (N x C), while the other dimensions are all 1.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { globalPoolTypeShapeInference_opset1(ctx); });
|
|
};
|
|
}
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
GlobalAveragePool,
|
|
1,
|
|
OpSchema().FillUsing(GlobalPoolingOpSchemaGenerator_opset1("AveragePool", "average")));
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
GlobalMaxPool,
|
|
1,
|
|
OpSchema().FillUsing(GlobalPoolingOpSchemaGenerator_opset1("MaxPool", "max")));
|
|
|
|
void convTransposeShapeInference_opset11(InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
|
|
// we need at least two inputs to have a shape for this inference.
|
|
if (!hasNInputShapes(ctx, 2)) {
|
|
return;
|
|
}
|
|
|
|
int64_t group = getAttribute(ctx, "group", 1);
|
|
|
|
auto input_shape = ctx.getInputType(0)->tensor_type().shape();
|
|
if (input_shape.dim_size() < 2) {
|
|
return; // Input tensor should have at least two dimensions.
|
|
}
|
|
|
|
// first dim is the batch axis and the next is the number of channels.
|
|
size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
|
|
|
|
std::vector<int64_t> dilations;
|
|
if (getRepeatedAttribute(ctx, "dilations", dilations)) {
|
|
if (dilations.size() != n_input_dims) {
|
|
return;
|
|
}
|
|
} else {
|
|
dilations.assign(n_input_dims, 1);
|
|
}
|
|
|
|
std::vector<int64_t> strides;
|
|
if (getRepeatedAttribute(ctx, "strides", strides)) {
|
|
if (strides.size() != n_input_dims) {
|
|
return;
|
|
}
|
|
} else {
|
|
strides.assign(n_input_dims, 1);
|
|
}
|
|
|
|
std::vector<int64_t> kernel_shape;
|
|
if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
|
|
if (kernel_shape.size() != n_input_dims) {
|
|
return;
|
|
}
|
|
} else {
|
|
auto second_input_shape = ctx.getInputType(1)->tensor_type().shape();
|
|
for (int i = 2; i < second_input_shape.dim_size(); ++i) {
|
|
if (!second_input_shape.dim(i).has_dim_value()) {
|
|
return;
|
|
}
|
|
kernel_shape.push_back(second_input_shape.dim(i).dim_value());
|
|
}
|
|
}
|
|
|
|
std::vector<int64_t> effective_kernel_shape = kernel_shape;
|
|
for (int i = 0; i < static_cast<int>(kernel_shape.size()); i++) {
|
|
// accounting for dilation, how big is the kernel in this dimension
|
|
effective_kernel_shape[i] = (effective_kernel_shape[i] - 1) * dilations[i] + 1;
|
|
}
|
|
|
|
std::vector<int64_t> pads;
|
|
if (getRepeatedAttribute(ctx, "pads", pads)) {
|
|
if (pads.size() != n_input_dims * 2) {
|
|
fail_shape_inference("Attribute pads has incorrect size");
|
|
}
|
|
const auto* auto_pad_attr = ctx.getAttribute("auto_pad");
|
|
if (nullptr != auto_pad_attr && auto_pad_attr->s() != "NOTSET") {
|
|
fail_shape_inference("The pads attribute cannot be used simultaneously with auto_pad attribute");
|
|
}
|
|
} else {
|
|
pads.assign(n_input_dims * 2, 0);
|
|
const auto* auto_pad_attr = ctx.getAttribute("auto_pad");
|
|
if ((nullptr != auto_pad_attr) && (auto_pad_attr->s() != "VALID")) {
|
|
int input_dims_size = static_cast<int>(n_input_dims);
|
|
for (int i = 0; i < input_dims_size; ++i) {
|
|
int64_t total_pad = effective_kernel_shape[i] - strides[i];
|
|
if (total_pad < 0)
|
|
total_pad = 0;
|
|
int64_t half_pad_small = total_pad >> 1;
|
|
int64_t half_pad_big = total_pad - half_pad_small;
|
|
if (auto_pad_attr->s() == "SAME_UPPER") {
|
|
pads[i] = half_pad_small;
|
|
pads[i + input_dims_size] = half_pad_big;
|
|
} else if (auto_pad_attr->s() == "SAME_LOWER") {
|
|
pads[i] = half_pad_big;
|
|
pads[i + input_dims_size] = half_pad_small;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::vector<int64_t> output_shape;
|
|
bool output_shape_presented = true;
|
|
if (getRepeatedAttribute(ctx, "output_shape", output_shape)) {
|
|
if (output_shape.size() != n_input_dims) {
|
|
return;
|
|
}
|
|
} else {
|
|
output_shape_presented = false;
|
|
}
|
|
|
|
std::vector<int64_t> output_padding;
|
|
if (getRepeatedAttribute(ctx, "output_padding", output_padding)) {
|
|
if (output_padding.size() != n_input_dims) { // Added only to one side.
|
|
return;
|
|
}
|
|
} else {
|
|
output_padding.assign(n_input_dims, 0);
|
|
}
|
|
|
|
auto final_output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
|
|
|
|
*final_output_shape->add_dim() = input_shape.dim(0);
|
|
*final_output_shape->add_dim() =
|
|
ctx.getInputType(1)->tensor_type().shape().dim(1) * group; // channels should be the second dim of second input
|
|
// multiply group.
|
|
|
|
int size_of_output;
|
|
if (output_shape_presented) {
|
|
size_of_output = static_cast<int>(output_shape.size());
|
|
for (int i = 0; i < size_of_output; ++i) {
|
|
if (input_shape.dim(i + 2).has_dim_value()) {
|
|
if (output_shape[i] < input_shape.dim(i + 2).dim_value()) {
|
|
// TODO: throw exception?
|
|
return; // output shape value cannot be smaller than the input shape
|
|
// value
|
|
}
|
|
}
|
|
final_output_shape->add_dim()->set_dim_value(output_shape[i]);
|
|
}
|
|
return;
|
|
} else {
|
|
size_of_output = input_shape.dim_size() - 2;
|
|
for (int i = 0; i < size_of_output; ++i) {
|
|
if (input_shape.dim(i + 2).has_dim_value()) {
|
|
int64_t output_shape_dim = strides[i] * (input_shape.dim(i + 2).dim_value() - 1) + output_padding[i] +
|
|
effective_kernel_shape[i] - pads[i] - pads[i + n_input_dims];
|
|
final_output_shape->add_dim()->set_dim_value(output_shape_dim);
|
|
} else {
|
|
final_output_shape->add_dim();
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
static const char* DeformConv_ver19_doc = R"DOC(
|
|
Performs deformable convolution as described in https://arxiv.org/abs/1703.06211 and https://arxiv.org/abs/1811.11168.
|
|
This operator specification supports the general N-D case. Note that most common use cases have 2D or 3D data.
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
DeformConv,
|
|
19,
|
|
OpSchema()
|
|
.SetDoc(DeformConv_ver19_doc)
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor. For 2D image data, it has shape (N, C, H, W) where N is the batch size, "
|
|
"C is the number of input channels, and H and W are the height and width. "
|
|
"In general, the shape is (N, C, D1, D2, ... , Dn) for n-dimensional data, where "
|
|
"D1 to Dn are the spatial dimension sizes. Most common use cases have n = 2 or 3.",
|
|
"T")
|
|
.Input(
|
|
1,
|
|
"W",
|
|
"Weight tensor that will be used in the convolutions. It has shape (oC, C/group, kH, kW), "
|
|
"where oC is the number of output channels and kH and kW are the kernel height and width. "
|
|
"For more than 2 dimensions, it has shape (oC, C/group, k1, k2, ... , kn).",
|
|
"T")
|
|
.Input(
|
|
2,
|
|
"offset",
|
|
"Offset tensor denoting the offset for the sampling locations in the convolution kernel. "
|
|
"It has shape (N, offset_group * kH * kW * 2, oH, oW) for 2D data or "
|
|
"(N, offset_group * k1 * k2 * ... * kn * n, o1, o2, ... , on) for nD data. Use linear interpolation"
|
|
"for fractional offset values. Sampling locations outside of the padded input tensor gives zero.",
|
|
"T")
|
|
.Input(
|
|
3,
|
|
"B",
|
|
"Optional 1D bias of length oC to be added to the convolution. Default is a tensor of zeros.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.Input(
|
|
4,
|
|
"mask",
|
|
"The mask tensor to be applied to each position in the convolution kernel. "
|
|
"It has shape (N, offset_group * kH * kW, oH, oW) for 2D data or "
|
|
"(N, offset_group * k1 * k2 * ... * kn * n, o1, o2, ... , on) for nD data. Default is a "
|
|
"tensor of ones.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor that contains the result of convolution. It has shape (N, oC, oH, oW) "
|
|
"for 2D data or (N, oC, o1, o2, ..., on) for nD data",
|
|
"T")
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.Attr(
|
|
"dilations",
|
|
"Dilation value along each spatial axis of the kernel. Default is 1 along each axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE)
|
|
.Attr(
|
|
"group",
|
|
"Number of groups the input and output channels, C and oC, are divided into. C and oC must both "
|
|
"be divisible by group. Default is 1.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1))
|
|
.Attr(
|
|
"kernel_shape",
|
|
"Shape of the convolution kernel. If not present, it is inferred from the shape of input W.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE)
|
|
.Attr(
|
|
"offset_group",
|
|
"Number of groups of offset. C must be divisible by offset_group. Default is 1.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1))
|
|
.Attr(
|
|
"pads",
|
|
"Padding for the beginning and end along each spatial axis. The values represent the number of pixels "
|
|
"added to the beginning and end of the corresponding axis and can take any nonnegative value. "
|
|
"The format should be as follows: [x1_begin, x2_begin, ..., x1_end, x2_end, ...], where xi_begin "
|
|
"is the number of pixels added at the beginning of axis `i` and xi_end is the number of pixels "
|
|
"added at the end of axis `i`. Default is 0 along each axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE)
|
|
.Attr(
|
|
"strides",
|
|
"Stride along each spatial axis. Default is 1 along each axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE)
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
convPoolShapeInference_opset19(ctx, true, false, 0, 1);
|
|
}));
|
|
|
|
std::function<void(OpSchema&)> ConvTransposeOpSchemaGenerator_opset11(const char* filter_desc) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
The convolution transpose operator consumes an input tensor and {filter_desc},
|
|
and computes the output.
|
|
|
|
If the pads parameter is provided the shape of the output is calculated via the following equation:
|
|
|
|
output_shape[i] = stride[i] * (input_size[i] - 1) + output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - pads[start_i] - pads[end_i]
|
|
|
|
output_shape can also be explicitly specified in which case pads values are auto generated using these equations:
|
|
|
|
total_padding[i] = stride[i] * (input_size[i] - 1) + output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i]
|
|
If (auto_pads == SAME_UPPER): pads[start_i] = total_padding[i]/2; pads[end_i] = total_padding[i] - (total_padding[i]/2)
|
|
Else: pads[start_i] = total_padding[i] - (total_padding[i]/2); pads[end_i] = (total_padding[i]/2).
|
|
|
|
)DOC";
|
|
ReplaceAll(doc, "{filter_desc}", filter_desc););
|
|
schema.SetDoc(doc);
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from previous layer; has size (N x C x H x W)"
|
|
", where N is the batch size, C is the number of channels, and"
|
|
" H and W are the height and width. Note that this is for the 2D image. "
|
|
"Otherwise the size is (N x C x D1 x D2 ... x Dn)",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Input(
|
|
1,
|
|
"W",
|
|
"The weight tensor that will be used in the "
|
|
"convolutions; has size (C x M/group x kH x kW), where C "
|
|
"is the number of channels, and kH and kW are the "
|
|
"height and width of the kernel, and M is the number "
|
|
"of feature maps. For more than 2 dimensions, the "
|
|
"weight shape will be (C x M/group x k1 x k2 x ... x kn), "
|
|
"where (k1 x k2 x ... x kn) is the dimension of the kernel. "
|
|
"The number of channels in the output should be equal to W.shape[1] * group "
|
|
"(assuming zero based indices of the shape array)",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Input(
|
|
2,
|
|
"B",
|
|
"Optional 1D bias to be added to the convolution, has size of M.",
|
|
"T",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor that contains the result of the convolution. The "
|
|
"output dimensions are functions of the kernel size, stride size, "
|
|
"pad lengths and group count. "
|
|
"The number of channels in the output should be equal to W.shape[1] * group "
|
|
"(assuming zero based indices of the shape array)",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.Attr(
|
|
"kernel_shape",
|
|
"The shape of the convolution kernel. If not present, should be inferred from input W.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"output_shape",
|
|
"The shape of the output can be explicitly set which will cause pads values to be auto generated. If output_shape is specified "
|
|
"pads values are ignored. See doc for details for equations to generate pads. Note that the output_shape attribute value "
|
|
"should not include dimensions for batch size and channels, which are automatically inferred.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"output_padding",
|
|
"Additional elements added to the side with higher coordinate indices in the output. "
|
|
"Each padding value in \"output_padding\" must be less than the corresponding stride/dilation dimension. "
|
|
"By default, this attribute is a zero vector. "
|
|
"Note that this attribute doesn't directly affect the computed output values. "
|
|
"It only controls the selection of the computed values, "
|
|
"so changing this attribute only adds or removes output elements. "
|
|
"If \"output_shape\" is explicitly provided, "
|
|
"\"output_padding\" does not contribute additional size to \"output_shape\" but "
|
|
"participates in the computation of the needed padding amount. "
|
|
"This is also called adjs or adjustment in some frameworks.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"dilations",
|
|
"dilation value along each spatial axis of the filter. If not present, the dilation defaults to 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"strides",
|
|
"Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", conv_transpose_auto_pad_doc_opset19, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc_opset11, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"group",
|
|
"number of groups input channels and output channels are divided into.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1));
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { convTransposeShapeInference_opset11(ctx); });
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(ConvTranspose, 11, OpSchema().FillUsing(ConvTransposeOpSchemaGenerator_opset11("a filter")));
|
|
|
|
std::function<void(OpSchema&)> ConvOpSchemaGenerator_opset11(const char* filter_desc) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
The convolution operator consumes an input tensor and {filter_desc}, and
|
|
computes the output.)DOC";
|
|
ReplaceAll(doc, "{filter_desc}", filter_desc););
|
|
schema.SetDoc(doc);
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from previous layer; "
|
|
"has size (N x C x H x W), where N is the batch size, "
|
|
"C is the number of channels, and H and W are the "
|
|
"height and width. Note that this is for the 2D image. "
|
|
"Otherwise the size is (N x C x D1 x D2 ... x Dn). "
|
|
"Optionally, if dimension denotation is "
|
|
"in effect, the operation expects input data tensor "
|
|
"to arrive with the dimension denotation of [DATA_BATCH, "
|
|
"DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Input(
|
|
1,
|
|
"W",
|
|
"The weight tensor that will be used in the "
|
|
"convolutions; has size (M x C/group x kH x kW), where C "
|
|
"is the number of channels, and kH and kW are the "
|
|
"height and width of the kernel, and M is the number "
|
|
"of feature maps. For more than 2 dimensions, the "
|
|
"kernel shape will be (M x C/group x k1 x k2 x ... x kn), "
|
|
"where (k1 x k2 x ... kn) is the dimension of the kernel. "
|
|
"Optionally, if dimension denotation is in effect, "
|
|
"the operation expects the weight tensor to arrive "
|
|
"with the dimension denotation of [FILTER_OUT_CHANNEL, "
|
|
"FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. "
|
|
"Assuming zero based indices for the shape array, "
|
|
"X.shape[1] == (W.shape[1] * group) == C and "
|
|
"W.shape[0] mod G == 0. Or in other words "
|
|
"FILTER_IN_CHANNEL multiplied by the number of groups "
|
|
"should be equal to DATA_CHANNEL and the number of "
|
|
"feature maps M should be a multiple of the number of "
|
|
"groups G.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Input(
|
|
2,
|
|
"B",
|
|
"Optional 1D bias to be added to the convolution, has size of M.",
|
|
"T",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor that contains the result of the "
|
|
"convolution. The output dimensions are functions "
|
|
"of the kernel size, stride size, and pad lengths.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.Attr(
|
|
"kernel_shape",
|
|
"The shape of the convolution kernel. If not present, should be inferred from input W.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"dilations",
|
|
"dilation value along each spatial axis of the filter. If not present, the dilation defaults is 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"strides",
|
|
"Stride along each spatial axis. If not present, the stride defaults is 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", conv_auto_pad_doc_opset19, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc_opset11, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"group",
|
|
"number of groups input channels and output channels are divided into.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1));
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
convPoolShapeInference_opset19(ctx, true, false, 0, 1);
|
|
});
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(Conv, 11, OpSchema().FillUsing(ConvOpSchemaGenerator_opset11("a filter")));
|
|
|
|
void roiPoolTypeShapeInference_opset1(InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
|
|
// rois is the second input.
|
|
if (!hasNInputShapes(ctx, 2)) {
|
|
return;
|
|
}
|
|
|
|
auto input_shape = ctx.getInputType(0)->tensor_type().shape();
|
|
auto rios_shape = ctx.getInputType(1)->tensor_type().shape();
|
|
|
|
if (input_shape.dim_size() < 2) {
|
|
fail_shape_inference("Input tensor must have at least 2 dimensions");
|
|
}
|
|
if (rios_shape.dim_size() != 2) {
|
|
fail_shape_inference("RoIs tensor must have 2 dimensions");
|
|
}
|
|
|
|
// first dim is the batch axis and the next is the number of channels.
|
|
size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
|
|
|
|
std::vector<int64_t> pooled_shape;
|
|
if (getRepeatedAttribute(ctx, "pooled_shape", pooled_shape)) {
|
|
if (pooled_shape.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute pooled_shape has incorrect length");
|
|
}
|
|
} else {
|
|
fail_shape_inference("Attribute pooled_shape must be specified");
|
|
}
|
|
|
|
// (num_rois, channels, pooled_shape[0], pooled_shape[1])
|
|
auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
|
|
|
|
*output_shape->add_dim() = rios_shape.dim(0);
|
|
*output_shape->add_dim() = input_shape.dim(1);
|
|
output_shape->add_dim()->set_dim_value(pooled_shape[0]);
|
|
output_shape->add_dim()->set_dim_value(pooled_shape[1]);
|
|
}
|
|
|
|
std::function<void(OpSchema&)> RoiPoolOpSchemaGenerator_opset1(const char* name) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
ROI {name} pool consumes an input tensor X and region of interests (RoIs) to
|
|
apply {name} pooling across each RoI, to produce output 4-D tensor of shape
|
|
(num_rois, channels, pooled_shape[0], pooled_shape[1]).)DOC";
|
|
ReplaceAll(doc, "{name}", name););
|
|
schema.SetDoc(doc);
|
|
schema.Attr("pooled_shape", "ROI pool output shape (height, width).", AttributeProto::INTS);
|
|
schema.Attr(
|
|
"spatial_scale",
|
|
"Multiplicative spatial scale factor to translate ROI coordinates from their input scale to the scale used when pooling.",
|
|
AttributeProto::FLOAT,
|
|
1.f);
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Input(
|
|
1,
|
|
"rois",
|
|
"RoIs (Regions of Interest) to pool over. Should "
|
|
"be a 2-D tensor of shape (num_rois, 5) given as "
|
|
"[[batch_id, x1, y1, x2, y2], ...].",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"RoI pooled output 4-D tensor of shape (num_rois, channels, pooled_shape[0], pooled_shape[1]).",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { roiPoolTypeShapeInference_opset1(ctx); });
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(MaxRoiPool, 1, OpSchema().FillUsing(RoiPoolOpSchemaGenerator_opset1("max")));
|
|
|
|
std::function<void(OpSchema&)> LpPoolOpSchemaGenerator_opset18(const char* name) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
{name} consumes an input tensor X and applies Lp pooling across
|
|
the tensor according to kernel sizes, stride sizes, and pad lengths.
|
|
Lp pooling consisting of computing the Lp norm on all values of a subset
|
|
of the input tensor according to the kernel size and downsampling the
|
|
data into the output tensor Y for further processing. The output spatial shape will be following:
|
|
```
|
|
output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i] + 1)
|
|
```
|
|
or
|
|
```
|
|
output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i] + 1)
|
|
```
|
|
if ceil_mode is enabled `pad_shape[i]` is the sum of pads along axis `i`.
|
|
|
|
`auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
|
|
```
|
|
VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - {kernelSpatialShape} + 1) / strides_spatial_shape[i])
|
|
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
|
|
```
|
|
And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
|
|
```
|
|
pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + {kernelSpatialShape} - input_spatial_shape[i]
|
|
```)DOC";
|
|
ReplaceAll(doc, "{name}", name););
|
|
schema.SetDoc(doc);
|
|
schema.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS);
|
|
schema.Attr(
|
|
"strides",
|
|
"Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"dilations",
|
|
"dilation value along each spatial axis of the filter. If not present, the dilation defaults is 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", conv_auto_pad_doc_opset19, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc_opset11, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"p", "p value of the Lp norm used to pool over the input data.", AttributeProto::INT, static_cast<int64_t>(2));
|
|
schema.Attr(
|
|
"ceil_mode",
|
|
"Whether to use ceil or floor (default) to compute the output shape.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0));
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the "
|
|
"batch size.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from Lp pooling across the input "
|
|
"tensor. Dimensions will vary based on various kernel, stride, and pad "
|
|
"sizes.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
convPoolShapeInference_opset19(ctx, true, true, 0, 1);
|
|
});
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(LpPool, 18, OpSchema().FillUsing(LpPoolOpSchemaGenerator_opset18("LpPool")));
|
|
|
|
static const char* MaxUnpool_ver11_doc = R"DOC(
|
|
MaxUnpool essentially computes the partial inverse of the MaxPool op.
|
|
The input information to this op is typically the output information from a MaxPool op. The first
|
|
input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
|
|
from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corresponding
|
|
to the elements in the first input tensor X. Input tensor I is typically the second output of the MaxPool op.
|
|
The third (optional) input is a tensor that specifies the output size of the unpooling operation.
|
|
|
|
MaxUnpool is intended to do 'partial' inverse of the MaxPool op. 'Partial' because all the non-maximal
|
|
values from the original input to MaxPool are set to zero in the output of the MaxUnpool op. Pooling
|
|
the result of an unpooling operation should give back the original input to the unpooling op.
|
|
|
|
MaxUnpool can produce the same output size for several input sizes, which makes unpooling op ambiguous.
|
|
The third input argument, output_size, is meant to disambiguate the op and produce output tensor of
|
|
known/predictable size.
|
|
|
|
In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape, strides, and pads,
|
|
which define the exact unpooling op. The attributes typically have the same values as the corresponding
|
|
pooling op that the unpooling op is trying to invert.
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
MaxUnpool,
|
|
11,
|
|
OpSchema()
|
|
.SetDoc(MaxUnpool_ver11_doc)
|
|
.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS)
|
|
.Attr(
|
|
"strides",
|
|
"Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE)
|
|
.Attr("pads", pads_doc_opset11, AttributeProto::INTS, OPTIONAL_VALUE)
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor that has to be unpooled. "
|
|
"This tensor is typically the first output of the MaxPool op."
|
|
"Dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non-image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
|
"size. Optionally, if dimension denotation is "
|
|
"in effect, the operation expects the input "
|
|
"data tensor to arrive with the dimension denotation "
|
|
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
|
"T1",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Input(
|
|
1,
|
|
"I",
|
|
"Input data tensor containing the indices corresponding to "
|
|
"elements in the first input tensor X."
|
|
"This tensor is typically the second output of the MaxPool op."
|
|
"Dimensions must be the same as input tensor X. "
|
|
"The indices are linear, i.e. computed considering the tensor as flattened 1-D tensor, "
|
|
"assuming row-major storage. Also, the linear indices should not consider padding. "
|
|
"So the values in indices are in the range [0, N x C x D1 x ... x Dn).",
|
|
"T2",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.Input(
|
|
2,
|
|
"output_shape",
|
|
"The shape of the output can be explicitly set which will cause pads values to be auto generated. If 'output_shape' is specified, "
|
|
"'pads' values are ignored.",
|
|
"T2",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.Output(
|
|
0,
|
|
"output",
|
|
"Output data tensor that contains the result of the unpooling.",
|
|
"T1",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.TypeConstraint(
|
|
"T1",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeConstraint("T2", {"tensor(int64)"}, "Constrain index tensor to int64")
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { maxUnpoolShapeInference_opset11(ctx); }));
|
|
|
|
std::vector<std::string> GetSupportedDataTypesForPoolingOps_opset19(bool supports8bit) {
|
|
if (supports8bit) {
|
|
return {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(int8)", "tensor(uint8)"};
|
|
}
|
|
return {"tensor(float16)", "tensor(float)", "tensor(double)"};
|
|
}
|
|
|
|
std::function<void(OpSchema&)> PoolOpSchemaGenerator_opset19(
|
|
const char* name,
|
|
const char* opName,
|
|
const char* additionalDescription,
|
|
bool use_dilation,
|
|
bool supports8bit = false) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(
|
|
doc = R"DOC(
|
|
{name} consumes an input tensor X and applies {opName} pooling across
|
|
the tensor according to kernel sizes, stride sizes, and pad lengths.
|
|
{opName} pooling consisting of computing the {opName} on all values of a
|
|
subset of the input tensor according to the kernel size and downsampling the
|
|
data into the output tensor Y for further processing. The output spatial shape is calculated differently
|
|
depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
|
|
With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
|
|
```
|
|
output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
|
|
```
|
|
or
|
|
```
|
|
output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
|
|
```
|
|
if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
|
|
|
|
`auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
|
|
```
|
|
VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - {kernelSpatialShape} + 1) / strides_spatial_shape[i])
|
|
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
|
|
```
|
|
or when ceil_mode is disabled (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AveragePooling2D):
|
|
```
|
|
VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i]) + 1
|
|
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor((input_spatial_shape[i] - 1) / strides_spatial_shape[i]) + 1
|
|
```
|
|
And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
|
|
```
|
|
pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + {kernelSpatialShape} - input_spatial_shape[i]
|
|
```
|
|
{additionalDescription}
|
|
)DOC";
|
|
ReplaceAll(doc, "{name}", name);
|
|
ReplaceAll(doc, "{opName}", opName);
|
|
ReplaceAll(doc, "{additionalDescription}", additionalDescription);
|
|
ReplaceAll(
|
|
doc,
|
|
"{kernelSpatialShape}",
|
|
use_dilation ? "((kernel_spatial_shape[i] - 1) * dilations[i] + 1)" : "kernel_spatial_shape[i]"););
|
|
schema.SetDoc(doc);
|
|
schema.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS);
|
|
schema.Attr(
|
|
"strides",
|
|
"Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", conv_auto_pad_doc_opset19, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc_opset11, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"ceil_mode",
|
|
"Whether to use ceil or floor (default) to compute the output shape.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0));
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
|
"size. Optionally, if dimension denotation is "
|
|
"in effect, the operation expects the input "
|
|
"data tensor to arrive with the dimension denotation "
|
|
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from average or max pooling across "
|
|
"the input tensor. Dimensions will vary based "
|
|
"on various kernel, stride, and pad sizes. Floor value of "
|
|
"the dimension is used",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.TypeConstraint(
|
|
"T",
|
|
GetSupportedDataTypesForPoolingOps_opset19(supports8bit),
|
|
supports8bit ? "Constrain input and output types to float and 8 bit tensors."
|
|
: "Constrain input and output types to float tensors.");
|
|
schema.TypeAndShapeInferenceFunction([use_dilation](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (ctx.getNumOutputs() > 1) {
|
|
// MaxPool with two outputs case.
|
|
auto output_type = ctx.getOutputType(1);
|
|
if (output_type->value_case() == TypeProto::kTensorType ||
|
|
output_type->value_case() == TypeProto::VALUE_NOT_SET) {
|
|
output_type->mutable_tensor_type()->set_elem_type(TensorProto::INT64);
|
|
}
|
|
}
|
|
convPoolShapeInference_opset19(ctx, use_dilation, true, 0, 1);
|
|
});
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
AveragePool,
|
|
19,
|
|
OpSchema()
|
|
.FillUsing(PoolOpSchemaGenerator_opset19(
|
|
"AveragePool",
|
|
"average",
|
|
"The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).",
|
|
true, /* use_dilation: dilations attribute has been added in opset 19. */
|
|
false /* supports8bit: does not support 8bit. */))
|
|
.Attr(
|
|
"dilations",
|
|
"Dilation value along each spatial axis of filter. If not present, the dilation defaults to 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE)
|
|
.Attr(
|
|
"count_include_pad",
|
|
"Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0)));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
MaxPool,
|
|
12,
|
|
OpSchema()
|
|
.FillUsing(PoolOpSchemaGenerator_opset19(
|
|
"MaxPool",
|
|
"max",
|
|
"The output of each pooling window is maximum number of elements exclude pad. ",
|
|
true,
|
|
true))
|
|
.Attr(
|
|
"storage_order",
|
|
"The storage order of the tensor. 0 is row major, and 1 is column major. "
|
|
"This attribute is used only to convert an n-tuple index value into "
|
|
"a single integer value for producing the second output. ",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0))
|
|
.Attr(
|
|
"dilations",
|
|
"Dilation value along each spatial axis of filter. If not present, the dilation defaults to 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE)
|
|
.Output(
|
|
1,
|
|
"Indices",
|
|
"Indices tensor from max pooling across the input tensor. "
|
|
"The dimensions of indices are the same as output tensor. "
|
|
"The values in indices of are the indices of the selected values during pooling. "
|
|
"The indices are computed as flatten 1-D tensor, "
|
|
"and the indices do not consider padding. "
|
|
"So the values in indices are in [0, N x C x D1 x ... x Dn).",
|
|
"I",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.TypeConstraint("I", {"tensor(int64)"}, "Constrain index tensor to int64"));
|
|
|
|
static const char* Dropout_ver12_doc = R"DOC(
|
|
Dropout takes an input floating-point tensor, an optional input ratio (floating-point scalar) and an optional input training_mode (boolean scalar). It produces two tensor outputs,
|
|
output (floating-point tensor) and mask (optional `Tensor<bool>`). If `training_mode` is true then the output Y will be a random dropout;
|
|
Note that this Dropout scales the masked input data by the following equation, so to convert the trained model into inference mode,
|
|
the user can simply not pass `training_mode` input or set it to false.
|
|
```
|
|
output = scale * data * mask,
|
|
```
|
|
where
|
|
```
|
|
scale = 1. / (1. - ratio).
|
|
```
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
Dropout,
|
|
12,
|
|
OpSchema()
|
|
.SetDoc(GET_OP_DOC_STR(std::string(Dropout_ver12_doc) + GenerateOptionalArgumentsDoc()))
|
|
.Attr(
|
|
"seed",
|
|
"(Optional) Seed to the random generator, if not specified we will auto generate one.",
|
|
AttributeProto::INT,
|
|
OPTIONAL_VALUE)
|
|
.Input(0, "data", "The input data as Tensor.", "T")
|
|
.Input(
|
|
1,
|
|
"ratio",
|
|
"The ratio of random dropout, with value in [0, 1). If this input was not set, "
|
|
"or if it was set to 0, the output would be a simple copy of the input. "
|
|
"If it's non-zero, output will be a random dropout of the scaled input, which is typically "
|
|
"the case during training. It is an optional value, if not specified it will default to 0.5.",
|
|
"T1",
|
|
OpSchema::Optional)
|
|
.Input(
|
|
2,
|
|
"training_mode",
|
|
"If set to true then it indicates dropout is being used for training. It is an optional value hence unless "
|
|
"specified explicitly, it is false. If it is false, ratio is ignored and the operation mimics inference mode where "
|
|
"nothing will be dropped from the input data and if mask is requested as output it will contain all ones.",
|
|
"T2",
|
|
OpSchema::Optional)
|
|
.Output(0, "output", "The output.", "T")
|
|
.Output(1, "mask", "The output mask.", "T2", OpSchema::Optional)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeConstraint(
|
|
"T1",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input 'ratio' types to float tensors.")
|
|
.TypeConstraint("T2", {"tensor(bool)"}, "Constrain output 'mask' types to boolean tensors.")
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (hasInputShape(ctx, 0)) {
|
|
propagateShapeFromInputToOutput(ctx, 0, 0);
|
|
}
|
|
|
|
if (ctx.getNumInputs() > 1 && hasInputShape(ctx, 1)) {
|
|
auto& ratio_input_shape = getInputShape(ctx, 1);
|
|
if (static_cast<int>(ratio_input_shape.dim_size()) != 0) {
|
|
fail_shape_inference("Ratio of Dropout must be a scalar.");
|
|
}
|
|
}
|
|
|
|
if (ctx.getNumInputs() > 2 && hasInputShape(ctx, 2)) {
|
|
auto& training_mode_input_shape = getInputShape(ctx, 2);
|
|
if (static_cast<int>(training_mode_input_shape.dim_size()) != 0) {
|
|
fail_shape_inference("training_mode of Dropout must be a scalar.");
|
|
}
|
|
}
|
|
|
|
if (ctx.getNumOutputs() == 2) {
|
|
updateOutputElemType(ctx, 1, TensorProto::BOOL);
|
|
if (hasNInputShapes(ctx, 1)) {
|
|
propagateShapeFromInputToOutput(ctx, 0, 1);
|
|
}
|
|
}
|
|
}));
|
|
|
|
static const char* Flatten_ver11_doc = R"DOC(
|
|
Flattens the input tensor into a 2D matrix. If input tensor has shape
|
|
(d_0, d_1, ... d_n) then the output will have shape
|
|
(d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn).
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
Flatten,
|
|
13,
|
|
OpSchema()
|
|
.SetDoc(Flatten_ver11_doc)
|
|
.Input(0, "input", "A tensor of rank >= axis.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
|
|
.Output(
|
|
0,
|
|
"output",
|
|
"A 2D tensor with the contents of the input tensor, "
|
|
"with input dimensions up to axis flattened to the outer dimension "
|
|
"of the output and remaining input dimensions flattened into the inner "
|
|
"dimension of the output.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output to all tensor types.")
|
|
.Attr(
|
|
"axis",
|
|
"Indicate up to which input dimensions "
|
|
"(exclusive) should be flattened to the outer dimension of the output. "
|
|
"The value for axis must be in the range [-r, r], where r is the rank of the input tensor. "
|
|
"Negative value means counting dimensions from the back. "
|
|
"When axis = 0, the shape of the output tensor is (1, (d_0 X d_1 ... d_n), "
|
|
"where the shape of the input tensor is (d_0, d_1, ... d_n). ",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1))
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (!hasInputShape(ctx, 0))
|
|
return;
|
|
auto& input_shape = getInputShape(ctx, 0);
|
|
int rank = static_cast<int>(input_shape.dim_size());
|
|
int axis = static_cast<int>(getAttribute(ctx, "axis", 1));
|
|
if (axis < 0) {
|
|
axis += rank;
|
|
}
|
|
if (axis > rank || axis < 0) {
|
|
fail_shape_inference("Invalid value(", axis, ") for attribute 'axis'");
|
|
}
|
|
// TODO: is the operation defined for input-rank < 2?
|
|
updateOutputShape(ctx, 0, {multiplyDims(input_shape, 0, axis), multiplyDims(input_shape, axis, rank)});
|
|
}));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
Flatten,
|
|
11,
|
|
OpSchema()
|
|
.SetDoc(Flatten_ver11_doc)
|
|
.Input(0, "input", "A tensor of rank >= axis.", "T")
|
|
.Output(
|
|
0,
|
|
"output",
|
|
"A 2D tensor with the contents of the input tensor, "
|
|
"with input dimensions up to axis flattened to the outer dimension "
|
|
"of the output and remaining input dimensions flattened into the inner "
|
|
"dimension of the output.",
|
|
"T")
|
|
.TypeConstraint("T", OpSchema::all_tensor_types(), "Constrain input and output to all tensor types.")
|
|
.Attr(
|
|
"axis",
|
|
"Indicate up to which input dimensions "
|
|
"(exclusive) should be flattened to the outer dimension of the output. "
|
|
"The value for axis must be in the range [-r, r], where r is the rank of the input tensor. "
|
|
"Negative value means counting dimensions from the back. "
|
|
"When axis = 0, the shape of the output tensor is (1, (d_0 X d_1 ... d_n), "
|
|
"where the shape of the input tensor is (d_0, d_1, ... d_n). ",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1))
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (!hasInputShape(ctx, 0))
|
|
return;
|
|
auto& input_shape = getInputShape(ctx, 0);
|
|
int rank = static_cast<int>(input_shape.dim_size());
|
|
int axis = static_cast<int>(getAttribute(ctx, "axis", 1));
|
|
if (axis < 0) {
|
|
axis += rank;
|
|
}
|
|
if (axis > rank || axis < 0) {
|
|
fail_shape_inference("Invalid value(", axis, ") for attribute 'axis'");
|
|
}
|
|
// TODO: is the operation defined for input-rank < 2?
|
|
updateOutputShape(ctx, 0, {multiplyDims(input_shape, 0, axis), multiplyDims(input_shape, axis, rank)});
|
|
}));
|
|
|
|
static const char* LRN_ver1_doc = R"DOC(
|
|
Local Response Normalization proposed in the [AlexNet paper](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
|
|
It normalizes over local input regions.
|
|
The local region is defined across the channels. For an element X[n, c, d1, ..., dk] in a tensor
|
|
of shape (N x C x D1 x D2, ..., Dk), its region is
|
|
{X[n, i, d1, ..., dk] | max(0, c - floor((size - 1) / 2)) <= i <= min(C - 1, c + ceil((size - 1) / 2))}.
|
|
|
|
square_sum[n, c, d1, ..., dk] = sum(X[n, i, d1, ..., dk] ^ 2),
|
|
where max(0, c - floor((size - 1) / 2)) <= i <= min(C - 1, c + ceil((size - 1) / 2)).
|
|
|
|
Y[n, c, d1, ..., dk] = X[n, c, d1, ..., dk] / (bias + alpha / size * square_sum[n, c, d1, ..., dk] ) ^ beta
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
LRN,
|
|
1,
|
|
OpSchema()
|
|
.Attr("size", "The number of channels to sum over", AttributeProto::INT)
|
|
.Attr("alpha", "Scaling parameter.", AttributeProto::FLOAT, 0.0001f)
|
|
.Attr("beta", "The exponent.", AttributeProto::FLOAT, 0.75f)
|
|
.Attr("bias", "", AttributeProto::FLOAT, 1.0f)
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
|
"size. Optionally, if dimension denotation is "
|
|
"in effect, the operation expects the input "
|
|
"data tensor to arrive with the dimension denotation "
|
|
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
|
"T")
|
|
.Output(0, "Y", "Output tensor, which has the shape and type as input tensor", "T")
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output "
|
|
" types to float tensors.")
|
|
.SetDoc(LRN_ver1_doc)
|
|
.TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
|
|
|
|
static const char* mvn_ver9_doc = R"DOC(
|
|
A MeanVarianceNormalization Function: Perform mean variance normalization
|
|
on the input tensor X using formula: <br/> ``` (X-EX)/sqrt(E(X-EX)^2) ```
|
|
)DOC";
|
|
|
|
static const std::vector<int64_t> old_mvn_default_axes = {0, 2, 3};
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
MeanVarianceNormalization,
|
|
9,
|
|
OpSchema()
|
|
.SetDoc(mvn_ver9_doc)
|
|
.Input(0, "X", "Input tensor", "T")
|
|
.Output(0, "Y", "Output tensor", "T")
|
|
.Attr(
|
|
"axes",
|
|
"A list of integers, along which to reduce. The default is to "
|
|
"calculate along axes [0,2,3] for calculating mean and variance "
|
|
"along each channel. Two variables with the same C-coordinate "
|
|
"are associated with the same mean and variance.",
|
|
AttributeProto::INTS,
|
|
old_mvn_default_axes)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to all numeric tensors.")
|
|
.FunctionBody(FunctionBodyHelper::BuildNodes(
|
|
{// nodes: {outputs, op, inputs, attributes}
|
|
FunctionBodyHelper::Const<float>("Exponent", 2.0f),
|
|
FunctionBodyHelper::Const<float>("Epsilon", float(1e-9)),
|
|
{{"X_RM"}, "ReduceMean", {"X"}, {MakeRefAttribute("axes", AttributeProto::INTS)}},
|
|
{{"EX_squared"}, "Pow", {"X_RM", "Exponent"}},
|
|
{{"X_squared"}, "Pow", {"X", "Exponent"}},
|
|
{{"E_Xsquared"}, "ReduceMean", {"X_squared"}, {MakeRefAttribute("axes", AttributeProto::INTS)}},
|
|
{{"Variance"}, "Sub", {"E_Xsquared", "EX_squared"}},
|
|
{{"STD"}, "Sqrt", {"Variance"}},
|
|
{{"X_variance"}, "Sub", {"X", "X_RM"}},
|
|
{{"Processed_STD"}, "Add", {"STD", "Epsilon"}},
|
|
{{"Y"}, "Div", {"X_variance", "Processed_STD"}}})));
|
|
|
|
const char* pads_doc2 =
|
|
"Padding for the beginning and ending along each spatial axis, it can take any value greater "
|
|
"than or equal to 0. The value represent the number of pixels added to the beginning "
|
|
"and end part of the corresponding axis. `pads` format should be as follow "
|
|
"[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
|
|
"added at the beginning of axis `i` and xi_end, the number of pixels added at "
|
|
"the end of axis `i`. This attribute cannot be used simultaneously with "
|
|
"auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
|
|
const char* auto_pad_doc2 =
|
|
"auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
|
|
"default value is NOTSET, which means explicit padding is used. "
|
|
"SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
|
|
"In case of odd number add the extra padding at the end for SAME_UPPER and at the "
|
|
"beginning for SAME_LOWER. VALID mean no padding.";
|
|
const char* auto_pad_doc3 =
|
|
"auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
|
|
"default value is NOTSET, which means explicit padding is used. "
|
|
"SAME_UPPER or SAME_LOWER mean pad the input so that "
|
|
"`output_shape[i] = ceil(input_shape[i] / strides[i])` for each axis `i`. "
|
|
"The padding is split between the two sides equally or almost equally (depending "
|
|
"on whether it is even or odd). In case the padding is an odd number, the extra "
|
|
"padding is added at the end for SAME_UPPER and at the beginning for SAME_LOWER.";
|
|
|
|
void convPoolShapeInference1(
|
|
InferenceContext& ctx,
|
|
bool use_dilation,
|
|
bool require_kernel_shape,
|
|
int input1Idx,
|
|
int input2Idx) {
|
|
// we need the first input shape for this inference.
|
|
if (!hasInputShape(ctx, input1Idx)) {
|
|
return;
|
|
}
|
|
|
|
// if kernel shape is an input (and not attribute)
|
|
// we need the shape of the second input.
|
|
if (!require_kernel_shape && !hasInputShape(ctx, input2Idx)) {
|
|
return;
|
|
}
|
|
|
|
auto input_shape = ctx.getInputType(input1Idx)->tensor_type().shape();
|
|
if (input_shape.dim_size() < 2) {
|
|
fail_shape_inference("Input tensor must have at least 2 dimensions");
|
|
}
|
|
|
|
// first dim is the batch axis and the next is the number of channels.
|
|
size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
|
|
|
|
// Only MaxPool and Conv support dilation. For
|
|
// simplicity of the code, we just treat the rest of them as having all-1s
|
|
// dilation.
|
|
std::vector<int64_t> dilations;
|
|
if (use_dilation && getRepeatedAttribute(ctx, "dilations", dilations)) {
|
|
if (dilations.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute dilations has incorrect size");
|
|
}
|
|
} else {
|
|
dilations.assign(n_input_dims, 1);
|
|
}
|
|
|
|
std::vector<int64_t> strides;
|
|
if (getRepeatedAttribute(ctx, "strides", strides)) {
|
|
if (strides.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute strides has incorrect size");
|
|
}
|
|
} else {
|
|
strides.assign(n_input_dims, 1);
|
|
}
|
|
|
|
std::vector<int64_t> kernel_shape;
|
|
if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
|
|
if (kernel_shape.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute kernel_shape has incorrect size");
|
|
}
|
|
} else if (require_kernel_shape) {
|
|
fail_shape_inference("Attribute kernel_shape must be specified");
|
|
} else {
|
|
auto second_input_shape = ctx.getInputType(input2Idx)->tensor_type().shape();
|
|
for (int i = 2; i < second_input_shape.dim_size(); ++i) {
|
|
if (!second_input_shape.dim(i).has_dim_value()) {
|
|
return;
|
|
}
|
|
kernel_shape.push_back(second_input_shape.dim(i).dim_value());
|
|
}
|
|
}
|
|
|
|
std::vector<int64_t> effective_kernel_shape = kernel_shape;
|
|
for (int i = 0; i < static_cast<int>(kernel_shape.size()); i++) {
|
|
// accounting for dilation, how big is the kernel in this dimension
|
|
effective_kernel_shape[i] = (effective_kernel_shape[i] - 1) * dilations[i] + 1;
|
|
}
|
|
|
|
std::vector<int64_t> pads;
|
|
if (getRepeatedAttribute(ctx, "pads", pads)) {
|
|
if (pads.size() != n_input_dims * 2) {
|
|
fail_shape_inference("Attribute pads has incorrect size");
|
|
}
|
|
} else {
|
|
pads.assign(n_input_dims * 2, 0);
|
|
const auto* auto_pad_attr = ctx.getAttribute("auto_pad");
|
|
if ((nullptr != auto_pad_attr) && (auto_pad_attr->s() != "VALID")) {
|
|
int input_dims_size = static_cast<int>(n_input_dims);
|
|
for (int i = 0; i < input_dims_size; ++i) {
|
|
int64_t residual = 0;
|
|
int64_t stride = strides[i];
|
|
if (stride > 1) {
|
|
if (!input_shape.dim(2 + i).has_dim_value()) {
|
|
continue;
|
|
}
|
|
residual = input_shape.dim(2 + i).dim_value();
|
|
while (residual >= stride) {
|
|
residual -= stride;
|
|
}
|
|
}
|
|
int64_t total_pad = residual == 0 ? effective_kernel_shape[i] - stride : effective_kernel_shape[i] - residual;
|
|
if (total_pad < 0)
|
|
total_pad = 0;
|
|
int64_t half_pad_small = total_pad >> 1;
|
|
int64_t half_pad_big = total_pad - half_pad_small;
|
|
if (auto_pad_attr->s() == "SAME_UPPER") {
|
|
pads[i] = half_pad_small;
|
|
pads[i + input_dims_size] = half_pad_big;
|
|
} else if (auto_pad_attr->s() == "SAME_LOWER") {
|
|
pads[i] = half_pad_big;
|
|
pads[i + input_dims_size] = half_pad_small;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
|
|
|
|
if (require_kernel_shape) {
|
|
// add the first two dimensions from the input.
|
|
*output_shape->add_dim() = input_shape.dim(0);
|
|
*output_shape->add_dim() = input_shape.dim(1);
|
|
} else {
|
|
*output_shape->add_dim() = input_shape.dim(0);
|
|
auto& second_input_shape = getInputShape(ctx, input2Idx);
|
|
if (second_input_shape.dim_size() < 1) {
|
|
fail_shape_inference("Second input tensor has wrong dimension");
|
|
}
|
|
*output_shape->add_dim() = second_input_shape.dim(0);
|
|
}
|
|
|
|
int kernel_shape_size = static_cast<int>(kernel_shape.size());
|
|
for (int i = 0; i < kernel_shape_size; ++i) {
|
|
auto newdim = output_shape->add_dim();
|
|
if (!input_shape.dim(2 + i).has_dim_value()) {
|
|
continue;
|
|
}
|
|
// how big is the input, including padding
|
|
int64_t effective_input_size = input_shape.dim(2 + i).dim_value();
|
|
effective_input_size += pads[i];
|
|
effective_input_size += pads[i + kernel_shape_size];
|
|
|
|
// default is floor mode .i.e. ceil_mode is set to 0
|
|
auto ceil_mode = getAttribute(ctx, "ceil_mode", 0);
|
|
|
|
// how many times we can move the kernel from it's initial position, based
|
|
// on the stride
|
|
int64_t strided_kernel_positions;
|
|
|
|
if (ceil_mode == 1)
|
|
strided_kernel_positions =
|
|
(int64_t)(std::ceil((effective_input_size - effective_kernel_shape[i]) / float(strides[i])));
|
|
else
|
|
strided_kernel_positions = (effective_input_size - effective_kernel_shape[i]) / strides[i];
|
|
|
|
// add in the initial position
|
|
newdim->set_dim_value(1 + strided_kernel_positions);
|
|
}
|
|
|
|
if (ctx.getNumOutputs() > 1) {
|
|
// MaxPool with two outputs case.
|
|
auto second_output_shape = ctx.getOutputType(1)->mutable_tensor_type()->mutable_shape();
|
|
second_output_shape->CopyFrom(*output_shape);
|
|
}
|
|
}
|
|
|
|
std::function<void(OpSchema&)>
|
|
PoolOpSchemaGenerator_9(const char* name, const char* opName, const char* additionalDescription) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
{name} consumes an input tensor X and applies {opName} pooling across
|
|
the tensor according to kernel sizes, stride sizes, and pad lengths.
|
|
{opName} pooling consisting of computing the {opName} on all values of a
|
|
subset of the input tensor according to the kernel size and downsampling the
|
|
data into the output tensor Y for further processing. The output spatial shape will be following:
|
|
```
|
|
output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1)
|
|
|
|
* pad_shape[i] is sum of pads along axis i
|
|
```
|
|
|
|
`auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
|
|
```
|
|
VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i])
|
|
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
|
|
```
|
|
And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
|
|
```
|
|
pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i]
|
|
```
|
|
{additionalDescription}
|
|
)DOC";
|
|
ReplaceAll(doc, "{name}", name);
|
|
ReplaceAll(doc, "{opName}", opName);
|
|
ReplaceAll(doc, "{additionalDescription}", additionalDescription););
|
|
schema.SetDoc(doc);
|
|
schema.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS);
|
|
schema.Attr("strides", "Stride along each spatial axis.", AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", auto_pad_doc2, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc2, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
|
"size. Optionally, if dimension denotation is "
|
|
"in effect, the operation expects the input "
|
|
"data tensor to arrive with the dimension denotation "
|
|
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
|
"T");
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from average or max pooling across "
|
|
"the input tensor. Dimensions will vary based "
|
|
"on various kernel, stride, and pad sizes. Floor value of "
|
|
"the dimension is used",
|
|
"T");
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (ctx.getNumOutputs() > 1) {
|
|
// MaxPool with two outputs case.
|
|
auto output_type = ctx.getOutputType(1);
|
|
if (output_type->value_case() == TypeProto::kTensorType ||
|
|
output_type->value_case() == TypeProto::VALUE_NOT_SET) {
|
|
output_type->mutable_tensor_type()->set_elem_type(TensorProto::INT64);
|
|
}
|
|
}
|
|
convPoolShapeInference1(ctx, false, true, 0, 1);
|
|
});
|
|
};
|
|
}
|
|
|
|
std::function<void(OpSchema&)> PoolOpSchemaGenerator_10(
|
|
const char* name,
|
|
const char* opName,
|
|
const char* additionalDescription,
|
|
bool use_dilation,
|
|
int opsetNum) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(
|
|
doc = R"DOC(
|
|
{name} consumes an input tensor X and applies {opName} pooling across
|
|
the tensor according to kernel sizes, stride sizes, and pad lengths.
|
|
{opName} pooling consisting of computing the {opName} on all values of a
|
|
subset of the input tensor according to the kernel size and downsampling the
|
|
data into the output tensor Y for further processing. The output spatial shape will be following:
|
|
```
|
|
output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i] + 1)
|
|
```
|
|
or
|
|
```
|
|
output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i] + 1)
|
|
```
|
|
if ceil_mode is enabled
|
|
|
|
```
|
|
* pad_shape[i] is sum of pads along axis i
|
|
```
|
|
|
|
`auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
|
|
```
|
|
VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - {kernelSpatialShape} + 1) / strides_spatial_shape[i])
|
|
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
|
|
```
|
|
And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
|
|
```
|
|
pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + {kernelSpatialShape} - input_spatial_shape[i]
|
|
```
|
|
{additionalDescription}
|
|
)DOC";
|
|
ReplaceAll(doc, "{name}", name);
|
|
ReplaceAll(doc, "{opName}", opName);
|
|
ReplaceAll(doc, "{additionalDescription}", additionalDescription);
|
|
ReplaceAll(
|
|
doc,
|
|
"{kernelSpatialShape}",
|
|
use_dilation ? "((kernel_spatial_shape[i] - 1) * dilations[i] + 1)" : "kernel_spatial_shape[i]"););
|
|
schema.SetDoc(doc);
|
|
schema.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS);
|
|
schema.Attr(
|
|
"strides",
|
|
opsetNum == 11
|
|
? "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis."
|
|
: "Stride along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", auto_pad_doc2, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc2, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"ceil_mode",
|
|
"Whether to use ceil or floor (default) to compute the output shape.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0));
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
|
"size. Optionally, if dimension denotation is "
|
|
"in effect, the operation expects the input "
|
|
"data tensor to arrive with the dimension denotation "
|
|
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
|
"T");
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from average or max pooling across "
|
|
"the input tensor. Dimensions will vary based "
|
|
"on various kernel, stride, and pad sizes. Floor value of "
|
|
"the dimension is used",
|
|
"T");
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.TypeAndShapeInferenceFunction([use_dilation](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (ctx.getNumOutputs() > 1) {
|
|
// MaxPool with two outputs case.
|
|
auto output_type = ctx.getOutputType(1);
|
|
if (output_type->value_case() == TypeProto::kTensorType ||
|
|
output_type->value_case() == TypeProto::VALUE_NOT_SET) {
|
|
output_type->mutable_tensor_type()->set_elem_type(TensorProto::INT64);
|
|
}
|
|
}
|
|
convPoolShapeInference1(ctx, use_dilation, true, 0, 1);
|
|
});
|
|
};
|
|
}
|
|
|
|
std::vector<std::string> GetSupportedDataTypesForPoolingOps_1(bool supports8bit) {
|
|
if (supports8bit) {
|
|
return {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(int8)", "tensor(uint8)"};
|
|
}
|
|
return {"tensor(float16)", "tensor(float)", "tensor(double)"};
|
|
}
|
|
|
|
std::function<void(OpSchema&)> PoolOpSchemaGenerator_11(
|
|
const char* name,
|
|
const char* opName,
|
|
const char* additionalDescription,
|
|
bool use_dilation,
|
|
bool supports8bit = false) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(
|
|
doc = R"DOC(
|
|
{name} consumes an input tensor X and applies {opName} pooling across
|
|
the tensor according to kernel sizes, stride sizes, and pad lengths.
|
|
{opName} pooling consisting of computing the {opName} on all values of a
|
|
subset of the input tensor according to the kernel size and downsampling the
|
|
data into the output tensor Y for further processing. The output spatial shape will be following:
|
|
```
|
|
output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i] + 1)
|
|
```
|
|
or
|
|
```
|
|
output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - {kernelSpatialShape}) / strides_spatial_shape[i] + 1)
|
|
```
|
|
if ceil_mode is enabled
|
|
|
|
```
|
|
* pad_shape[i] is sum of pads along axis i
|
|
```
|
|
|
|
`auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
|
|
```
|
|
VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - {kernelSpatialShape} + 1) / strides_spatial_shape[i])
|
|
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
|
|
```
|
|
or when ceil_mode is disabled:
|
|
```
|
|
VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - {kernelSpatialShape} + 1) / strides_spatial_shape[i])
|
|
SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor(input_spatial_shape[i] / strides_spatial_shape[i])
|
|
```
|
|
|
|
And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
|
|
```
|
|
pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + {kernelSpatialShape} - input_spatial_shape[i]
|
|
```
|
|
{additionalDescription}
|
|
)DOC";
|
|
ReplaceAll(doc, "{name}", name);
|
|
ReplaceAll(doc, "{opName}", opName);
|
|
ReplaceAll(doc, "{additionalDescription}", additionalDescription);
|
|
ReplaceAll(
|
|
doc,
|
|
"{kernelSpatialShape}",
|
|
use_dilation ? "((kernel_spatial_shape[i] - 1) * dilations[i] + 1)" : "kernel_spatial_shape[i]"););
|
|
schema.SetDoc(doc);
|
|
schema.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS);
|
|
schema.Attr(
|
|
"strides",
|
|
"Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", auto_pad_doc3, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc2, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"ceil_mode",
|
|
"Whether to use ceil or floor (default) to compute the output shape.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0));
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
|
"size. Optionally, if dimension denotation is "
|
|
"in effect, the operation expects the input "
|
|
"data tensor to arrive with the dimension denotation "
|
|
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from average or max pooling across "
|
|
"the input tensor. Dimensions will vary based "
|
|
"on various kernel, stride, and pad sizes. Floor value of "
|
|
"the dimension is used",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.TypeConstraint(
|
|
"T",
|
|
GetSupportedDataTypesForPoolingOps_1(supports8bit),
|
|
supports8bit ? "Constrain input and output types to float and 8 bit tensors."
|
|
: "Constrain input and output types to float tensors.");
|
|
schema.TypeAndShapeInferenceFunction([use_dilation](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (ctx.getNumOutputs() > 1) {
|
|
// MaxPool with two outputs case.
|
|
auto output_type = ctx.getOutputType(1);
|
|
if (output_type->value_case() == TypeProto::kTensorType ||
|
|
output_type->value_case() == TypeProto::VALUE_NOT_SET) {
|
|
output_type->mutable_tensor_type()->set_elem_type(TensorProto::INT64);
|
|
}
|
|
}
|
|
convPoolShapeInference1(ctx, use_dilation, true, 0, 1);
|
|
});
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
AveragePool,
|
|
1,
|
|
OpSchema().FillUsing(PoolOpSchemaGenerator_9(
|
|
"AveragePool",
|
|
"average",
|
|
"The output of each pooling window is divided by the number of elements exclude pad.")));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
AveragePool,
|
|
7,
|
|
OpSchema()
|
|
.FillUsing(PoolOpSchemaGenerator_9(
|
|
"AveragePool",
|
|
"average",
|
|
"The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero)."))
|
|
.Attr(
|
|
"count_include_pad",
|
|
"Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0)));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
AveragePool,
|
|
10,
|
|
OpSchema()
|
|
.FillUsing(PoolOpSchemaGenerator_10(
|
|
"AveragePool",
|
|
"average",
|
|
"The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).",
|
|
false,
|
|
10))
|
|
.Attr(
|
|
"count_include_pad",
|
|
"Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0)));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
AveragePool,
|
|
11,
|
|
OpSchema()
|
|
.FillUsing(PoolOpSchemaGenerator_11(
|
|
"AveragePool",
|
|
"average",
|
|
"The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero).",
|
|
true,
|
|
false))
|
|
.Attr(
|
|
"count_include_pad",
|
|
"Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0)));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
MaxPool,
|
|
1,
|
|
OpSchema().FillUsing(PoolOpSchemaGenerator_9(
|
|
"MaxPool",
|
|
"max",
|
|
"The output of each pooling window is maximum number of elements exclude pad.")));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
MaxPool,
|
|
8,
|
|
OpSchema()
|
|
.FillUsing(PoolOpSchemaGenerator_9(
|
|
"MaxPool",
|
|
"max",
|
|
"The output of each pooling window is maximum number of elements exclude pad."))
|
|
.Attr(
|
|
"storage_order",
|
|
"The storage order of the tensor. 0 is row major, and 1 is column major.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0))
|
|
.Output(
|
|
1,
|
|
"Indices",
|
|
"Indices tensor from max pooling across the input tensor. "
|
|
"The dimensions of indices are the same as output tensor. "
|
|
"The values in indices of are the indices of the selected values during pooling. "
|
|
"The indices are computed as flatten 1-D tensor, "
|
|
"and the indices do not consider padding. "
|
|
"So the values in indices are in [0, N x C x D1 x ... x Dn).",
|
|
"I",
|
|
OpSchema::Optional)
|
|
.TypeConstraint("I", {"tensor(int64)"}, "Constrain index tensor to int64"));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
MaxPool,
|
|
10,
|
|
OpSchema()
|
|
.FillUsing(PoolOpSchemaGenerator_10(
|
|
"MaxPool",
|
|
"max",
|
|
"The output of each pooling window is maximum number of elements exclude pad.",
|
|
true,
|
|
10))
|
|
.Attr(
|
|
"storage_order",
|
|
"The storage order of the tensor. 0 is row major, and 1 is column major.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0))
|
|
.Attr("dilations", "Dilation value along each spatial axis of filter.", AttributeProto::INTS, OPTIONAL_VALUE)
|
|
.Output(
|
|
1,
|
|
"Indices",
|
|
"Indices tensor from max pooling across the input tensor. "
|
|
"The dimensions of indices are the same as output tensor. "
|
|
"The values in indices of are the indices of the selected values during pooling. "
|
|
"The indices are computed as flatten 1-D tensor, "
|
|
"and the indices do not consider padding. "
|
|
"So the values in indices are in [0, N x C x D1 x ... x Dn).",
|
|
"I",
|
|
OpSchema::Optional)
|
|
.TypeConstraint("I", {"tensor(int64)"}, "Constrain index tensor to int64"));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
MaxPool,
|
|
11,
|
|
OpSchema()
|
|
.FillUsing(PoolOpSchemaGenerator_10(
|
|
"MaxPool",
|
|
"max",
|
|
"The output of each pooling window is maximum number of elements exclude pad.",
|
|
true,
|
|
11))
|
|
.Attr(
|
|
"storage_order",
|
|
"The storage order of the tensor. 0 is row major, and 1 is column major.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0))
|
|
.Attr(
|
|
"dilations",
|
|
"Dilation value along each spatial axis of filter. If not present, the dilation defaults to 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE)
|
|
.Output(
|
|
1,
|
|
"Indices",
|
|
"Indices tensor from max pooling across the input tensor. "
|
|
"The dimensions of indices are the same as output tensor. "
|
|
"The values in indices of are the indices of the selected values during pooling. "
|
|
"The indices are computed as flatten 1-D tensor, "
|
|
"and the indices do not consider padding. "
|
|
"So the values in indices are in [0, N x C x D1 x ... x Dn).",
|
|
"I",
|
|
OpSchema::Optional)
|
|
.TypeConstraint("I", {"tensor(int64)"}, "Constrain index tensor to int64"));
|
|
|
|
void maxUnpoolShapeInference1(InferenceContext& ctx) {
|
|
// we need at least two inputs to have a shape for this inference.
|
|
if (ctx.getNumInputs() != 2 && ctx.getNumInputs() != 3) {
|
|
fail_type_inference("MaxUnpool op must have either two or three inputs.");
|
|
}
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (!hasInputShape(ctx, 0)) {
|
|
return; // If first input does not have shape, we cannot infer much.
|
|
}
|
|
auto input_shape = ctx.getInputType(0)->tensor_type().shape();
|
|
if (input_shape.dim_size() < 2) {
|
|
fail_shape_inference("Input tensor X must have at least 2 dimensions.");
|
|
}
|
|
|
|
// first dim is the batch axis and the next is the number of channels.
|
|
size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
|
|
|
|
std::vector<int64_t> pads;
|
|
if (getRepeatedAttribute(ctx, "pads", pads)) {
|
|
if (pads.size() != n_input_dims * 2) {
|
|
fail_shape_inference("Attribute pads has incorrect size.");
|
|
}
|
|
} else {
|
|
pads.assign(n_input_dims * 2, 0);
|
|
}
|
|
|
|
std::vector<int64_t> strides;
|
|
if (getRepeatedAttribute(ctx, "strides", strides)) {
|
|
if (strides.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute strides has incorrect size.");
|
|
}
|
|
} else {
|
|
strides.assign(n_input_dims, 1);
|
|
}
|
|
|
|
std::vector<int64_t> kernel_shape;
|
|
if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
|
|
if (kernel_shape.size() != n_input_dims) {
|
|
fail_shape_inference("Attribute kernel_shape has incorrect size.");
|
|
}
|
|
} else {
|
|
fail_shape_inference("Attribute kernel_shape must be specified.");
|
|
}
|
|
|
|
if (ctx.getNumInputs() == 3) {
|
|
// If the third input, output_size, is specified, then use that instead
|
|
// of inferring shape from inputs.
|
|
if (hasInputShape(ctx, 2)) {
|
|
auto& output_shape = getInputShape(ctx, 2);
|
|
if (output_shape.dim_size() != 1) {
|
|
fail_type_inference("'output_shape' must be rank 1 tensor.");
|
|
}
|
|
if (output_shape.dim(static_cast<int>(0)).has_dim_value() &&
|
|
static_cast<int>(output_shape.dim(static_cast<int>(0)).dim_value()) != input_shape.dim_size()) {
|
|
fail_shape_inference("'output_shape' must have same number of elements as the shape of input tensor X.");
|
|
}
|
|
}
|
|
return; // 'output_shape' is specified as input. Actual shape will be
|
|
// determined at runtime.
|
|
}
|
|
|
|
auto final_output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
|
|
|
|
*final_output_shape->add_dim() = input_shape.dim(0);
|
|
*final_output_shape->add_dim() =
|
|
ctx.getInputType(1)->tensor_type().shape().dim(1); // channels should be the second dim of second input.
|
|
|
|
int kernel_shape_size = static_cast<int>(kernel_shape.size());
|
|
for (int i = 0; i < kernel_shape_size; ++i) {
|
|
auto newdim = final_output_shape->add_dim();
|
|
if (!input_shape.dim(2 + i).has_dim_value()) {
|
|
continue;
|
|
}
|
|
|
|
int64_t newdim_value = strides[i] * (input_shape.dim(2 + i).dim_value() - 1);
|
|
newdim_value += kernel_shape[i];
|
|
newdim_value -= pads[i];
|
|
newdim_value -= pads[i + kernel_shape_size];
|
|
|
|
// add in the initial position
|
|
newdim->set_dim_value(newdim_value);
|
|
}
|
|
}
|
|
|
|
static const char* MaxUnpool_ver9_doc = R"DOC(
|
|
MaxUnpool essentially computes the partial inverse of the MaxPool op.
|
|
The input information to this op is typically the output information from a MaxPool op. The first
|
|
input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
|
|
from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corresponding
|
|
to the elements in the first input tensor X. Input tensor I is typically the second output of the MaxPool op.
|
|
The third (optional) input is a tensor that specifies the output size of the unpooling operation.
|
|
|
|
MaxUnpool is intended to do 'partial' inverse of the MaxPool op. 'Partial' because all the non-maximal
|
|
values from the original input to MaxPool are set to zero in the output of the MaxUnpool op. Pooling
|
|
the result of an unpooling operation should give back the original input to the unpooling op.
|
|
|
|
MaxUnpool can produce the same output size for several input sizes, which makes unpooling op ambiguous.
|
|
The third input argument, output_size, is meant to disambiguate the op and produce output tensor of
|
|
known/predictable size.
|
|
|
|
In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape, strides, and pads,
|
|
which define the exact unpooling op. The attributes typically have the same values as the corresponding
|
|
pooling op that the unpooling op is trying to invert.
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
MaxUnpool,
|
|
9,
|
|
OpSchema()
|
|
.SetDoc(MaxUnpool_ver9_doc)
|
|
.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS)
|
|
.Attr("strides", "Stride along each spatial axis.", AttributeProto::INTS, OPTIONAL_VALUE)
|
|
.Attr("pads", pads_doc2, AttributeProto::INTS, OPTIONAL_VALUE)
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor that has to be unpooled. "
|
|
"This tensor is typically the first output of the MaxPool op."
|
|
"Dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non-image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
|
"size. Optionally, if dimension denotation is "
|
|
"in effect, the operation expects the input "
|
|
"data tensor to arrive with the dimension denotation "
|
|
"of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
|
"T1")
|
|
.Input(
|
|
1,
|
|
"I",
|
|
"Input data tensor containing the indices corresponding to "
|
|
"elements in the first input tensor X."
|
|
"This tensor is typically the second output of the MaxPool op."
|
|
"Dimensions must be the same as input tensor X. "
|
|
"The indices are linear, i.e. computed considering the tensor as flattened 1-D tensor, "
|
|
"assuming row-major storage. Also, the linear indices should not consider padding. "
|
|
"So the values in indices are in the range [0, N x C x D1 x ... x Dn).",
|
|
"T2")
|
|
.Input(
|
|
2,
|
|
"output_shape",
|
|
"The shape of the output can be explicitly set which will cause pads values to be auto generated. If 'output_shape' is specified, "
|
|
"'pads' values are ignored.",
|
|
"T2",
|
|
OpSchema::Optional)
|
|
.Output(0, "output", "Output data tensor that contains the result of the unpooling.", "T1")
|
|
.TypeConstraint(
|
|
"T1",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeConstraint("T2", {"tensor(int64)"}, "Constrain index tensor to int64")
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { maxUnpoolShapeInference1(ctx); }));
|
|
|
|
const char* pads_doc1 =
|
|
"Padding for the beginning and ending along each axis, it can take any value greater "
|
|
"than or equal to 0. The value represent the number of pixels added to the beginning "
|
|
"and end part of the corresponding axis. `pads` format should be as follow "
|
|
"[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
|
|
"added at the beginning of axis `i` and xi_end, the number of pixels added at "
|
|
"the end of axis `i`. This attribute cannot be used simultaneously with "
|
|
"auto_pad attribute.";
|
|
const char* auto_pad_doc1 =
|
|
"auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
|
|
"default value is NOTSET, which means explicit padding is used. "
|
|
"SAME_UPPER or SAME_LOWER mean pad the input so that the output size match the input."
|
|
"In case of odd number add the extra padding at the end for SAME_UPPER and at the "
|
|
"beginning for SAME_LOWER. VALID mean no padding. DEPRECATION NOTE: auto_pad is "
|
|
"only intended to support legacy uses, and for framework authors, one is explicitly "
|
|
"encouraged to use explicit padding specified in the pads attribute.";
|
|
|
|
static const char* LpPool_ver1_doc = R"DOC(
|
|
LpPool consumes an input tensor X and applies Lp pooling across the
|
|
the tensor according to kernel sizes, stride sizes, and pad lengths.
|
|
Lp pooling consisting of computing the Lp norm on all values of a subset
|
|
of the input tensor according to the kernel size and downsampling the
|
|
data into the output tensor Y for further processing.)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
LpPool,
|
|
1,
|
|
OpSchema()
|
|
.SetDoc(LpPool_ver1_doc)
|
|
.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS, OPTIONAL_VALUE)
|
|
.Attr("strides", "Stride along each axis.", AttributeProto::INTS, OPTIONAL_VALUE)
|
|
.Attr("auto_pad", auto_pad_doc1, AttributeProto::STRING, std::string("NOTSET"))
|
|
.Attr("pads", pads_doc1, AttributeProto::INTS, OPTIONAL_VALUE)
|
|
.Attr(
|
|
"p",
|
|
"p value of the Lp norm used to pool over the input data, default is 2.0.",
|
|
AttributeProto::FLOAT,
|
|
2.0f)
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimension are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the "
|
|
"batch size.",
|
|
"T")
|
|
.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from Lp pooling across the input "
|
|
"tensor. Dimensions will vary based on various kernel, stride, and pad "
|
|
"sizes.",
|
|
"T")
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors."));
|
|
|
|
std::function<void(OpSchema&)> LpPoolOpSchemaGenerator_10(const char* name) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
{name} consumes an input tensor X and applies Lp pooling across
|
|
the tensor according to kernel sizes, stride sizes, and pad lengths.
|
|
Lp pooling consisting of computing the Lp norm on all values of a subset
|
|
of the input tensor according to the kernel size and downsampling the
|
|
data into the output tensor Y for further processing.)DOC";
|
|
ReplaceAll(doc, "{name}", name););
|
|
schema.SetDoc(doc);
|
|
schema.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS);
|
|
schema.Attr("strides", "Stride along each spatial axis.", AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", auto_pad_doc2, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc2, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"p", "p value of the Lp norm used to pool over the input data.", AttributeProto::INT, static_cast<int64_t>(2));
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the "
|
|
"batch size.",
|
|
"T");
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from Lp pooling across the input "
|
|
"tensor. Dimensions will vary based on various kernel, stride, and pad "
|
|
"sizes.",
|
|
"T");
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
convPoolShapeInference1(ctx, false, true, 0, 1);
|
|
});
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(LpPool, 2, OpSchema().FillUsing(LpPoolOpSchemaGenerator_10("LpPool")));
|
|
|
|
static const char* GlobalLpPool_ver1_doc = R"DOC(
|
|
GlobalLpPool consumes an input tensor X and applies lp pool pooling across the
|
|
the values in the same channel. This is equivalent to LpPool with kernel size
|
|
equal to the spatial dimension of input tensor.)DOC";
|
|
|
|
std::function<void(OpSchema&)> LpPoolOpSchemaGenerator_11(const char* name) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
{name} consumes an input tensor X and applies Lp pooling across
|
|
the tensor according to kernel sizes, stride sizes, and pad lengths.
|
|
Lp pooling consisting of computing the Lp norm on all values of a subset
|
|
of the input tensor according to the kernel size and downsampling the
|
|
data into the output tensor Y for further processing.)DOC";
|
|
ReplaceAll(doc, "{name}", name););
|
|
schema.SetDoc(doc);
|
|
schema.Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS);
|
|
schema.Attr(
|
|
"strides",
|
|
"Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", auto_pad_doc3, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc2, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"p", "p value of the Lp norm used to pool over the input data.", AttributeProto::INT, static_cast<int64_t>(2));
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the "
|
|
"batch size.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from Lp pooling across the input "
|
|
"tensor. Dimensions will vary based on various kernel, stride, and pad "
|
|
"sizes.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable);
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
convPoolShapeInference1(ctx, false, true, 0, 1);
|
|
});
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(LpPool, 11, OpSchema().FillUsing(LpPoolOpSchemaGenerator_11("LpPool")));
|
|
|
|
std::function<void(OpSchema&)> ConvOpSchemaGenerator_10(const char* filter_desc) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
The convolution operator consumes an input tensor and {filter_desc}, and
|
|
computes the output.)DOC";
|
|
ReplaceAll(doc, "{filter_desc}", filter_desc););
|
|
schema.SetDoc(doc);
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from previous layer; "
|
|
"has size (N x C x H x W), where N is the batch size, "
|
|
"C is the number of channels, and H and W are the "
|
|
"height and width. Note that this is for the 2D image. "
|
|
"Otherwise the size is (N x C x D1 x D2 ... x Dn). "
|
|
"Optionally, if dimension denotation is "
|
|
"in effect, the operation expects input data tensor "
|
|
"to arrive with the dimension denotation of [DATA_BATCH, "
|
|
"DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
|
|
"T");
|
|
schema.Input(
|
|
1,
|
|
"W",
|
|
"The weight tensor that will be used in the "
|
|
"convolutions; has size (M x C/group x kH x kW), where C "
|
|
"is the number of channels, and kH and kW are the "
|
|
"height and width of the kernel, and M is the number "
|
|
"of feature maps. For more than 2 dimensions, the "
|
|
"kernel shape will be (M x C/group x k1 x k2 x ... x kn), "
|
|
"where (k1 x k2 x ... kn) is the dimension of the kernel. "
|
|
"Optionally, if dimension denotation is in effect, "
|
|
"the operation expects the weight tensor to arrive "
|
|
"with the dimension denotation of [FILTER_OUT_CHANNEL, "
|
|
"FILTER_IN_CHANNEL, FILTER_SPATIAL, FILTER_SPATIAL ...]. "
|
|
"X.shape[1] == (W.shape[1] * group) == C "
|
|
"(assuming zero based indices for the shape array). "
|
|
"Or in other words FILTER_IN_CHANNEL should be equal to DATA_CHANNEL. ",
|
|
"T");
|
|
schema.Input(2, "B", "Optional 1D bias to be added to the convolution, has size of M.", "T", OpSchema::Optional);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor that contains the result of the "
|
|
"convolution. The output dimensions are functions "
|
|
"of the kernel size, stride size, and pad lengths.",
|
|
"T");
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.Attr(
|
|
"kernel_shape",
|
|
"The shape of the convolution kernel. If not present, should be inferred from input W.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"dilations", "dilation value along each spatial axis of the filter.", AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr("strides", "Stride along each spatial axis.", AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", auto_pad_doc2, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc2, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"group",
|
|
"number of groups input channels and output channels are divided into.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1));
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
convPoolShapeInference1(ctx, true, false, 0, 1);
|
|
});
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(Conv, 1, OpSchema().FillUsing(ConvOpSchemaGenerator_10("a filter")));
|
|
|
|
void convTransposeShapeInference1(InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
|
|
// we need at least two inputs to have a shape for this inference.
|
|
if (!hasNInputShapes(ctx, 2)) {
|
|
return;
|
|
}
|
|
|
|
int64_t group = getAttribute(ctx, "group", 1);
|
|
|
|
auto input_shape = ctx.getInputType(0)->tensor_type().shape();
|
|
if (input_shape.dim_size() < 2) {
|
|
return; // Input tensor should have at least two dimensions.
|
|
}
|
|
|
|
// first dim is the batch axis and the next is the number of channels.
|
|
size_t n_input_dims = static_cast<size_t>(input_shape.dim_size() - 2);
|
|
|
|
std::vector<int64_t> dilations;
|
|
if (getRepeatedAttribute(ctx, "dilations", dilations)) {
|
|
if (dilations.size() != n_input_dims) {
|
|
return;
|
|
}
|
|
} else {
|
|
dilations.assign(n_input_dims, 1);
|
|
}
|
|
|
|
std::vector<int64_t> strides;
|
|
if (getRepeatedAttribute(ctx, "strides", strides)) {
|
|
if (strides.size() != n_input_dims) {
|
|
return;
|
|
}
|
|
} else {
|
|
strides.assign(n_input_dims, 1);
|
|
}
|
|
|
|
std::vector<int64_t> kernel_shape;
|
|
if (getRepeatedAttribute(ctx, "kernel_shape", kernel_shape)) {
|
|
if (kernel_shape.size() != n_input_dims) {
|
|
return;
|
|
}
|
|
} else {
|
|
auto second_input_shape = ctx.getInputType(1)->tensor_type().shape();
|
|
for (int i = 2; i < second_input_shape.dim_size(); ++i) {
|
|
if (!second_input_shape.dim(i).has_dim_value()) {
|
|
return;
|
|
}
|
|
kernel_shape.push_back(second_input_shape.dim(i).dim_value());
|
|
}
|
|
}
|
|
|
|
std::vector<int64_t> effective_kernel_shape = kernel_shape;
|
|
for (int i = 0; i < static_cast<int>(kernel_shape.size()); i++) {
|
|
// accounting for dilation, how big is the kernel in this dimension
|
|
effective_kernel_shape[i] = (effective_kernel_shape[i] - 1) * dilations[i] + 1;
|
|
}
|
|
|
|
std::vector<int64_t> pads;
|
|
if (getRepeatedAttribute(ctx, "pads", pads)) {
|
|
if (pads.size() != n_input_dims * 2) {
|
|
fail_shape_inference("Attribute pads has incorrect size");
|
|
}
|
|
} else {
|
|
pads.assign(n_input_dims * 2, 0);
|
|
const auto* auto_pad_attr = ctx.getAttribute("auto_pad");
|
|
if ((nullptr != auto_pad_attr) && (auto_pad_attr->s() != "VALID")) {
|
|
int input_dims_size = static_cast<int>(n_input_dims);
|
|
for (int i = 0; i < input_dims_size; ++i) {
|
|
int64_t total_pad = effective_kernel_shape[i] - strides[i];
|
|
if (total_pad < 0)
|
|
total_pad = 0;
|
|
int64_t half_pad_small = total_pad >> 1;
|
|
int64_t half_pad_big = total_pad - half_pad_small;
|
|
if (auto_pad_attr->s() == "SAME_UPPER") {
|
|
pads[i] = half_pad_small;
|
|
pads[i + input_dims_size] = half_pad_big;
|
|
} else if (auto_pad_attr->s() == "SAME_LOWER") {
|
|
pads[i] = half_pad_big;
|
|
pads[i + input_dims_size] = half_pad_small;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::vector<int64_t> output_shape;
|
|
bool output_shape_presented = true;
|
|
if (getRepeatedAttribute(ctx, "output_shape", output_shape)) {
|
|
if (output_shape.size() != n_input_dims) {
|
|
return;
|
|
}
|
|
} else {
|
|
output_shape_presented = false;
|
|
}
|
|
|
|
std::vector<int64_t> output_padding;
|
|
if (getRepeatedAttribute(ctx, "output_padding", output_padding)) {
|
|
if (output_padding.size() != n_input_dims) { // Added only to one side.
|
|
return;
|
|
}
|
|
} else {
|
|
output_padding.assign(n_input_dims, 0);
|
|
}
|
|
|
|
auto final_output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
|
|
|
|
*final_output_shape->add_dim() = input_shape.dim(0);
|
|
*final_output_shape->add_dim() =
|
|
ctx.getInputType(1)->tensor_type().shape().dim(1) * group; // channels should be the second dim of second input
|
|
// multiply group.
|
|
|
|
int size_of_output;
|
|
if (output_shape_presented) {
|
|
size_of_output = static_cast<int>(output_shape.size());
|
|
for (int i = 0; i < size_of_output; ++i) {
|
|
if (input_shape.dim(i + 2).has_dim_value()) {
|
|
if (output_shape[i] < input_shape.dim(i + 2).dim_value()) {
|
|
// TODO: throw exception?
|
|
return; // output shape value cannot be smaller than the input shape
|
|
// value
|
|
}
|
|
}
|
|
final_output_shape->add_dim()->set_dim_value(output_shape[i]);
|
|
}
|
|
return;
|
|
} else {
|
|
size_of_output = input_shape.dim_size() - 2;
|
|
for (int i = 0; i < size_of_output; ++i) {
|
|
if (input_shape.dim(i + 2).has_dim_value()) {
|
|
int64_t output_shape_dim = strides[i] * (input_shape.dim(i + 2).dim_value() - 1) + output_padding[i] +
|
|
effective_kernel_shape[i] - pads[i] - pads[i + n_input_dims];
|
|
final_output_shape->add_dim()->set_dim_value(output_shape_dim);
|
|
} else {
|
|
final_output_shape->add_dim();
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
std::function<void(OpSchema&)> ConvTransposeOpSchemaGenerator_10(const char* filter_desc) {
|
|
return [=](OpSchema& schema) {
|
|
std::string doc;
|
|
POPULATE_OP_DOC_STR(doc = R"DOC(
|
|
The convolution transpose operator consumes an input tensor and {filter_desc},
|
|
and computes the output.
|
|
|
|
If the pads parameter is provided the shape of the output is calculated via the following equation:
|
|
|
|
output_shape[i] = stride[i] * (input_size[i] - 1) + output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - pads[start_i] - pads[end_i]
|
|
|
|
output_shape can also be explicitly specified in which case pads values are auto generated using these equations:
|
|
|
|
total_padding[i] = stride[i] * (input_size[i] - 1) + output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i]
|
|
If (auto_pads != SAME_UPPER): pads[start_i] = total_padding[i]/2; pads[end_i] = total_padding[i] - (total_padding[i]/2)
|
|
Else: pads[start_i] = total_padding[i] - (total_padding[i]/2); pads[end_i] = (total_padding[i]/2).
|
|
|
|
)DOC";
|
|
ReplaceAll(doc, "{filter_desc}", filter_desc););
|
|
schema.SetDoc(doc);
|
|
schema.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from previous layer; has size (N x C x H x W)"
|
|
", where N is the batch size, C is the number of channels, and"
|
|
" H and W are the height and width. Note that this is for the 2D image. "
|
|
"Otherwise the size is (N x C x D1 x D2 ... x Dn)",
|
|
"T");
|
|
schema.Input(
|
|
1,
|
|
"W",
|
|
"The weight tensor that will be used in the "
|
|
"convolutions; has size (C x M/group x kH x kW), where C "
|
|
"is the number of channels, and kH and kW are the "
|
|
"height and width of the kernel, and M is the number "
|
|
"of feature maps. For more than 2 dimensions, the "
|
|
"weight shape will be (C x M/group x k1 x k2 x ... x kn), "
|
|
"where (k1 x k2 x ... x kn) is the dimension of the kernel. "
|
|
"The number of channels in the output should be equal to W.shape[1] * group "
|
|
"(assuming zero based indices of the shape array)",
|
|
"T");
|
|
schema.Input(2, "B", "Optional 1D bias to be added to the convolution, has size of M.", "T", OpSchema::Optional);
|
|
schema.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor that contains the result of the convolution. The "
|
|
"output dimensions are functions of the kernel size, stride size, "
|
|
"pad lengths and group count. "
|
|
"The number of channels in the output should be equal to W.shape[1] * group "
|
|
"(assuming zero based indices of the shape array)",
|
|
"T");
|
|
schema.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.");
|
|
schema.Attr(
|
|
"kernel_shape",
|
|
"The shape of the convolution kernel. If not present, should be inferred from input W.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"output_shape",
|
|
"The shape of the output can be explicitly set which will cause pads values to be auto generated. If output_shape is specified "
|
|
"pads values are ignored. See doc for details for equations to generate pads",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"output_padding",
|
|
"The zero-padding added to one side of the output."
|
|
" This is also called adjs/adjustment in some frameworks.",
|
|
AttributeProto::INTS,
|
|
OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"dilations", "dilation value along each spatial axis of the filter.", AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr("strides", "Stride along each spatial axis.", AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr("auto_pad", auto_pad_doc2, AttributeProto::STRING, std::string("NOTSET"));
|
|
schema.Attr("pads", pads_doc2, AttributeProto::INTS, OPTIONAL_VALUE);
|
|
schema.Attr(
|
|
"group",
|
|
"number of groups input channels and output channels are divided into.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1));
|
|
schema.TypeAndShapeInferenceFunction([](InferenceContext& ctx) { convTransposeShapeInference1(ctx); });
|
|
};
|
|
}
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(ConvTranspose, 1, OpSchema().FillUsing(ConvTransposeOpSchemaGenerator_10("a filter")));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
GlobalLpPool,
|
|
1,
|
|
OpSchema()
|
|
.SetDoc(GlobalLpPool_ver1_doc)
|
|
.Attr(
|
|
"p",
|
|
"p value of the Lp norm used to pool over the input data, default is 2.0.",
|
|
AttributeProto::FLOAT,
|
|
2.0f)
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the width "
|
|
"of the data. For non image case, the dimension are "
|
|
"in the form of (N x C x D1 x D2 ... Dn), "
|
|
"where N is the batch size.",
|
|
"T")
|
|
.Output(
|
|
0,
|
|
"Y",
|
|
"Output data tensor from pooling across the input "
|
|
"tensor. Dimensions will be N x C x 1 x 1",
|
|
"T")
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors."));
|
|
|
|
static const char* BatchNormalization_ver1_doc = R"DOC(
|
|
Carries out batch normalization as described in the paper
|
|
https://arxiv.org/abs/1502.03167. Depending on the mode it is being run,
|
|
there are multiple cases for the number of outputs, which we list below:
|
|
|
|
Output case #1: Y, mean, var, saved_mean, saved_var (training mode)
|
|
Output case #2: Y (test mode)
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
BatchNormalization,
|
|
1,
|
|
OpSchema()
|
|
.NumOutputs({1, 5})
|
|
.SetDoc(BatchNormalization_ver1_doc)
|
|
.Attr(
|
|
"spatial",
|
|
"If true, compute the mean and variance across all spatial elements "
|
|
"If false, compute the mean and variance across per feature."
|
|
"Default is 1.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1))
|
|
.Attr(
|
|
"is_test",
|
|
"If set to nonzero, run spatial batch normalization in test mode, default is 0.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0))
|
|
.Attr(
|
|
"epsilon",
|
|
"The epsilon value to use to avoid division by zero, default is 1e-5f.",
|
|
AttributeProto::FLOAT,
|
|
1e-5f)
|
|
.Attr(
|
|
"momentum",
|
|
"Factor used in computing the running mean and variance."
|
|
"e.g., running_mean = running_mean * momentum + mean * (1 - momentum), default is 0.9f.",
|
|
AttributeProto::FLOAT,
|
|
0.9f)
|
|
// This attribute was added via AllowConsumed API in OpSchema.
|
|
// After removing the API, we're now using the Attr API to simulate the
|
|
// old definition.
|
|
.Attr("consumed_inputs", "legacy optimization attribute.", AttributeProto::INTS)
|
|
.Input(0, "X", "The input 4-dimensional tensor of shape NCHW.", "T")
|
|
.Input(
|
|
1,
|
|
"scale",
|
|
"The scale as a 1-dimensional tensor of size C to be applied to the "
|
|
"output.",
|
|
"T")
|
|
.Input(
|
|
2,
|
|
"B",
|
|
"The bias as a 1-dimensional tensor of size C to be applied to the "
|
|
"output.",
|
|
"T")
|
|
.Input(
|
|
3,
|
|
"mean",
|
|
"The running mean (training) or the estimated mean (testing) "
|
|
"as a 1-dimensional tensor of size C.",
|
|
"T")
|
|
.Input(
|
|
4,
|
|
"var",
|
|
"The running variance (training) or the estimated "
|
|
"variance (testing) as a 1-dimensional tensor of size C.",
|
|
"T")
|
|
.Output(0, "Y", "The output 4-dimensional tensor of the same shape as X.", "T")
|
|
.Output(
|
|
1,
|
|
"mean",
|
|
"The running mean after the BatchNormalization operator. Must be in-place "
|
|
"with the input mean. Should not be used for testing.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.Output(
|
|
2,
|
|
"var",
|
|
"The running variance after the BatchNormalization operator. Must be "
|
|
"in-place with the input var. Should not be used for testing.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.Output(
|
|
3,
|
|
"saved_mean",
|
|
"Saved mean used during training to speed up gradient "
|
|
"computation. Should not be used for testing.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.Output(
|
|
4,
|
|
"saved_var",
|
|
"Saved variance used during training to speed up "
|
|
"gradient computation. Should not be used for testing.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors."));
|
|
|
|
static const char* BatchNormalization_ver9_doc = R"DOC(
|
|
Carries out batch normalization as described in the paper
|
|
https://arxiv.org/abs/1502.03167. Depending on the mode it is being run,
|
|
there are multiple cases for the number of outputs, which we list below:
|
|
|
|
Output case #1: Y, mean, var, saved_mean, saved_var (training mode)
|
|
Output case #2: Y (test mode)
|
|
|
|
For previous (depreciated) non-spatial cases, implementors are suggested
|
|
to flatten the input shape to (N x C*D1*D2 ..*Dn) before a BatchNormalization Op.
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
BatchNormalization,
|
|
9,
|
|
OpSchema()
|
|
.NumOutputs({1, 5})
|
|
.SetDoc(BatchNormalization_ver9_doc + GenerateOptionalArgumentsDoc())
|
|
.Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
|
|
.Attr(
|
|
"momentum",
|
|
"Factor used in computing the running mean and variance."
|
|
"e.g., running_mean = running_mean * momentum + mean * (1 - momentum).",
|
|
AttributeProto::FLOAT,
|
|
0.9f)
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions are in the form of (N x C x D1 x D2 ... Dn), "
|
|
"where N is the batch size, C is the number of channels. "
|
|
"Statistics are computed for every channel of C over N and D1 to Dn dimensions. "
|
|
"For image data, input dimensions become (N x C x H x W). "
|
|
"The op also accepts single dimension input of size N in which case C is assumed to be 1",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Input(1, "scale", "Scale tensor of shape (C).", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
|
|
.Input(2, "B", "Bias tensor of shape (C).", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
|
|
.Input(
|
|
3,
|
|
"mean",
|
|
"running (training) or estimated (testing) mean tensor of shape (C).",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Input(
|
|
4,
|
|
"var",
|
|
"running (training) or estimated (testing) variance tensor of shape (C).",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Output(
|
|
0,
|
|
"Y",
|
|
"The output tensor of the same shape as X",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Output(
|
|
1,
|
|
"mean",
|
|
"The running mean after the BatchNormalization operator.",
|
|
"T",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.Output(
|
|
2,
|
|
"var",
|
|
"The running variance after the BatchNormalization operator.",
|
|
"T",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.Output(
|
|
3,
|
|
"saved_mean",
|
|
"Saved mean used during training to speed up gradient "
|
|
"computation.",
|
|
"T",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.Output(
|
|
4,
|
|
"saved_var",
|
|
"Saved variance used during training to speed up "
|
|
"gradient computation.",
|
|
"T",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateShapeAndTypeFromFirstInput(ctx);
|
|
// TODO in training mode, it may be possible to infer some of
|
|
// the other outputs as well.
|
|
}));
|
|
|
|
static const char* BatchNormalization_ver14_doc = R"DOC(
|
|
Carries out batch normalization as described in the paper
|
|
https://arxiv.org/abs/1502.03167. Depending on the mode it is being run,
|
|
There are five required inputs 'X', 'scale', 'B', 'input_mean' and
|
|
'input_var'.
|
|
Note that 'input_mean' and 'input_var' are expected to be the estimated
|
|
statistics in inference mode (training_mode=False, default),
|
|
and the running statistics in training mode (training_mode=True).
|
|
There are multiple cases for the number of outputs, which we list below:
|
|
|
|
Output case #1: Y, running_mean, running_var (training_mode=True)
|
|
Output case #2: Y (training_mode=False)
|
|
|
|
When training_mode=False, extra outputs are invalid.
|
|
The outputs are updated as follows when training_mode=True:
|
|
```
|
|
running_mean = input_mean * momentum + current_mean * (1 - momentum)
|
|
running_var = input_var * momentum + current_var * (1 - momentum)
|
|
|
|
Y = (X - current_mean) / sqrt(current_var + epsilon) * scale + B
|
|
|
|
where:
|
|
|
|
current_mean = ReduceMean(X, axis=all_except_channel_index)
|
|
current_var = ReduceVar(X, axis=all_except_channel_index)
|
|
|
|
Notice that ReduceVar refers to the population variance, and it equals to
|
|
sum(sqrd(x_i - x_avg)) / N
|
|
where N is the population size (this formula does not use sample size N - 1).
|
|
|
|
```
|
|
|
|
When training_mode=False:
|
|
```
|
|
Y = (X - input_mean) / sqrt(input_var + epsilon) * scale + B
|
|
```
|
|
|
|
For previous (depreciated) non-spatial cases, implementors are suggested
|
|
to flatten the input shape to (N x C * D1 * D2 * ... * Dn) before a BatchNormalization Op.
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
BatchNormalization,
|
|
14,
|
|
OpSchema()
|
|
.NumOutputs({1, 3})
|
|
.SetDoc(BatchNormalization_ver14_doc + GenerateOptionalArgumentsDoc())
|
|
.Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
|
|
.Attr(
|
|
"momentum",
|
|
"Factor used in computing the running mean and variance."
|
|
"e.g., running_mean = running_mean * momentum + mean * (1 - momentum).",
|
|
AttributeProto::FLOAT,
|
|
0.9f)
|
|
.Attr(
|
|
"training_mode",
|
|
"If set to true, it indicates BatchNormalization is being used for training, and outputs 1, "
|
|
"2, 3, and 4 would be populated.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0))
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions are in the form of (N x C x D1 x D2 ... Dn), "
|
|
"where N is the batch size, C is the number of channels. "
|
|
"Statistics are computed for every channel of C over N and D1 to Dn dimensions. "
|
|
"For image data, input dimensions become (N x C x H x W). "
|
|
"The op also accepts single dimension input of size N in which case C is assumed to be 1",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Input(1, "scale", "Scale tensor of shape (C).", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
|
|
.Input(2, "B", "Bias tensor of shape (C).", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
|
|
.Input(
|
|
3,
|
|
"input_mean",
|
|
"running (training) or estimated (testing) mean tensor of shape (C).",
|
|
"U",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Input(
|
|
4,
|
|
"input_var",
|
|
"running (training) or estimated (testing) variance tensor of shape (C).",
|
|
"U",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Output(
|
|
0,
|
|
"Y",
|
|
"The output tensor of the same shape as X",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Output(
|
|
1,
|
|
"running_mean",
|
|
"The running mean after the BatchNormalization operator.",
|
|
"U",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.Output(
|
|
2,
|
|
"running_var",
|
|
"The running variance after the BatchNormalization operator. This op uses the population size (N) for "
|
|
"calculating variance, and not the sample size N-1.",
|
|
"U",
|
|
OpSchema::Optional,
|
|
true,
|
|
1,
|
|
OpSchema::NonDifferentiable)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeConstraint(
|
|
"U",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
|
|
"Constrain mean and variance types to float tensors. It allows all float type for U.")
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateShapeAndTypeFromFirstInput(ctx);
|
|
propagateShapeFromInputToOutput(ctx, 0, 0);
|
|
|
|
// Inputs 1 to 4 must be of rank 1.
|
|
checkInputRank(ctx, 1, 1);
|
|
checkInputRank(ctx, 2, 1);
|
|
checkInputRank(ctx, 3, 1);
|
|
checkInputRank(ctx, 4, 1);
|
|
|
|
Dim num_channels;
|
|
|
|
if (hasInputShape(ctx, 0)) {
|
|
if (getInputShape(ctx, 0).dim_size() > 1)
|
|
unifyInputDim(ctx, 0, 1, num_channels);
|
|
else
|
|
unifyDim(num_channels, 1);
|
|
}
|
|
|
|
unifyInputDim(ctx, 1, 0, num_channels);
|
|
unifyInputDim(ctx, 2, 0, num_channels);
|
|
unifyInputDim(ctx, 3, 0, num_channels);
|
|
unifyInputDim(ctx, 4, 0, num_channels);
|
|
|
|
if (ctx.getAttribute("training_mode") && static_cast<int>(ctx.getAttribute("training_mode")->i()) != 0) {
|
|
if (ctx.getNumOutputs() != 3)
|
|
fail_shape_inference("This number of op outputs should be 3 when Training_mode = True, but it is not.");
|
|
} else {
|
|
if (ctx.getNumOutputs() != 1)
|
|
fail_shape_inference("This number of op outputs should be 1 when Training_mode = False, but it is not.");
|
|
}
|
|
|
|
if (ctx.getNumOutputs() > 1) {
|
|
TensorShapeProto outputs_shape;
|
|
*outputs_shape.add_dim() = num_channels; // channel
|
|
|
|
propagateElemTypeFromInputToOutput(ctx, 3, 1);
|
|
updateOutputShape(ctx, 1, outputs_shape);
|
|
|
|
if (ctx.getNumOutputs() > 2) {
|
|
propagateElemTypeFromInputToOutput(ctx, 4, 2);
|
|
updateOutputShape(ctx, 2, outputs_shape);
|
|
}
|
|
}
|
|
}));
|
|
|
|
static const char* InstanceNormalization_ver1_doc = R"DOC(
|
|
Carries out instance normalization as described in the paper
|
|
https://arxiv.org/abs/1607.08022.
|
|
|
|
y = scale * (x - mean) / sqrt(variance + epsilon) + B,
|
|
where mean and variance are computed per instance per channel.
|
|
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
InstanceNormalization,
|
|
1,
|
|
OpSchema()
|
|
.SetDoc(InstanceNormalization_ver1_doc)
|
|
// This attribute was added via AllowConsumed API in OpSchema.
|
|
// After removing the API, we're now using the Attr API to simulate the
|
|
// old definition.
|
|
.Attr("consumed_inputs", "legacy optimization attribute.", AttributeProto::INTS, OPTIONAL_VALUE)
|
|
.Attr(
|
|
"epsilon",
|
|
"The epsilon value to use to avoid division by zero, default is 1e-5f.",
|
|
AttributeProto::FLOAT,
|
|
1e-5f)
|
|
.Input(0, "input", "The input 4-dimensional tensor of shape NCHW.", "T")
|
|
.Input(1, "scale", "The input 1-dimensional scale tensor of size C.", "T")
|
|
.Input(2, "B", "The input 1-dimensional bias tensor of size C.", "T")
|
|
.Output(0, "output", "The output 4-dimensional tensor of the same shape as input.", "T")
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors."));
|
|
|
|
static const char* Dropout_old_doc = R"DOC(
|
|
Dropout takes one input data (Tensor<float>) and produces two Tensor outputs,
|
|
output (Tensor<float>) and mask (Tensor<bool>). Depending on whether it is in
|
|
test mode or not, the output Y will either be a random dropout, or a simple
|
|
copy of the input. Note that our implementation of Dropout does scaling in
|
|
the training phase, so during testing nothing needs to be done.
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
Dropout,
|
|
1,
|
|
OpSchema()
|
|
.SetDoc(Dropout_old_doc)
|
|
.Attr("ratio", "(float, default 0.5) the ratio of random dropout", AttributeProto::FLOAT, 0.5f)
|
|
// This attribute was added via AllowConsumed API in OpSchema.
|
|
// After removing the API, we're now using the Attr API to simulate the
|
|
// old definition.
|
|
.Attr("consumed_inputs", "legacy optimization attribute.", AttributeProto::INTS, OPTIONAL_VALUE)
|
|
.Attr(
|
|
"is_test",
|
|
"(int, default 0) if nonzero, run dropout in test mode where "
|
|
"the output is simply Y = X.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0))
|
|
.Input(0, "data", "The input data as Tensor.", "T")
|
|
.Output(0, "output", "The output.", "T")
|
|
.Output(
|
|
1,
|
|
"mask",
|
|
"The output mask. If is_test is nonzero, this output is not filled.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors."));
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
Dropout,
|
|
6,
|
|
OpSchema()
|
|
.SetDoc(Dropout_old_doc)
|
|
.Attr("ratio", "(float, default 0.5) the ratio of random dropout", AttributeProto::FLOAT, 0.5f)
|
|
.Attr(
|
|
"is_test",
|
|
"(int, default 0) if nonzero, run dropout in test mode where "
|
|
"the output is simply Y = X.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0))
|
|
.Input(0, "data", "The input data as Tensor.", "T")
|
|
.Output(0, "output", "The output.", "T")
|
|
.Output(
|
|
1,
|
|
"mask",
|
|
"The output mask. If is_test is nonzero, this output is not filled.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
|
|
|
|
static const char* Dropout_ver7_doc = R"DOC(
|
|
Dropout takes one input data (Tensor<float>) and produces two Tensor outputs,
|
|
output (Tensor<float>) and mask (Tensor<bool>). Depending on whether it is in
|
|
test mode or not, the output Y will either be a random dropout, or a simple
|
|
copy of the input. Note that our implementation of Dropout does scaling in
|
|
the training phase, so during testing nothing needs to be done.
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
Dropout,
|
|
7,
|
|
OpSchema()
|
|
.SetDoc(GET_OP_DOC_STR(std::string(Dropout_ver7_doc) + GenerateOptionalArgumentsDoc()))
|
|
.Attr("ratio", "The ratio of random dropout", AttributeProto::FLOAT, 0.5f)
|
|
.Input(0, "data", "The input data as Tensor.", "T")
|
|
.Output(0, "output", "The output.", "T")
|
|
.Output(1, "mask", "The output mask.", "T", OpSchema::Optional)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
|
|
|
|
static const char* Dropout_ver10_doc = R"DOC(
|
|
Dropout takes one input floating tensor and produces two tensor outputs,
|
|
output (floating tensor) and mask (`Tensor<bool>`). Depending on whether it is
|
|
in test mode or not, the output Y will either be a random dropout, or a simple
|
|
copy of the input. Note that our implementation of Dropout does scaling in
|
|
the training phase, so during testing nothing needs to be done.
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
Dropout,
|
|
10,
|
|
OpSchema()
|
|
.SetDoc(GET_OP_DOC_STR(std::string(Dropout_ver10_doc) + GenerateOptionalArgumentsDoc()))
|
|
.Attr("ratio", "The ratio of random dropout", AttributeProto::FLOAT, 0.5f)
|
|
.Input(0, "data", "The input data as Tensor.", "T")
|
|
.Output(0, "output", "The output.", "T")
|
|
.Output(1, "mask", "The output mask.", "T1", OpSchema::Optional)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeConstraint("T1", {"tensor(bool)"}, "Constrain output mask types to boolean tensors.")
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateShapeAndTypeFromFirstInput(ctx);
|
|
if (ctx.getNumOutputs() == 2) {
|
|
updateOutputElemType(ctx, 1, TensorProto::BOOL);
|
|
if (hasNInputShapes(ctx, 1)) {
|
|
propagateShapeFromInputToOutput(ctx, 0, 1);
|
|
}
|
|
}
|
|
}));
|
|
|
|
static const char* BatchNorm_ver6_doc = R"DOC(
|
|
Carries out batch normalization as described in the paper
|
|
https://arxiv.org/abs/1502.03167. Depending on the mode it is being run,
|
|
there are multiple cases for the number of outputs, which we list below:
|
|
|
|
Output case #1: Y, mean, var, saved_mean, saved_var (training mode)
|
|
Output case #2: Y (test mode)
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
BatchNormalization,
|
|
6,
|
|
OpSchema()
|
|
.NumOutputs({1, 5})
|
|
.SetDoc(BatchNorm_ver6_doc)
|
|
.Attr(
|
|
"spatial",
|
|
"If true, compute the mean and variance across all spatial elements "
|
|
"If false, compute the mean and variance across per feature."
|
|
"Default is 1.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1))
|
|
.Attr(
|
|
"is_test",
|
|
"If set to nonzero, run spatial batch normalization in test mode, default is 0.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(0))
|
|
.Attr(
|
|
"epsilon",
|
|
"The epsilon value to use to avoid division by zero, default is 1e-5f.",
|
|
AttributeProto::FLOAT,
|
|
1e-5f)
|
|
.Attr(
|
|
"momentum",
|
|
"Factor used in computing the running mean and variance."
|
|
"e.g., running_mean = running_mean * momentum + mean * (1 - momentum), default is 0.9f.",
|
|
AttributeProto::FLOAT,
|
|
0.9f)
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
|
"size.",
|
|
"T")
|
|
.Input(
|
|
1,
|
|
"scale",
|
|
"The scale as a 1-dimensional tensor of size C to be applied to the "
|
|
"output.",
|
|
"T")
|
|
.Input(
|
|
2,
|
|
"B",
|
|
"The bias as a 1-dimensional tensor of size C to be applied to the "
|
|
"output.",
|
|
"T")
|
|
.Input(
|
|
3,
|
|
"mean",
|
|
"The running mean (training) or the estimated mean (testing) "
|
|
"as a 1-dimensional tensor of size C.",
|
|
"T")
|
|
.Input(
|
|
4,
|
|
"var",
|
|
"The running variance (training) or the estimated "
|
|
"variance (testing) as a 1-dimensional tensor of size C.",
|
|
"T")
|
|
.Output(0, "Y", "The output tensor of the same shape as X.", "T")
|
|
.Output(
|
|
1,
|
|
"mean",
|
|
"The running mean after the BatchNormalization operator. Must be in-place "
|
|
"with the input mean. Should not be used for testing.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.Output(
|
|
2,
|
|
"var",
|
|
"The running variance after the BatchNormalization operator. Must be "
|
|
"in-place with the input var. Should not be used for testing.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.Output(
|
|
3,
|
|
"saved_mean",
|
|
"Saved mean used during training to speed up gradient "
|
|
"computation. Should not be used for testing.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.Output(
|
|
4,
|
|
"saved_var",
|
|
"Saved variance used during training to speed up "
|
|
"gradient computation. Should not be used for testing.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateShapeAndTypeFromFirstInput(ctx);
|
|
// TODO in training mode, it may be possible to infer some of
|
|
// the other outputs as well.
|
|
}));
|
|
|
|
static const char* Flatten_ver1_doc = R"DOC(
|
|
Flattens the input tensor into a 2D matrix. If input tensor has shape
|
|
(d_0, d_1, ... d_n) then the output will have shape
|
|
(d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn).
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
Flatten,
|
|
1,
|
|
OpSchema()
|
|
.SetDoc(Flatten_ver1_doc)
|
|
.Input(0, "input", "A tensor of rank >= axis.", "T")
|
|
.Output(
|
|
0,
|
|
"output",
|
|
"A 2D tensor with the contents of the input tensor, "
|
|
"with input dimensions up to axis flattened to the outer dimension "
|
|
"of the output and remaining input dimensions flattened into the inner "
|
|
"dimension of the output.",
|
|
"T")
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.Attr(
|
|
"axis",
|
|
"Indicate up to which input dimensions "
|
|
"(exclusive) should be flattened to the outer dimension of the output. "
|
|
"The value for axis must be in the range [0, R], where R is the rank of the input tensor. "
|
|
"When axis = 0, the shape of the output tensor is (1, (d_0 X d_1 ... d_n), "
|
|
"where the shape of the input tensor is (d_0, d_1, ... d_n). ",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1))
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (!hasInputShape(ctx, 0))
|
|
return;
|
|
auto& input_shape = getInputShape(ctx, 0);
|
|
int rank = static_cast<int>(input_shape.dim_size());
|
|
int axis = static_cast<int>(getAttribute(ctx, "axis", 1));
|
|
if (axis > rank || axis < 0) {
|
|
fail_shape_inference("Invalid value(", axis, ") for attribute 'axis'");
|
|
}
|
|
// TODO: is the operation defined for input-rank < 2?
|
|
updateOutputShape(ctx, 0, {multiplyDims(input_shape, 0, axis), multiplyDims(input_shape, axis, rank)});
|
|
}));
|
|
|
|
static const char* Flatten_ver9_doc = R"DOC(
|
|
Flattens the input tensor into a 2D matrix. If input tensor has shape
|
|
(d_0, d_1, ... d_n) then the output will have shape
|
|
(d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn).
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
Flatten,
|
|
9,
|
|
OpSchema()
|
|
.SetDoc(Flatten_ver9_doc)
|
|
.Input(0, "input", "A tensor of rank >= axis.", "T")
|
|
.Output(
|
|
0,
|
|
"output",
|
|
"A 2D tensor with the contents of the input tensor, "
|
|
"with input dimensions up to axis flattened to the outer dimension "
|
|
"of the output and remaining input dimensions flattened into the inner "
|
|
"dimension of the output.",
|
|
"T")
|
|
.TypeConstraint("T", OpSchema::all_tensor_types(), "Constrain input and output to all tensor types.")
|
|
.Attr(
|
|
"axis",
|
|
"Indicate up to which input dimensions "
|
|
"(exclusive) should be flattened to the outer dimension of the output. "
|
|
"The value for axis must be in the range [0, R], where R is the rank of the input tensor. "
|
|
"When axis = 0, the shape of the output tensor is (1, (d_0 X d_1 ... d_n), "
|
|
"where the shape of the input tensor is (d_0, d_1, ... d_n). ",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1))
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateElemTypeFromInputToOutput(ctx, 0, 0);
|
|
if (!hasInputShape(ctx, 0))
|
|
return;
|
|
auto& input_shape = getInputShape(ctx, 0);
|
|
int rank = static_cast<int>(input_shape.dim_size());
|
|
int axis = static_cast<int>(getAttribute(ctx, "axis", 1));
|
|
if (axis > rank || axis < 0) {
|
|
fail_shape_inference("Invalid value(", axis, ") for attribute 'axis'");
|
|
}
|
|
// TODO: is the operation defined for input-rank < 2?
|
|
updateOutputShape(ctx, 0, {multiplyDims(input_shape, 0, axis), multiplyDims(input_shape, axis, rank)});
|
|
}));
|
|
|
|
static const char* BatchNormalization_ver7_doc = R"DOC(
|
|
Carries out batch normalization as described in the paper
|
|
https://arxiv.org/abs/1502.03167. Depending on the mode it is being run,
|
|
there are multiple cases for the number of outputs, which we list below:
|
|
|
|
Output case #1: Y, mean, var, saved_mean, saved_var (training mode)
|
|
Output case #2: Y (test mode)
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
BatchNormalization,
|
|
7,
|
|
OpSchema()
|
|
.SetDoc(GET_OP_DOC_STR(std::string(BatchNormalization_ver7_doc) + GenerateOptionalArgumentsDoc()))
|
|
.NumOutputs({1, 5})
|
|
.Attr(
|
|
"spatial",
|
|
"If true, compute the mean and variance across per activation. "
|
|
"If false, compute the mean and variance across per feature over "
|
|
"each mini-batch.",
|
|
AttributeProto::INT,
|
|
static_cast<int64_t>(1))
|
|
.Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
|
|
.Attr(
|
|
"momentum",
|
|
"Factor used in computing the running mean and variance."
|
|
"e.g., running_mean = running_mean * momentum + mean * (1 - momentum).",
|
|
AttributeProto::FLOAT,
|
|
0.9f)
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor from the previous operator; "
|
|
"dimensions for image case are (N x C x H x W), "
|
|
"where N is the batch size, C is the number of "
|
|
"channels, and H and W are the height and the "
|
|
"width of the data. For non image case, the "
|
|
"dimensions are in the form of "
|
|
"(N x C x D1 x D2 ... Dn), where N is the batch "
|
|
"size.",
|
|
"T")
|
|
.Input(
|
|
1,
|
|
"scale",
|
|
"If spatial is true, the dimension of scale is (C). "
|
|
"If spatial is false, the dimensions of scale are "
|
|
"(C x D1 x ... x Dn)",
|
|
"T")
|
|
.Input(
|
|
2,
|
|
"B",
|
|
"If spatial is true, the dimension of bias is (C). "
|
|
"If spatial is false, the dimensions of bias are "
|
|
"(C x D1 x ... x Dn)",
|
|
"T")
|
|
.Input(
|
|
3,
|
|
"mean",
|
|
"If spatial is true, the dimension of the running mean "
|
|
"(training) or the estimated mean (testing) is (C). "
|
|
"If spatial is false, the dimensions of the running mean "
|
|
"(training) or the estimated mean (testing) are (C x D1 x ... x Dn).",
|
|
"T")
|
|
.Input(
|
|
4,
|
|
"var",
|
|
"If spatial is true, the dimension of the running variance"
|
|
"(training) or the estimated variance (testing) is (C). "
|
|
"If spatial is false, the dimensions of the running variance"
|
|
"(training) or the estimated variance (testing) are (C x D1 x ... x Dn).",
|
|
"T")
|
|
.Output(0, "Y", "The output tensor of the same shape as X", "T")
|
|
.Output(1, "mean", "The running mean after the BatchNormalization operator.", "T", OpSchema::Optional)
|
|
.Output(2, "var", "The running variance after the BatchNormalization operator.", "T", OpSchema::Optional)
|
|
.Output(
|
|
3,
|
|
"saved_mean",
|
|
"Saved mean used during training to speed up gradient "
|
|
"computation.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.Output(
|
|
4,
|
|
"saved_var",
|
|
"Saved variance used during training to speed up "
|
|
"gradient computation.",
|
|
"T",
|
|
OpSchema::Optional)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
|
|
propagateShapeAndTypeFromFirstInput(ctx);
|
|
// TODO in training mode, it may be possible to infer some of
|
|
// the other outputs as well.
|
|
}));
|
|
|
|
static const char* GroupNormalization_ver18_doc = R"DOC(
|
|
A GroupNormalization function. Carries out group normalization as described in
|
|
the paper https://arxiv.org/abs/1803.08494
|
|
|
|
This operator transforms input according to
|
|
```
|
|
y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
|
|
```
|
|
where the mean and variance are computed per instance per group of channels, and
|
|
`scale` and `bias` should be specified for each group of channels. The number of
|
|
groups `num_groups` should be divisible by the number of channels so that there are
|
|
an equal number of channels per group.
|
|
|
|
When the number of groups is the same as the number of channels, this operator is
|
|
equivalent to InstanceNormalization. When there is only one group, this operator
|
|
is equivalent to LayerNormalization.
|
|
)DOC";
|
|
|
|
ONNX_OPERATOR_SET_SCHEMA(
|
|
GroupNormalization,
|
|
18,
|
|
OpSchema()
|
|
.SetDoc(GroupNormalization_ver18_doc)
|
|
.Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
|
|
.Attr(
|
|
"num_groups",
|
|
"The number of groups of channels. It should be a divisor of the number of channels `C`.",
|
|
AttributeProto::INT,
|
|
true)
|
|
.Input(
|
|
0,
|
|
"X",
|
|
"Input data tensor. Dimensions for image cases are `(N x C x H x W)`, where `N` is the batch size, "
|
|
"`C` is the number of channels, and `H` and `W` are the height and width of the data. Statistics are "
|
|
"computed for every group of channels over `C`, `H`, and `W`. For non-image cases, the dimensions are "
|
|
"in the form of `(N x C x D1 x D2 ... Dn)`.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Input(
|
|
1,
|
|
"scale",
|
|
"Scale tensor of shape `(num_groups)`.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Input(
|
|
2,
|
|
"bias",
|
|
"Bias tensor of shape `(num_groups)`.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.Output(
|
|
0,
|
|
"Y",
|
|
"The output tensor of the same shape as `X`.",
|
|
"T",
|
|
OpSchema::Single,
|
|
true,
|
|
1,
|
|
OpSchema::Differentiable)
|
|
.TypeConstraint(
|
|
"T",
|
|
{"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
|
|
"Constrain input and output types to float tensors.")
|
|
.SetContextDependentFunctionBodyBuilder(
|
|
[](const FunctionBodyBuildContext& ctx, const OpSchema& schema, FunctionProto& functionProto) {
|
|
// GroupNormalization <epsilon, num_groups> (X, scale, bias) => (Y)
|
|
auto* tp = ctx.getInputType(0);
|
|
if ((tp == nullptr) || (!tp->has_tensor_type()))
|
|
return false;
|
|
int64_t T = tp->tensor_type().elem_type();
|
|
|
|
auto* epsilon_attr = ctx.getAttribute("epsilon");
|
|
float epsilon = (epsilon_attr != nullptr) ? epsilon_attr->f() : 1e-5f;
|
|
auto* num_groups_attr = ctx.getAttribute("num_groups");
|
|
if (num_groups_attr == nullptr)
|
|
return false;
|
|
int64_t num_groups = num_groups_attr->i();
|
|
|
|
FunctionBuilder builder(functionProto);
|
|
builder.Const1D("FloatEpsilon", epsilon)
|
|
.Add("Epsilon = Cast (FloatEpsilon)", "to", T)
|
|
.Add("XShape = Shape (X)") // shape of input tensor: 1D tensor
|
|
.Add("C = Shape <start = 1, end = 2> (X)")
|
|
.Const1D("NumGroups", num_groups)
|
|
.Add("GroupSize = Div (C, NumGroups)")
|
|
.Add("N = Shape <start = 0, end = 1> (X)") // batch size
|
|
.Add("InstanceShape = Shape <start = 2> (X)") // data instance shape
|
|
|
|
// NewShape = [N, num_groups, group_size, H, W, (...)]
|
|
.Add("NewShape = Concat <axis = 0> (N, NumGroups, GroupSize, InstanceShape)")
|
|
.Add("XReshaped = Reshape (X, NewShape)")
|
|
|
|
// Flatten into 3D tensor: [N, num_groups, group_size x H x W (x ...)]
|
|
.Add("Shape3D = Constant <value_ints = [0, 0, -1]> ()")
|
|
.Add("X3D = Reshape(XReshaped, Shape3D)")
|
|
|
|
// Calculate statistics
|
|
.Const1D("Axes2", (int64_t)2)
|
|
.Add("Mean = ReduceMean (X3D, Axes2)")
|
|
.Add("Square = Mul (X3D, X3D)")
|
|
.Add("MeanOfSquare = ReduceMean (Square, Axes2)")
|
|
.Add("SquareOfMean = Mul (Mean, Mean)")
|
|
.Add("Var = Sub (MeanOfSquare, SquareOfMean)")
|
|
.Add("VarPlusEpsilon = Add (Var, Epsilon)")
|
|
.Add("StdDev = Sqrt (VarPlusEpsilon)")
|
|
.Add("Deviation = Sub (X3D, Mean)")
|
|
.Add("Normalized = Div (Deviation, StdDev)")
|
|
|
|
// Reshape scale and bias for broadcasting
|
|
.Add("ScaleShape = Constant <value_ints = [1, -1, 1]> ()")
|
|
.Add("ScaleT = Cast (scale)", "to", T)
|
|
.Add("BiasT = Cast (bias)", "to", T)
|
|
.Add("ScaleReshaped = Reshape (ScaleT, ScaleShape)")
|
|
.Add("BiasReshaped = Reshape (BiasT, ScaleShape)")
|
|
|
|
// Calculate scaled and biased output
|
|
.Add("Scaled = Mul (ScaleReshaped, Normalized)")
|
|
.Add("Biased = Add (Scaled, BiasReshaped)")
|
|
.Add("Y = Reshape (Biased, XShape)");
|
|
|
|
schema.BuildFunction(functionProto);
|
|
return true;
|
|
}));
|
|
} // namespace ONNX_NAMESPACE
|