diff --git a/docs/Changelog.md b/docs/Changelog.md
index 469b80cbbe8..03f5ea49077 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -21477,7 +21477,7 @@ This version of the operator has been available since version 18 of the default
Constrain input and output types to all numeric tensor types.
-### **GroupNormalization-18**
+### **GroupNormalization-18** (deprecated)
A GroupNormalization function. Carries out group normalization as described in
the paper https://arxiv.org/abs/1803.08494
@@ -21497,41 +21497,7 @@ This version of the operator has been available since version 18 of the default
#### Version
-This version of the operator has been available since version 18 of the default ONNX operator set.
-
-#### Attributes
-
-
-- epsilon : float (default is 1e-05)
-- The epsilon value to use to avoid division by zero.
-- num_groups : int (required)
-- The number of groups of channels. It should be a divisor of the number of channels `C`.
-
-
-#### Inputs
-
-
-- X (differentiable) : T
-- Input data tensor. Dimensions for image cases are `(N x C x H x W)`, where `N` is the batch size, `C` is the number of channels, and `H` and `W` are the height and width of the data. Statistics are computed for every group of channels over `C`, `H`, and `W`. For non-image cases, the dimensions are in the form of `(N x C x D1 x D2 ... Dn)`.
-- scale (differentiable) : T
-- Scale tensor of shape `(num_groups)`.
-- bias (differentiable) : T
-- Bias tensor of shape `(num_groups)`.
-
-
-#### Outputs
-
-
-- Y (differentiable) : T
-- The output tensor of the same shape as `X`.
-
-
-#### Type Constraints
-
-
-- T : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)
-- Constrain input and output types to float tensors.
-
+This version of the operator has been deprecated since version 18 of the default ONNX operator set.
### **LpPool-18**
@@ -24864,7 +24830,7 @@ This version of the operator has been available since version 21 of the default
y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
```
where the mean and variance are computed per instance per group of channels, and
- `scale` and `bias` should be specified for each group of channels. The number of
+ `scale` and `bias` should be specified for each channel. The number of
groups `num_groups` should be divisible by the number of channels so that there are
an equal number of channels per group.
diff --git a/docs/Operators.md b/docs/Operators.md
index a72d8877067..31b6d337358 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -11736,7 +11736,7 @@ expect(
y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
```
where the mean and variance are computed per instance per group of channels, and
- `scale` and `bias` should be specified for each group of channels. The number of
+ `scale` and `bias` should be specified for each channel. The number of
groups `num_groups` should be divisible by the number of channels so that there are
an equal number of channels per group.
diff --git a/onnx/defs/nn/defs.cc b/onnx/defs/nn/defs.cc
index 0b4624f148b..a12b040a969 100644
--- a/onnx/defs/nn/defs.cc
+++ b/onnx/defs/nn/defs.cc
@@ -2699,7 +2699,7 @@ This operator transforms input according to
y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
```
where the mean and variance are computed per instance per group of channels, and
-`scale` and `bias` should be specified for each group of channels. The number of
+`scale` and `bias` should be specified for each channel. The number of
groups `num_groups` should be divisible by the number of channels so that there are
an equal number of channels per group.
diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
index 763b1e90780..ff19bc3a54c 100644
--- a/onnx/defs/nn/old.cc
+++ b/onnx/defs/nn/old.cc
@@ -4020,6 +4020,7 @@ ONNX_OPERATOR_SET_SCHEMA(
GroupNormalization,
18,
OpSchema()
+ .Deprecate()
.SetDoc(GroupNormalization_ver18_doc)
.Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
.Attr(
diff --git a/onnx/reference/ops/op_quantize_linear.py b/onnx/reference/ops/op_quantize_linear.py
index f260065f9d7..be22d388773 100644
--- a/onnx/reference/ops/op_quantize_linear.py
+++ b/onnx/reference/ops/op_quantize_linear.py
@@ -209,7 +209,7 @@ def _run( # noqa: PLR0911
if tensor_type == TensorProto.FLOAT4E2M1:
x += zero_point
f4 = subbyte.float32_to_float4e2m1_unpacked(x)
- return (f4,) # type: ignore[attr-defined]
+ return (f4.astype(float4e2m1),) # type: ignore[attr-defined]
raise ValueError(
f"Unexpected type: output_dtype={tensor_type} is not a supported quantized type."
diff --git a/onnx/test/version_converter/automatic_upgrade_test.py b/onnx/test/version_converter/automatic_upgrade_test.py
index 10627b8c323..d47f797935a 100644
--- a/onnx/test/version_converter/automatic_upgrade_test.py
+++ b/onnx/test/version_converter/automatic_upgrade_test.py
@@ -1718,8 +1718,8 @@ def test_BitwiseXor(self) -> None:
def test_GroupNormalization(self) -> None:
self._test_op_upgrade(
"GroupNormalization",
- 18,
- [[3, 4, 2, 2], [1], [1]],
+ 21,
+ [[3, 4, 2, 2], [4], [4]],
[[3, 4, 2, 2]],
attrs={"epsilon": 1e-5, "num_groups": 2},
)