diff --git a/docs/Changelog.md b/docs/Changelog.md
index 469b80cbbe8..03f5ea49077 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -21477,7 +21477,7 @@ This version of the operator has been available since version 18 of the default
 <dd>Constrain input and output types to all numeric tensor types.</dd>
 </dl>
 
-### <a name="GroupNormalization-18"></a>**GroupNormalization-18**</a>
+### <a name="GroupNormalization-18"></a>**GroupNormalization-18** (deprecated)</a>
 
   A GroupNormalization function. Carries out group normalization as described in
   the paper https://arxiv.org/abs/1803.08494
@@ -21497,41 +21497,7 @@ This version of the operator has been available since version 18 of the default
 
 #### Version
 
-This version of the operator has been available since version 18 of the default ONNX operator set.
-
-#### Attributes
-
-<dl>
-<dt><tt>epsilon</tt> : float (default is 1e-05)</dt>
-<dd>The epsilon value to use to avoid division by zero.</dd>
-<dt><tt>num_groups</tt> : int (required)</dt>
-<dd>The number of groups of channels. It should be a divisor of the number of channels `C`.</dd>
-</dl>
-
-#### Inputs
-
-<dl>
-<dt><tt>X</tt> (differentiable) : T</dt>
-<dd>Input data tensor. Dimensions for image cases are `(N x C x H x W)`, where `N` is the batch size, `C` is the number of channels, and `H` and `W` are the height and width of the data. Statistics are computed for every group of channels over `C`, `H`, and `W`. For non-image cases, the dimensions are in the form of `(N x C x D1 x D2 ... Dn)`.</dd>
-<dt><tt>scale</tt> (differentiable) : T</dt>
-<dd>Scale tensor of shape `(num_groups)`.</dd>
-<dt><tt>bias</tt> (differentiable) : T</dt>
-<dd>Bias tensor of shape `(num_groups)`.</dd>
-</dl>
-
-#### Outputs
-
-<dl>
-<dt><tt>Y</tt> (differentiable) : T</dt>
-<dd>The output tensor of the same shape as `X`.</dd>
-</dl>
-
-#### Type Constraints
-
-<dl>
-<dt><tt>T</tt> : tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
-<dd>Constrain input and output types to float tensors.</dd>
-</dl>
+This version of the operator has been deprecated since version 18 of the default ONNX operator set.
 
 ### <a name="LpPool-18"></a>**LpPool-18**</a>
 
@@ -24864,7 +24830,7 @@ This version of the operator has been available since version 21 of the default
   y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
   ```
   where the mean and variance are computed per instance per group of channels, and
-  `scale` and `bias` should be specified for each group of channels. The number of
+  `scale` and `bias` should be specified for each channel. The number of
   groups `num_groups` should be divisible by the number of channels so that there are
   an equal number of channels per group.
 
diff --git a/docs/Operators.md b/docs/Operators.md
index a72d8877067..31b6d337358 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -11736,7 +11736,7 @@ expect(
   y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
   ```
   where the mean and variance are computed per instance per group of channels, and
-  `scale` and `bias` should be specified for each group of channels. The number of
+  `scale` and `bias` should be specified for each channel. The number of
   groups `num_groups` should be divisible by the number of channels so that there are
   an equal number of channels per group.
 
diff --git a/onnx/defs/nn/defs.cc b/onnx/defs/nn/defs.cc
index 0b4624f148b..a12b040a969 100644
--- a/onnx/defs/nn/defs.cc
+++ b/onnx/defs/nn/defs.cc
@@ -2699,7 +2699,7 @@ This operator transforms input according to
 y = scale * (x - mean) / sqrt(variance + epsilon) + bias,
 ```
 where the mean and variance are computed per instance per group of channels, and
-`scale` and `bias` should be specified for each group of channels. The number of
+`scale` and `bias` should be specified for each channel. The number of
 groups `num_groups` should be divisible by the number of channels so that there are
 an equal number of channels per group.
 
diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
index 763b1e90780..ff19bc3a54c 100644
--- a/onnx/defs/nn/old.cc
+++ b/onnx/defs/nn/old.cc
@@ -4020,6 +4020,7 @@ ONNX_OPERATOR_SET_SCHEMA(
     GroupNormalization,
     18,
     OpSchema()
+        .Deprecate()
         .SetDoc(GroupNormalization_ver18_doc)
         .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, 1e-5f)
         .Attr(
diff --git a/onnx/reference/ops/op_quantize_linear.py b/onnx/reference/ops/op_quantize_linear.py
index f260065f9d7..be22d388773 100644
--- a/onnx/reference/ops/op_quantize_linear.py
+++ b/onnx/reference/ops/op_quantize_linear.py
@@ -209,7 +209,7 @@ def _run(  # noqa: PLR0911
         if tensor_type == TensorProto.FLOAT4E2M1:
             x += zero_point
             f4 = subbyte.float32_to_float4e2m1_unpacked(x)
-            return (f4,)  # type: ignore[attr-defined]
+            return (f4.astype(float4e2m1),)  # type: ignore[attr-defined]
 
         raise ValueError(
             f"Unexpected type: output_dtype={tensor_type} is not a supported quantized type."
diff --git a/onnx/test/version_converter/automatic_upgrade_test.py b/onnx/test/version_converter/automatic_upgrade_test.py
index 10627b8c323..d47f797935a 100644
--- a/onnx/test/version_converter/automatic_upgrade_test.py
+++ b/onnx/test/version_converter/automatic_upgrade_test.py
@@ -1718,8 +1718,8 @@ def test_BitwiseXor(self) -> None:
     def test_GroupNormalization(self) -> None:
         self._test_op_upgrade(
             "GroupNormalization",
-            18,
-            [[3, 4, 2, 2], [1], [1]],
+            21,
+            [[3, 4, 2, 2], [4], [4]],
             [[3, 4, 2, 2]],
             attrs={"epsilon": 1e-5, "num_groups": 2},
         )