Fix seed for random test data. (#9844)

* Fix seed for random test data. Also switch to using default_rng instead of RandomState. * Fixes * one more fix. * more fixes * last one? * one more
pydata · Dec 3, 2024 · 7445012 · 7445012
1 parent 05f24f7
commit 7445012
Show file tree

Hide file tree

Showing 16 changed files with 63 additions and 59 deletions.
diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py
@@ -30,13 +30,13 @@ def requires_sparse():
 
 
 def randn(shape, frac_nan=None, chunks=None, seed=0):
-    rng = np.random.RandomState(seed)
+    rng = np.random.default_rng(seed)
     if chunks is None:
         x = rng.standard_normal(shape)
     else:
         import dask.array as da
 
-        rng = da.random.RandomState(seed)
+        rng = da.random.default_rng(seed)
         x = rng.standard_normal(shape, chunks=chunks)
 
     if frac_nan is not None:
@@ -47,7 +47,7 @@ def randn(shape, frac_nan=None, chunks=None, seed=0):
 
 
 def randint(low, high=None, size=None, frac_minus=None, seed=0):
-    rng = np.random.RandomState(seed)
+    rng = np.random.default_rng(seed)
     x = rng.randint(low, high, size)
     if frac_minus is not None:
         inds = rng.choice(range(x.size), int(x.size * frac_minus))

diff --git a/asv_bench/benchmarks/reindexing.py b/asv_bench/benchmarks/reindexing.py
@@ -11,7 +11,7 @@
 
 class Reindex:
     def setup(self):
-        data = np.random.RandomState(0).randn(ntime, nx, ny)
+        data = np.random.default_rng(0).random((ntime, nx, ny))
         self.ds = xr.Dataset(
             {"temperature": (("time", "x", "y"), data)},
             coords={"time": np.arange(ntime), "x": np.arange(nx), "y": np.arange(ny)},

diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py
@@ -8,7 +8,7 @@
 
 class Unstacking:
     def setup(self):
-        data = np.random.RandomState(0).randn(250, 500)
+        data = np.random.default_rng(0).random((250, 500))
         self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
         self.da_missing = self.da_full[:-1]
         self.df_missing = self.da_missing.to_pandas()

diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst
@@ -30,7 +30,8 @@ numpy) over all array values:
 .. ipython:: python
 
     arr = xr.DataArray(
-        np.random.RandomState(0).randn(2, 3), [("x", ["a", "b"]), ("y", [10, 20, 30])]
+        np.random.default_rng(0).random((2, 3)),
+        [("x", ["a", "b"]), ("y", [10, 20, 30])],
     )
     arr - 3
     abs(arr)

diff --git a/doc/user-guide/dask.rst b/doc/user-guide/dask.rst
@@ -292,7 +292,7 @@ work as a streaming operation, when run on arrays loaded from disk:
 .. ipython::
     :verbatim:
 
-    In [56]: rs = np.random.RandomState(0)
+    In [56]: rs = np.random.default_rng(0)
 
     In [57]: array1 = xr.DataArray(rs.randn(1000, 100000), dims=["place", "time"])  # 800MB
 

diff --git a/doc/user-guide/pandas.rst b/doc/user-guide/pandas.rst
@@ -202,7 +202,7 @@ Let's take a look:
 
 .. ipython:: python
 
-    data = np.random.RandomState(0).rand(2, 3, 4)
+    data = np.random.default_rng(0).rand(2, 3, 4)
     items = list("ab")
     major_axis = list("mno")
     minor_axis = pd.date_range(start="2000", periods=4, name="date")

diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
@@ -298,12 +298,12 @@ def assert_allclose(a, b, check_default_indexes=True, **kwargs):
 
 
 def create_test_data(
-    seed: int | None = None,
+    seed: int = 12345,
     add_attrs: bool = True,
     dim_sizes: tuple[int, int, int] = _DEFAULT_TEST_DIM_SIZES,
     use_extension_array: bool = False,
 ) -> Dataset:
-    rs = np.random.RandomState(seed)
+    rs = np.random.default_rng(seed)
     _vars = {
         "var1": ["dim1", "dim2"],
         "var2": ["dim1", "dim2"],
@@ -329,15 +329,15 @@ def create_test_data(
             "dim1",
             pd.Categorical(
                 rs.choice(
-                    list(string.ascii_lowercase[: rs.randint(1, 5)]),
+                    list(string.ascii_lowercase[: rs.integers(1, 5)]),
                     size=dim_sizes[0],
                 )
             ),
         )
     if dim_sizes == _DEFAULT_TEST_DIM_SIZES:
         numbers_values = np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64")
     else:
-        numbers_values = rs.randint(0, 3, _dims["dim3"], dtype="int64")
+        numbers_values = rs.integers(0, 3, _dims["dim3"], dtype="int64")
     obj.coords["numbers"] = ("dim3", numbers_values)
     obj.encoding = {"foo": "bar"}
     assert_writeable(obj)

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -2868,8 +2868,11 @@ def test_append_with_new_variable(self) -> None:
 
         # check append mode for new variable
         with self.create_zarr_target() as store_target:
-            xr.concat([ds, ds_to_append], dim="time").to_zarr(
-                store_target, mode="w", **self.version_kwargs
+            combined = xr.concat([ds, ds_to_append], dim="time")
+            combined.to_zarr(store_target, mode="w", **self.version_kwargs)
+            assert_identical(
+                combined,
+                xr.open_dataset(store_target, engine="zarr", **self.version_kwargs),
             )
             ds_with_new_var.to_zarr(store_target, mode="a", **self.version_kwargs)
             combined = xr.concat([ds, ds_to_append], dim="time")
@@ -6494,7 +6497,7 @@ def test_zarr_safe_chunk_region(tmp_path):
         arr.isel(a=slice(5, -1)).chunk(a=5).to_zarr(store, region="auto", mode="r+")
 
     # Test if the code is detecting the last chunk correctly
-    data = np.random.RandomState(0).randn(2920, 25, 53)
+    data = np.random.default_rng(0).random((2920, 25, 53))
     ds = xr.Dataset({"temperature": (("time", "lat", "lon"), data)})
     chunks = {"time": 1000, "lat": 25, "lon": 53}
     ds.chunk(chunks).to_zarr(store, compute=False, mode="w")

diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py
@@ -1293,9 +1293,9 @@ def covariance(x, y):
             (x - x.mean(axis=-1, keepdims=True)) * (y - y.mean(axis=-1, keepdims=True))
         ).mean(axis=-1)
 
-    rs = np.random.RandomState(42)
-    array1 = da.from_array(rs.randn(4, 4), chunks=(2, 4))
-    array2 = da.from_array(rs.randn(4, 4), chunks=(2, 4))
+    rs = np.random.default_rng(42)
+    array1 = da.from_array(rs.random((4, 4)), chunks=(2, 4))
+    array2 = da.from_array(rs.random((4, 4)), chunks=(2, 4))
     data_array_1 = xr.DataArray(array1, dims=("x", "z"))
     data_array_2 = xr.DataArray(array2, dims=("y", "z"))
 

diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py
@@ -38,7 +38,7 @@
 
 
 def test_raise_if_dask_computes():
-    data = da.from_array(np.random.RandomState(0).randn(4, 6), chunks=(2, 2))
+    data = da.from_array(np.random.default_rng(0).random((4, 6)), chunks=(2, 2))
     with pytest.raises(RuntimeError, match=r"Too many computes"):
         with raise_if_dask_computes():
             data.compute()
@@ -77,7 +77,7 @@ def assertLazyAndAllClose(self, expected, actual):
 
     @pytest.fixture(autouse=True)
     def setUp(self):
-        self.values = np.random.RandomState(0).randn(4, 6)
+        self.values = np.random.default_rng(0).random((4, 6))
         self.data = da.from_array(self.values, chunks=(2, 2))
 
         self.eager_var = Variable(("x", "y"), self.values)

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -2284,7 +2284,7 @@ class NdArraySubclass(np.ndarray):
         assert isinstance(converted_subok.data, NdArraySubclass)
 
     def test_is_null(self) -> None:
-        x = np.random.RandomState(42).randn(5, 6)
+        x = np.random.default_rng(42).random((5, 6))
         x[x < 0] = np.nan
         original = DataArray(x, [-np.arange(5), np.arange(6)], ["x", "y"])
         expected = DataArray(pd.isnull(x), [-np.arange(5), np.arange(6)], ["x", "y"])
@@ -3528,7 +3528,7 @@ def test_from_multiindex_series_sparse(self) -> None:
 
         idx = pd.MultiIndex.from_product([np.arange(3), np.arange(5)], names=["a", "b"])
         series: pd.Series = pd.Series(
-            np.random.RandomState(0).random(len(idx)), index=idx
+            np.random.default_rng(0).random(len(idx)), index=idx
         ).sample(n=5, random_state=3)
 
         dense = DataArray.from_series(series, sparse=False)
@@ -3703,8 +3703,8 @@ def test_to_dict_with_numpy_attrs(self) -> None:
         assert expected_attrs == actual["attrs"]
 
     def test_to_masked_array(self) -> None:
-        rs = np.random.RandomState(44)
-        x = rs.random_sample(size=(10, 20))
+        rs = np.random.default_rng(44)
+        x = rs.random(size=(10, 20))
         x_masked = np.ma.masked_where(x < 0.5, x)
         da = DataArray(x_masked)
 

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -99,7 +99,7 @@
 
 
 def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]:
-    rs = np.random.RandomState(seed)
+    rs = np.random.default_rng(seed)
 
     lat = [2, 1, 0]
     lon = [0, 1, 2]
@@ -126,7 +126,7 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]:
         ds = xr.Dataset(
             data_vars={
                 "da": xr.DataArray(
-                    rs.rand(3, 3, nt1),
+                    rs.random((3, 3, nt1)),
                     coords=[lat, lon, time1],
                     dims=["lat", "lon", "time"],
                 ),
@@ -141,7 +141,7 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]:
         ds_to_append = xr.Dataset(
             data_vars={
                 "da": xr.DataArray(
-                    rs.rand(3, 3, nt2),
+                    rs.random((3, 3, nt2)),
                     coords=[lat, lon, time2],
                     dims=["lat", "lon", "time"],
                 ),
@@ -156,7 +156,7 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]:
         ds_with_new_var = xr.Dataset(
             data_vars={
                 "new_var": xr.DataArray(
-                    rs.rand(3, 3, nt1 + nt2),
+                    rs.random((3, 3, nt1 + nt2)),
                     coords=[lat, lon, time1.append(time2)],
                     dims=["lat", "lon", "time"],
                 )
@@ -293,14 +293,14 @@ def test_repr(self) -> None:
                 numbers  (dim3) int64 80B 0 1 2 0 0 1 1 2 2 3
             Dimensions without coordinates: dim1
             Data variables:
-                var1     (dim1, dim2) float64 576B -1.086 0.9973 0.283 ... 0.4684 -0.8312
-                var2     (dim1, dim2) float64 576B 1.162 -1.097 -2.123 ... 1.267 0.3328
-                var3     (dim3, dim1) float64 640B 0.5565 -0.2121 0.4563 ... -0.2452 -0.3616
+                var1     (dim1, dim2) float64 576B -0.9891 -0.3678 1.288 ... -0.2116 0.364
+                var2     (dim1, dim2) float64 576B 0.953 1.52 1.704 ... 0.1347 -0.6423
+                var3     (dim3, dim1) float64 640B 0.4107 0.9941 0.1665 ... 0.716 1.555
             Attributes:
                 foo:      bar""".format(data["dim3"].dtype)
         )
         actual = "\n".join(x.rstrip() for x in repr(data).split("\n"))
-        print(actual)
+
         assert expected == actual
 
         with set_options(display_width=100):
@@ -7161,13 +7161,13 @@ def test_raise_no_warning_assert_close(ds) -> None:
 @pytest.mark.parametrize("dask", [True, False])
 @pytest.mark.parametrize("edge_order", [1, 2])
 def test_differentiate(dask, edge_order) -> None:
-    rs = np.random.RandomState(42)
+    rs = np.random.default_rng(42)
     coord = [0.2, 0.35, 0.4, 0.6, 0.7, 0.75, 0.76, 0.8]
 
     da = xr.DataArray(
-        rs.randn(8, 6),
+        rs.random((8, 6)),
         dims=["x", "y"],
-        coords={"x": coord, "z": 3, "x2d": (("x", "y"), rs.randn(8, 6))},
+        coords={"x": coord, "z": 3, "x2d": (("x", "y"), rs.random((8, 6)))},
     )
     if dask and has_dask:
         da = da.chunk({"x": 4})
@@ -7210,7 +7210,7 @@ def test_differentiate(dask, edge_order) -> None:
 @pytest.mark.filterwarnings("ignore:Converting non-nanosecond")
 @pytest.mark.parametrize("dask", [True, False])
 def test_differentiate_datetime(dask) -> None:
-    rs = np.random.RandomState(42)
+    rs = np.random.default_rng(42)
     coord = np.array(
         [
             "2004-07-13",
@@ -7226,9 +7226,9 @@ def test_differentiate_datetime(dask) -> None:
     )
 
     da = xr.DataArray(
-        rs.randn(8, 6),
+        rs.random((8, 6)),
         dims=["x", "y"],
-        coords={"x": coord, "z": 3, "x2d": (("x", "y"), rs.randn(8, 6))},
+        coords={"x": coord, "z": 3, "x2d": (("x", "y"), rs.random((8, 6)))},
     )
     if dask and has_dask:
         da = da.chunk({"x": 4})
@@ -7260,12 +7260,12 @@ def test_differentiate_datetime(dask) -> None:
 @requires_cftime
 @pytest.mark.parametrize("dask", [True, False])
 def test_differentiate_cftime(dask) -> None:
-    rs = np.random.RandomState(42)
+    rs = np.random.default_rng(42)
     coord = xr.cftime_range("2000", periods=8, freq="2ME")
 
     da = xr.DataArray(
-        rs.randn(8, 6),
-        coords={"time": coord, "z": 3, "t2d": (("time", "y"), rs.randn(8, 6))},
+        rs.random((8, 6)),
+        coords={"time": coord, "z": 3, "t2d": (("time", "y"), rs.random((8, 6)))},
         dims=["time", "y"],
     )
 
@@ -7289,17 +7289,17 @@ def test_differentiate_cftime(dask) -> None:
 
 @pytest.mark.parametrize("dask", [True, False])
 def test_integrate(dask) -> None:
-    rs = np.random.RandomState(42)
+    rs = np.random.default_rng(42)
     coord = [0.2, 0.35, 0.4, 0.6, 0.7, 0.75, 0.76, 0.8]
 
     da = xr.DataArray(
-        rs.randn(8, 6),
+        rs.random((8, 6)),
         dims=["x", "y"],
         coords={
             "x": coord,
-            "x2": (("x",), rs.randn(8)),
+            "x2": (("x",), rs.random(8)),
             "z": 3,
-            "x2d": (("x", "y"), rs.randn(8, 6)),
+            "x2d": (("x", "y"), rs.random((8, 6))),
         },
     )
     if dask and has_dask:
@@ -7343,17 +7343,17 @@ def test_integrate(dask) -> None:
 @requires_scipy
 @pytest.mark.parametrize("dask", [True, False])
 def test_cumulative_integrate(dask) -> None:
-    rs = np.random.RandomState(43)
+    rs = np.random.default_rng(43)
     coord = [0.2, 0.35, 0.4, 0.6, 0.7, 0.75, 0.76, 0.8]
 
     da = xr.DataArray(
-        rs.randn(8, 6),
+        rs.random((8, 6)),
         dims=["x", "y"],
         coords={
             "x": coord,
-            "x2": (("x",), rs.randn(8)),
+            "x2": (("x",), rs.random(8)),
             "z": 3,
-            "x2d": (("x", "y"), rs.randn(8, 6)),
+            "x2d": (("x", "y"), rs.random((8, 6))),
         },
     )
     if dask and has_dask:
@@ -7406,7 +7406,7 @@ def test_cumulative_integrate(dask) -> None:
 @pytest.mark.parametrize("dask", [True, False])
 @pytest.mark.parametrize("which_datetime", ["np", "cftime"])
 def test_trapezoid_datetime(dask, which_datetime) -> None:
-    rs = np.random.RandomState(42)
+    rs = np.random.default_rng(42)
     coord: ArrayLike
     if which_datetime == "np":
         coord = np.array(
@@ -7428,8 +7428,8 @@ def test_trapezoid_datetime(dask, which_datetime) -> None:
         coord = xr.cftime_range("2000", periods=8, freq="2D")
 
     da = xr.DataArray(
-        rs.randn(8, 6),
-        coords={"time": coord, "z": 3, "t2d": (("time", "y"), rs.randn(8, 6))},
+        rs.random((8, 6)),
+        coords={"time": coord, "z": 3, "t2d": (("time", "y"), rs.random((8, 6)))},
         dims=["time", "y"],
     )
 

diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py
@@ -341,16 +341,16 @@ def test_types(self, val1, val2, val3, null):
 
 def construct_dataarray(dim_num, dtype, contains_nan, dask):
     # dimnum <= 3
-    rng = np.random.RandomState(0)
+    rng = np.random.default_rng(0)
     shapes = [16, 8, 4][:dim_num]
     dims = ("x", "y", "z")[:dim_num]
 
     if np.issubdtype(dtype, np.floating):
-        array = rng.randn(*shapes).astype(dtype)
+        array = rng.random(shapes).astype(dtype)
     elif np.issubdtype(dtype, np.integer):
-        array = rng.randint(0, 10, size=shapes).astype(dtype)
+        array = rng.integers(0, 10, size=shapes).astype(dtype)
     elif np.issubdtype(dtype, np.bool_):
-        array = rng.randint(0, 1, size=shapes).astype(dtype)
+        array = rng.integers(0, 1, size=shapes).astype(dtype)
     elif dtype is str:
         array = rng.choice(["a", "b", "c", "d"], size=shapes)
     else:

diff --git a/xarray/tests/test_formatting_html.py b/xarray/tests/test_formatting_html.py
@@ -11,7 +11,7 @@
 
 @pytest.fixture
 def dataarray() -> xr.DataArray:
-    return xr.DataArray(np.random.RandomState(0).randn(4, 6))
+    return xr.DataArray(np.random.default_rng(0).random((4, 6)))
 
 
 @pytest.fixture

diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py
@@ -2879,7 +2879,7 @@ def test_multiple_groupers(use_flox: bool, shuffle: bool) -> None:
         )
 
     b = xr.DataArray(
-        np.random.RandomState(0).randn(2, 3, 4),
+        np.random.default_rng(0).random((2, 3, 4)),
         coords={"xy": (("x", "y"), [["a", "b", "c"], ["b", "c", "c"]], {"foo": "bar"})},
         dims=["x", "y", "z"],
     )