From 7e25f87c184dc1f22d7ef7a2219bd73851229d82 Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Wed, 18 Dec 2024 01:32:19 +0530 Subject: [PATCH 01/12] fix for cleaning working dir in case of same uri Signed-off-by: ujjawal-khare --- python/ray/_private/runtime_env/packaging.py | 203 +++++++++--------- python/ray/_private/runtime_env/plugin.py | 7 +- .../ray/tests/test_runtime_env_working_dir.py | 24 +++ 3 files changed, 129 insertions(+), 105 deletions(-) diff --git a/python/ray/_private/runtime_env/packaging.py b/python/ray/_private/runtime_env/packaging.py index 73ada86c4fb0..675c7125d32e 100644 --- a/python/ray/_private/runtime_env/packaging.py +++ b/python/ray/_private/runtime_env/packaging.py @@ -723,116 +723,113 @@ async def download_and_unpack_package( local_dir = get_local_dir_from_uri(pkg_uri, base_directory) assert local_dir != pkg_file, "Invalid pkg_file!" if local_dir.exists(): - assert local_dir.is_dir(), f"{local_dir} is not a directory" - else: - protocol, pkg_name = parse_uri(pkg_uri) - if protocol == Protocol.GCS: - if gcs_aio_client is None: - raise ValueError( - "GCS client must be provided to download from GCS." - ) + shutil.rmtree(local_dir) - # Download package from the GCS. - code = await gcs_aio_client.internal_kv_get( - pkg_uri.encode(), namespace=None, timeout=None + protocol, pkg_name = parse_uri(pkg_uri) + if protocol == Protocol.GCS: + if gcs_aio_client is None: + raise ValueError("GCS client must be provided to download from GCS.") + + # Download package from the GCS. + code = await gcs_aio_client.internal_kv_get( + pkg_uri.encode(), namespace=None, timeout=None + ) + if os.environ.get(RAY_RUNTIME_ENV_FAIL_DOWNLOAD_FOR_TESTING_ENV_VAR): + code = None + if code is None: + raise IOError( + f"Failed to download runtime_env file package {pkg_uri} " + "from the GCS to the Ray worker node. The package may " + "have prematurely been deleted from the GCS due to a " + "long upload time or a problem with Ray. Try setting the " + "environment variable " + f"{RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR} " + " to a value larger than the upload time in seconds " + "(the default is " + f"{RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT}). " + "If this fails, try re-running " + "after making any change to a file in the file package." ) - if os.environ.get(RAY_RUNTIME_ENV_FAIL_DOWNLOAD_FOR_TESTING_ENV_VAR): - code = None - if code is None: - raise IOError( - f"Failed to download runtime_env file package {pkg_uri} " - "from the GCS to the Ray worker node. The package may " - "have prematurely been deleted from the GCS due to a " - "long upload time or a problem with Ray. Try setting the " - "environment variable " - f"{RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR} " - " to a value larger than the upload time in seconds " - "(the default is " - f"{RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT}). " - "If this fails, try re-running " - "after making any change to a file in the file package." - ) - code = code or b"" - pkg_file.write_bytes(code) - - if is_zip_uri(pkg_uri): - unzip_package( - package_path=pkg_file, - target_dir=local_dir, - remove_top_level_directory=False, - unlink_zip=True, - logger=logger, - ) - else: - return str(pkg_file) - elif protocol in Protocol.remote_protocols(): - # Download package from remote URI - tp = None - install_warning = ( - "Note that these must be preinstalled " - "on all nodes in the Ray cluster; it is not " - "sufficient to install them in the runtime_env." + code = code or b"" + pkg_file.write_bytes(code) + + if is_zip_uri(pkg_uri): + unzip_package( + package_path=pkg_file, + target_dir=local_dir, + remove_top_level_directory=False, + unlink_zip=True, + logger=logger, ) + else: + return str(pkg_file) + elif protocol in Protocol.remote_protocols(): + # Download package from remote URI + tp = None + install_warning = ( + "Note that these must be preinstalled " + "on all nodes in the Ray cluster; it is not " + "sufficient to install them in the runtime_env." + ) - if protocol == Protocol.S3: - try: - import boto3 - from smart_open import open as open_file - except ImportError: - raise ImportError( - "You must `pip install smart_open` and " - "`pip install boto3` to fetch URIs in s3 " - "bucket. " + install_warning - ) - tp = {"client": boto3.client("s3")} - elif protocol == Protocol.GS: - try: - from google.cloud import storage # noqa: F401 - from smart_open import open as open_file - except ImportError: - raise ImportError( - "You must `pip install smart_open` and " - "`pip install google-cloud-storage` " - "to fetch URIs in Google Cloud Storage bucket." - + install_warning - ) - elif protocol == Protocol.FILE: - pkg_uri = pkg_uri[len("file://") :] - - def open_file(uri, mode, *, transport_params=None): - return open(uri, mode) - - else: - try: - from smart_open import open as open_file - except ImportError: - raise ImportError( - "You must `pip install smart_open` " - f"to fetch {protocol.value.upper()} URIs. " - + install_warning - ) - - with open_file(pkg_uri, "rb", transport_params=tp) as package_zip: - with open_file(pkg_file, "wb") as fin: - fin.write(package_zip.read()) - - if pkg_file.suffix in [".zip", ".jar"]: - unzip_package( - package_path=pkg_file, - target_dir=local_dir, - remove_top_level_directory=True, - unlink_zip=True, - logger=logger, + if protocol == Protocol.S3: + try: + import boto3 + from smart_open import open as open_file + except ImportError: + raise ImportError( + "You must `pip install smart_open` and " + "`pip install boto3` to fetch URIs in s3 " + "bucket. " + install_warning ) - elif pkg_file.suffix == ".whl": - return str(pkg_file) - else: - raise NotImplementedError( - f"Package format {pkg_file.suffix} is ", - "not supported for remote protocols", + tp = {"client": boto3.client("s3")} + elif protocol == Protocol.GS: + try: + from google.cloud import storage # noqa: F401 + from smart_open import open as open_file + except ImportError: + raise ImportError( + "You must `pip install smart_open` and " + "`pip install google-cloud-storage` " + "to fetch URIs in Google Cloud Storage bucket." + + install_warning ) + elif protocol == Protocol.FILE: + pkg_uri = pkg_uri[len("file://") :] + + def open_file(uri, mode, *, transport_params=None): + return open(uri, mode) + + else: + try: + from smart_open import open as open_file + except ImportError: + raise ImportError( + "You must `pip install smart_open` " + f"to fetch {protocol.value.upper()} URIs. " + install_warning + ) + + with open_file(pkg_uri, "rb", transport_params=tp) as package_zip: + with open_file(pkg_file, "wb") as fin: + fin.write(package_zip.read()) + + if pkg_file.suffix in [".zip", ".jar"]: + unzip_package( + package_path=pkg_file, + target_dir=local_dir, + remove_top_level_directory=True, + unlink_zip=True, + logger=logger, + ) + elif pkg_file.suffix == ".whl": + return str(pkg_file) else: - raise NotImplementedError(f"Protocol {protocol} is not supported") + raise NotImplementedError( + f"Package format {pkg_file.suffix} is ", + "not supported for remote protocols", + ) + else: + raise NotImplementedError(f"Protocol {protocol} is not supported") return str(local_dir) diff --git a/python/ray/_private/runtime_env/plugin.py b/python/ray/_private/runtime_env/plugin.py index a1e03a507b59..b7e1a15db24c 100644 --- a/python/ray/_private/runtime_env/plugin.py +++ b/python/ray/_private/runtime_env/plugin.py @@ -241,6 +241,7 @@ async def create_for_plugin_if_needed( uris = plugin.get_uris(runtime_env) + logger.info(f"Setting up runtime env {plugin.name} with URIs {uris}.") if not uris: logger.debug( f"No URIs for runtime env plugin {plugin.name}; " @@ -252,13 +253,15 @@ async def create_for_plugin_if_needed( if uri not in uri_cache: logger.debug(f"Cache miss for URI {uri}.") size_bytes = await plugin.create(uri, runtime_env, context, logger=logger) - uri_cache.add(uri, size_bytes, logger=logger) + if plugin.name is None or plugin.name != "working_dir": + uri_cache.add(uri, size_bytes, logger=logger) else: logger.info( f"Runtime env {plugin.name} {uri} is already installed " "and will be reused. Search " "all runtime_env_setup-*.log to find the corresponding setup log." ) - uri_cache.mark_used(uri, logger=logger) + if plugin.name is None or plugin.name != "working_dir": + uri_cache.mark_used(uri, logger=logger) plugin.modify_context(uris, runtime_env, context, logger) diff --git a/python/ray/tests/test_runtime_env_working_dir.py b/python/ray/tests/test_runtime_env_working_dir.py index e667b0c712b1..6dd35970397c 100644 --- a/python/ray/tests/test_runtime_env_working_dir.py +++ b/python/ray/tests/test_runtime_env_working_dir.py @@ -45,6 +45,30 @@ def insert_test_dir_in_pythonpath(): yield +@pytest.mark.asyncio +async def test_working_dir_cleanup(tmpdir, ray_start_regular): + gcs_aio_client = gcs_utils.GcsAioClient( + address=ray.worker.global_worker.gcs_client.address + ) + + plugin = WorkingDirPlugin(tmpdir, gcs_aio_client) + size = await plugin.create(HTTPS_PACKAGE_URI, {}, RuntimeEnvContext()) + + files = os.listdir(f"{tmpdir}/working_dir_files") + file_metadata = os.stat(f"{tmpdir}/working_dir_files/{files[0]}") + creation_time = file_metadata.st_ctime + + time.sleep(1) + + size = await plugin.create(HTTPS_PACKAGE_URI, {}, RuntimeEnvContext()) + files = os.listdir(f"{tmpdir}/working_dir_files") + + file_metadata = os.stat(f"{tmpdir}/working_dir_files/{files[0]}") + creation_time_after = file_metadata.st_ctime + + assert creation_time != creation_time_after + + @pytest.mark.asyncio async def test_create_delete_size_equal(tmpdir, ray_start_regular): """Tests that `create` and `delete_uri` return the same size for a URI.""" From 64d936de034f67b6f25655ec81be7da0c6592542 Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Mon, 23 Dec 2024 12:30:34 +0530 Subject: [PATCH 02/12] removed unused imports Signed-off-by: ujjawal-khare --- python/ray/tests/test_runtime_env_working_dir.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/tests/test_runtime_env_working_dir.py b/python/ray/tests/test_runtime_env_working_dir.py index 6dd35970397c..b197eba41644 100644 --- a/python/ray/tests/test_runtime_env_working_dir.py +++ b/python/ray/tests/test_runtime_env_working_dir.py @@ -52,7 +52,7 @@ async def test_working_dir_cleanup(tmpdir, ray_start_regular): ) plugin = WorkingDirPlugin(tmpdir, gcs_aio_client) - size = await plugin.create(HTTPS_PACKAGE_URI, {}, RuntimeEnvContext()) + await plugin.create(HTTPS_PACKAGE_URI, {}, RuntimeEnvContext()) files = os.listdir(f"{tmpdir}/working_dir_files") file_metadata = os.stat(f"{tmpdir}/working_dir_files/{files[0]}") @@ -60,7 +60,7 @@ async def test_working_dir_cleanup(tmpdir, ray_start_regular): time.sleep(1) - size = await plugin.create(HTTPS_PACKAGE_URI, {}, RuntimeEnvContext()) + await plugin.create(HTTPS_PACKAGE_URI, {}, RuntimeEnvContext()) files = os.listdir(f"{tmpdir}/working_dir_files") file_metadata = os.stat(f"{tmpdir}/working_dir_files/{files[0]}") From 5bc3556510f87bc5cd959ff9e4c7ca3aaf7c4f9c Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Tue, 24 Dec 2024 09:15:27 +0530 Subject: [PATCH 03/12] timeout increased Signed-off-by: ujjawal-khare --- python/ray/dashboard/modules/job/tests/test_http_job_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/dashboard/modules/job/tests/test_http_job_server.py b/python/ray/dashboard/modules/job/tests/test_http_job_server.py index 455ed856fdc4..3a0fbcb88912 100644 --- a/python/ray/dashboard/modules/job/tests/test_http_job_server.py +++ b/python/ray/dashboard/modules/job/tests/test_http_job_server.py @@ -398,7 +398,7 @@ def test_ray_tune_basic(job_sdk_client: JobSubmissionClient): runtime_env={"working_dir": DRIVER_SCRIPT_DIR}, ) wait_for_condition( - _check_job_succeeded, timeout=30, client=job_sdk_client, job_id=job_id + _check_job_succeeded, timeout=60, client=job_sdk_client, job_id=job_id ) From bc4a2d50226552cbe960b177f6e1a74f7e22b3fa Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Tue, 24 Dec 2024 10:52:31 +0530 Subject: [PATCH 04/12] timeout reverted Signed-off-by: ujjawal-khare --- python/ray/dashboard/modules/job/tests/test_http_job_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/dashboard/modules/job/tests/test_http_job_server.py b/python/ray/dashboard/modules/job/tests/test_http_job_server.py index 3a0fbcb88912..455ed856fdc4 100644 --- a/python/ray/dashboard/modules/job/tests/test_http_job_server.py +++ b/python/ray/dashboard/modules/job/tests/test_http_job_server.py @@ -398,7 +398,7 @@ def test_ray_tune_basic(job_sdk_client: JobSubmissionClient): runtime_env={"working_dir": DRIVER_SCRIPT_DIR}, ) wait_for_condition( - _check_job_succeeded, timeout=60, client=job_sdk_client, job_id=job_id + _check_job_succeeded, timeout=30, client=job_sdk_client, job_id=job_id ) From ddd9f5fe1e610def83f98925e04c54d2152a4ff5 Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Wed, 25 Dec 2024 20:08:10 +0530 Subject: [PATCH 05/12] overwrite as optional Signed-off-by: ujjawal-khare --- python/ray/_private/runtime_env/packaging.py | 209 +++++++++--------- .../ray/_private/runtime_env/working_dir.py | 6 +- 2 files changed, 115 insertions(+), 100 deletions(-) diff --git a/python/ray/_private/runtime_env/packaging.py b/python/ray/_private/runtime_env/packaging.py index e632dde00e6a..092036716e8b 100644 --- a/python/ray/_private/runtime_env/packaging.py +++ b/python/ray/_private/runtime_env/packaging.py @@ -656,6 +656,7 @@ async def download_and_unpack_package( base_directory: str, gcs_aio_client: Optional["GcsAioClient"] = None, # noqa: F821 logger: Optional[logging.Logger] = default_logger, + overwrite: bool = False, ) -> str: """Download the package corresponding to this URI and unpack it if zipped. @@ -668,6 +669,7 @@ async def download_and_unpack_package( directory for the unpacked files. gcs_aio_client: Client to use for downloading from the GCS. logger: The logger to use. + overwrite: If True, overwrite the existing package. Returns: Path to the local directory containing the unpacked package files. @@ -695,114 +697,123 @@ async def download_and_unpack_package( local_dir = get_local_dir_from_uri(pkg_uri, base_directory) assert local_dir != pkg_file, "Invalid pkg_file!" - if local_dir.exists(): + + download_package: bool = True + if local_dir.exists() and not overwrite: + download_package = False + assert local_dir.is_dir(), f"{local_dir} is not a directory" + elif local_dir.exists(): shutil.rmtree(local_dir) - protocol, pkg_name = parse_uri(pkg_uri) - if protocol == Protocol.GCS: - if gcs_aio_client is None: - raise ValueError("GCS client must be provided to download from GCS.") + if download_package: + protocol, pkg_name = parse_uri(pkg_uri) + if protocol == Protocol.GCS: + if gcs_aio_client is None: + raise ValueError( + "GCS client must be provided to download from GCS." + ) - # Download package from the GCS. - code = await gcs_aio_client.internal_kv_get( - pkg_uri.encode(), namespace=None, timeout=None - ) - if os.environ.get(RAY_RUNTIME_ENV_FAIL_DOWNLOAD_FOR_TESTING_ENV_VAR): - code = None - if code is None: - raise IOError( - f"Failed to download runtime_env file package {pkg_uri} " - "from the GCS to the Ray worker node. The package may " - "have prematurely been deleted from the GCS due to a " - "long upload time or a problem with Ray. Try setting the " - "environment variable " - f"{RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR} " - " to a value larger than the upload time in seconds " - "(the default is " - f"{RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT}). " - "If this fails, try re-running " - "after making any change to a file in the file package." - ) - code = code or b"" - pkg_file.write_bytes(code) - - if is_zip_uri(pkg_uri): - unzip_package( - package_path=pkg_file, - target_dir=local_dir, - remove_top_level_directory=False, - unlink_zip=True, - logger=logger, + # Download package from the GCS. + code = await gcs_aio_client.internal_kv_get( + pkg_uri.encode(), namespace=None, timeout=None ) - else: - return str(pkg_file) - elif protocol in Protocol.remote_protocols(): - # Download package from remote URI - tp = None - install_warning = ( - "Note that these must be preinstalled " - "on all nodes in the Ray cluster; it is not " - "sufficient to install them in the runtime_env." - ) - - if protocol == Protocol.S3: - try: - import boto3 - from smart_open import open as open_file - except ImportError: - raise ImportError( - "You must `pip install smart_open` and " - "`pip install boto3` to fetch URIs in s3 " - "bucket. " + install_warning + if os.environ.get(RAY_RUNTIME_ENV_FAIL_DOWNLOAD_FOR_TESTING_ENV_VAR): + code = None + if code is None: + raise IOError( + f"Failed to download runtime_env file package {pkg_uri} " + "from the GCS to the Ray worker node. The package may " + "have prematurely been deleted from the GCS due to a " + "long upload time or a problem with Ray. Try setting the " + "environment variable " + f"{RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_ENV_VAR} " + " to a value larger than the upload time in seconds " + "(the default is " + f"{RAY_RUNTIME_ENV_URI_PIN_EXPIRATION_S_DEFAULT}). " + "If this fails, try re-running " + "after making any change to a file in the file package." ) - tp = {"client": boto3.client("s3")} - elif protocol == Protocol.GS: - try: - from google.cloud import storage # noqa: F401 - from smart_open import open as open_file - except ImportError: - raise ImportError( - "You must `pip install smart_open` and " - "`pip install google-cloud-storage` " - "to fetch URIs in Google Cloud Storage bucket." - + install_warning + code = code or b"" + pkg_file.write_bytes(code) + + if is_zip_uri(pkg_uri): + unzip_package( + package_path=pkg_file, + target_dir=local_dir, + remove_top_level_directory=False, + unlink_zip=True, + logger=logger, ) - elif protocol == Protocol.FILE: - pkg_uri = pkg_uri[len("file://") :] - - def open_file(uri, mode, *, transport_params=None): - return open(uri, mode) + else: + return str(pkg_file) + elif protocol in Protocol.remote_protocols(): + # Download package from remote URI + tp = None + install_warning = ( + "Note that these must be preinstalled " + "on all nodes in the Ray cluster; it is not " + "sufficient to install them in the runtime_env." + ) - else: - try: - from smart_open import open as open_file - except ImportError: - raise ImportError( - "You must `pip install smart_open` " - f"to fetch {protocol.value.upper()} URIs. " + install_warning + if protocol == Protocol.S3: + try: + import boto3 + from smart_open import open as open_file + except ImportError: + raise ImportError( + "You must `pip install smart_open` and " + "`pip install boto3` to fetch URIs in s3 " + "bucket. " + install_warning + ) + tp = {"client": boto3.client("s3")} + elif protocol == Protocol.GS: + try: + from google.cloud import storage # noqa: F401 + from smart_open import open as open_file + except ImportError: + raise ImportError( + "You must `pip install smart_open` and " + "`pip install google-cloud-storage` " + "to fetch URIs in Google Cloud Storage bucket." + + install_warning + ) + elif protocol == Protocol.FILE: + pkg_uri = pkg_uri[len("file://") :] + + def open_file(uri, mode, *, transport_params=None): + return open(uri, mode) + + else: + try: + from smart_open import open as open_file + except ImportError: + raise ImportError( + "You must `pip install smart_open` " + f"to fetch {protocol.value.upper()} URIs. " + + install_warning + ) + + with open_file(pkg_uri, "rb", transport_params=tp) as package_zip: + with open_file(pkg_file, "wb") as fin: + fin.write(package_zip.read()) + + if pkg_file.suffix in [".zip", ".jar"]: + unzip_package( + package_path=pkg_file, + target_dir=local_dir, + remove_top_level_directory=True, + unlink_zip=True, + logger=logger, + ) + elif pkg_file.suffix == ".whl": + return str(pkg_file) + else: + raise NotImplementedError( + f"Package format {pkg_file.suffix} is ", + "not supported for remote protocols", ) - - with open_file(pkg_uri, "rb", transport_params=tp) as package_zip: - with open_file(pkg_file, "wb") as fin: - fin.write(package_zip.read()) - - if pkg_file.suffix in [".zip", ".jar"]: - unzip_package( - package_path=pkg_file, - target_dir=local_dir, - remove_top_level_directory=True, - unlink_zip=True, - logger=logger, - ) - elif pkg_file.suffix == ".whl": - return str(pkg_file) else: - raise NotImplementedError( - f"Package format {pkg_file.suffix} is ", - "not supported for remote protocols", - ) - else: - raise NotImplementedError(f"Protocol {protocol} is not supported") + raise NotImplementedError(f"Protocol {protocol} is not supported") return str(local_dir) diff --git a/python/ray/_private/runtime_env/working_dir.py b/python/ray/_private/runtime_env/working_dir.py index 51a0d2b91a57..690215323501 100644 --- a/python/ray/_private/runtime_env/working_dir.py +++ b/python/ray/_private/runtime_env/working_dir.py @@ -161,7 +161,11 @@ async def create( logger: logging.Logger = default_logger, ) -> int: local_dir = await download_and_unpack_package( - uri, self._resources_dir, self._gcs_aio_client, logger=logger + uri, + self._resources_dir, + self._gcs_aio_client, + logger=logger, + overwrite=True, ) return get_directory_size_bytes(local_dir) From 086748432e57b3c5c1320aabd94d77db58b264d6 Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Wed, 25 Dec 2024 20:16:28 +0530 Subject: [PATCH 06/12] master merge Signed-off-by: ujjawal-khare --- python/ray/_private/runtime_env/packaging.py | 52 +------------------- 1 file changed, 2 insertions(+), 50 deletions(-) diff --git a/python/ray/_private/runtime_env/packaging.py b/python/ray/_private/runtime_env/packaging.py index 092036716e8b..725d1391fb0d 100644 --- a/python/ray/_private/runtime_env/packaging.py +++ b/python/ray/_private/runtime_env/packaging.py @@ -706,7 +706,7 @@ async def download_and_unpack_package( shutil.rmtree(local_dir) if download_package: - protocol, pkg_name = parse_uri(pkg_uri) + protocol, _ = parse_uri(pkg_uri) if protocol == Protocol.GCS: if gcs_aio_client is None: raise ValueError( @@ -747,55 +747,7 @@ async def download_and_unpack_package( else: return str(pkg_file) elif protocol in Protocol.remote_protocols(): - # Download package from remote URI - tp = None - install_warning = ( - "Note that these must be preinstalled " - "on all nodes in the Ray cluster; it is not " - "sufficient to install them in the runtime_env." - ) - - if protocol == Protocol.S3: - try: - import boto3 - from smart_open import open as open_file - except ImportError: - raise ImportError( - "You must `pip install smart_open` and " - "`pip install boto3` to fetch URIs in s3 " - "bucket. " + install_warning - ) - tp = {"client": boto3.client("s3")} - elif protocol == Protocol.GS: - try: - from google.cloud import storage # noqa: F401 - from smart_open import open as open_file - except ImportError: - raise ImportError( - "You must `pip install smart_open` and " - "`pip install google-cloud-storage` " - "to fetch URIs in Google Cloud Storage bucket." - + install_warning - ) - elif protocol == Protocol.FILE: - pkg_uri = pkg_uri[len("file://") :] - - def open_file(uri, mode, *, transport_params=None): - return open(uri, mode) - - else: - try: - from smart_open import open as open_file - except ImportError: - raise ImportError( - "You must `pip install smart_open` " - f"to fetch {protocol.value.upper()} URIs. " - + install_warning - ) - - with open_file(pkg_uri, "rb", transport_params=tp) as package_zip: - with open_file(pkg_file, "wb") as fin: - fin.write(package_zip.read()) + protocol.download_remote_uri(source_uri=pkg_uri, dest_file=pkg_file) if pkg_file.suffix in [".zip", ".jar"]: unzip_package( From 0212f7155020053496f04c2b37e05b432f6745a6 Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Fri, 27 Dec 2024 10:21:23 +0530 Subject: [PATCH 07/12] gcs cleanup added Signed-off-by: ujjawal-khare --- python/ray/_private/runtime_env/packaging.py | 14 +++++++++++++- python/ray/_private/runtime_env/plugin.py | 6 +----- python/ray/_private/runtime_env/working_dir.py | 2 +- .../modules/job/tests/test_http_job_server.py | 2 +- python/ray/experimental/internal_kv.py | 3 +++ 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/python/ray/_private/runtime_env/packaging.py b/python/ray/_private/runtime_env/packaging.py index 725d1391fb0d..2f49e35e4a08 100644 --- a/python/ray/_private/runtime_env/packaging.py +++ b/python/ray/_private/runtime_env/packaging.py @@ -628,6 +628,7 @@ def upload_package_if_needed( package_file = package_file.with_name( f"{time.time_ns()}_{os.getpid()}_{package_file.name}" ) + create_package( module_path, package_file, @@ -696,6 +697,7 @@ async def download_and_unpack_package( logger.debug(f"Fetching package for URI: {pkg_uri}") local_dir = get_local_dir_from_uri(pkg_uri, base_directory) + logger.info("local_dir: %s", local_dir) assert local_dir != pkg_file, "Invalid pkg_file!" download_package: bool = True @@ -703,16 +705,25 @@ async def download_and_unpack_package( download_package = False assert local_dir.is_dir(), f"{local_dir} is not a directory" elif local_dir.exists(): - shutil.rmtree(local_dir) + logger.info(f"local_dir.exists() removing dir {local_dir}") + # Remove existing file and directory + # os.remove(pkg_file) + logger.info(f"Removing {local_dir} with pkg_file {pkg_file}") + # shutil.rmtree(local_dir) + # time.sleep(0.5) if download_package: protocol, _ = parse_uri(pkg_uri) + logger.info( + f"Downloading package from {pkg_uri} to {pkg_file} with protocol {protocol}" + ) if protocol == Protocol.GCS: if gcs_aio_client is None: raise ValueError( "GCS client must be provided to download from GCS." ) + # logger.info(f"Downloading package from {pkg_uri} to {pkg_file}") # Download package from the GCS. code = await gcs_aio_client.internal_kv_get( pkg_uri.encode(), namespace=None, timeout=None @@ -737,6 +748,7 @@ async def download_and_unpack_package( pkg_file.write_bytes(code) if is_zip_uri(pkg_uri): + logger.info(f"Unpacking {pkg_file} to {local_dir}") unzip_package( package_path=pkg_file, target_dir=local_dir, diff --git a/python/ray/_private/runtime_env/plugin.py b/python/ray/_private/runtime_env/plugin.py index b7e1a15db24c..c8da993dc1d5 100644 --- a/python/ray/_private/runtime_env/plugin.py +++ b/python/ray/_private/runtime_env/plugin.py @@ -252,16 +252,12 @@ async def create_for_plugin_if_needed( for uri in uris: if uri not in uri_cache: logger.debug(f"Cache miss for URI {uri}.") - size_bytes = await plugin.create(uri, runtime_env, context, logger=logger) - if plugin.name is None or plugin.name != "working_dir": - uri_cache.add(uri, size_bytes, logger=logger) + await plugin.create(uri, runtime_env, context, logger=logger) else: logger.info( f"Runtime env {plugin.name} {uri} is already installed " "and will be reused. Search " "all runtime_env_setup-*.log to find the corresponding setup log." ) - if plugin.name is None or plugin.name != "working_dir": - uri_cache.mark_used(uri, logger=logger) plugin.modify_context(uris, runtime_env, context, logger) diff --git a/python/ray/_private/runtime_env/working_dir.py b/python/ray/_private/runtime_env/working_dir.py index 690215323501..3cb4023f3ee7 100644 --- a/python/ray/_private/runtime_env/working_dir.py +++ b/python/ray/_private/runtime_env/working_dir.py @@ -165,7 +165,7 @@ async def create( self._resources_dir, self._gcs_aio_client, logger=logger, - overwrite=True, + overwrite=False, ) return get_directory_size_bytes(local_dir) diff --git a/python/ray/dashboard/modules/job/tests/test_http_job_server.py b/python/ray/dashboard/modules/job/tests/test_http_job_server.py index 455ed856fdc4..fdec5d45d859 100644 --- a/python/ray/dashboard/modules/job/tests/test_http_job_server.py +++ b/python/ray/dashboard/modules/job/tests/test_http_job_server.py @@ -398,7 +398,7 @@ def test_ray_tune_basic(job_sdk_client: JobSubmissionClient): runtime_env={"working_dir": DRIVER_SCRIPT_DIR}, ) wait_for_condition( - _check_job_succeeded, timeout=30, client=job_sdk_client, job_id=job_id + _check_job_succeeded, timeout=600, client=job_sdk_client, job_id=job_id ) diff --git a/python/ray/experimental/internal_kv.py b/python/ray/experimental/internal_kv.py index 862ff3bacc89..720e74a3bf3b 100644 --- a/python/ray/experimental/internal_kv.py +++ b/python/ray/experimental/internal_kv.py @@ -91,6 +91,9 @@ def _internal_kv_put( and isinstance(value, bytes) and isinstance(overwrite, bool) ) + # Remove the key if it already exists. + if overwrite: + global_gcs_client.internal_kv_del(key, False, namespace) return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0 From a2cae03721ccb6f750f803340f884b5472eae8ef Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Fri, 27 Dec 2024 12:15:04 +0530 Subject: [PATCH 08/12] test added Signed-off-by: ujjawal-khare --- python/ray/_private/runtime_env/packaging.py | 9 +++------ python/ray/_private/runtime_env/working_dir.py | 2 +- .../modules/job/tests/test_http_job_server.py | 2 +- python/ray/experimental/internal_kv.py | 3 --- .../ray/tests/test_runtime_env_working_dir.py | 18 ++++++++++++------ python/ray/train/_internal/storage.py | 2 +- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/python/ray/_private/runtime_env/packaging.py b/python/ray/_private/runtime_env/packaging.py index 2f49e35e4a08..96b0255b384f 100644 --- a/python/ray/_private/runtime_env/packaging.py +++ b/python/ray/_private/runtime_env/packaging.py @@ -705,17 +705,14 @@ async def download_and_unpack_package( download_package = False assert local_dir.is_dir(), f"{local_dir} is not a directory" elif local_dir.exists(): - logger.info(f"local_dir.exists() removing dir {local_dir}") - # Remove existing file and directory - # os.remove(pkg_file) logger.info(f"Removing {local_dir} with pkg_file {pkg_file}") - # shutil.rmtree(local_dir) - # time.sleep(0.5) + shutil.rmtree(local_dir) if download_package: protocol, _ = parse_uri(pkg_uri) logger.info( - f"Downloading package from {pkg_uri} to {pkg_file} with protocol {protocol}" + f"Downloading package from {pkg_uri} to {pkg_file} " + f"with protocol {protocol}" ) if protocol == Protocol.GCS: if gcs_aio_client is None: diff --git a/python/ray/_private/runtime_env/working_dir.py b/python/ray/_private/runtime_env/working_dir.py index 3cb4023f3ee7..690215323501 100644 --- a/python/ray/_private/runtime_env/working_dir.py +++ b/python/ray/_private/runtime_env/working_dir.py @@ -165,7 +165,7 @@ async def create( self._resources_dir, self._gcs_aio_client, logger=logger, - overwrite=False, + overwrite=True, ) return get_directory_size_bytes(local_dir) diff --git a/python/ray/dashboard/modules/job/tests/test_http_job_server.py b/python/ray/dashboard/modules/job/tests/test_http_job_server.py index fdec5d45d859..455ed856fdc4 100644 --- a/python/ray/dashboard/modules/job/tests/test_http_job_server.py +++ b/python/ray/dashboard/modules/job/tests/test_http_job_server.py @@ -398,7 +398,7 @@ def test_ray_tune_basic(job_sdk_client: JobSubmissionClient): runtime_env={"working_dir": DRIVER_SCRIPT_DIR}, ) wait_for_condition( - _check_job_succeeded, timeout=600, client=job_sdk_client, job_id=job_id + _check_job_succeeded, timeout=30, client=job_sdk_client, job_id=job_id ) diff --git a/python/ray/experimental/internal_kv.py b/python/ray/experimental/internal_kv.py index 720e74a3bf3b..862ff3bacc89 100644 --- a/python/ray/experimental/internal_kv.py +++ b/python/ray/experimental/internal_kv.py @@ -91,9 +91,6 @@ def _internal_kv_put( and isinstance(value, bytes) and isinstance(overwrite, bool) ) - # Remove the key if it already exists. - if overwrite: - global_gcs_client.internal_kv_del(key, False, namespace) return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0 diff --git a/python/ray/tests/test_runtime_env_working_dir.py b/python/ray/tests/test_runtime_env_working_dir.py index b197eba41644..904bea0ad909 100644 --- a/python/ray/tests/test_runtime_env_working_dir.py +++ b/python/ray/tests/test_runtime_env_working_dir.py @@ -54,19 +54,25 @@ async def test_working_dir_cleanup(tmpdir, ray_start_regular): plugin = WorkingDirPlugin(tmpdir, gcs_aio_client) await plugin.create(HTTPS_PACKAGE_URI, {}, RuntimeEnvContext()) + print(f"tmpdir {tmpdir}") files = os.listdir(f"{tmpdir}/working_dir_files") - file_metadata = os.stat(f"{tmpdir}/working_dir_files/{files[0]}") - creation_time = file_metadata.st_ctime + # Iterate over the files and store the metadata. + + creation_metadata = {} + for file in files: + file_metadata = os.stat(f"{tmpdir}/working_dir_files/{file}") + creation_time = file_metadata.st_ctime + creation_metadata[file] = creation_time time.sleep(1) await plugin.create(HTTPS_PACKAGE_URI, {}, RuntimeEnvContext()) files = os.listdir(f"{tmpdir}/working_dir_files") - file_metadata = os.stat(f"{tmpdir}/working_dir_files/{files[0]}") - creation_time_after = file_metadata.st_ctime - - assert creation_time != creation_time_after + for file in files: + file_metadata = os.stat(f"{tmpdir}/working_dir_files/{file}") + creation_time_after = file_metadata.st_ctime + assert creation_metadata[file] != creation_time_after @pytest.mark.asyncio diff --git a/python/ray/train/_internal/storage.py b/python/ray/train/_internal/storage.py index 05970988862e..46119176c713 100644 --- a/python/ray/train/_internal/storage.py +++ b/python/ray/train/_internal/storage.py @@ -250,7 +250,7 @@ def _list_at_fs_path( selector = pyarrow.fs.FileSelector(fs_path, allow_not_found=True, recursive=False) return [ - os.path.relpath(file_info.path.lstrip("/"), start=fs_path.lstrip("/")) + os.path.relpath(os.path.abspath(file_info.path), start=os.path.abspath(fs_path)) for file_info in fs.get_file_info(selector) if file_filter(file_info) ] From e2951b4bc26d1ccef4f09e917db99ac8073185ba Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Fri, 27 Dec 2024 12:18:41 +0530 Subject: [PATCH 09/12] cleanup Signed-off-by: ujjawal-khare --- python/ray/_private/runtime_env/packaging.py | 2 -- python/ray/_private/runtime_env/plugin.py | 4 +++- python/ray/tests/test_runtime_env_working_dir.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/ray/_private/runtime_env/packaging.py b/python/ray/_private/runtime_env/packaging.py index 96b0255b384f..40537cd40859 100644 --- a/python/ray/_private/runtime_env/packaging.py +++ b/python/ray/_private/runtime_env/packaging.py @@ -697,7 +697,6 @@ async def download_and_unpack_package( logger.debug(f"Fetching package for URI: {pkg_uri}") local_dir = get_local_dir_from_uri(pkg_uri, base_directory) - logger.info("local_dir: %s", local_dir) assert local_dir != pkg_file, "Invalid pkg_file!" download_package: bool = True @@ -720,7 +719,6 @@ async def download_and_unpack_package( "GCS client must be provided to download from GCS." ) - # logger.info(f"Downloading package from {pkg_uri} to {pkg_file}") # Download package from the GCS. code = await gcs_aio_client.internal_kv_get( pkg_uri.encode(), namespace=None, timeout=None diff --git a/python/ray/_private/runtime_env/plugin.py b/python/ray/_private/runtime_env/plugin.py index c8da993dc1d5..1b412f82913f 100644 --- a/python/ray/_private/runtime_env/plugin.py +++ b/python/ray/_private/runtime_env/plugin.py @@ -252,12 +252,14 @@ async def create_for_plugin_if_needed( for uri in uris: if uri not in uri_cache: logger.debug(f"Cache miss for URI {uri}.") - await plugin.create(uri, runtime_env, context, logger=logger) + size_bytes = await plugin.create(uri, runtime_env, context, logger=logger) + uri_cache.add(uri, size_bytes, logger=logger) else: logger.info( f"Runtime env {plugin.name} {uri} is already installed " "and will be reused. Search " "all runtime_env_setup-*.log to find the corresponding setup log." ) + uri_cache.mark_used(uri, logger=logger) plugin.modify_context(uris, runtime_env, context, logger) diff --git a/python/ray/tests/test_runtime_env_working_dir.py b/python/ray/tests/test_runtime_env_working_dir.py index 904bea0ad909..8b5c04df767d 100644 --- a/python/ray/tests/test_runtime_env_working_dir.py +++ b/python/ray/tests/test_runtime_env_working_dir.py @@ -56,8 +56,8 @@ async def test_working_dir_cleanup(tmpdir, ray_start_regular): print(f"tmpdir {tmpdir}") files = os.listdir(f"{tmpdir}/working_dir_files") - # Iterate over the files and store the metadata. + # Iterate over the files and storing creation metadata. creation_metadata = {} for file in files: file_metadata = os.stat(f"{tmpdir}/working_dir_files/{file}") From 528fae345bafc65edd04130824fcf5869e99492f Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Fri, 27 Dec 2024 12:20:42 +0530 Subject: [PATCH 10/12] cleanup Signed-off-by: ujjawal-khare --- python/ray/_private/runtime_env/packaging.py | 1 - python/ray/_private/runtime_env/plugin.py | 1 - 2 files changed, 2 deletions(-) diff --git a/python/ray/_private/runtime_env/packaging.py b/python/ray/_private/runtime_env/packaging.py index 40537cd40859..87412e06abf1 100644 --- a/python/ray/_private/runtime_env/packaging.py +++ b/python/ray/_private/runtime_env/packaging.py @@ -743,7 +743,6 @@ async def download_and_unpack_package( pkg_file.write_bytes(code) if is_zip_uri(pkg_uri): - logger.info(f"Unpacking {pkg_file} to {local_dir}") unzip_package( package_path=pkg_file, target_dir=local_dir, diff --git a/python/ray/_private/runtime_env/plugin.py b/python/ray/_private/runtime_env/plugin.py index 1b412f82913f..a1e03a507b59 100644 --- a/python/ray/_private/runtime_env/plugin.py +++ b/python/ray/_private/runtime_env/plugin.py @@ -241,7 +241,6 @@ async def create_for_plugin_if_needed( uris = plugin.get_uris(runtime_env) - logger.info(f"Setting up runtime env {plugin.name} with URIs {uris}.") if not uris: logger.debug( f"No URIs for runtime env plugin {plugin.name}; " From 368eeafe1a9a18f6181442531a6a769fc85c99e7 Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Fri, 3 Jan 2025 08:58:38 +0530 Subject: [PATCH 11/12] lint fix Signed-off-by: ujjawal-khare --- python/ray/tests/test_runtime_env_working_dir.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/tests/test_runtime_env_working_dir.py b/python/ray/tests/test_runtime_env_working_dir.py index 8b5c04df767d..35f9d1390c40 100644 --- a/python/ray/tests/test_runtime_env_working_dir.py +++ b/python/ray/tests/test_runtime_env_working_dir.py @@ -54,7 +54,6 @@ async def test_working_dir_cleanup(tmpdir, ray_start_regular): plugin = WorkingDirPlugin(tmpdir, gcs_aio_client) await plugin.create(HTTPS_PACKAGE_URI, {}, RuntimeEnvContext()) - print(f"tmpdir {tmpdir}") files = os.listdir(f"{tmpdir}/working_dir_files") # Iterate over the files and storing creation metadata. From fecfe090262c6cfcbcb1868f989ff17ae454cb6f Mon Sep 17 00:00:00 2001 From: ujjawal-khare Date: Mon, 13 Jan 2025 23:57:41 +0530 Subject: [PATCH 12/12] storage file Signed-off-by: ujjawal-khare --- python/ray/train/_internal/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/train/_internal/storage.py b/python/ray/train/_internal/storage.py index 46119176c713..05970988862e 100644 --- a/python/ray/train/_internal/storage.py +++ b/python/ray/train/_internal/storage.py @@ -250,7 +250,7 @@ def _list_at_fs_path( selector = pyarrow.fs.FileSelector(fs_path, allow_not_found=True, recursive=False) return [ - os.path.relpath(os.path.abspath(file_info.path), start=os.path.abspath(fs_path)) + os.path.relpath(file_info.path.lstrip("/"), start=fs_path.lstrip("/")) for file_info in fs.get_file_info(selector) if file_filter(file_info) ]