From 2c8b6bbab56d7cf92d50033d7958f8dda28e9b21 Mon Sep 17 00:00:00 2001 From: "Dr. Sascha Gerloff" Date: Fri, 26 Jan 2024 10:19:39 +0100 Subject: [PATCH 01/10] refactor: fix typo --- rocket/rocket.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rocket/rocket.py b/rocket/rocket.py index fe98217..0f4469d 100644 --- a/rocket/rocket.py +++ b/rocket/rocket.py @@ -26,7 +26,7 @@ def setup(self): Initialize the application. """ if os.path.exists("setup.py") or os.path.exists(f"pyproject.toml"): - logger.info("Packaing file already exists so no need to create a new one") + logger.info("Packaging file already exists so no need to create a new one") return content = """ From 988482d0559fa2cdec2661c040e628647f7506a5 Mon Sep 17 00:00:00 2001 From: "Dr. Sascha Gerloff" Date: Fri, 26 Jan 2024 10:22:15 +0100 Subject: [PATCH 02/10] refactor: complete doc string from rocket launch --- rocket/rocket.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rocket/rocket.py b/rocket/rocket.py index 0f4469d..b1d0e46 100644 --- a/rocket/rocket.py +++ b/rocket/rocket.py @@ -55,7 +55,7 @@ def launch( ): """ Entrypoint of the application, triggers a build and deploy - :param project_location: + :param project_location: path to project code, default: `"."` :param dbfs_path: path where the wheel will be stored, ex: dbfs:/tmp/myteam/myproject :param watch: Set to false if you don't want to automatically sync your files :return: From 84ce7388c2f87bb1038373316a313b674ed84d35 Mon Sep 17 00:00:00 2001 From: "Dr. Sascha Gerloff" Date: Fri, 26 Jan 2024 10:31:22 +0100 Subject: [PATCH 03/10] refactor: improve type hints --- rocket/rocket.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/rocket/rocket.py b/rocket/rocket.py index b1d0e46..9841ff2 100644 --- a/rocket/rocket.py +++ b/rocket/rocket.py @@ -1,5 +1,5 @@ import os -from typing import Optional +from typing import Optional, List import fire @@ -51,8 +51,8 @@ def launch( self, project_location: str = ".", dbfs_path: Optional[str] = None, - watch=True, - ): + watch: bool = True + ) -> None: """ Entrypoint of the application, triggers a build and deploy :param project_location: path to project code, default: `"."` @@ -93,8 +93,12 @@ def launch( watcher.start() def _build_and_deploy( - self, watch, project_location, dbfs_path, modified_files=None - ): + self, + watch: bool, + project_location: str, + dbfs_path: str, + modified_files: Optional[List[str]] = None + ) -> None: if modified_files: logger.info(f"Found changes in {modified_files}. Overwriting them.") self._deploy( @@ -193,15 +197,20 @@ def _build_and_deploy( %autoreload 2""" ) - def _deploy(self, file_paths, dbfs_path, project_location): - def helper(file): + def _deploy( + self, + file_paths: List[str], + dbfs_path: str, + project_location: str + ) -> None: + def helper(file: str) -> None: target_path = f"{dbfs_path}/{os.path.relpath(file, project_location)}" execute_shell_command(f"databricks fs cp --overwrite {file} {target_path}") logger.info(f"Uploaded {file} to {target_path}") execute_for_each_multithreaded(file_paths, lambda x: helper(x)) - def _create_python_project_wheel(self, project_location): + def _create_python_project_wheel(self, project_location: str) -> (str, str): dist_location = f"{project_location}/dist" execute_shell_command(f"rm {dist_location}/* 2>/dev/null || true") From 9497aa2fa6409a24c2add3cb4f835191adeae0a0 Mon Sep 17 00:00:00 2001 From: "Dr. Sascha Gerloff" Date: Fri, 26 Jan 2024 10:59:04 +0100 Subject: [PATCH 04/10] feat: add glob_path as parameter to specify files to upload and watch --- rocket/rocket.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/rocket/rocket.py b/rocket/rocket.py index 9841ff2..b43e6a3 100644 --- a/rocket/rocket.py +++ b/rocket/rocket.py @@ -1,5 +1,6 @@ import os -from typing import Optional, List +import glob +from typing import Optional, List, Union import fire @@ -51,13 +52,15 @@ def launch( self, project_location: str = ".", dbfs_path: Optional[str] = None, - watch: bool = True + watch: bool = True, + glob_path: Optional[Union[str, List[str]]] = None ) -> None: """ Entrypoint of the application, triggers a build and deploy :param project_location: path to project code, default: `"."` :param dbfs_path: path where the wheel will be stored, ex: dbfs:/tmp/myteam/myproject :param watch: Set to false if you don't want to automatically sync your files + :param glob_path: glob string or list of strings for additional files to deploy, e.g. "*.json" :return: """ if os.getenv("DATABRICKS_TOKEN") is None: @@ -79,7 +82,7 @@ def launch( project_name = os.path.abspath(project_location).split("/")[-1] dbfs_path = f"{dbfs_path}/{project_name}" - self._build_and_deploy(watch, project_location, dbfs_path) + self._build_and_deploy(watch=watch, project_location=project_location, dbfs_path=dbfs_path, glob_path=glob_path) if watch: watcher = FileWatcher( project_location, @@ -88,6 +91,7 @@ def launch( modified_files=watcher.modified_files, dbfs_path=dbfs_path, project_location=project_location, + glob_path=glob_path ), ) watcher.start() @@ -97,7 +101,8 @@ def _build_and_deploy( watch: bool, project_location: str, dbfs_path: str, - modified_files: Optional[List[str]] = None + modified_files: Optional[List[str]] = None, + glob_path: Optional[Union[str, List[str]]] = None ) -> None: if modified_files: logger.info(f"Found changes in {modified_files}. Overwriting them.") @@ -143,6 +148,12 @@ def _build_and_deploy( for file in extract_python_files_from_folder(package_dir): files.append(file) + if isinstance(glob_path, str): + files.extend(glob.glob(os.path.join(project_location, glob_path))) + elif isinstance(glob_path, list): + for path in glob_path: + files.extend(glob.glob(os.path.join(project_location, path))) + project_files = ["setup.py", "pyproject.toml"] for project_file in project_files: if os.path.exists(f"{project_location}/{project_file}"): From 3a5fdf53c50e6619e2c4431d4768be38c7afc96b Mon Sep 17 00:00:00 2001 From: "Dr. Sascha Gerloff" Date: Fri, 26 Jan 2024 12:09:24 +0100 Subject: [PATCH 05/10] bump minor version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 27d1429..9edb298 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setuptools.setup( name="databricks-rocket", - version="2.0.4", + version="2.1.0", author="GetYourGuide", author_email="engineering.data-products@getyourguide.com", description="Keep your local python scripts installed and in sync with a databricks notebook. Shortens the feedback loop to develop projects using a hybrid enviroment", From ade5fdfa355e615821bd24036f15a90deebcdf5a Mon Sep 17 00:00:00 2001 From: "Dr. Sascha Gerloff" Date: Fri, 26 Jan 2024 12:15:28 +0100 Subject: [PATCH 06/10] add to readme documentation to advertise the feature --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 59aef79..680c3d9 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,17 @@ and following in a new Python cell: Finally, add the content in you databricks notebook: ![imgs/img_2.png](imgs/img_2.png) +#### Include non-python files +Upload all root level json files: +```shell +rocket launch --glob_path="*,json" +``` +On top also upload all env files: +```shell +rocket launch --glob_path="[\"*.json\", \".env*\"]" +``` +When specifying lists, be mindful about the formatting of the parameter string. + ### To Upload Your Python Package If you've disabled the watch feature, `databricks-rocket` will only upload your project as a wheel to DBFS: From 41395be08179f0390188311a1533850451f8ed89 Mon Sep 17 00:00:00 2001 From: "Dr. Sascha Gerloff" Date: Fri, 26 Jan 2024 12:16:26 +0100 Subject: [PATCH 07/10] add entry to changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1372ad5..cbc8bae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog db-rocket +## Version 2.1.0 +- New paramter for ``rocket launch --glob_path=<...>``, which allows to specify a list of globs for files to deploy during launch. + ## Version 2.0.4 - Update version number. From 7b958664b822d8610909686ec5f47c625fba8494 Mon Sep 17 00:00:00 2001 From: "Dr. Sascha Gerloff" Date: Fri, 26 Jan 2024 13:29:10 +0100 Subject: [PATCH 08/10] enable recursive upload from databricks cli --- rocket/rocket.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rocket/rocket.py b/rocket/rocket.py index b43e6a3..965416f 100644 --- a/rocket/rocket.py +++ b/rocket/rocket.py @@ -216,7 +216,7 @@ def _deploy( ) -> None: def helper(file: str) -> None: target_path = f"{dbfs_path}/{os.path.relpath(file, project_location)}" - execute_shell_command(f"databricks fs cp --overwrite {file} {target_path}") + execute_shell_command(f"databricks fs cp --recursive --overwrite {file} {target_path}") logger.info(f"Uploaded {file} to {target_path}") execute_for_each_multithreaded(file_paths, lambda x: helper(x)) From 64aa7db927e53d794158a0b55cfd31d6f7786185 Mon Sep 17 00:00:00 2001 From: "Dr. Sascha Gerloff" Date: Fri, 26 Jan 2024 14:03:35 +0100 Subject: [PATCH 09/10] fix: avoid duplicates in files to upload --- rocket/rocket.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/rocket/rocket.py b/rocket/rocket.py index 965416f..3caa6ae 100644 --- a/rocket/rocket.py +++ b/rocket/rocket.py @@ -143,21 +143,20 @@ def _build_and_deploy( return package_dirs = extract_python_package_dirs(project_location) - files = [] + files = set() for package_dir in package_dirs: - for file in extract_python_files_from_folder(package_dir): - files.append(file) + files.update(extract_python_files_from_folder(package_dir)) if isinstance(glob_path, str): - files.extend(glob.glob(os.path.join(project_location, glob_path))) + files.update(glob.glob(os.path.join(project_location, glob_path))) elif isinstance(glob_path, list): for path in glob_path: - files.extend(glob.glob(os.path.join(project_location, path))) + files.update(glob.glob(os.path.join(project_location, path))) project_files = ["setup.py", "pyproject.toml"] for project_file in project_files: if os.path.exists(f"{project_location}/{project_file}"): - files.append(f"{project_location}/{project_file}") + files.add(f"{project_location}/{project_file}") if os.path.exists(f"{project_location}/pyproject.toml"): execute_shell_command( @@ -171,7 +170,7 @@ def _build_and_deploy( for dependency_file in dependency_files: dependency_file_path = f"{project_location}/{dependency_file}" if os.path.exists(dependency_file_path): - files.append(dependency_file_path) + files.add(dependency_file_path) uploaded_dependency_file = dependency_file dependency_file_exist = True with open(dependency_file_path) as f: @@ -179,7 +178,7 @@ def _build_and_deploy( line.strip() for line in f.readlines() if "index-url" in line ] self._deploy( - file_paths=files, dbfs_path=dbfs_path, project_location=project_location + file_paths=list(files), dbfs_path=dbfs_path, project_location=project_location ) install_path = f'{dbfs_path.replace("dbfs:/", "/dbfs/")}' From a32ed236e0eff3b147b3a95b51724755f607f29b Mon Sep 17 00:00:00 2001 From: "Dr. Sascha Gerloff" Date: Fri, 26 Jan 2024 14:57:33 +0100 Subject: [PATCH 10/10] fix: handle watching files tracked by user defined glob; current solution is bruteforce --- rocket/file_watcher.py | 22 ++++++++++++++++------ rocket/rocket.py | 21 +++++++++++++-------- rocket/utils.py | 9 +++++++++ 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/rocket/file_watcher.py b/rocket/file_watcher.py index b6f70ed..a767a52 100644 --- a/rocket/file_watcher.py +++ b/rocket/file_watcher.py @@ -1,9 +1,13 @@ +import glob import os import time +from typing import List from watchdog.events import FileSystemEventHandler from watchdog.observers import Observer +from rocket.utils import gather_glob_paths + class FileWatcher: class _Handler(FileSystemEventHandler): @@ -11,17 +15,23 @@ def __init__(self, watcher_instance): self.watcher_instance = watcher_instance def on_modified(self, event): - if event.is_directory: + _current_glob_files = gather_glob_paths(self.watcher_instance.glob_paths) + if event.src_path in _current_glob_files: + self.watcher_instance.modified_files.add(event.src_path) + elif event.is_directory: return - if os.path.splitext(event.src_path)[1] == ".py": - self.watcher_instance.modified_files.append(event.src_path) + elif os.path.splitext(event.src_path)[1] == ".py": + self.watcher_instance.modified_files.add(event.src_path) - def __init__(self, path_to_watch, callback, recursive=True): + def __init__(self, path_to_watch, callback, recursive=True, glob_paths: List[str] = None): self.path_to_watch = path_to_watch self.callback = callback self.recursive = recursive self.observer = Observer() - self.modified_files = [] + self.modified_files = set() + self.glob_paths = glob_paths + if self.glob_paths is None: + self.glob_paths = [] self.handler = self._Handler(self) def start(self): @@ -33,7 +43,7 @@ def start(self): while True: time.sleep(1) if self.modified_files: - self.callback(self.modified_files) + self.callback(list(self.modified_files)) self.modified_files.clear() except KeyboardInterrupt: self.observer.stop() diff --git a/rocket/rocket.py b/rocket/rocket.py index 3caa6ae..dd1e789 100644 --- a/rocket/rocket.py +++ b/rocket/rocket.py @@ -11,6 +11,7 @@ extract_python_package_dirs, extract_python_files_from_folder, execute_for_each_multithreaded, + gather_glob_paths, ) @@ -82,7 +83,13 @@ def launch( project_name = os.path.abspath(project_location).split("/")[-1] dbfs_path = f"{dbfs_path}/{project_name}" - self._build_and_deploy(watch=watch, project_location=project_location, dbfs_path=dbfs_path, glob_path=glob_path) + glob_paths = [] + if isinstance(glob_path, str): + glob_paths = [os.path.join(project_location, glob_path)] + elif isinstance(glob_path, list): + glob_paths = [os.path.join(project_location, path) for path in glob_path] + + self._build_and_deploy(watch=watch, project_location=project_location, dbfs_path=dbfs_path, glob_paths=glob_paths) if watch: watcher = FileWatcher( project_location, @@ -91,8 +98,9 @@ def launch( modified_files=watcher.modified_files, dbfs_path=dbfs_path, project_location=project_location, - glob_path=glob_path + glob_paths=glob_path ), + glob_paths=glob_paths, ) watcher.start() @@ -102,7 +110,7 @@ def _build_and_deploy( project_location: str, dbfs_path: str, modified_files: Optional[List[str]] = None, - glob_path: Optional[Union[str, List[str]]] = None + glob_paths: Optional[List[str]] = None ) -> None: if modified_files: logger.info(f"Found changes in {modified_files}. Overwriting them.") @@ -147,11 +155,8 @@ def _build_and_deploy( for package_dir in package_dirs: files.update(extract_python_files_from_folder(package_dir)) - if isinstance(glob_path, str): - files.update(glob.glob(os.path.join(project_location, glob_path))) - elif isinstance(glob_path, list): - for path in glob_path: - files.update(glob.glob(os.path.join(project_location, path))) + if glob_paths is not None: + files.update(gather_glob_paths(glob_paths)) project_files = ["setup.py", "pyproject.toml"] for project_file in project_files: diff --git a/rocket/utils.py b/rocket/utils.py index 4bfe6e4..1687d22 100644 --- a/rocket/utils.py +++ b/rocket/utils.py @@ -1,7 +1,9 @@ import concurrent.futures +import glob import os import subprocess +from typing import List, Set from rocket.logger import logger @@ -53,3 +55,10 @@ def extract_python_files_from_folder(path): py_files.append(os.path.join(root, file)) return py_files + + +def gather_glob_paths(glob_paths: List[str]) -> Set[str]: + _unique_paths = set() + for glob_path in glob_paths: + _unique_paths.update(glob.glob(glob_path)) + return _unique_paths