diff --git a/llm_engineering/interfaces/orchestrator/pipelines/__init__.py b/pipelines/__init__.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/pipelines/__init__.py rename to pipelines/__init__.py diff --git a/llm_engineering/interfaces/orchestrator/pipelines/digital_data_etl.py b/pipelines/digital_data_etl.py similarity index 67% rename from llm_engineering/interfaces/orchestrator/pipelines/digital_data_etl.py rename to pipelines/digital_data_etl.py index c765b1b..0b5b041 100644 --- a/llm_engineering/interfaces/orchestrator/pipelines/digital_data_etl.py +++ b/pipelines/digital_data_etl.py @@ -1,6 +1,6 @@ from zenml import pipeline -from llm_engineering.interfaces.orchestrator.steps.etl import crawl_links, get_or_create_user +from steps.etl import crawl_links, get_or_create_user @pipeline diff --git a/llm_engineering/interfaces/orchestrator/pipelines/export_artifact_to_json.py b/pipelines/export_artifact_to_json.py similarity index 86% rename from llm_engineering/interfaces/orchestrator/pipelines/export_artifact_to_json.py rename to pipelines/export_artifact_to_json.py index 4924e77..91a0490 100644 --- a/llm_engineering/interfaces/orchestrator/pipelines/export_artifact_to_json.py +++ b/pipelines/export_artifact_to_json.py @@ -3,7 +3,7 @@ from zenml import pipeline from zenml.client import Client -from llm_engineering.interfaces.orchestrator.steps import export as export_steps +from steps import export as export_steps @pipeline diff --git a/llm_engineering/interfaces/orchestrator/pipelines/feature_engineering.py b/pipelines/feature_engineering.py similarity index 81% rename from llm_engineering/interfaces/orchestrator/pipelines/feature_engineering.py rename to pipelines/feature_engineering.py index 962d32d..8d58b35 100644 --- a/llm_engineering/interfaces/orchestrator/pipelines/feature_engineering.py +++ b/pipelines/feature_engineering.py @@ -1,6 +1,6 @@ from zenml import pipeline -from llm_engineering.interfaces.orchestrator.steps import feature_engineering as fe_steps +from steps import feature_engineering as fe_steps @pipeline diff --git a/llm_engineering/interfaces/orchestrator/pipelines/generate_instruct_datasets.py b/pipelines/generate_instruct_datasets.py similarity index 84% rename from llm_engineering/interfaces/orchestrator/pipelines/generate_instruct_datasets.py rename to pipelines/generate_instruct_datasets.py index 3a14072..10492ea 100644 --- a/llm_engineering/interfaces/orchestrator/pipelines/generate_instruct_datasets.py +++ b/pipelines/generate_instruct_datasets.py @@ -1,6 +1,6 @@ from zenml import pipeline -from llm_engineering.interfaces.orchestrator.steps import generate_instruct_datasets as cd_steps +from steps import generate_instruct_datasets as cd_steps @pipeline diff --git a/llm_engineering/interfaces/orchestrator/pipelines/training.py b/pipelines/training.py similarity index 89% rename from llm_engineering/interfaces/orchestrator/pipelines/training.py rename to pipelines/training.py index c0ff534..decba50 100644 --- a/llm_engineering/interfaces/orchestrator/pipelines/training.py +++ b/pipelines/training.py @@ -1,7 +1,7 @@ from zenml import pipeline from zenml.client import Client -from llm_engineering.interfaces.orchestrator.steps import training as training_steps +from steps import training as training_steps @pipeline diff --git a/poetry.lock b/poetry.lock index 6e1e501..fdd6509 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1289,6 +1289,16 @@ files = [ {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"}, ] +[[package]] +name = "html2text" +version = "2024.2.26" +description = "Turn HTML into equivalent Markdown-structured text." +optional = false +python-versions = ">=3.8" +files = [ + {file = "html2text-2024.2.26.tar.gz", hash = "sha256:05f8e367d15aaabc96415376776cdd11afd5127a77fce6e36afc60c563ca2c32"}, +] + [[package]] name = "httpcore" version = "1.0.5" @@ -5438,4 +5448,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "~3.11" -content-hash = "e3b22d97c2ba0bb0af25619a6bf8bd441547b9ac07a9752136d65d0007517ad3" +content-hash = "bcecf4160348020e4e0d8dedbfa642eec3185e6d0389f4d5c54ddce6343883d9" diff --git a/pyproject.toml b/pyproject.toml index 218f336..81a106a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ numpy = "^1.26.4" selenium = "^4.21.0" webdriver-manager = "^4.0.1" beautifulsoup4 = "^4.12.3" +html2text = "^2024.2.26" jmespath = "^1.0.1" chromedriver-autoinstaller = "^0.6.4" @@ -59,7 +60,7 @@ build-backend = "poetry.core.masonry.api" [tool.poe.tasks] # Pipelines -run-digital-data-etl-alex = "python -m llm_engineering.interfaces.orchestrator.run --run-etl --no-cache --etl-config-filename digital_data_etl_alex_vesa.yaml" +run-digital-data-etl-alex = "python run.py --run-etl --no-cache --etl-config-filename digital_data_etl_alex_vesa.yaml" run-digital-data-etl-maxime = "python -m llm_engineering.interfaces.orchestrator.run --run-etl --no-cache --etl-config-filename digital_data_etl_maxime_labonne.yaml" run-digital-data-etl-paul = "python -m llm_engineering.interfaces.orchestrator.run --run-etl --no-cache --etl-config-filename digital_data_etl_paul_iusztin.yaml" run-digital-data-etl = [ @@ -93,6 +94,10 @@ local-infrastructure-down = [ "local-zenml-server-down", ] +# ZenML +set-local-stack = "zenml stack set default" +set-aws-stack = "zenml stack set aws-stack" + # Docker build-docker-image = "docker buildx build --platform linux/amd64 -t llmtwin -f Dockerfile ." run-docker-digital-data-etl = "docker run --rm --network host --shm-size=2g --env-file .env llmtwin poetry poe run-digital-data-etl" diff --git a/llm_engineering/interfaces/orchestrator/run.py b/run.py similarity index 96% rename from llm_engineering/interfaces/orchestrator/run.py rename to run.py index dcc804b..cbbd8c6 100644 --- a/llm_engineering/interfaces/orchestrator/run.py +++ b/run.py @@ -3,7 +3,7 @@ import click -from llm_engineering.interfaces.orchestrator.pipelines import ( +from pipelines import ( digital_data_etl, export_artifact_to_json, feature_engineering, @@ -102,7 +102,8 @@ def main( pipeline_args = { "enable_cache": not no_cache, } - root_dir = Path(__file__).resolve().parent.parent.parent.parent + # root_dir = Path(__file__).resolve().parent.parent.parent.parent + root_dir = Path(__file__).resolve().parent if run_etl: run_args_etl = {} diff --git a/llm_engineering/interfaces/orchestrator/steps/__init__.py b/steps/__init__.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/__init__.py rename to steps/__init__.py diff --git a/llm_engineering/interfaces/orchestrator/steps/etl/__init__.py b/steps/etl/__init__.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/etl/__init__.py rename to steps/etl/__init__.py diff --git a/llm_engineering/interfaces/orchestrator/steps/etl/crawl_links.py b/steps/etl/crawl_links.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/etl/crawl_links.py rename to steps/etl/crawl_links.py diff --git a/llm_engineering/interfaces/orchestrator/steps/etl/get_or_create_user.py b/steps/etl/get_or_create_user.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/etl/get_or_create_user.py rename to steps/etl/get_or_create_user.py diff --git a/llm_engineering/interfaces/orchestrator/steps/export/__init__.py b/steps/export/__init__.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/export/__init__.py rename to steps/export/__init__.py diff --git a/llm_engineering/interfaces/orchestrator/steps/export/serialize_artifact.py b/steps/export/serialize_artifact.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/export/serialize_artifact.py rename to steps/export/serialize_artifact.py diff --git a/llm_engineering/interfaces/orchestrator/steps/export/to_json.py b/steps/export/to_json.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/export/to_json.py rename to steps/export/to_json.py diff --git a/llm_engineering/interfaces/orchestrator/steps/feature_engineering/__init__.py b/steps/feature_engineering/__init__.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/feature_engineering/__init__.py rename to steps/feature_engineering/__init__.py diff --git a/llm_engineering/interfaces/orchestrator/steps/feature_engineering/clean.py b/steps/feature_engineering/clean.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/feature_engineering/clean.py rename to steps/feature_engineering/clean.py diff --git a/llm_engineering/interfaces/orchestrator/steps/feature_engineering/load_to_vector_db.py b/steps/feature_engineering/load_to_vector_db.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/feature_engineering/load_to_vector_db.py rename to steps/feature_engineering/load_to_vector_db.py diff --git a/llm_engineering/interfaces/orchestrator/steps/feature_engineering/query_data_warehouse.py b/steps/feature_engineering/query_data_warehouse.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/feature_engineering/query_data_warehouse.py rename to steps/feature_engineering/query_data_warehouse.py diff --git a/llm_engineering/interfaces/orchestrator/steps/feature_engineering/rag.py b/steps/feature_engineering/rag.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/feature_engineering/rag.py rename to steps/feature_engineering/rag.py diff --git a/llm_engineering/interfaces/orchestrator/steps/generate_instruct_datasets/__init__.py b/steps/generate_instruct_datasets/__init__.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/generate_instruct_datasets/__init__.py rename to steps/generate_instruct_datasets/__init__.py diff --git a/llm_engineering/interfaces/orchestrator/steps/generate_instruct_datasets/create_prompts.py b/steps/generate_instruct_datasets/create_prompts.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/generate_instruct_datasets/create_prompts.py rename to steps/generate_instruct_datasets/create_prompts.py diff --git a/llm_engineering/interfaces/orchestrator/steps/generate_instruct_datasets/generate.py b/steps/generate_instruct_datasets/generate.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/generate_instruct_datasets/generate.py rename to steps/generate_instruct_datasets/generate.py diff --git a/llm_engineering/interfaces/orchestrator/steps/generate_instruct_datasets/push_to_huggingface.py b/steps/generate_instruct_datasets/push_to_huggingface.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/generate_instruct_datasets/push_to_huggingface.py rename to steps/generate_instruct_datasets/push_to_huggingface.py diff --git a/llm_engineering/interfaces/orchestrator/steps/generate_instruct_datasets/query_feature_store.py b/steps/generate_instruct_datasets/query_feature_store.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/generate_instruct_datasets/query_feature_store.py rename to steps/generate_instruct_datasets/query_feature_store.py diff --git a/llm_engineering/interfaces/orchestrator/steps/training/__init__.py b/steps/training/__init__.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/training/__init__.py rename to steps/training/__init__.py diff --git a/llm_engineering/interfaces/orchestrator/steps/training/tokenize.py b/steps/training/tokenize.py similarity index 100% rename from llm_engineering/interfaces/orchestrator/steps/training/tokenize.py rename to steps/training/tokenize.py