From beb64389ad815817c5d0af77a42606038c54d093 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 6 Jun 2024 14:34:24 -0700 Subject: [PATCH 1/2] Add ingest-ncbi GH Action workflow Automated runs of the ingest workflow for the public NCBI data. This is scheduled to run daily similar to other pathogen ingests. We are not running the phylogenetic builds automatically for now since there are frequent outgroups that need to be manually excluded. --- .github/workflows/ingest-ncbi.yaml | 65 ++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 .github/workflows/ingest-ncbi.yaml diff --git a/.github/workflows/ingest-ncbi.yaml b/.github/workflows/ingest-ncbi.yaml new file mode 100644 index 0000000..e4a6256 --- /dev/null +++ b/.github/workflows/ingest-ncbi.yaml @@ -0,0 +1,65 @@ +name: Ingest NCBI + +defaults: + run: + # This is the same as GitHub Action's `bash` keyword as of 20 June 2023: + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell + # + # Completely spelling it out here so that GitHub can't change it out from under us + # and we don't have to refer to the docs to know the expected behavior. + shell: bash --noprofile --norc -eo pipefail {0} + +on: + schedule: + # Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings. + # + # Note the actual runs might be late. + # Numerous people were confused, about that, including me: + # - https://github.community/t/scheduled-action-running-consistently-late/138025/11 + # - https://github.com/github/docs/issues/3059 + # + # Note, '*' is a special character in YAML, so you have to quote this string. + # + # Docs: + # - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule + # + # Tool that deciphers this particular format of crontab string: + # - https://crontab.guru/ + # + # Runs at 5pm UTC (1pm EDT/10am PDT) since curation by NCBI happens on the East Coast. + # We were running into invalid zip archive errors at 9am PDT, so hoping an hour + # delay will lower the error frequency + - cron: '0 17 * * *' + + workflow_dispatch: + +jobs: + ingest: + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + run: | + nextstrain build \ + ingest \ + upload_all_ncbi \ + --configfile build-configs/ncbi/defaults/config.yaml + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: ingest-ncbi-build-output + artifact-paths: | + ingest/.snakemake/log/ + ingest/andersen-lab/results/ + ingest/andersen-lab/benchmarks/ + ingest/andersen-lab/logs/ + ingest/joined-ncbi/results/ + ingest/joined-ncbi/benchmarks/ + ingest/joined-ncbi/logs/ + ingest/ncbi/results/ + ingest/ncbi/benchmarks/ + ingest/ncbi/logs/ From 8b01f5387af4b8f7480ceb59ee05d1eb42ddf402 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 6 Jun 2024 15:22:15 -0700 Subject: [PATCH 2/2] Add ingest-fauna GH Action workflow The ingest-fauna workflow is not scheduled to run automatically since updates to fauna are done manually. This at least makes it easy to run the workflow via GH Actions instead of having to do it locally. --- .github/workflows/ingest-fauna.yaml | 42 +++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 .github/workflows/ingest-fauna.yaml diff --git a/.github/workflows/ingest-fauna.yaml b/.github/workflows/ingest-fauna.yaml new file mode 100644 index 0000000..9621c44 --- /dev/null +++ b/.github/workflows/ingest-fauna.yaml @@ -0,0 +1,42 @@ +name: Ingest fauna + +defaults: + run: + # This is the same as GitHub Action's `bash` keyword as of 20 June 2023: + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsshell + # + # Completely spelling it out here so that GitHub can't change it out from under us + # and we don't have to refer to the docs to know the expected behavior. + shell: bash --noprofile --norc -eo pipefail {0} + +on: + workflow_dispatch: + +jobs: + ingest: + permissions: + id-token: write + uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master + secrets: inherit + with: + # Starting with the default docker runtime + # We can migrate to AWS Batch when/if we need to for more resources or if + # the job runs longer than the GH Action limit of 6 hours. + runtime: docker + run: | + nextstrain build \ + --env RETHINK_HOST \ + --env RETHINK_AUTH_KEY \ + ingest \ + upload_all + # Specifying artifact name to differentiate ingest build outputs from + # the phylogenetic build outputs + artifact-name: ingest-fauna-build-output + # Explicitly excluding `ingest/fauna/results` and `ingest/fauna/data` + # since this is private data and should not available through the public artifacts + artifact-paths: | + !ingest/fauna/results/ + !ingest/fauna/data/ + ingest/.snakemake/log/ + ingest/fauna/benchmarks/ + ingest/fauna/logs/