Skip to content

Commit

Permalink
update github actions
Browse files Browse the repository at this point in the history
  • Loading branch information
mr8lu committed Nov 12, 2024
1 parent e6ac1b4 commit df15966
Show file tree
Hide file tree
Showing 7 changed files with 350 additions and 0 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/check-links-self.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: 'Check Links'
on:
workflow_dispatch:
push:
pull_request:

jobs:
link_check:
name: 'Link Check'
uses: STRIDES/NIHCloudLab/.github/workflows/check-links.yaml@main
with:
repo_link_ignore_list: ""
34 changes: 34 additions & 0 deletions .github/workflows/check-links.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: 'Check Links'
on:
workflow_call:
inputs:
directory:
required: false
type: string
repo_link_ignore_list:
required: true
type: string
secrets:
PAT:
required: false
jobs:
link_check:
name: 'Link Check'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Checkout
uses: actions/checkout@v4
with:
repository: STRIDES/NIHCloudLab
path: reusable-workflow-repo
ref: main

- name: Link Check
run: |
python3 ${GITHUB_WORKSPACE}/reusable-workflow-repo/.github/workflows/check_links.py
env:
LINK_IGNORE_LIST: "https://www.sciencedirect.com,https://portlandpress.com,cloud.google.com,aws.amazon.com,https://journals.biologists.com/bio/article-pdf/5/8/1134/1114440/bio020065.pdf,https://onlinelibrary.wiley.com/doi/10.1111/1755-0998.13593,https://www.ensembl.org/,https://github.com/NIGMS/NIGMS-Sandbox/blob/main/README.md,https://github.com/rnaseqprok,https://github.com/rnaseqprok/rnaseqprok/blob/master/CITATIONS.md,https://github.com/NIGMS/NIGMS-Sandbox/blob/main/README.md#available-modules,https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18,https://snakemake.readthedocs.io/en/stable/executor_tutorial/google_lifesciences.html,https://nersc.gitlab.io/development/shifter/how-to-use/,http://bioconductor.org/packages/release/BiocViews.html#___OrgDb,https://www.bioconductor.org/packages/release/bioc/vignettes/methylKit/inst/doc/methylKit.html#4_Annotating_differentially_methylated_bases_or_regions,https://docs.qiime2.org/2024.2/install/native/#qiime-2-metagenome-distribution,https://www.researchgate.net/figure/Diagrams-of-the-BRCA1-and-BRCA2-genes-indicating-the-position-of-pathogenic-variants_fig2_321205153,https://www.fda.gov/drugs/biomarker-qualification-program/about-biomarkers-and-qualification,https://github.com/NIGMS/NIGMS-Sandbox/blob/main/README.md#available-modules,https://www.cambridge.org/highereducation/books/introduction-to-applied-linear-algebra/4D69AF22E38303FE20FFEEFDCE0E7F96#overview,https://doi.org/10.1089/wound.2019.1030"
PAT: ${{ secrets.PAT }}
183 changes: 183 additions & 0 deletions .github/workflows/check_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import http.client
import urllib.request, urllib.error
import os
import sys
import re



# set some default variables
remove_characters = ['**', '\\n']

# text that tends to be at the end of the url that we need truncate everything past them
end_characters = [')',",","'",'`',"\"",'</a>','</div>',"\\",">","]"]

big_regex = re.compile('|'.join(map(re.escape, remove_characters)))

# if there are any URLs to ignore add here
link_ignore_list = []
link_ignore_list_env = os.getenv("LINK_IGNORE_LIST")
if link_ignore_list_env and len(link_ignore_list_env) > 0:
link_ignore_list = link_ignore_list_env.split(',')

# Add any repo specific ignores
link_ignore_list_env_2 = os.getenv("inputs.repo_link_ignore_list")
if link_ignore_list_env_2 and len(link_ignore_list_env_2) > 0:
link_ignore_list.extend(link_ignore_list_env.split(','))

print_valid = os.getenv("print_valid_links") is not None

# If we are given a directory then use it, otherwise assume path is current directory
path = "."
if len(sys.argv) >1 and os.path.exists(sys.argv[1]):
path = sys.argv[1]

# directory environment overrides the system arguments and default.
directory_env = os.getenv("inputs.directory")
if directory_env and len(directory_env) > 0:
path = directory_env

pat_env = os.getenv("INPUT_PAT")
if directory_env and len(directory_env) > 0:
path = directory_env

# list which stores all links to check
links_to_check = []
link_file_map = {}
# Get the response code of the url to see if it exists
def getResponseCode(url):
content = None
try:
req = urllib.request.Request(url,
headers={'User-Agent': 'Mozilla/5.0'})
conn = urllib.request.urlopen(req)
# Only get HTML if we have a potential anchor link
if "#" in url and "pdf" not in url:
content = conn.read().decode("utf-8")
except urllib.error.HTTPError as e:
return [e.code, content]
except urllib.error.URLError as e:
return [404, content]
except http.client.InvalidURL:
return [200, content]
return [conn.getcode(), content]

def clean_link(link):
if link.endswith("."):
link = link[:link.rfind(".")]
if link.endswith("'"):
link = link[:link.rfind("'")]
if link.endswith("\""):
link = link[:link.rfind("\"")]
if link.endswith(","):
link = link[:link.rfind(",")]
link_stripped = big_regex.sub("", link.strip())
for end_c in end_characters:
end_index = link_stripped.find(end_c)
if end_index != -1:
link_stripped = link_stripped[:end_index]
return link_stripped

def add_link(loc,link):
# this is a command being ran so difficult to validate in this script, skip it
if '$(uname' in link or "http" not in link:
return False

# get just from the http portion if there was more in from of the string we grabbed
link = link[link.find("http"):]

# if there is a period at the end, truncate to that period. Other periods may be valid
# strip various characters that may be in the string
link_stripped = clean_link(link)
while link_stripped != link:
link = link_stripped
link_stripped = clean_link(link)

# add link to be checked
links_to_check.append(link_stripped)

# store where the link is so we can fix it
link_file_map[link_stripped] = loc
def check_link(link):
#should we ignore the failure
ignore = False
# try and get the url, if its 404 or 500 then its invalid, let us know and trigger the error flag
code = getResponseCode(link)
loc =link_file_map[link]
# If the link failed, but we are ignoring it then just mention that
for ignored_link in link_ignore_list:
if ignored_link in link:
ignore = True
break
if code[0] in [404, 403, 500]:
if ignore:
print(
loc + " , " + link + " , Ignored")
return False
# print(file+" Code:"+str(code[0])+" Line "+str(line_num)+"("+str(char)+"):"+item_stripped)
print(
loc + " , " + link + " , Failed")
return True

# check for missing anchors
elif "#" in link and \
code[1] is not None \
and 'href=\"' + link[link.find("#"):] + '\"' not in \
code[1] \
and 'name=\"' + link[link.find("#")+1:] + '\"' not in \
code[1] \
and 'name=\"user-content-' + link[link.find("#") + 1:] + '\"' not in \
code[1]:
if ignore:
print(
loc + ", " + link + ", Ignored")
return False
print(
loc + " , " + link + " , Failed - Anchor")
return True
# print(file + " Missing Anchor Line " + str(
# line_num) + "(" + str(
# char) + "):" + item_stripped)
elif print_valid:
print(
loc + " , " + link + " , Valid")
return False


if __name__ == "__main__":
err = 0
print("Directory is "+path)
# Loop through all files in path

for root, dirs, files in os.walk(path):
for file in files:
# only read file that match template ( txt, md or python notebook)
if file.endswith(".md") or file.endswith(".txt") or file.endswith(
".ipynb"):

# get content and separate into lines and then separate by spaces
raw_content = open(os.path.join(root, file), "r").read()
content = raw_content.split("\n")
content = [x.split(" ") for x in content]
loc = os.path.join(root, file)
# have an incrementer for line number later export
for line in content:
for item in line:

if "https://" in item or "http://" in item:
if "](" in item:
add_link(loc,item[item.find("]"):])
# if we get any error then add it
if item[item.find("("):] == item[item.find("]"):]:
continue
add_link(loc,item[item.find("("):])
else:
add_link(loc,item)

for link in set(links_to_check):
# if we get any error then add to err variable
err = check_link(link) + err
# if the error is > 1 then set it to 1 to error as 1
if err > 1:
err = 1
exit(err)
53 changes: 53 additions & 0 deletions .github/workflows/lint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import shutil
import nbformat
from nbformat.v4 import new_notebook

def clean_notebook(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
notebook = nbformat.read(f, as_version=4)

# Clean cells
for cell in notebook.cells:
if 'outputs' in cell:
cell['outputs'] = []
if 'execution_count' in cell:
cell['execution_count'] = None
if 'metadata' in cell:
cell['metadata'] = {}

# Clean notebook metadata
if 'metadata' in notebook:
notebook['metadata'] = {}

with open(file_path, 'w', encoding='utf-8') as f:
nbformat.write(notebook, f)

def delete_checkpoints_dirs(root_dir):
# Walk through the directory tree
for dirpath, dirnames, filenames in os.walk(root_dir):
for dirname in dirnames:
# Check if the directory name is 'checkpoints'
if dirname == '.ipynb_checkpoints':
# Construct the full path to the directory
dir_to_delete = os.path.join(dirpath, dirname)
# Delete the directory
shutil.rmtree(dir_to_delete)
print(f'Deleted {dir_to_delete}')
print('Consider adding .ipynb_checkpoints to your .gitignore file!')


if __name__ == "__main__":
# Change this to the directory containing your notebooks
notebook_dir = '.'

for root, dirs, files in os.walk(notebook_dir):
for file in files:
if file.endswith('.ipynb'):
file_path = os.path.join(root, file)
clean_notebook(file_path)
print(f'Cleaned {file_path}')

# Delete all 'checkpoints' directories
delete_checkpoints_dirs(notebook_dir)

14 changes: 14 additions & 0 deletions .github/workflows/notebook-lint-self.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: 'Lint Notebook'
on:
push:
workflow_dispatch:
permissions:
contents: write
id-token: write

jobs:
lint:
name: 'Linting'
uses: STRIDES/NIHCloudLab/.github/workflows/notebook-lint.yaml@main
with:
directory: .
53 changes: 53 additions & 0 deletions .github/workflows/notebook-lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: 'Lint Notebook'
on:
workflow_call:
inputs:
directory:
required: false
type: string
permissions:
contents: write
id-token: write

jobs:
lint:
name: 'Linting'
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Checkout
uses: actions/checkout@v4
with:
repository: STRIDES/NIHCloudLab
ref: main
path: reusable-workflow-repo

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip'

- name: Install requirements.txt
working-directory: reusable-workflow-repo/.github/workflows/
run: |
python3 -m pip install --upgrade pip
pip3 install nbformat
- name: Notebook Linting
run: |
python3 ${GITHUB_WORKSPACE}/reusable-workflow-repo/.github/workflows/lint.py
- name: Remove workflow repo
run: |
rm ${GITHUB_WORKSPACE}/reusable-workflow-repo -r
- name: Commit changes
uses: EndBug/add-and-commit@v9
with:
author_name: github-action
author_email: [email protected]
message: 'Github Action: Lint Notebooks'
1 change: 1 addition & 0 deletions .github/workflows/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
nbformat==5.10.4

0 comments on commit df15966

Please sign in to comment.