-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
350 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
name: 'Check Links' | ||
on: | ||
workflow_dispatch: | ||
push: | ||
pull_request: | ||
|
||
jobs: | ||
link_check: | ||
name: 'Link Check' | ||
uses: STRIDES/NIHCloudLab/.github/workflows/check-links.yaml@main | ||
with: | ||
repo_link_ignore_list: "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
name: 'Check Links' | ||
on: | ||
workflow_call: | ||
inputs: | ||
directory: | ||
required: false | ||
type: string | ||
repo_link_ignore_list: | ||
required: true | ||
type: string | ||
secrets: | ||
PAT: | ||
required: false | ||
jobs: | ||
link_check: | ||
name: 'Link Check' | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
|
||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
with: | ||
repository: STRIDES/NIHCloudLab | ||
path: reusable-workflow-repo | ||
ref: main | ||
|
||
- name: Link Check | ||
run: | | ||
python3 ${GITHUB_WORKSPACE}/reusable-workflow-repo/.github/workflows/check_links.py | ||
env: | ||
LINK_IGNORE_LIST: "https://www.sciencedirect.com,https://portlandpress.com,cloud.google.com,aws.amazon.com,https://journals.biologists.com/bio/article-pdf/5/8/1134/1114440/bio020065.pdf,https://onlinelibrary.wiley.com/doi/10.1111/1755-0998.13593,https://www.ensembl.org/,https://github.com/NIGMS/NIGMS-Sandbox/blob/main/README.md,https://github.com/rnaseqprok,https://github.com/rnaseqprok/rnaseqprok/blob/master/CITATIONS.md,https://github.com/NIGMS/NIGMS-Sandbox/blob/main/README.md#available-modules,https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18,https://snakemake.readthedocs.io/en/stable/executor_tutorial/google_lifesciences.html,https://nersc.gitlab.io/development/shifter/how-to-use/,http://bioconductor.org/packages/release/BiocViews.html#___OrgDb,https://www.bioconductor.org/packages/release/bioc/vignettes/methylKit/inst/doc/methylKit.html#4_Annotating_differentially_methylated_bases_or_regions,https://docs.qiime2.org/2024.2/install/native/#qiime-2-metagenome-distribution,https://www.researchgate.net/figure/Diagrams-of-the-BRCA1-and-BRCA2-genes-indicating-the-position-of-pathogenic-variants_fig2_321205153,https://www.fda.gov/drugs/biomarker-qualification-program/about-biomarkers-and-qualification,https://github.com/NIGMS/NIGMS-Sandbox/blob/main/README.md#available-modules,https://www.cambridge.org/highereducation/books/introduction-to-applied-linear-algebra/4D69AF22E38303FE20FFEEFDCE0E7F96#overview,https://doi.org/10.1089/wound.2019.1030" | ||
PAT: ${{ secrets.PAT }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
import http.client | ||
import urllib.request, urllib.error | ||
import os | ||
import sys | ||
import re | ||
|
||
|
||
|
||
# set some default variables | ||
remove_characters = ['**', '\\n'] | ||
|
||
# text that tends to be at the end of the url that we need truncate everything past them | ||
end_characters = [')',",","'",'`',"\"",'</a>','</div>',"\\",">","]"] | ||
|
||
big_regex = re.compile('|'.join(map(re.escape, remove_characters))) | ||
|
||
# if there are any URLs to ignore add here | ||
link_ignore_list = [] | ||
link_ignore_list_env = os.getenv("LINK_IGNORE_LIST") | ||
if link_ignore_list_env and len(link_ignore_list_env) > 0: | ||
link_ignore_list = link_ignore_list_env.split(',') | ||
|
||
# Add any repo specific ignores | ||
link_ignore_list_env_2 = os.getenv("inputs.repo_link_ignore_list") | ||
if link_ignore_list_env_2 and len(link_ignore_list_env_2) > 0: | ||
link_ignore_list.extend(link_ignore_list_env.split(',')) | ||
|
||
print_valid = os.getenv("print_valid_links") is not None | ||
|
||
# If we are given a directory then use it, otherwise assume path is current directory | ||
path = "." | ||
if len(sys.argv) >1 and os.path.exists(sys.argv[1]): | ||
path = sys.argv[1] | ||
|
||
# directory environment overrides the system arguments and default. | ||
directory_env = os.getenv("inputs.directory") | ||
if directory_env and len(directory_env) > 0: | ||
path = directory_env | ||
|
||
pat_env = os.getenv("INPUT_PAT") | ||
if directory_env and len(directory_env) > 0: | ||
path = directory_env | ||
|
||
# list which stores all links to check | ||
links_to_check = [] | ||
link_file_map = {} | ||
# Get the response code of the url to see if it exists | ||
def getResponseCode(url): | ||
content = None | ||
try: | ||
req = urllib.request.Request(url, | ||
headers={'User-Agent': 'Mozilla/5.0'}) | ||
conn = urllib.request.urlopen(req) | ||
# Only get HTML if we have a potential anchor link | ||
if "#" in url and "pdf" not in url: | ||
content = conn.read().decode("utf-8") | ||
except urllib.error.HTTPError as e: | ||
return [e.code, content] | ||
except urllib.error.URLError as e: | ||
return [404, content] | ||
except http.client.InvalidURL: | ||
return [200, content] | ||
return [conn.getcode(), content] | ||
|
||
def clean_link(link): | ||
if link.endswith("."): | ||
link = link[:link.rfind(".")] | ||
if link.endswith("'"): | ||
link = link[:link.rfind("'")] | ||
if link.endswith("\""): | ||
link = link[:link.rfind("\"")] | ||
if link.endswith(","): | ||
link = link[:link.rfind(",")] | ||
link_stripped = big_regex.sub("", link.strip()) | ||
for end_c in end_characters: | ||
end_index = link_stripped.find(end_c) | ||
if end_index != -1: | ||
link_stripped = link_stripped[:end_index] | ||
return link_stripped | ||
|
||
def add_link(loc,link): | ||
# this is a command being ran so difficult to validate in this script, skip it | ||
if '$(uname' in link or "http" not in link: | ||
return False | ||
|
||
# get just from the http portion if there was more in from of the string we grabbed | ||
link = link[link.find("http"):] | ||
|
||
# if there is a period at the end, truncate to that period. Other periods may be valid | ||
# strip various characters that may be in the string | ||
link_stripped = clean_link(link) | ||
while link_stripped != link: | ||
link = link_stripped | ||
link_stripped = clean_link(link) | ||
|
||
# add link to be checked | ||
links_to_check.append(link_stripped) | ||
|
||
# store where the link is so we can fix it | ||
link_file_map[link_stripped] = loc | ||
def check_link(link): | ||
#should we ignore the failure | ||
ignore = False | ||
# try and get the url, if its 404 or 500 then its invalid, let us know and trigger the error flag | ||
code = getResponseCode(link) | ||
loc =link_file_map[link] | ||
# If the link failed, but we are ignoring it then just mention that | ||
for ignored_link in link_ignore_list: | ||
if ignored_link in link: | ||
ignore = True | ||
break | ||
if code[0] in [404, 403, 500]: | ||
if ignore: | ||
print( | ||
loc + " , " + link + " , Ignored") | ||
return False | ||
# print(file+" Code:"+str(code[0])+" Line "+str(line_num)+"("+str(char)+"):"+item_stripped) | ||
print( | ||
loc + " , " + link + " , Failed") | ||
return True | ||
|
||
# check for missing anchors | ||
elif "#" in link and \ | ||
code[1] is not None \ | ||
and 'href=\"' + link[link.find("#"):] + '\"' not in \ | ||
code[1] \ | ||
and 'name=\"' + link[link.find("#")+1:] + '\"' not in \ | ||
code[1] \ | ||
and 'name=\"user-content-' + link[link.find("#") + 1:] + '\"' not in \ | ||
code[1]: | ||
if ignore: | ||
print( | ||
loc + ", " + link + ", Ignored") | ||
return False | ||
print( | ||
loc + " , " + link + " , Failed - Anchor") | ||
return True | ||
# print(file + " Missing Anchor Line " + str( | ||
# line_num) + "(" + str( | ||
# char) + "):" + item_stripped) | ||
elif print_valid: | ||
print( | ||
loc + " , " + link + " , Valid") | ||
return False | ||
|
||
|
||
if __name__ == "__main__": | ||
err = 0 | ||
print("Directory is "+path) | ||
# Loop through all files in path | ||
|
||
for root, dirs, files in os.walk(path): | ||
for file in files: | ||
# only read file that match template ( txt, md or python notebook) | ||
if file.endswith(".md") or file.endswith(".txt") or file.endswith( | ||
".ipynb"): | ||
|
||
# get content and separate into lines and then separate by spaces | ||
raw_content = open(os.path.join(root, file), "r").read() | ||
content = raw_content.split("\n") | ||
content = [x.split(" ") for x in content] | ||
loc = os.path.join(root, file) | ||
# have an incrementer for line number later export | ||
for line in content: | ||
for item in line: | ||
|
||
if "https://" in item or "http://" in item: | ||
if "](" in item: | ||
add_link(loc,item[item.find("]"):]) | ||
# if we get any error then add it | ||
if item[item.find("("):] == item[item.find("]"):]: | ||
continue | ||
add_link(loc,item[item.find("("):]) | ||
else: | ||
add_link(loc,item) | ||
|
||
for link in set(links_to_check): | ||
# if we get any error then add to err variable | ||
err = check_link(link) + err | ||
# if the error is > 1 then set it to 1 to error as 1 | ||
if err > 1: | ||
err = 1 | ||
exit(err) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import os | ||
import shutil | ||
import nbformat | ||
from nbformat.v4 import new_notebook | ||
|
||
def clean_notebook(file_path): | ||
with open(file_path, 'r', encoding='utf-8') as f: | ||
notebook = nbformat.read(f, as_version=4) | ||
|
||
# Clean cells | ||
for cell in notebook.cells: | ||
if 'outputs' in cell: | ||
cell['outputs'] = [] | ||
if 'execution_count' in cell: | ||
cell['execution_count'] = None | ||
if 'metadata' in cell: | ||
cell['metadata'] = {} | ||
|
||
# Clean notebook metadata | ||
if 'metadata' in notebook: | ||
notebook['metadata'] = {} | ||
|
||
with open(file_path, 'w', encoding='utf-8') as f: | ||
nbformat.write(notebook, f) | ||
|
||
def delete_checkpoints_dirs(root_dir): | ||
# Walk through the directory tree | ||
for dirpath, dirnames, filenames in os.walk(root_dir): | ||
for dirname in dirnames: | ||
# Check if the directory name is 'checkpoints' | ||
if dirname == '.ipynb_checkpoints': | ||
# Construct the full path to the directory | ||
dir_to_delete = os.path.join(dirpath, dirname) | ||
# Delete the directory | ||
shutil.rmtree(dir_to_delete) | ||
print(f'Deleted {dir_to_delete}') | ||
print('Consider adding .ipynb_checkpoints to your .gitignore file!') | ||
|
||
|
||
if __name__ == "__main__": | ||
# Change this to the directory containing your notebooks | ||
notebook_dir = '.' | ||
|
||
for root, dirs, files in os.walk(notebook_dir): | ||
for file in files: | ||
if file.endswith('.ipynb'): | ||
file_path = os.path.join(root, file) | ||
clean_notebook(file_path) | ||
print(f'Cleaned {file_path}') | ||
|
||
# Delete all 'checkpoints' directories | ||
delete_checkpoints_dirs(notebook_dir) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
name: 'Lint Notebook' | ||
on: | ||
push: | ||
workflow_dispatch: | ||
permissions: | ||
contents: write | ||
id-token: write | ||
|
||
jobs: | ||
lint: | ||
name: 'Linting' | ||
uses: STRIDES/NIHCloudLab/.github/workflows/notebook-lint.yaml@main | ||
with: | ||
directory: . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
name: 'Lint Notebook' | ||
on: | ||
workflow_call: | ||
inputs: | ||
directory: | ||
required: false | ||
type: string | ||
permissions: | ||
contents: write | ||
id-token: write | ||
|
||
jobs: | ||
lint: | ||
name: 'Linting' | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
|
||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
with: | ||
repository: STRIDES/NIHCloudLab | ||
ref: main | ||
path: reusable-workflow-repo | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: '3.12' | ||
cache: 'pip' | ||
|
||
- name: Install requirements.txt | ||
working-directory: reusable-workflow-repo/.github/workflows/ | ||
run: | | ||
python3 -m pip install --upgrade pip | ||
pip3 install nbformat | ||
- name: Notebook Linting | ||
run: | | ||
python3 ${GITHUB_WORKSPACE}/reusable-workflow-repo/.github/workflows/lint.py | ||
- name: Remove workflow repo | ||
run: | | ||
rm ${GITHUB_WORKSPACE}/reusable-workflow-repo -r | ||
- name: Commit changes | ||
uses: EndBug/add-and-commit@v9 | ||
with: | ||
author_name: github-action | ||
author_email: [email protected] | ||
message: 'Github Action: Lint Notebooks' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
nbformat==5.10.4 |