Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

License classify modeling #27

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
3b3398f
Primary dataset collection completed with rough drafts
Bransthre Nov 15, 2022
00b67b4
Dataset Constrained Recollection
Bransthre Nov 16, 2022
a37a662
dataset remodeling and cleaning
Bransthre Nov 17, 2022
eeb1f9e
current wip at 48, 70, 78
Bransthre Nov 19, 2022
458e717
intermediate progress on failing dataset
Bransthre Nov 19, 2022
dec5a95
resolved merge conflict on vscode?
Bransthre Nov 19, 2022
ee98370
resolved merge conflict on vscode
Bransthre Nov 19, 2022
8ccc536
reverted back to original
Bransthre Nov 19, 2022
da4477b
reverted back to original
Bransthre Nov 19, 2022
a066d06
wasted progress
Bransthre Nov 19, 2022
8ff77f6
help.
Bransthre Nov 19, 2022
bd90221
Acquired 47, 66, 80
Bransthre Nov 19, 2022
bcb29e6
Using SVD has raised performance
Bransthre Nov 20, 2022
7c2b2fc
Reached 52, 75, 84 on RB.
Bransthre Nov 20, 2022
97c3aa8
Modeling Phase completed, code unrevised yet
Bransthre Nov 20, 2022
a813403
Merge branch 'main' into license-classify-modeling
TimidRobot Dec 7, 2022
f955029
Merge branch 'main' into license-classify-modeling
TimidRobot Jan 17, 2023
0326b9a
Merge branch 'main' into license-classify-modeling
TimidRobot Jan 28, 2023
e72f9a6
update per isort, black, and flake8
TimidRobot Jan 28, 2023
5c315fb
Merge branch 'main' into license-classify-modeling
TimidRobot Feb 9, 2023
7f102d7
Merge branch 'main' into license-classify-modeling
TimidRobot Mar 7, 2023
9a17e87
Merge branch 'main' into license-classify-modeling
TimidRobot Mar 7, 2023
cccf58b
Merge branch 'main' into license-classify-modeling
TimidRobot Mar 8, 2023
d6aa8e5
Merge branch 'main' into license-classify-modeling
TimidRobot Apr 13, 2023
8a42aa0
Merge branch 'main' into license-classify-modeling
TimidRobot Oct 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,609 changes: 1,609 additions & 0 deletions model_sampling/dataset_sampling.ipynb

Large diffs are not rendered by default.

206 changes: 206 additions & 0 deletions model_sampling/dataset_sampling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#!/usr/bin/env python
"""
This file is a Python script for generating the Data Science Discovery modeling
task's training dataset.
"""

# Standard library
import os
import sys
import traceback

# Third-party
import pandas as pd
import query_secrets
import requests
import sqlalchemy
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

API_KEYS = query_secrets.API_KEYS
API_KEYS_IND = 0
CWD = os.path.dirname(os.path.abspath(__file__))
MODEL_DATABASE = f"{CWD}" f"/model_dataset.db"
PSE_KEY = query_secrets.PSE_KEY

RIGHTS_MAP = {
"by": "cc_attribute",
"sa": "cc_sharealike",
"nc": "cc_noncommercial",
"nd": "cc_nonderived",
"publicdomain": "cc_publicdomain",
}


def get_rights(license_type):
# TODO: Documentation
return [RIGHTS_MAP[right] for right in RIGHTS_MAP if right in license_type]


def get_license_map():
# TODO: Documentation
cc_license_data = pd.read_csv(f"{CWD}/legal-tool-paths.txt", header=None)
license_pattern = r"((?:[^/]+/){2}(?:[^/]+)).*"
license_pattern_map = {
"by": "licenses/by/",
"by-sa": "licenses/by-sa/",
"by-nc": "licenses/by-nc/",
"by-nc-sa": "licenses/by-nc-sa/",
"by-nd": "licenses/by-nd/",
"by-nc-nd": "licenses/by-nc-nd/|licenses/by-nd-nc/",
"publicdomain": "publicdomain/",
}
license_list = pd.Series(
cc_license_data[0]
.str.extract(license_pattern, expand=False)
.dropna()
.unique()
)
license_series_map = {
k: license_list[license_list.str.contains(license_pattern_map[k])]
for k in license_pattern_map
}
return license_series_map


def get_api_endpoint(license_type, license_rights, start):
# TODO: Documentation
try:
api_key = API_KEYS[API_KEYS_IND]
base_url = (
r"https://customsearch.googleapis.com/customsearch/v1"
f"?key={api_key}&cx={PSE_KEY}&"
f"q=-fileType%3Apdf%20-inurl%3Apdf%20-pdf&"
f"start={start}&"
f"m12&" # Third Layer Strictness
)
base_url = (
f"{base_url}&linkSite=creativecommons.org"
f'{license_type.replace("/", "%2F")}'
f"&rights={'%7'.join(license_rights)}"
)
return base_url
except Exception as e:
if isinstance(e, IndexError):
print(
"IndexError: Depleted all API Keys provided", file=sys.stderr
)
else:
raise e


def get_api_response(license_type, start, retry_on_empty=2):
# TODO: Documentation
try:
request_url = get_api_endpoint(
license_type, get_rights(license_type), start
)
max_retries = Retry(
total=5,
backoff_factor=10,
status_forcelist=[400, 403, 408, 500, 502, 503, 504],
# 429 is Quota Limit Exceeded, which will be handled alternatively
)
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=max_retries))
with session.get(request_url) as response:
response.raise_for_status()
search_data = response.json()
return search_data["items"]
except Exception as e:
if isinstance(e, KeyError):
if retry_on_empty:
return get_api_response(
license_type, start, retry_on_empty - 1
)
else:
return {}
if isinstance(e, requests.exceptions.HTTPError):
global API_KEYS_IND
API_KEYS_IND += 1
print(
"Changing API KEYS due to depletion of quota", file=sys.stderr
)
return get_api_response(license_type, start)
else:
print(f"Request URL was {request_url}", file=sys.stderr)
raise e


def get_address_entries(web_url, content_char_count=5000):
# TODO: Documentation
try:
web_contents = requests.get(web_url).text
encoding = EncodingDetector.find_declared_encoding(
web_contents, is_html=True
)
soup = BeautifulSoup(web_contents, "lxml", from_encoding=encoding)
for script in soup(["script", "style"]):
script.extract()
parse_result = soup.get_text(" ", strip=True)
return (web_url, soup.title, parse_result[:content_char_count])
except Exception:
return None


def get_license_type_sample_df(license_type):
# TODO: Documentation
license_sample_dict = {
"license": [],
"url": [],
"title": [],
"contents": [],
}
for start_ind in range(1, 101, 10):
license_subresponse = get_api_response(license_type, start_ind)
for entry in license_subresponse:
if ".pdf" in entry["link"] or ".txt" in entry["link"]:
continue
address_entries = get_address_entries(entry["link"])
if address_entries is not None:
license_sample_dict["license"].append(license_type)
license_sample_dict["url"].append(address_entries[0])
license_sample_dict["title"].append(str(address_entries[1]))
license_sample_dict["contents"].append(address_entries[2])
print(f"DEBUG: {license_type} has been sampled.")
return pd.DataFrame(license_sample_dict)


def get_license_series_sample_df(general_license_series):
# TODO: Documentation
return pd.concat(
[
get_license_type_sample_df(license_type)
for license_type in general_license_series
]
)


def load_general_licenses():
# TODO: Documentation
engine = sqlalchemy.create_engine(f"sqlite:///{CWD}/modeling_dataset.db")
engine.connect()
license_map = get_license_map()
for general_type in license_map:
sampled_df = get_license_series_sample_df(license_map[general_type])
sampled_df.to_sql(general_type, engine, if_exists="append")


def main():
load_general_licenses()


if __name__ == "__main__":
try:
main()
except SystemExit as e:
sys.exit(e.code)
except KeyboardInterrupt:
print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr)
sys.exit(130)
except Exception:
print("ERROR (1) Unhandled exception:", file=sys.stderr)
print(traceback.print_exc(), file=sys.stderr)
sys.exit(1)
Loading