Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cache management features #799

Merged
merged 9 commits into from
Aug 22, 2024
19 changes: 19 additions & 0 deletions docs/intro/tutorial07.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,25 @@ This will download the pato.db sqlite file once, and cache it.

PyStow is used to cache the file, and the default location is ``~/.data/oaklib``.

By default, a cached SQLite file will be automatically refreshed (downloaded
again) if it is older than 7 days. That behavior can be controlled with the
global ``--caching`` option. For example, to force OAK to always download the
file regardless of its age:

.. code-block::

runoak --caching=refresh -i sqlite:obo:pato search t~shape

Other possible values for the ``--caching`` option include:

- ``no-refresh`` to prevent OAK from re-downloading the file even it is older
than 7 days;
- ``Xd`` to refresh a cached file older than _X_ days;
- ``Xw`` to refresh a cached file older than _X_ weeks.

You may also use the ``cache-clear`` command to force clearing any cached
SQLite file at anytime.
gouttegd marked this conversation as resolved.
Show resolved Hide resolved

Building your own SQLite files
-------------------

Expand Down
40 changes: 21 additions & 19 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,12 @@
# See https://stackoverflow.com/questions/47972638/how-can-i-define-the-order-of-click-sub-commands-in-help
import json
import logging
import os
import statistics as stats
import sys
from collections import defaultdict
from enum import Enum, unique
from itertools import chain
from pathlib import Path
from time import time
from types import ModuleType
from typing import (
Any,
Expand All @@ -28,7 +26,6 @@

import click
import kgcl_schema.grammar.parser as kgcl_parser
import pystow
import sssom.writers as sssom_writers
import sssom_schema
import yaml
Expand All @@ -42,6 +39,7 @@

import oaklib.datamodels.taxon_constraints as tcdm
from oaklib import datamodels
from oaklib.constants import FILE_CACHE
from oaklib.converters.logical_definition_flattener import LogicalDefinitionFlattener
from oaklib.datamodels import synonymizer_datamodel
from oaklib.datamodels.association import RollupGroup
Expand Down Expand Up @@ -149,6 +147,7 @@
generate_disjoint_class_expressions_axioms,
)
from oaklib.utilities.basic_utils import pairs_as_dict
from oaklib.utilities.caching import CachePolicy
from oaklib.utilities.iterator_utils import chunk
from oaklib.utilities.kgcl_utilities import (
generate_change_id,
Expand Down Expand Up @@ -568,6 +567,13 @@ def _apply_changes(impl, changes: List[kgcl.Change]):
show_default=True,
help="If set, will profile the command",
)
@click.option(
"--caching",
type=CachePolicy.ClickType,
default="1w",
show_default=True,
help="Set the cache management policy",
)
def main(
verbose: int,
quiet: bool,
Expand All @@ -587,6 +593,7 @@ def main(
prefix,
profile: bool,
import_depth: Optional[int],
caching: CachePolicy,
**kwargs,
):
"""
Expand Down Expand Up @@ -635,6 +642,7 @@ def exit():
import requests_cache

requests_cache.install_cache(requests_cache_db)
FILE_CACHE.policy = caching
resource = OntologyResource()
resource.slug = input
settings.autosave = autosave
Expand Down Expand Up @@ -5454,12 +5462,14 @@ def cache_ls():
"""
List the contents of the pystow oaklib cache.

TODO: this currently only works on unix-based systems.
"""
directory = pystow.api.join("oaklib")
command = f"ls -al {directory}"
click.secho(f"[pystow] {command}", fg="cyan", bold=True)
os.system(command) # noqa:S605
units = ["B", "KB", "MB", "GB", "TB"]
for path, size, mtime in FILE_CACHE.get_contents(subdirs=True):
i = 0
while size > 1024 and i < len(units) - 1:
size /= 1024
i += 1
click.echo(f"{path} ({size:.2f} {units[i]}, {mtime:%Y-%m-%d})")


@main.command()
Expand All @@ -5475,17 +5485,9 @@ def cache_clear(days_old: int):
Clear the contents of the pystow oaklib cache.

"""
directory = pystow.api.join("oaklib")
now = time()
for item in Path(directory).glob("*"):
if ".db" not in str(item):
continue
mtime = item.stat().st_mtime
curr_days_old = (int(now) - int(mtime)) / 86400
logging.info(f"{item} is {curr_days_old}")
if curr_days_old > days_old:
click.echo(f"Deleting {item} which is {curr_days_old}")
item.unlink()

for name, _, age in FILE_CACHE.clear(subdirs=False, older_than=days_old, pattern="*.db*"):
click.echo(f"Deleted {name} which was {age.days} days old")


@main.command()
Expand Down
4 changes: 4 additions & 0 deletions src/oaklib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@

import pystow

from oaklib.utilities.caching import FileCache

__all__ = [
"OAKLIB_MODULE",
"FILE_CACHE",
]

OAKLIB_MODULE = pystow.module("oaklib")
FILE_CACHE = FileCache(OAKLIB_MODULE, '1w')
TIMEOUT_SECONDS = 30
4 changes: 2 additions & 2 deletions src/oaklib/implementations/llm_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Tuple

import pystow
from linkml_runtime.dumpers import yaml_dumper
from sssom_schema import Mapping
from tenacity import (
Expand All @@ -19,6 +18,7 @@
)

from oaklib import BasicOntologyInterface
from oaklib.constants import FILE_CACHE
from oaklib.datamodels.class_enrichment import ClassEnrichmentResult
from oaklib.datamodels.item_list import ItemList
from oaklib.datamodels.obograph import DefinitionPropertyValue
Expand Down Expand Up @@ -148,7 +148,7 @@ def config_to_prompt(configuration: Optional[ValidationConfiguration]) -> Option

for obj in configuration.documentation_objects:
if obj.startswith("http:") or obj.startswith("https:"):
path = pystow.ensure("oaklib", "documents", url=obj)
path = FILE_CACHE.ensure("documents", url=obj)
else:
path = obj
with open(path) as f:
Expand Down
4 changes: 2 additions & 2 deletions src/oaklib/implementations/sqldb/sql_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@

import oaklib.datamodels.ontology_metadata as om
import oaklib.datamodels.validation_datamodel as vdm
from oaklib.constants import OAKLIB_MODULE
from oaklib.constants import FILE_CACHE
from oaklib.datamodels import obograph, ontology_metadata
from oaklib.datamodels.association import Association
from oaklib.datamodels.obograph import (
Expand Down Expand Up @@ -342,7 +342,7 @@ def __post_init__(self):
# Option 1 uses direct URL construction:
url = f"https://s3.amazonaws.com/bbop-sqlite/{prefix}.db.gz"
logging.info(f"Ensuring gunzipped for {url}")
db_path = OAKLIB_MODULE.ensure_gunzip(url=url, autoclean=False)
db_path = FILE_CACHE.ensure_gunzip(url=url, autoclean=False)
# Option 2 uses botocore to interface with the S3 API directly:
# db_path = OAKLIB_MODULE.ensure_from_s3(s3_bucket="bbop-sqlite", s3_key=f"{prefix}.db")
locator = f"sqlite:///{db_path}"
Expand Down
Loading
Loading