From 48dc794ccc53c4bbcca7d861737fd8d7ac61a7d9 Mon Sep 17 00:00:00 2001 From: Trayan Azarov Date: Thu, 18 Jan 2024 05:52:15 +0200 Subject: [PATCH] feat: Initial commit --- .dockerignore | 80 +++++++++++++++++++++++++ .github/workflows/release.yml | 44 ++++++++++++++ .github/workflows/test.yaml | 32 ++++++++++ .gitignore | 108 ++++++++++++++++++++++++++++++++++ .pre-commit-config.yaml | 34 +++++++++++ Dockerfile | 11 ++++ LICENSE.md | 9 +++ Makefile | 18 ++++++ README.md | 73 +++++++++++++++++++++++ SECURITY.md | 19 ++++++ chroma_ops/__init__.py | 0 chroma_ops/main.py | 20 +++++++ chroma_ops/utils.py | 35 +++++++++++ chroma_ops/wal_clean.py | 69 ++++++++++++++++++++++ chroma_ops/wal_commit.py | 58 ++++++++++++++++++ chroma_ops/wal_export.py | 64 ++++++++++++++++++++ experiments/wal_commit.ipynb | 97 ++++++++++++++++++++++++++++++ mypy.ini | 0 pyproject.toml | 41 +++++++++++++ tests/__init__.py | 0 tests/test_wal_clean.py | 36 ++++++++++++ tests/test_wal_commit.py | 49 +++++++++++++++ tests/test_wal_export.py | 36 ++++++++++++ 23 files changed, 933 insertions(+) create mode 100644 .dockerignore create mode 100644 .github/workflows/release.yml create mode 100644 .github/workflows/test.yaml create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 Dockerfile create mode 100644 LICENSE.md create mode 100644 Makefile create mode 100644 README.md create mode 100644 SECURITY.md create mode 100644 chroma_ops/__init__.py create mode 100644 chroma_ops/main.py create mode 100644 chroma_ops/utils.py create mode 100755 chroma_ops/wal_clean.py create mode 100644 chroma_ops/wal_commit.py create mode 100644 chroma_ops/wal_export.py create mode 100644 experiments/wal_commit.ipynb create mode 100644 mypy.ini create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/test_wal_clean.py create mode 100644 tests/test_wal_commit.py create mode 100644 tests/test_wal_export.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1f8d5cb --- /dev/null +++ b/.dockerignore @@ -0,0 +1,80 @@ +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +*.manifest +*.spec +pip-log.txt +pip-delete-this-directory.txt +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ +*.mo +*.pot +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal +instance/ +.webassets-cache +.scrapy +docs/_build/ +.pybuilder/ +target/ +.ipynb_checkpoints +profile_default/ +ipython_config.py +.pdm.toml +__pypackages__/ +celerybeat-schedule +celerybeat.pid +*.sage.py +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +.spyderproject +.spyproject +.ropeproject +/site +.mypy_cache/ +.dmypy.json +dmypy.json +.pyre/ +.pytype/ +cython_debug/ +tests/ +.github/ +.pre-commit-config.yaml +Makefile diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..c090325 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,44 @@ +name: Publish Python Package + +on: + release: + types: [created] +permissions: + actions: write +jobs: + build-n-publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + set -e + python -m pip install --upgrade pip + curl -sSL https://install.python-poetry.org | python3 - + + shell: bash +# - name: Version bump +# run: | +# set -e +# poetry version ${{ github.event.release.tag_name }} +# git add ./pyproject.toml +# git config --global user.name "Release Bot" +# git config --global user.email "opensource@amikos.tech" +# git commit -m "Change version to ${{ github.event.release.tag_name }}" --allow-empty +# git push origin HEAD:main +# shell: bash + + - name: Publish package to PyPI + run: | + set -e + poetry config pypi-token.pypi ${{ secrets.PYPI_API_TOKEN }} + poetry publish -n --build + shell: bash +# poetry config repositories.publish ${{ inputs.PUBLISH_REGISTRY }} +# poetry publish -p ${{ secrets.PYPI_API_TOKEN }} -u ${{ inputs.PUBLISH_REGISTRY_USERNAME }} -r publish --build diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml new file mode 100644 index 0000000..9f2fb09 --- /dev/null +++ b/.github/workflows/test.yaml @@ -0,0 +1,32 @@ +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + test: + runs-on: [ "ubuntu-latest" ] + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: "3.9" + - name: Install dependencies + run: | + set -e + python -m pip install --upgrade pip + curl -sSL https://install.python-poetry.org | python3 - + # - name: Lint with flake8 + # run: | + # set -e + # poetry update + # pre-commit run --from-ref HEAD~1 --to-ref HEAD + - name: Run tests + run: | + set -e + poetry update + poetry run pytest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fe189c2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,108 @@ +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf +.idea/**/aws.xml +.idea/**/contentModel.xml +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml +.idea/**/gradle.xml +.idea/**/libraries +cmake-build-*/ +.idea/**/mongoSettings.xml +*.iws +out/ +.idea_modules/ +atlassian-ide-plugin.xml +.idea/replstate.xml +.idea/sonarlint/ +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties +.idea/httpRequests +.idea/caches/build_file_checksums.ser +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +*.manifest +*.spec +pip-log.txt +pip-delete-this-directory.txt +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ +*.mo +*.pot +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal +instance/ +.webassets-cache +.scrapy +docs/_build/ +.pybuilder/ +target/ +.ipynb_checkpoints +profile_default/ +ipython_config.py +.pdm.toml +__pypackages__/ +celerybeat-schedule +celerybeat.pid +*.sage.py +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +.spyderproject +.spyproject +.ropeproject +/site +.mypy_cache/ +.dmypy.json +dmypy.json +.pyre/ +.pytype/ +cython_debug/ +.idea/ +.python-version diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..16a5902 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,34 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: mixed-line-ending + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: check-yaml + args: ["--allow-multiple-documents"] + - id: check-xml + - id: check-merge-conflict + - id: check-case-conflict + - id: check-docstring-first + + - repo: https://github.com/psf/black + rev: "refs/tags/23.3.0:refs/tags/23.3.0" + hooks: + - id: black + + - repo: https://github.com/PyCQA/flake8 + rev: 6.1.0 + hooks: + - id: flake8 + args: + - "--extend-ignore=E203,E501,E503" + - "--max-line-length=88" + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: "v1.2.0" + hooks: + - id: mypy + args: [--strict, --ignore-missing-imports, --follow-imports=silent, --disable-error-code=type-abstract, --config-file=./mypy.ini] + additional_dependencies: ["types-requests", "pydantic", "overrides", "hypothesis", "pytest", "pypika", "numpy", "types-protobuf", "kubernetes"] diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..25fb9d3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM chromadb/chroma:latest as base + +COPY ./ /chroma_ops + +WORKDIR /chroma_ops + +# install poetry +RUN pip install poetry && \ + poetry install --no-dev --no-interaction --no-ansi + +ENTRYPOINT ["poetry", "run","chops"] diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..26e4d29 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,9 @@ +The MIT License + +Copyright (c) 2024 Amikos Tech Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4db35dd --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +lint: + @echo "Running pre-commit hooks" + @pre-commit run --all-files +pre-commit: + @echo "Linting last commit" + @pre-commit run --from-ref HEAD~1 --to-ref HEAD +dependencies: + @echo "Installing dependencies" + @poetry update +install: + @echo "Installing the project" + @poetry install +test: + @echo "Running tests" + @poetry run pytest +build-docker: + @echo "Building docker image" + @docker build -t chromadb-dp . diff --git a/README.md b/README.md new file mode 100644 index 0000000..310036f --- /dev/null +++ b/README.md @@ -0,0 +1,73 @@ +# ChromaDB Operations Tools + +Tiny collection of utilities to help you managed ChromaDB indices. + +WARNING: These tools rely on internal ChromaDB APIs and may break in the future. + +## ☠️☠️☠️ BEFORE YOU BEGIN ☠️☠️☠️ + +Before you use these tools make sure your ChromaDB persistent dir, on which you intend to run these tools, is backed up. + +## Installation + +```bash +pip install chromadb-ops +``` + +## Usage + +### WAL Commit + +This command ensures your WAL is committed to binary vector index (HNSW). + +```bash +chown commit-wal /path/to/persist_dir +``` + +### WAL Cleanup + +This command cleans up the committed portion of the WAL and VACUUMs the database. + +```bash +chown cleanup-wal /path/to/persist_dir +``` + +### WAL Export + +This commands exports the WAL to a `jsonl` file. The command can be useful in taking backups of the WAL. + +```bash +chown export-wal /path/to/persist_dir --out /path/to/export.jsonl +``` + +> Note: If --out or -o is not specified the command will print the output to stdout. + +### Using Docker + +> Note: You have to mount your persist directory into the container for the commands to work. + + +Building the image: + +```bash +docker build -t chops . +``` + +#### WAL Commit + +```bash +docker run -it --rm -v ./persist_dir:/chroma-data chops commit-wal /chroma-data +``` + +#### WAL Cleanup + +```bash +docker run -it --rm -v ./persist_dir:/chroma-data chops clean-wal /chroma-data +``` + +#### WAL Export + +```bash +docker run -it --rm -v ./persist_dir:/chroma-data -v ./backup:/backup chops export-wal /chroma-data --out /backup/export.jsonl +``` + diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..e3e124d --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,19 @@ +# Security Policy + +## Supported Versions + +Use this section to tell people about which versions of your project are +currently being supported with security updates. + +| Version | Supported | +| ------- | ------------------ | +| 0.0.x | :white_check_mark: | + + +## Reporting a Vulnerability + +Use this section to tell people how to report a vulnerability. + +Tell them where to go, how often they can expect to get an update on a +reported vulnerability, what to expect if the vulnerability is accepted or +declined, etc. diff --git a/chroma_ops/__init__.py b/chroma_ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chroma_ops/main.py b/chroma_ops/main.py new file mode 100644 index 0000000..bc975c4 --- /dev/null +++ b/chroma_ops/main.py @@ -0,0 +1,20 @@ +import typer + +from chroma_ops.wal_commit import command as commit_wal_command +from chroma_ops.wal_clean import command as clean_wal_command +from chroma_ops.wal_export import command as export_wal_command + +app = typer.Typer(no_args_is_help=True, help="ChromaDB Ops Commands.") + +app.command(name="commit-wal", help="Commit WAL to HNSW lib binary index", no_args_is_help=True)( + commit_wal_command +) + +app.command(name="clean-wal", help="Cleans up WAL and VACUUM the SQLite DB.", no_args_is_help=True)( + clean_wal_command +) +app.command(name="export-wal", help="Exports the WAL to a jsonl file.", no_args_is_help=True)( + export_wal_command +) +if __name__ == "__main__": + app() diff --git a/chroma_ops/utils.py b/chroma_ops/utils.py new file mode 100644 index 0000000..fbc0cfe --- /dev/null +++ b/chroma_ops/utils.py @@ -0,0 +1,35 @@ +import os +from typing import List + +import hnswlib + + +def validate_chroma_persist_dir(persist_dir: str) -> None: + if not os.path.exists(persist_dir): + raise ValueError(f"Persist dir ({persist_dir}) does not exist") + if not os.path.exists(f"{persist_dir}/chroma.sqlite3"): + raise ValueError( + f"{persist_dir} does not appear to be valid ChromaDB persist directory" + ) + + +def get_hnsw_index_ids(filename: str, space: str = "l2", dim: int = 384) -> List[int]: + index = hnswlib.Index(space=space, dim=dim) + index.load_index( + filename, + is_persistent_index=True, + max_elements=100000, + ) + ids = index.get_ids_list().copy() + index.close_file_handles() + return ids + + +def get_dir_size(path: str) -> int: + total_size = 0 + for dirpath, dirnames, filenames in os.walk(path): + for f in filenames: + fp = os.path.join(dirpath, f) + if os.path.exists(fp): + total_size += os.path.getsize(fp) + return total_size diff --git a/chroma_ops/wal_clean.py b/chroma_ops/wal_clean.py new file mode 100755 index 0000000..e754e77 --- /dev/null +++ b/chroma_ops/wal_clean.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +import argparse +import os +import sqlite3 +import hnswlib +import typer +from chromadb.segment.impl.vector.local_persistent_hnsw import PersistentData + +from chroma_ops.utils import validate_chroma_persist_dir, get_hnsw_index_ids, get_dir_size + + +def clean_wal(persist_dir: str): + validate_chroma_persist_dir(persist_dir) + print("Size before: ", get_dir_size(persist_dir)) + # TODO add path join here + sql_file = os.path.join(persist_dir, "chroma.sqlite3") + conn = sqlite3.connect(sql_file) + # conn = sqlite3.connect(f"file:{sql_file}?mode=ro", uri=True) + + cursor = conn.cursor() + + query = "SELECT s.id as 'segment',s.topic as 'topic', c.id as 'collection' , c.dimension as 'dimension' FROM segments s LEFT JOIN collections c ON s.collection = c.id WHERE s.scope = 'VECTOR';" + + cursor.execute(query) + + results = cursor.fetchall() + wal_cleanup_queries = [] + for row in results: + metadata_pickle = os.path.join(persist_dir, row[0], 'index_metadata.pickle') + if os.path.exists(metadata_pickle): + metadata = PersistentData.load_from_file(metadata_pickle) + wal_cleanup_queries.append( + f"DELETE FROM embeddings_queue WHERE seq_id < {metadata.max_seq_id} AND topic='{row[1]}';" + ) + else: + hnsw_space = cursor.execute( + "select str_value from collection_metadata where collection_id=? and key='hnsw:space'", + (row[2],), + ).fetchone() + hnsw_space = "l2" if hnsw_space is None else hnsw_space[0] + list_of_ids = get_hnsw_index_ids( + f"{os.path.join(persist_dir, row[0])}", hnsw_space, row[3] + ) + batch_size = 100 + for batch in range(0, len(list_of_ids), batch_size): + wal_cleanup_queries.append( + f"DELETE FROM embeddings_queue WHERE seq_id IN ({','.join([str(i) for i in list_of_ids[batch:batch + batch_size]])});" + ) + if len(wal_cleanup_queries) > 0: + print("Cleaning up WAL") + wal_cleanup_queries.append("VACUUM;") + cursor.executescript("\n".join(wal_cleanup_queries)) + # Close the cursor and connection + cursor.close() + conn.close() + print("Size after: ", get_dir_size(persist_dir)) + + +def command( + persist_dir: str = typer.Argument(..., help="The persist directory"), +): + clean_wal(persist_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("persist_dir", type=str) + arg = parser.parse_args() + clean_wal(arg.persist_dir) diff --git a/chroma_ops/wal_commit.py b/chroma_ops/wal_commit.py new file mode 100644 index 0000000..52b3fa0 --- /dev/null +++ b/chroma_ops/wal_commit.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +import argparse +import os + +import chromadb +import typer + +from chromadb.segment.impl.vector.local_persistent_hnsw import PersistentData +from chromadb.segment import VectorReader +from chromadb.types import SegmentScope, Operation + +from chroma_ops.utils import validate_chroma_persist_dir + + +def commit_wal(persist_dir: str): + """Note this uses internal ChromaDB APIs which may change at any moment.""" + + validate_chroma_persist_dir(persist_dir) + client = chromadb.PersistentClient(path=persist_dir) + vector_segments = [ + s + for s in client._server._sysdb.get_segments() + if s["scope"] == SegmentScope.VECTOR + ] + for s in vector_segments: + col = client._server._get_collection( + s["collection"] + ) # load the collection and apply WAL + client._server._manager.hint_use_collection( + s["collection"], Operation.ADD + ) # Add hint to load the index into memory + segment = client._server._manager.get_segment( + s["collection"], VectorReader + ) # Get the segment instance + segment._apply_batch(segment._curr_batch) # Apply the current WAL batch + + segment._persist() # persist the index + metadata = PersistentData.load_from_file( + os.path.join(persist_dir, str(s["id"]), "index_metadata.pickle") + ) # load the metadata after persisting + print( + f"Processing index for collection {col['name']} ({s['id']}) - " + f"total vectors in index {len(segment._index.get_ids_list())}" + ) + segment.close_persistent_index() + + +def command( + persist_dir: str = typer.Argument(..., help="The persist directory"), +): + commit_wal(persist_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("persist_dir", type=str) + arg = parser.parse_args() + commit_wal(arg.persist_dir) diff --git a/chroma_ops/wal_export.py b/chroma_ops/wal_export.py new file mode 100644 index 0000000..9617238 --- /dev/null +++ b/chroma_ops/wal_export.py @@ -0,0 +1,64 @@ +import argparse +import base64 +import json +import os +import sqlite3 +import sys +from contextlib import contextmanager + +import typer + +from chroma_ops.utils import validate_chroma_persist_dir + + +@contextmanager +def smart_open(filename=None): + if filename: + fh = open(filename, 'w') + else: + fh = sys.stdout + + try: + yield fh + finally: + if filename: + fh.close() + + +def export_wal(persist_dir: str, output_file: str): + validate_chroma_persist_dir(persist_dir) + sql_file = os.path.join(persist_dir, "chroma.sqlite3") + conn = sqlite3.connect(f"file:{sql_file}?mode=ro", uri=True) + cursor = conn.cursor() + query = "SELECT * FROM embeddings_queue ORDER BY seq_id ASC;" + cursor.execute(query) + exported_rows = 0 + with smart_open(output_file) as json_file: + column_names = [description[0] for description in cursor.description] + for row in cursor: + row_data = {} + for i, column_name in enumerate(column_names): + if column_name == 'vector': + row_data[column_name] = base64.b64encode(row[i]).decode() + else: + row_data[column_name] = row[i] + json_file.write(json.dumps(row_data) + "\n") + exported_rows += 1 + + conn.close() + typer.echo(f"Exported {exported_rows} rows", file=sys.stderr) + + +def command( + persist_dir: str = typer.Argument(..., help="The persist directory"), + out: str = typer.Option(None, "--out", "-o", help="The output jsonl file"), +): + export_wal(persist_dir, out) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("persist_dir", type=str) + parser.add_argument("--out", type=str, default=None) + arg = parser.parse_args() + export_wal(arg.persist_dir, arg.out) diff --git a/experiments/wal_commit.ipynb b/experiments/wal_commit.ipynb new file mode 100644 index 0000000..868bc04 --- /dev/null +++ b/experiments/wal_commit.ipynb @@ -0,0 +1,97 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import chromadb\n", + "\n", + "client = chromadb.PersistentClient(path=\"chroma-test-compact\")\n" + ] + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "from chromadb.segment.impl.vector.local_persistent_hnsw import PersistentData\n", + "from chromadb.segment import VectorReader\n", + "from chromadb.types import SegmentScope, Operation\n", + "\n", + "# client._server._manager\n", + "vector_segments = [s for s in client._server._sysdb.get_segments() if s['scope'] == SegmentScope.VECTOR]\n", + "\n", + "for s in vector_segments:\n", + " col=client._server._get_collection(s['collection'])\n", + " client._server._manager.hint_use_collection(s['collection'], Operation.ADD)\n", + " segment=client._server._manager.get_segment(s['collection'], VectorReader)\n", + " segment._apply_batch(segment._curr_batch)\n", + " segment._persist()\n", + " metadata = PersistentData.load_from_file(\n", + " f\"chroma-test-compact/{s['id']}/index_metadata.pickle\")\n", + " print(len(metadata.id_to_label),col['name'] , metadata.max_seq_id)\n", + " segment.close_persistent_index()" + ], + "metadata": { + "collapsed": false + }, + "id": "8017e49aa42809d6", + "execution_count": null + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "import chromadb\n", + "\n", + "client = chromadb.PersistentClient(path=\"chroma-test-compact\")\n", + "\n", + "client.list_collections()\n", + "\n", + "col = client.get_or_create_collection(\"chroma-qna7\")\n", + "\n", + "res= col.get(include=[\"embeddings\"])\n", + "print(len(res[\"ids\"]),len(res[\"embeddings\"]))" + ], + "metadata": { + "collapsed": false + }, + "id": "f69e9a226404f06f", + "execution_count": null + }, + { + "cell_type": "code", + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "c6689ca863699e30" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..85bc789 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,41 @@ +[tool.poetry] +name = "chromadb-ops" +version = "0.0.1" +description = "Tiny unofficial ChromaDB operations CLI" +authors = ["Trayan Azarov "] +license = "MIT" +readme = "README.md" +packages = [{ include = "chroma_ops" }] +[tool.poetry.urls] +"Bug Tracker" = "https://github.com/amikos-tech/chromadb-ops/issues" +"Homepage" = "https://github.com/amikos-tech/chromadb-ops/" +"Source" = "https://github.com/amikos-tech/chromadb-ops/" + +[tool.poetry.scripts] +"chops" = "chroma_ops.main:app" + +[tool.poetry.dependencies] +python = ">=3.9" +chromadb = ">=0.4.22" +typer = {extras = ["all"], version = "^0.9.0"} + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + + +[tool.poetry.group.dev.dependencies] +pytest = "^7.4.3" +black = "23.3.0" +pre-commit = "^3.6.0" +hypothesis = "^6.92.0" + + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = "-rA" +testpaths = [ + "tests", + "integration", +] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_wal_clean.py b/tests/test_wal_clean.py new file mode 100644 index 0000000..8a5404d --- /dev/null +++ b/tests/test_wal_clean.py @@ -0,0 +1,36 @@ +import os.path +import sqlite3 +import tempfile +import uuid + +from hypothesis import given, settings +import hypothesis.strategies as st + +import chromadb +from chromadb.segment import VectorReader +from chromadb.types import SegmentScope + +from chroma_ops.utils import get_dir_size +from chroma_ops.wal_clean import clean_wal + + +@given(records_to_add=st.integers(min_value=1000, max_value=10001)) +@settings(deadline=60000) +def test_basic_clean(records_to_add: int): + with tempfile.TemporaryDirectory() as temp_dir: + client = chromadb.PersistentClient(path=temp_dir) + col = client.create_collection("test") + ids_documents = [(f"{uuid.uuid4()}", f"document {i}", [0.1] * 1536) for i in range(records_to_add)] + ids, documents, embeddings = zip(*ids_documents) + col.add(ids=list(ids), documents=list(documents), embeddings=list(embeddings)) + size_before = get_dir_size(temp_dir) + sql_file = os.path.join(temp_dir, "chroma.sqlite3") + conn = sqlite3.connect(f"file:{sql_file}?mode=ro", uri=True) + cursor = conn.cursor() + count = cursor.execute("SELECT count(*) FROM embeddings_queue") + assert count.fetchone()[0] == records_to_add + clean_wal(temp_dir) + count = cursor.execute("SELECT count(*) FROM embeddings_queue") + assert count.fetchone()[0] < records_to_add + size_after = get_dir_size(temp_dir) + assert size_after < size_before diff --git a/tests/test_wal_commit.py b/tests/test_wal_commit.py new file mode 100644 index 0000000..af02b4b --- /dev/null +++ b/tests/test_wal_commit.py @@ -0,0 +1,49 @@ +import os.path +import sqlite3 +import tempfile +import uuid + +from hypothesis import given, settings +import hypothesis.strategies as st + +import chromadb +from chromadb.segment import VectorReader +from chromadb.types import SegmentScope + +from chroma_ops.wal_commit import commit_wal + + +@given(records_to_add=st.integers(min_value=1, max_value=1001)) +@settings(deadline=60000) +def test_basic_commit(records_to_add: int): + with tempfile.TemporaryDirectory() as temp_dir: + client = chromadb.PersistentClient(path=temp_dir) + col = client.create_collection("test") + ids_documents = [(f"{uuid.uuid4()}", f"document {i}", [0.1] * 1536) for i in range(records_to_add)] + ids, documents, embeddings = zip(*ids_documents) + col.add(ids=list(ids), documents=list(documents), embeddings=list(embeddings)) + sql_file = os.path.join(temp_dir, "chroma.sqlite3") + conn = sqlite3.connect(f"file:{sql_file}?mode=ro", uri=True) + + vector_segments = [ + client._server._manager.get_segment( + s["collection"], VectorReader + ) + for s in client._server._sysdb.get_segments() + if s["scope"] == SegmentScope.VECTOR + ] + for segment in vector_segments: + batch_size = segment._batch_size + if records_to_add % batch_size == 0: + assert len(segment._index.get_ids_list()) == records_to_add, "Precheck failed" + else: + assert len(segment._index.get_ids_list()) != records_to_add, "Precheck failed" + cursor = conn.cursor() + + count = cursor.execute("SELECT count(*) FROM embeddings_queue") + assert count.fetchone()[0] == records_to_add + commit_wal(temp_dir) + count = cursor.execute("SELECT count(*) FROM embeddings_queue") + assert count.fetchone()[0] == records_to_add + for segment in vector_segments: + assert len(segment._index.get_ids_list()) == records_to_add, "postcheck failed" diff --git a/tests/test_wal_export.py b/tests/test_wal_export.py new file mode 100644 index 0000000..c483d95 --- /dev/null +++ b/tests/test_wal_export.py @@ -0,0 +1,36 @@ +import os.path +import sqlite3 +import tempfile +import uuid + +from hypothesis import given, settings +import hypothesis.strategies as st + +import chromadb +from chromadb.segment import VectorReader +from chromadb.types import SegmentScope + +from chroma_ops.utils import get_dir_size +from chroma_ops.wal_clean import clean_wal +from chroma_ops.wal_export import export_wal + + +def count_lines(file_path: str) -> int: + with open(file_path, 'r') as file: + lines = file.readlines() + return len(lines) + + +@given(records_to_add=st.integers(min_value=1, max_value=100)) +@settings(deadline=60000) +def test_basic_export(records_to_add: int): + with tempfile.TemporaryDirectory() as temp_dir: + client = chromadb.PersistentClient(path=temp_dir) + col = client.create_collection("test") + ids_documents = [(f"{uuid.uuid4()}", f"document {i}", [0.1] * 1536) for i in range(records_to_add)] + ids, documents, embeddings = zip(*ids_documents) + col.add(ids=list(ids), documents=list(documents), embeddings=list(embeddings)) + with tempfile.NamedTemporaryFile() as temp_file: + export_wal(temp_dir, temp_file.name) + assert os.path.exists(temp_file.name) + assert count_lines(temp_file.name) == records_to_add