Skip to content

Commit

Permalink
feat: Improved fts rebuild prompts and messages
Browse files Browse the repository at this point in the history
  • Loading branch information
tazarov committed Jan 14, 2025
1 parent bc0f62d commit 76e2097
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 16 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ The data is stored in sqlite3 file including all binary indices.
chops collection snapshot /path/to/persist_dir --collection <collection_name> -o /path/to/snapshot.sqlite3
```

Additional options:

- `--yes` (`-y`) - skip confirmation prompt (default: `False`, prompt will be shown)

**Go:**

> [!NOTE]
Expand Down Expand Up @@ -208,6 +212,10 @@ This command cleans up orphanated HNSW segment subdirectories.
chops db clean /path/to/persist_dir
```

Additional options:

- `--yes` (`-y`) - skip confirmation prompt (default: `False`, prompt will be shown)

**Go:**


Expand Down Expand Up @@ -325,6 +333,10 @@ This command rebuilds the full-text search index.
chops fts rebuild /path/to/persist_dir
```

Additional options:

- `--yes` (`-y`) - skip confirmation prompt (default: `False`, prompt will be shown)

Change the tokenizer to `unicode61` by passing `--tokenizer unicode61` (or `-t unicode61`) option.

```bash
Expand Down
40 changes: 26 additions & 14 deletions chroma_ops/fts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import sys
from typing import Optional

import chromadb
import typer
Expand All @@ -11,6 +11,8 @@
read_script,
)

from rich.console import Console


fts_commands = typer.Typer(no_args_is_help=True)

Expand All @@ -28,31 +30,38 @@ def validate_tokenizer(tokenizer: str) -> None:
)


def rebuild_fts(persist_dir: str, tokenizer: str = DEFAULT_TOKENIZER) -> None:
def rebuild_fts(
persist_dir: str,
tokenizer: str = DEFAULT_TOKENIZER,
yes: Optional[bool] = False,
) -> None:
validate_chroma_persist_dir(persist_dir)
validate_tokenizer(tokenizer)
console = Console()
if not yes:
if not typer.confirm(
f"\nAre you sure you want to rebuild the FTS index in {persist_dir}? This action will drop the existing FTS index and create a new one.",
default=False,
show_default=True,
):
console.print("[yellow]Rebuild FTS cancelled by user[/yellow]")
return
with get_sqlite_connection(persist_dir, SqliteMode.READ_WRITE) as conn:
cursor = conn.cursor()
script = read_script("scripts/drop_fts.sql")
script = script.replace("__TOKENIZER__", tokenizer)
cursor.executescript(script)
cursor.close()
typer.echo("Dropped FTS. Will try to start your Chroma now.", file=sys.stderr)
typer.echo(
console.print("Rebuilt FTS. Will try to start your Chroma now.")
console.print(
"NOTE: Depending on the size of your documents in Chroma it may take a while for Chroma to start up again.",
file=sys.stderr,
color=typer.colors.YELLOW,
)
try:
chromadb.PersistentClient(path=persist_dir)
typer.echo("Chroma started successfully.", file=sys.stderr)
console.print("[green]Chroma started successfully. FTS rebuilt.[/green]")
except Exception as e:
typer.echo(
f"Chroma failed to start. Error: {repr(e)}",
file=sys.stderr,
color=typer.colors.RED,
err=True,
)
console.print(f"[red]Chroma failed to start. Error: {repr(e)}[/red]")
raise e


def rebuild_command(
Expand All @@ -63,8 +72,11 @@ def rebuild_command(
"-t",
help="The tokenizer to use for the FTS index. Supported values: 'trigram', 'unicode61', 'ascii', 'porter'. See https://www.sqlite.org/fts5.html#tokenizers",
),
yes: Optional[bool] = typer.Option(
False, "--yes", "-y", help="Skip confirmation prompt"
),
) -> None:
rebuild_fts(persist_dir, tokenizer)
rebuild_fts(persist_dir, tokenizer, yes=yes)


fts_commands.command(
Expand Down
4 changes: 2 additions & 2 deletions tests/test_fts.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_rebuild_fts() -> None:
col.get(where_document={"$contains": "document 0"})

assert "no such table: embedding_fulltext_search" in str(e)
rebuild_fts(temp_dir)
rebuild_fts(temp_dir, yes=True)
fixed_temp_dir = os.path.join(temp_dir, "fixed")
shutil.copytree(temp_dir, fixed_temp_dir)
client = chromadb.PersistentClient(path=fixed_temp_dir)
Expand Down Expand Up @@ -74,7 +74,7 @@ def test_rebuild_fts_with_unicode61() -> None:

fixed_temp_dir = os.path.join(temp_dir, "fixed")
shutil.copytree(temp_dir, fixed_temp_dir)
rebuild_fts(fixed_temp_dir, tokenizer="unicode61")
rebuild_fts(fixed_temp_dir, tokenizer="unicode61", yes=True)
client = chromadb.PersistentClient(path=fixed_temp_dir)
col = client.get_collection("test")
res_after = col.get(where_document={"$contains": "순매"})
Expand Down

0 comments on commit 76e2097

Please sign in to comment.