diff --git a/.corpora.yaml b/.corpora.yaml index 3df22f9..3337573 100644 --- a/.corpora.yaml +++ b/.corpora.yaml @@ -1,6 +1,6 @@ # .corpora.yaml -name: "corpora" +name: "corpora2" url: "https://github.com/skyl/corpora" server: diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh index 1ba990a..305103c 100755 --- a/.devcontainer/setup.sh +++ b/.devcontainer/setup.sh @@ -4,4 +4,4 @@ # echo "starting zsh..." echo 'autoload -Uz add-zsh-hook; append_history() { fc -W }; add-zsh-hook precmd append_history; export HISTFILE=/home/vscode/.corpora.zsh_history/.zsh_history' >> ~/.zshrc echo alias tree="tree -I '.venv|node_modules|__pycache__|.git|.pytest_cache' -a" >> ~/.zshrc -zsh +# zsh diff --git a/.gitignore b/.gitignore index 3e91ef8..e71c493 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +/.env + # macOS system files .DS_Store diff --git a/TODO.md b/TODO.md index d131d69..bc7da67 100644 --- a/TODO.md +++ b/TODO.md @@ -1,5 +1,11 @@ # TODO +- compare vector of summary versus vector of splits, create search endpoint(s) test adhoc queries + +- test full init with chained tasks + +- Python split with AST - langchain one is kinda' lame. + - for small corpora, we could get away with a mega-task but let's break into many single responsibilities - finish ingest for real - Start building the real records for the `corpora` corpus @@ -20,7 +26,7 @@ - agents ... function calling ... maybe we have a menu of function `mkdir`, `create file`, `rewrite file` * Take input. Analyze problem. Choose sequence of agents. Let agents execute in order. Report back to user. -- rm unused imports, for instance (on save, in CI..) +- more CI checks, local tools - rm unused imports, for instance (on save, in CI..) # Next @@ -60,4 +66,7 @@ - collect corpus, upload tarball to API - introduce celery (?) or similar for async tasks - handle the file (sync/async) upload in the API +- vector length in models versus oai implementation +- langchain-text-splitters - split based on type, cohesive approach, python and md + diff --git a/docker-compose.yaml b/docker-compose.yaml index 86c2dd6..ccaadd6 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -9,6 +9,9 @@ services: environment: PYTHONPATH: "/workspace/py/packages" REDIS_URL: "redis://redis:6379/0" + OPENAI_API_KEY: "${OPENAI_API_KEY}" + command: python manage.py runserver 0.0.0.0:8000 + working_dir: /workspace/py/packages/corpora_proj depends_on: - db - redis @@ -49,6 +52,7 @@ services: environment: PYTHONPATH: "/workspace/py/packages" REDIS_URL: "redis://redis:6379/0" + OPENAI_API_KEY: "${OPENAI_API_KEY}" depends_on: redis: condition: service_healthy diff --git a/md/notes/celery-tasks.md b/md/notes/celery-tasks.md new file mode 100644 index 0000000..8eba3b3 --- /dev/null +++ b/md/notes/celery-tasks.md @@ -0,0 +1,217 @@ +> Understanding Celery's task methods, such as `s`, `si`, `delay`, and `chain`, is essential for managing asynchronous workflows effectively. Each method has its own purpose, especially when working with task dependencies, immutability, and argument handling. +> +> Here’s a breakdown of the key Celery task methods and techniques, along with practical examples to illustrate their use cases. + +--- + +## 1. `delay`: The Simplest Way to Trigger a Task + +`delay` is a shortcut for calling a Celery task asynchronously with positional arguments. It’s easy to use and is equivalent to `.apply_async()` but with simplified syntax. + +### Example + +```python +@shared_task +def add(x, y): + return x + y + +# Calling the task asynchronously with delay +result = add.delay(4, 5) # Runs add(4, 5) asynchronously +``` + +### Usage Notes + +- **Simplifies Syntax**: You don’t need to use `.apply_async((args, kwargs), ...)`. +- **Fire and Forget**: `delay` is perfect for simple, one-off tasks. + +--- + +## 2. `.apply_async()`: Full Control over Task Execution + +While `delay` is convenient, `.apply_async()` provides more control. You can pass additional options such as `countdown`, `eta`, and `retry` policies. + +### Example + +```python +@shared_task +def send_email(to_address, subject, body): + # Send email logic here + return "Email sent" + +# Schedule task with additional options +result = send_email.apply_async( + args=("user@example.com", "Subject", "Body text"), + countdown=60 # Delay execution by 60 seconds +) +``` + +### Usage Notes + +- **Control Timing**: You can schedule tasks to run at specific times or after delays. +- **Pass Custom Options**: Supports retry policies, timeouts, and more. + +--- + +## 3. `s`: Signature for Building Task Chains and Groups + +The `s` method, short for “signature,” is used to define a task and its arguments without executing it. You can think of it as preparing a “task blueprint” for use in chains, groups, or chords. + +### Example + +```python +@shared_task +def process_data(data): + # Process data logic + return "Processed" + +# Define a task signature +task_signature = process_data.s("sample data") + +# Use the signature to run the task +task_signature.delay() +``` + +### Usage Notes + +- **Reusable**: Allows you to define tasks with arguments that can be reused in other contexts. +- **Works in Chains**: Essential for creating task chains, groups, and chords. + +--- + +## 4. `si`: Immutable Signature + +The `si` method, short for “signature immutable,” is similar to `s` but creates an **immutable** signature. This means that any result from a previous task in a chain will not be passed to this task, even if it’s part of a chain. + +### Example + +```python +@shared_task +def step_one(): + return "Result from step one" + +@shared_task +def step_two(data): + return f"Received: {data}" + +# Immutable signature - it will not receive output from step_one +chain(step_one.s(), step_two.si("Custom data")).apply_async() +``` + +In this example, `step_two` will receive `"Custom data"` as its input, ignoring the output of `step_one`. + +### Usage Notes + +- **Useful in Chains**: If a task should always receive specific arguments, use `si` to prevent it from receiving output from prior tasks. +- **Avoids Argument Mismatch Errors**: Prevents issues when chaining tasks that don’t accept the output of previous tasks. + +--- + +## 5. `chain`: Creating Sequential Task Pipelines + +`chain` is used to create a sequence of tasks where each task runs after the previous one completes. The result of each task is passed as the input to the next task in the chain (unless `si` is used). + +### Example + +```python +@shared_task +def add(x, y): + return x + y + +@shared_task +def multiply(result): + return result * 10 + +# Chain tasks together +result = chain(add.s(4, 5), multiply.s()).apply_async() +``` + +In this example: +- `add` runs first with arguments `(4, 5)` and returns `9`. +- `multiply` then receives `9` as its input and returns `90`. + +### Usage Notes + +- **Sequential Dependency**: Use `chain` when each task depends on the output of the previous task. +- **Error Handling**: If any task in the chain fails, the subsequent tasks will not run. + +--- + +## 6. `group`: Running Tasks in Parallel + +`group` lets you run multiple tasks in parallel and collect their results. It’s useful when tasks can run independently, and you want to aggregate their outputs. + +### Example + +```python +@shared_task +def add(x, y): + return x + y + +# Run tasks in parallel +result = group(add.s(2, 2), add.s(4, 4), add.s(6, 6)).apply_async() +``` + +In this example: +- Each `add` task runs independently. +- The results are collected as a list once all tasks complete (e.g., `[4, 8, 12]`). + +### Usage Notes + +- **Parallel Execution**: Ideal for tasks that don’t depend on each other. +- **Collect Results**: Results are returned as a list after all tasks complete. + +--- + +## 7. `chord`: Combining Groups with a Final Callback + +A `chord` is a combination of a `group` and a callback. All tasks in the group run in parallel, and once they complete, their results are passed to the callback task. + +### Example + +```python +@shared_task +def add(x, y): + return x + y + +@shared_task +def summarize(results): + return sum(results) + +# Define a chord +result = chord([add.s(2, 2), add.s(4, 4), add.s(6, 6)], summarize.s()).apply_async() +``` + +In this example: +- Each `add` task runs in parallel. +- The results `[4, 8, 12]` are passed to `summarize`, which returns `24`. + +### Usage Notes + +- **Useful for Aggregating Results**: The callback task can aggregate results from multiple tasks. +- **Sequential Logic**: Ensures that the callback runs only after all tasks in the group complete. + +--- + +## Summary Table + +| Method | Purpose | Usage Example | +|------------------|------------------------------------------------------------------------------------------|-----------------------------------| +| `delay` | Shortcut for asynchronously calling a task | `task.delay(args)` | +| `apply_async` | Full control over task execution with additional options | `task.apply_async(args, options)` | +| `s` | Signature for creating task definitions without execution | `task.s(args)` | +| `si` | Immutable signature to prevent prior task results from being passed | `task.si(args)` | +| `chain` | Sequential pipeline where each task depends on the output of the previous task | `chain(task1.s(), task2.s())` | +| `group` | Parallel execution of multiple tasks, with results collected as a list | `group([task1.s(), task2.s()])` | +| `chord` | Combines a group of parallel tasks with a callback that receives the group’s results | `chord(group, callback.s())` | + +--- + +### Choosing the Right Method + +- **For simple, independent tasks**: Use `delay` or `apply_async`. +- **For sequential tasks with dependencies**: Use `chain`. +- **For parallel tasks with independent execution**: Use `group`. +- **For combining parallel tasks with a final aggregation**: Use `chord`. +- **For tasks in a chain where you need a fixed input**: Use `si` to make the signature immutable. + +This understanding gives you the flexibility to create robust, efficient Celery workflows that handle task dependencies, sequencing, and parallelism as needed. diff --git a/md/notes/practical-embeddings-tutorial-2-multivector.md b/md/notes/practical-embeddings-tutorial-2-multivector.md new file mode 100644 index 0000000..969cff2 --- /dev/null +++ b/md/notes/practical-embeddings-tutorial-2-multivector.md @@ -0,0 +1,140 @@ +## Comprehensive Guide to Multi-Vector Embeddings with ColBERT and pgvector in Django + +Multi-vector embeddings allow for fine-grained semantic search by storing multiple vectors per document, representing token-level or segment-level information. This tutorial will cover: + +1. **Overview of Multi-Vector Models and ColBERT** +2. **Setting Up Multi-Vector Embeddings in Django with pgvector** +3. **Practical Storage and Retrieval with Multi-Vectors** + +--- + +### 1. Overview of Multi-Vector Models and ColBERT + +**ColBERT** (Contextualized Late Interaction over BERT) is a multi-vector model that generates a vector for each token in a document, enabling high-resolution semantic matching. This token-level approach allows for detailed similarity matching between query terms and document terms. + +- **Dimensionality**: ColBERT commonly uses **128-dimensional embeddings per token**, balancing semantic accuracy with manageable storage. +- **Use Case**: Ideal for applications where granular matching (e.g., term-to-term or passage-to-passage) is needed, such as search in code repositories, books, or large document corpora. + +With ColBERT, each document is represented by an array of vectors, capturing the context of individual tokens, which can improve search precision. + +### 2. Setting Up Multi-Vector Embeddings in Django with pgvector + +#### Database Schema Design + +To store multi-vector embeddings in PostgreSQL with `pgvector`, you can leverage PostgreSQL’s array functionality. Each document will have an array of 128-dimensional vectors, one per token. + +#### Step-by-Step Implementation + +1. **Install pgvector and Set Up PostgreSQL** + + Make sure PostgreSQL has `pgvector` installed. If not, install it with: + ```sql + CREATE EXTENSION IF NOT EXISTS vector; + ``` + +2. **Define Django Model for Multi-Vector Embeddings** + + Use Django’s `ArrayField` along with `VectorField` from `pgvector.django`. Here’s how to define a model that stores multiple 128-dimensional vectors for each document: + + ```python + from django.db import models + from django.contrib.postgres.fields import ArrayField + from pgvector.django import VectorField + + class Document(models.Model): + content = models.TextField() + embeddings = ArrayField( + base_field=VectorField(dimensions=128), # 128 dimensions per token vector + size=None, # None allows for variable-length arrays + ) + ``` + + - **`content`**: Stores the raw text of the document. + - **`embeddings`**: An array field where each element is a 128-dimensional vector representing a token embedding. + +3. **Generate and Store Multi-Vector Embeddings** + + Use ColBERT to generate an embedding for each token in a document. Here’s a sample function that could generate and store these embeddings: + + ```python + from transformers import ColBERTTokenizer, ColBERTModel + import torch + + # Assume ColBERTModel and ColBERTTokenizer are set up correctly + tokenizer = ColBERTTokenizer.from_pretrained("bert-base-uncased") + model = ColBERTModel.from_pretrained("bert-base-uncased") + + def generate_and_store_embeddings(document_text): + # Tokenize and obtain embeddings for each token + inputs = tokenizer(document_text, return_tensors="pt") + embeddings = model(**inputs).last_hidden_state.squeeze().detach().numpy() + + # Convert embeddings to a list of lists for storage + embeddings_list = embeddings.tolist()[:128] # Limit to 128 dimensions + + # Store in Django + document = Document(content=document_text, embeddings=embeddings_list) + document.save() + ``` + + This example assumes that each token in `document_text` has been converted to a 128-dimensional vector. After generation, the vectors are stored in the `embeddings` array field. + +#### Indexing for Efficient Retrieval + +Create an index on the embeddings array to speed up similarity searches: + +```sql +CREATE INDEX ON document_embeddings USING ivfflat (embeddings); +``` + +This index enables faster nearest neighbor searches within the vector space. + +### 3. Practical Storage and Retrieval with Multi-Vectors + +With multi-vector embeddings stored, we can now perform searches that take advantage of the fine-grained information embedded in each token’s vector. + +#### Example Query for Similarity Search with Late Interaction + +When performing a search query, you generate embeddings for the query terms and compare them against the token-level embeddings of each document. + +Here’s an example using Django to retrieve similar documents based on a multi-vector query: + +```python +from django.db.models import F +from pgvector.django import CosineDistance + +def find_similar_documents(query_text): + # Generate multi-vector embeddings for query text + query_embeddings = get_embeddings_for_text(query_text) + + # Perform similarity search against stored embeddings + results = Document.objects.annotate( + similarity=CosineDistance(F("embeddings"), query_embeddings) + ).order_by("similarity")[:10] + + return results +``` + +In this example: +- **Cosine Similarity**: Measures similarity between the query embeddings and each document’s token embeddings, allowing for granular matching. +- **Late Interaction**: This function calculates similarity scores by interacting with each token embedding, offering higher precision in retrieval. + +### Practical Considerations + +1. **Storage Requirements**: Multi-vector embeddings require more storage than single-vector embeddings. Each token embedding adds data, so ensure your database can handle the additional storage requirements. + +2. **Query Performance**: Multi-vector queries are more computationally intensive. Indexing, caching, and query optimization can help maintain acceptable performance. + +3. **Adjustable Dimensionality**: If storage is a concern, consider reducing the dimensionality (e.g., to 64 or 32 dimensions) to lower storage requirements, though this may reduce search accuracy. + +--- + +### Summary + +Using ColBERT-style multi-vector embeddings with `pgvector` in Django enables a high-resolution search experience, ideal for complex corpora like code repositories or large document collections. + +- **Define Multi-Vector Storage**: Use `ArrayField` and `VectorField` to store arrays of vectors, representing token-level embeddings. +- **Efficient Querying**: Use Cosine similarity with late interaction to retrieve relevant results based on fine-grained semantic matches. +- **Performance and Storage Management**: Balance dimensionality, indexing, and storage to optimize for both accuracy and performance. + +This setup provides a robust foundation for implementing advanced search capabilities within a Django application, leveraging the power of multi-vector embeddings and `pgvector`. \ No newline at end of file diff --git a/md/notes/practical-embeddings-tutorial.md b/md/notes/practical-embeddings-tutorial.md new file mode 100644 index 0000000..d5ae117 --- /dev/null +++ b/md/notes/practical-embeddings-tutorial.md @@ -0,0 +1,102 @@ +Here’s an updated, accurate tutorial on embeddings with a focus on using `text-embedding-3-small` at 1536 dimensions, trade-offs in embedding size, practical strategies for corpora like books and code repositories, and how to manage dimensionality effectively. + +--- + +## Comprehensive Guide to Embeddings in High-Quality Search Systems + +Embeddings transform text or code into dense vectors, capturing semantic meaning for search and retrieval. For a high-quality application like Corpora, we’re focusing on accurate representations, optimized for both contextual retrieval and future flexibility in vector dimensions. + +This guide covers: +1. **Model Dimensions and Configurations** +2. **Effective Embedding Strategies for Different Corpora** +3. **Trade-offs in Dimensionality and Storage** + +## Model Dimensions and Configurations + +### Overview of `text-embedding-3-small` and Other Models + +- **`text-embedding-3-small`**: Generates a **1536-dimensional** embedding by default. This provides rich contextual data, suitable for complex retrieval tasks. OpenAI allows for reduced dimensions if needed by specifying a custom `dimensions` parameter, but the full dimensionality is recommended for applications focused on high semantic fidelity. + +- **Higher-Dimensional Options**: OpenAI’s larger models, such as `text-embedding-3-large`, can offer dimensions up to **3072**. These are ideal for highly nuanced tasks but come with increased storage and computational costs. + +- **Other Common Models**: + - **BERT**: Often used at 768 dimensions, strong for general-purpose contextual similarity. + - **Sentence Transformers**: Available at dimensions from 384 to 1024, flexible and widely used for semantic search. + +Given Corpora’s focus, using the **full 1536 dimensions** of `text-embedding-3-small` offers an optimal balance of detail and performance. + +### Setting up 1536 Dimensions in Django with pgvector + +To store 1536-dimensional vectors, configure your `VectorField` as follows: + +```python +# Define a 1536-dimensional VectorField for embeddings in Django with pgvector +from pgvector.django import VectorField + +vector = VectorField(dimensions=1536, null=True, blank=True) +``` + +This setup ensures high-quality embeddings, capturing detailed semantic nuances without any dimensionality reduction. + +## Effective Embedding Strategies for Different Corpora + +To make the most of embeddings, consider the structure and purpose of each corpus type: + +### 1. Books or Long Documents + - **Chunking for Contextual Search**: For long texts, break each document into chunks (e.g., 300–500 words). Embed each chunk individually, allowing for fine-grained search and retrieval of relevant passages. + - **Hierarchical Search**: Store and retrieve passages by chunk-level embeddings, then apply a secondary ranking if needed, based on the entire document. + +### 2. Code Repositories + - **Function-Level Embeddings**: For large files, embedding each function or class provides focused representations that are ideal for retrieving specific code snippets or analyzing code structure. + - **File-Level Embeddings for Smaller Files**: For small code files, embedding the entire file at once can be effective, offering a holistic view of the code’s purpose. + +In both cases, keeping context manageable (within 8191 tokens for `text-embedding-3-small`) ensures that embeddings maintain accuracy and relevance. + +### Strategies for Balancing Dimensionality and Retrieval Goals + +With high-dimensional embeddings like 1536, retrieval tasks are highly accurate but can be resource-intensive. Here are some tips for balancing dimensionality with search performance: + +1. **Use Primary Embedding (1536) for Critical Similarity Searches**: For high-stakes applications, prioritize retrieval with the full-dimensionality embedding. This setup will yield the best results but requires indexing and query optimization. + +2. **Store Multiple Embeddings for Configurable Retrieval**: If different levels of granularity are needed, consider storing multiple embeddings per record, such as: + - **`embedding_1536`**: Main field for detailed retrieval. + - **`embedding_300`**: Secondary field for lightweight, approximate similarity searches. + + This approach provides flexibility, allowing you to choose the embedding based on the retrieval context. + +3. **pgvector and Indexing**: Ensure pgvector is properly indexed for the primary vector field. For high-dimensional vectors, **Cosine Similarity** or **Inner Product** are commonly used to find nearest neighbors. + +```python +from django.db.models import F +from pgvector.django import CosineDistance + +def find_similar_documents(query_text): + query_embedding = client.get_embedding(query_text) + return Document.objects.annotate( + similarity=CosineDistance(F("embedding_1536"), query_embedding) + ).order_by("similarity")[:10] +``` + +This ensures efficient, high-quality retrieval across complex document corpora. + +## Trade-offs in Dimensionality and Storage + +Choosing a higher dimensionality, like 1536, brings significant advantages but also some considerations: + +1. **Storage Costs**: Higher dimensions require more storage. With a 1536-dimensional vector, each entry will consume significantly more database space compared to lower-dimensional vectors (e.g., 300 or 512). However, for high-value applications like Corpora, this trade-off is worthwhile. + +2. **Performance and Latency**: Queries with high-dimensional vectors can be slower, especially with larger datasets. Indexing and caching strategies, along with a well-tuned database, help mitigate these effects. + +3. **Dimensionality Reduction (Optional)**: If certain use cases warrant lower dimensions, consider techniques like **Principal Component Analysis (PCA)** on the 1536-dimensional vectors to produce 300- or 512-dimensional approximations for faster, approximate searches. + +--- + +## Summary + +For a world-class application like Corpora, the 1536-dimensional configuration of `text-embedding-3-small` strikes an ideal balance, providing: + +- **High Semantic Fidelity**: Essential for detailed contextual search across books and code. +- **Scalable Retrieval Strategies**: Options to chunk and store various embedding levels for optimal search performance. +- **Configurable Dimensionality**: Storing primary high-dimension embeddings while allowing secondary, smaller dimensions if required. + +By following these strategies, Corpora will be equipped for high-quality, contextually rich search capabilities that meet the standards of a world-class application. diff --git a/py/packages/corpora/admin.py b/py/packages/corpora/admin.py index aa39fd8..7c291c2 100644 --- a/py/packages/corpora/admin.py +++ b/py/packages/corpora/admin.py @@ -23,8 +23,11 @@ class CorpusTextFileAdmin(admin.ModelAdmin): readonly_fields = ("checksum", "created_at", "updated_at") fieldsets = ( (None, {"fields": ("corpus", "path", "content")}), - ("AI Summary", {"fields": ("ai_summary", "vector_of_summary")}), - ("Metadata", {"fields": ("checksum", "created_at", "updated_at")}), + ("AI Summary", {"fields": ("ai_summary",)}), + ( + "Metadata", + {"fields": ("checksum", "created_at", "updated_at")}, + ), ) @@ -34,10 +37,9 @@ class SplitAdmin(admin.ModelAdmin): list_display = ("file", "order", "content_preview", "metadata") search_fields = ("file__path", "content") ordering = ("file", "order") - readonly_fields = ("vector",) fieldsets = ( (None, {"fields": ("file", "order", "content")}), - ("Vector Data", {"fields": ("vector", "metadata")}), + ("Meta", {"fields": ("metadata",)}), ) def content_preview(self, obj): diff --git a/py/packages/corpora/migrations/0007_alter_split_vector.py b/py/packages/corpora/migrations/0007_alter_split_vector.py new file mode 100644 index 0000000..911b9f9 --- /dev/null +++ b/py/packages/corpora/migrations/0007_alter_split_vector.py @@ -0,0 +1,24 @@ +# Generated by Django 5.1.2 on 2024-11-04 01:41 + +import pgvector.django.vector +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("corpora", "0006_rename_file_corpustextfile"), + ] + + operations = [ + migrations.AlterField( + model_name="split", + name="vector", + field=pgvector.django.vector.VectorField( + blank=True, + dimensions=1536, + help_text="text-embedding-3-small vector of the content", + null=True, + ), + ), + ] diff --git a/py/packages/corpora/migrations/0008_alter_corpustextfile_vector_of_summary.py b/py/packages/corpora/migrations/0008_alter_corpustextfile_vector_of_summary.py new file mode 100644 index 0000000..0e64f27 --- /dev/null +++ b/py/packages/corpora/migrations/0008_alter_corpustextfile_vector_of_summary.py @@ -0,0 +1,24 @@ +# Generated by Django 5.1.2 on 2024-11-04 02:58 + +import pgvector.django.vector +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("corpora", "0007_alter_split_vector"), + ] + + operations = [ + migrations.AlterField( + model_name="corpustextfile", + name="vector_of_summary", + field=pgvector.django.vector.VectorField( + blank=True, + dimensions=1536, + help_text="text-embedding-3-small vector of the content", + null=True, + ), + ), + ] diff --git a/py/packages/corpora/models.py b/py/packages/corpora/models.py index 317bb8b..a6388df 100644 --- a/py/packages/corpora/models.py +++ b/py/packages/corpora/models.py @@ -1,9 +1,15 @@ +import os import uuid + from django.db import models from django.utils import timezone from django.contrib.auth import get_user_model + +# from django.contrib.postgres.fields import ArrayField from pgvector.django import VectorField +from corpora_ai.split import get_text_splitter + User = get_user_model() @@ -54,7 +60,13 @@ class CorpusTextFile(models.Model): path = models.CharField(max_length=1024) content = models.TextField(blank=True) ai_summary = models.TextField(blank=True) - vector_of_summary = VectorField(dimensions=300, null=True, blank=True) + vector_of_summary = VectorField( + dimensions=1536, + null=True, + blank=True, + help_text="text-embedding-3-small vector of the content", + editable=False, + ) checksum = models.CharField( max_length=40, editable=False, @@ -70,6 +82,44 @@ class Meta: def __str__(self): return f"{self.corpus.name}:{self.path}" + def get_and_save_summary(self): + from corpora_ai.provider_loader import load_llm_provider + + llm = load_llm_provider() + summary = llm.get_summary(self._get_text_representation()) + self.ai_summary = summary + self.save(update_fields=["ai_summary"]) + + def _get_text_representation(self): + return f"{self.corpus.name}:{self.path}\n\n{self.content}" + + def get_and_save_vector_of_summary(self): + from corpora_ai.provider_loader import load_llm_provider + + llm = load_llm_provider() + vector = llm.get_embedding(self.ai_summary) + self.vector_of_summary = vector + self.save(update_fields=["vector_of_summary"]) + + def split_content(self): + """ + Splits the content of the file into smaller parts using an appropriate text splitter. + Returns a list of Split instances. + """ + file_name = os.path.basename(self.path) + splitter = get_text_splitter(file_name) + + # Split content into parts + parts = splitter.split_text(self.content) + splits = [] + + # Create Split instances for each part + for order, part in enumerate(parts): + split = Split.objects.create(file=self, order=order, content=part) + splits.append(split) + + return splits + class Split(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) @@ -78,7 +128,20 @@ class Split(models.Model): ) order = models.PositiveIntegerField() content = models.TextField(blank=True) - vector = VectorField(dimensions=300, null=True, blank=True) + vector = VectorField( + dimensions=1536, + null=True, + blank=True, + help_text="text-embedding-3-small vector of the content", + editable=False, + ) + # # Multivector: https://huggingface.co/colbert-ir/colbertv2.0 + # https://github.com/pgvector/pgvector/issues/640 + # https://docs.djangoproject.com/en/5.1/ref/contrib/postgres/fields/#arrayfield + # colbert_embeddings = ArrayField( + # base_field=VectorField(dimensions=128), + # size=None, # Set to None for variable-length arrays + # ) metadata = models.JSONField(default=dict, blank=True) class Meta: @@ -87,3 +150,17 @@ class Meta: def __str__(self): return f"{self.file.corpus.name}:{self.file.path}:{self.order}" + + def get_and_save_vector(self): + from corpora_ai.provider_loader import load_llm_provider + + llm = load_llm_provider() + vector = llm.get_embedding(self.content) + self.vector = vector + self.save(update_fields=["vector"]) + + # # Optionally, for multi-vector storage + # def get_and_save_colbert_vectors(self): + # colbert_vectors = generate_colbert_vectors(self.content) # e.g., a list of 128-dim vectors + # self.colbert_embeddings = colbert_vectors + # self.save(update_fields=["colbert_embeddings"]) diff --git a/py/packages/corpora/tasks.py b/py/packages/corpora/tasks.py index 674284d..503a2e5 100644 --- a/py/packages/corpora/tasks.py +++ b/py/packages/corpora/tasks.py @@ -3,34 +3,58 @@ from celery import shared_task from .lib.files import compute_checksum -from .models import Corpus, CorpusTextFile +from .models import Corpus, CorpusTextFile, Split @shared_task def process_tarball(corpus_id: str, tarball: bytes) -> None: - """ - Process a tarball by extracting each file and creating a `CorpusFile` - entry for each extracted file in the database. - """ - print(f"Processing tarball... {corpus_id}") corpus = Corpus.objects.get(id=corpus_id) - with tarfile.open(fileobj=io.BytesIO(tarball), mode="r:gz") as tar: for member in tar.getmembers(): if member.isfile(): - file_content = tar.extractfile(member).read() + file_content = ( + tar.extractfile(member).read().decode("utf-8", errors="replace") + ) checksum = compute_checksum(file_content) - print(f"{member.name}") - print(f"{checksum}") - # Save each extracted file as a `CorpusFile` entry - cf = CorpusTextFile.objects.create( + # Create a CorpusTextFile and kick off further tasks + corpus_file = CorpusTextFile.objects.create( corpus=corpus, path=member.name, - content=file_content.decode("utf-8", errors="replace"), + content=file_content, checksum=checksum, ) - print(f"Created file: {cf}") + generate_summary_task.delay(corpus_file.id) + split_file_task.delay(corpus_file.id) + + +@shared_task +def generate_summary_task(corpus_file_id: str) -> None: + corpus_file = CorpusTextFile.objects.get(id=corpus_file_id) + corpus_file.get_and_save_summary() + corpus_file.get_and_save_vector_of_summary() + + +@shared_task +def split_file_task(corpus_file_id: str) -> None: + corpus_file = CorpusTextFile.objects.get(id=corpus_file_id) + splits = corpus_file.split_content() + for split in splits: + # no need to chain + generate_vector_task.delay(split.id) + # generate_colbert_vectors_task.delay(split.id) + + +@shared_task +def generate_vector_task(split_id: str) -> None: + split = Split.objects.get(id=split_id) + split.get_and_save_vector() + + +# @shared_task +# def generate_colbert_vectors_task(split_id: str) -> None: +# split = Split.objects.get(id=split_id) +# split.get_and_save_colbert_vectors() @shared_task diff --git a/py/packages/corpora_ai/README.md b/py/packages/corpora_ai/README.md index cd514d5..08fe35e 100644 --- a/py/packages/corpora_ai/README.md +++ b/py/packages/corpora_ai/README.md @@ -36,6 +36,6 @@ print(response) ### Generating an Embedding ```python -embedding = llm.generate_embedding("Sample text for embedding") +embedding = llm.get_embedding("Sample text for embedding") print(embedding) ``` diff --git a/py/packages/corpora_ai/count_tokens.py b/py/packages/corpora_ai/count_tokens.py new file mode 100644 index 0000000..876589d --- /dev/null +++ b/py/packages/corpora_ai/count_tokens.py @@ -0,0 +1,20 @@ +import tiktoken + + +def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int: + """ + Counts the number of tokens in a given text string for a specific model. + + Args: + text (str): The text to count tokens for. + model (str): The model to base the tokenization on. Default is "gpt-3.5-turbo". + + Returns: + int: The number of tokens in the text. + """ + # Load the tokenizer for the specified model + encoding = tiktoken.encoding_for_model(model) + + # Encode the text and count the tokens + tokens = encoding.encode(text) + return len(tokens) diff --git a/py/packages/corpora_ai/llm_interface.py b/py/packages/corpora_ai/llm_interface.py index ddcd656..6638379 100644 --- a/py/packages/corpora_ai/llm_interface.py +++ b/py/packages/corpora_ai/llm_interface.py @@ -2,6 +2,8 @@ from dataclasses import dataclass from typing import List +from corpora_ai.prompts import SUMMARIZE_SYSTEM_MESSAGE + @dataclass class ChatCompletionTextMessage: @@ -32,7 +34,7 @@ def get_text_completion(self, messages: List[ChatCompletionTextMessage]) -> str: pass @abstractmethod - def generate_embedding(self, text: str) -> List[float]: + def get_embedding(self, text: str) -> List[float]: """ Generates an embedding vector for the input text, suitable for a pgvector VectorField. @@ -43,3 +45,26 @@ def generate_embedding(self, text: str) -> List[float]: List[float]: The embedding vector. """ pass + + def get_summary(self, text: str) -> str: + """ + Generates a summary of the input text. + + Args: + text (str): The text to summarize. + + Returns: + str: The generated summary text. + """ + return self.get_text_completion( + [ + ChatCompletionTextMessage( + role="system", + content=SUMMARIZE_SYSTEM_MESSAGE, + ), + ChatCompletionTextMessage( + role="user", + content=f"Summarize the following:\n {text}", + ), + ] + ) diff --git a/py/packages/corpora_ai/prompts.py b/py/packages/corpora_ai/prompts.py new file mode 100644 index 0000000..787e6ea --- /dev/null +++ b/py/packages/corpora_ai/prompts.py @@ -0,0 +1,9 @@ +SUMMARIZE_SYSTEM_MESSAGE = ( + "You are a highly analytical assistant trained to produce concise summaries. " + "For the input text, summarize its main ideas while preserving key terminology, " + "concepts, and any domain-specific vocabulary, whether from programming, technical " + "documents, or general language. Retain proper nouns, unique terms, and relevant " + "phrases that capture the essential meaning and context. The summary should be " + "short, cohesive, and representative of the original text's core message, making " + "it suitable for semantic search and relevance matching." +) diff --git a/py/packages/corpora_ai/split.py b/py/packages/corpora_ai/split.py new file mode 100644 index 0000000..a261b20 --- /dev/null +++ b/py/packages/corpora_ai/split.py @@ -0,0 +1,41 @@ +import os +from typing import Union + +from langchain_text_splitters import ( + PythonCodeTextSplitter, + # MarkdownHeaderTextSplitter, + MarkdownTextSplitter, + CharacterTextSplitter, +) + + +def get_text_splitter( + file_name: str, + chunk_size: int = 5000, # number of characters + chunk_overlap: int = 0, # number of characters +) -> Union[PythonCodeTextSplitter, CharacterTextSplitter]: + """ + Returns an appropriate text splitter based on the file extension or name. + """ + # Mapping specific extensions to splitters + extension_to_splitter = { + ".py": PythonCodeTextSplitter, + ".md": MarkdownTextSplitter, + # Add more mappings as needed + } + + # Extract the extension (if available) and lower-case for consistency + _, ext = os.path.splitext(file_name) + ext = ext.lower() + + # Handle files without extensions using a default splitter + splitter_class = extension_to_splitter.get(ext, None) + + # For files with defined splitters, return the configured splitter instance + if splitter_class: + return splitter_class(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + + # For text-based formats or unknown extensions, use CharacterTextSplitter with `\n\n` + return CharacterTextSplitter( + separator="\n\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) diff --git a/py/packages/corpora_ai/test_provider_loader.py b/py/packages/corpora_ai/test_provider_loader.py index ccd0f21..5e4307b 100644 --- a/py/packages/corpora_ai/test_provider_loader.py +++ b/py/packages/corpora_ai/test_provider_loader.py @@ -24,7 +24,7 @@ def test_load_openai_provider_success(self, MockOpenAIClient): self.assertIsInstance(provider, LLMBaseInterface) self.assertEqual(provider, mock_client_instance) - @patch.dict(os.environ, {"LLM_PROVIDER": "openai"}) + @patch.dict(os.environ, {"LLM_PROVIDER": "openai", "OPENAI_API_KEY": ""}) @patch("corpora_ai.provider_loader.OpenAIClient") def test_missing_openai_api_key(self, MockOpenAIClient): """ diff --git a/py/packages/corpora_ai_openai/README.md b/py/packages/corpora_ai_openai/README.md index 4724c50..62832cb 100644 --- a/py/packages/corpora_ai_openai/README.md +++ b/py/packages/corpora_ai_openai/README.md @@ -33,7 +33,7 @@ print(response) ### Generating an Embedding ```python -embedding = llm.generate_embedding("Sample text for embedding") +embedding = llm.get_embedding("Sample text for embedding") print(embedding) ``` diff --git a/py/packages/corpora_ai_openai/llm_client.py b/py/packages/corpora_ai_openai/llm_client.py index 3c80f7a..69bf23b 100644 --- a/py/packages/corpora_ai_openai/llm_client.py +++ b/py/packages/corpora_ai_openai/llm_client.py @@ -25,7 +25,7 @@ def get_text_completion(self, messages: List[ChatCompletionTextMessage]) -> str: ) return response.choices[0].message.content - def generate_embedding(self, text: str) -> List[float]: + def get_embedding(self, text: str) -> List[float]: if not text: raise ValueError("Input text must not be empty.") response = self.client.embeddings.create(input=text, model=self.embedding_model) diff --git a/py/packages/corpora_ai_openai/test_llm_client.py b/py/packages/corpora_ai_openai/test_llm_client.py index 8b9ec63..4cb7747 100644 --- a/py/packages/corpora_ai_openai/test_llm_client.py +++ b/py/packages/corpora_ai_openai/test_llm_client.py @@ -37,9 +37,9 @@ def test_get_text_completion_success(self): model="gpt-4o", messages=[{"role": "user", "content": "Tell me a joke."}] ) - def test_generate_embedding_success(self): + def test_get_embedding_success(self): """ - Test that generate_embedding returns the correct embedding vector. + Test that get_embedding returns the correct embedding vector. """ # Mock response from OpenAI API mock_response = MagicMock() @@ -47,8 +47,8 @@ def test_generate_embedding_success(self): mock_response.data[0].embedding = [0.1, 0.2, 0.3] self.mock_openai_client.embeddings.create.return_value = mock_response - # Call generate_embedding and assert response - response = self.client.generate_embedding("Sample text for embedding") + # Call get_embedding and assert response + response = self.client.get_embedding("Sample text for embedding") self.assertEqual(response, [0.1, 0.2, 0.3]) # Ensure OpenAI API was called with correct parameters @@ -63,12 +63,12 @@ def test_get_text_completion_empty_messages(self): with self.assertRaises(ValueError): self.client.get_text_completion([]) - def test_generate_embedding_empty_text(self): + def test_get_embedding_empty_text(self): """ - Test that generate_embedding raises an error when text is empty. + Test that get_embedding raises an error when text is empty. """ with self.assertRaises(ValueError): - self.client.generate_embedding("") + self.client.get_embedding("") if __name__ == "__main__": diff --git a/py/packages/corpora_cli/commands/corpus.py b/py/packages/corpora_cli/commands/corpus.py index 5b5bc48..2b8d4c8 100644 --- a/py/packages/corpora_cli/commands/corpus.py +++ b/py/packages/corpora_cli/commands/corpus.py @@ -45,7 +45,7 @@ def delete(ctx: typer.Context): c.console.print(f"Deleting corpus: {corpus_name}") try: c.api_client.corpora_api_delete_corpus(corpus_name) - c.console.print("Corpus deleted.", style="green") + c.console.print(f"{corpus_name} deleted", style="green") except ApiException as e: if e.status == 404: c.console.print("Corpus not found.", style="red") diff --git a/py/requirements.txt b/py/requirements.txt index 6b80a53..66af576 100644 --- a/py/requirements.txt +++ b/py/requirements.txt @@ -4,4 +4,7 @@ -r packages/corpora_client/test-requirements.txt -r packages/corpora_proj/requirements.txt -r packages/corpora_ai_openai/requirements.txt +# TODO: decide if these should be isolated +langchain-text-splitters==0.3.2 +tiktoken==0.8.0 -r requirements-dev.txt