diff --git a/.corpora.yaml b/.corpora.yaml
index 3df22f9..3337573 100644
--- a/.corpora.yaml
+++ b/.corpora.yaml
@@ -1,6 +1,6 @@
 # .corpora.yaml
 
-name: "corpora"
+name: "corpora2"
 url: "https://github.com/skyl/corpora"
 
 server:
diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh
index 1ba990a..305103c 100755
--- a/.devcontainer/setup.sh
+++ b/.devcontainer/setup.sh
@@ -4,4 +4,4 @@
 # echo "starting zsh..."
 echo 'autoload -Uz add-zsh-hook; append_history() { fc -W }; add-zsh-hook precmd append_history; export HISTFILE=/home/vscode/.corpora.zsh_history/.zsh_history' >> ~/.zshrc
 echo alias tree="tree -I '.venv|node_modules|__pycache__|.git|.pytest_cache' -a" >> ~/.zshrc
-zsh
+# zsh
diff --git a/.gitignore b/.gitignore
index 3e91ef8..e71c493 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+/.env
+
 # macOS system files
 .DS_Store
 
diff --git a/TODO.md b/TODO.md
index d131d69..bc7da67 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,5 +1,11 @@
 # TODO
 
+- compare vector of summary versus vector of splits, create search endpoint(s) test adhoc queries
+
+- test full init with chained tasks
+
+- Python split with AST - langchain one is kinda' lame.
+
 - for small corpora, we could get away with a mega-task but let's break into many single responsibilities
   - finish ingest for real
   - Start building the real records for the `corpora` corpus
@@ -20,7 +26,7 @@
   - agents ... function calling ... maybe we have a menu of function `mkdir`, `create file`, `rewrite file`
     * Take input. Analyze problem. Choose sequence of agents. Let agents execute in order. Report back to user.
 
-- rm unused imports, for instance (on save, in CI..)
+- more CI checks, local tools - rm unused imports, for instance (on save, in CI..)
 
 # Next
 
@@ -60,4 +66,7 @@
 - collect corpus, upload tarball to API
 - introduce celery (?) or similar for async tasks
 - handle the file (sync/async) upload in the API
+- vector length in models versus oai implementation
+- langchain-text-splitters - split based on type, cohesive approach, python and md
+
 
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 86c2dd6..ccaadd6 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -9,6 +9,9 @@ services:
     environment:
       PYTHONPATH: "/workspace/py/packages"
       REDIS_URL: "redis://redis:6379/0"
+      OPENAI_API_KEY: "${OPENAI_API_KEY}"
+    command: python manage.py runserver 0.0.0.0:8000
+    working_dir: /workspace/py/packages/corpora_proj
     depends_on:
       - db
       - redis
@@ -49,6 +52,7 @@ services:
     environment:
       PYTHONPATH: "/workspace/py/packages"
       REDIS_URL: "redis://redis:6379/0"
+      OPENAI_API_KEY: "${OPENAI_API_KEY}"
     depends_on:
       redis:
         condition: service_healthy
diff --git a/md/notes/celery-tasks.md b/md/notes/celery-tasks.md
new file mode 100644
index 0000000..8eba3b3
--- /dev/null
+++ b/md/notes/celery-tasks.md
@@ -0,0 +1,217 @@
+> Understanding Celery's task methods, such as `s`, `si`, `delay`, and `chain`, is essential for managing asynchronous workflows effectively. Each method has its own purpose, especially when working with task dependencies, immutability, and argument handling.
+>
+> Here’s a breakdown of the key Celery task methods and techniques, along with practical examples to illustrate their use cases.
+
+---
+
+## 1. `delay`: The Simplest Way to Trigger a Task
+
+`delay` is a shortcut for calling a Celery task asynchronously with positional arguments. It’s easy to use and is equivalent to `.apply_async()` but with simplified syntax.
+
+### Example
+
+```python
+@shared_task
+def add(x, y):
+    return x + y
+
+# Calling the task asynchronously with delay
+result = add.delay(4, 5)  # Runs add(4, 5) asynchronously
+```
+
+### Usage Notes
+
+- **Simplifies Syntax**: You don’t need to use `.apply_async((args, kwargs), ...)`.
+- **Fire and Forget**: `delay` is perfect for simple, one-off tasks.
+
+---
+
+## 2. `.apply_async()`: Full Control over Task Execution
+
+While `delay` is convenient, `.apply_async()` provides more control. You can pass additional options such as `countdown`, `eta`, and `retry` policies.
+
+### Example
+
+```python
+@shared_task
+def send_email(to_address, subject, body):
+    # Send email logic here
+    return "Email sent"
+
+# Schedule task with additional options
+result = send_email.apply_async(
+    args=("user@example.com", "Subject", "Body text"),
+    countdown=60  # Delay execution by 60 seconds
+)
+```
+
+### Usage Notes
+
+- **Control Timing**: You can schedule tasks to run at specific times or after delays.
+- **Pass Custom Options**: Supports retry policies, timeouts, and more.
+
+---
+
+## 3. `s`: Signature for Building Task Chains and Groups
+
+The `s` method, short for “signature,” is used to define a task and its arguments without executing it. You can think of it as preparing a “task blueprint” for use in chains, groups, or chords.
+
+### Example
+
+```python
+@shared_task
+def process_data(data):
+    # Process data logic
+    return "Processed"
+
+# Define a task signature
+task_signature = process_data.s("sample data")
+
+# Use the signature to run the task
+task_signature.delay()
+```
+
+### Usage Notes
+
+- **Reusable**: Allows you to define tasks with arguments that can be reused in other contexts.
+- **Works in Chains**: Essential for creating task chains, groups, and chords.
+
+---
+
+## 4. `si`: Immutable Signature
+
+The `si` method, short for “signature immutable,” is similar to `s` but creates an **immutable** signature. This means that any result from a previous task in a chain will not be passed to this task, even if it’s part of a chain.
+
+### Example
+
+```python
+@shared_task
+def step_one():
+    return "Result from step one"
+
+@shared_task
+def step_two(data):
+    return f"Received: {data}"
+
+# Immutable signature - it will not receive output from step_one
+chain(step_one.s(), step_two.si("Custom data")).apply_async()
+```
+
+In this example, `step_two` will receive `"Custom data"` as its input, ignoring the output of `step_one`.
+
+### Usage Notes
+
+- **Useful in Chains**: If a task should always receive specific arguments, use `si` to prevent it from receiving output from prior tasks.
+- **Avoids Argument Mismatch Errors**: Prevents issues when chaining tasks that don’t accept the output of previous tasks.
+
+---
+
+## 5. `chain`: Creating Sequential Task Pipelines
+
+`chain` is used to create a sequence of tasks where each task runs after the previous one completes. The result of each task is passed as the input to the next task in the chain (unless `si` is used).
+
+### Example
+
+```python
+@shared_task
+def add(x, y):
+    return x + y
+
+@shared_task
+def multiply(result):
+    return result * 10
+
+# Chain tasks together
+result = chain(add.s(4, 5), multiply.s()).apply_async()
+```
+
+In this example:
+- `add` runs first with arguments `(4, 5)` and returns `9`.
+- `multiply` then receives `9` as its input and returns `90`.
+
+### Usage Notes
+
+- **Sequential Dependency**: Use `chain` when each task depends on the output of the previous task.
+- **Error Handling**: If any task in the chain fails, the subsequent tasks will not run.
+
+---
+
+## 6. `group`: Running Tasks in Parallel
+
+`group` lets you run multiple tasks in parallel and collect their results. It’s useful when tasks can run independently, and you want to aggregate their outputs.
+
+### Example
+
+```python
+@shared_task
+def add(x, y):
+    return x + y
+
+# Run tasks in parallel
+result = group(add.s(2, 2), add.s(4, 4), add.s(6, 6)).apply_async()
+```
+
+In this example:
+- Each `add` task runs independently.
+- The results are collected as a list once all tasks complete (e.g., `[4, 8, 12]`).
+
+### Usage Notes
+
+- **Parallel Execution**: Ideal for tasks that don’t depend on each other.
+- **Collect Results**: Results are returned as a list after all tasks complete.
+
+---
+
+## 7. `chord`: Combining Groups with a Final Callback
+
+A `chord` is a combination of a `group` and a callback. All tasks in the group run in parallel, and once they complete, their results are passed to the callback task.
+
+### Example
+
+```python
+@shared_task
+def add(x, y):
+    return x + y
+
+@shared_task
+def summarize(results):
+    return sum(results)
+
+# Define a chord
+result = chord([add.s(2, 2), add.s(4, 4), add.s(6, 6)], summarize.s()).apply_async()
+```
+
+In this example:
+- Each `add` task runs in parallel.
+- The results `[4, 8, 12]` are passed to `summarize`, which returns `24`.
+
+### Usage Notes
+
+- **Useful for Aggregating Results**: The callback task can aggregate results from multiple tasks.
+- **Sequential Logic**: Ensures that the callback runs only after all tasks in the group complete.
+
+---
+
+## Summary Table
+
+| Method           | Purpose                                                                                  | Usage Example                     |
+|------------------|------------------------------------------------------------------------------------------|-----------------------------------|
+| `delay`          | Shortcut for asynchronously calling a task                                              | `task.delay(args)`                |
+| `apply_async`    | Full control over task execution with additional options                                | `task.apply_async(args, options)` |
+| `s`              | Signature for creating task definitions without execution                               | `task.s(args)`                    |
+| `si`             | Immutable signature to prevent prior task results from being passed                     | `task.si(args)`                   |
+| `chain`          | Sequential pipeline where each task depends on the output of the previous task          | `chain(task1.s(), task2.s())`     |
+| `group`          | Parallel execution of multiple tasks, with results collected as a list                  | `group([task1.s(), task2.s()])`   |
+| `chord`          | Combines a group of parallel tasks with a callback that receives the group’s results    | `chord(group, callback.s())`      |
+
+---
+
+### Choosing the Right Method
+
+- **For simple, independent tasks**: Use `delay` or `apply_async`.
+- **For sequential tasks with dependencies**: Use `chain`.
+- **For parallel tasks with independent execution**: Use `group`.
+- **For combining parallel tasks with a final aggregation**: Use `chord`.
+- **For tasks in a chain where you need a fixed input**: Use `si` to make the signature immutable.
+
+This understanding gives you the flexibility to create robust, efficient Celery workflows that handle task dependencies, sequencing, and parallelism as needed.
diff --git a/md/notes/practical-embeddings-tutorial-2-multivector.md b/md/notes/practical-embeddings-tutorial-2-multivector.md
new file mode 100644
index 0000000..969cff2
--- /dev/null
+++ b/md/notes/practical-embeddings-tutorial-2-multivector.md
@@ -0,0 +1,140 @@
+## Comprehensive Guide to Multi-Vector Embeddings with ColBERT and pgvector in Django
+
+Multi-vector embeddings allow for fine-grained semantic search by storing multiple vectors per document, representing token-level or segment-level information. This tutorial will cover:
+
+1. **Overview of Multi-Vector Models and ColBERT**
+2. **Setting Up Multi-Vector Embeddings in Django with pgvector**
+3. **Practical Storage and Retrieval with Multi-Vectors**
+
+---
+
+### 1. Overview of Multi-Vector Models and ColBERT
+
+**ColBERT** (Contextualized Late Interaction over BERT) is a multi-vector model that generates a vector for each token in a document, enabling high-resolution semantic matching. This token-level approach allows for detailed similarity matching between query terms and document terms.
+
+- **Dimensionality**: ColBERT commonly uses **128-dimensional embeddings per token**, balancing semantic accuracy with manageable storage.
+- **Use Case**: Ideal for applications where granular matching (e.g., term-to-term or passage-to-passage) is needed, such as search in code repositories, books, or large document corpora.
+
+With ColBERT, each document is represented by an array of vectors, capturing the context of individual tokens, which can improve search precision.
+
+### 2. Setting Up Multi-Vector Embeddings in Django with pgvector
+
+#### Database Schema Design
+
+To store multi-vector embeddings in PostgreSQL with `pgvector`, you can leverage PostgreSQL’s array functionality. Each document will have an array of 128-dimensional vectors, one per token.
+
+#### Step-by-Step Implementation
+
+1. **Install pgvector and Set Up PostgreSQL**
+
+   Make sure PostgreSQL has `pgvector` installed. If not, install it with:
+   ```sql
+   CREATE EXTENSION IF NOT EXISTS vector;
+   ```
+
+2. **Define Django Model for Multi-Vector Embeddings**
+
+   Use Django’s `ArrayField` along with `VectorField` from `pgvector.django`. Here’s how to define a model that stores multiple 128-dimensional vectors for each document:
+
+   ```python
+   from django.db import models
+   from django.contrib.postgres.fields import ArrayField
+   from pgvector.django import VectorField
+
+   class Document(models.Model):
+       content = models.TextField()
+       embeddings = ArrayField(
+           base_field=VectorField(dimensions=128),  # 128 dimensions per token vector
+           size=None,  # None allows for variable-length arrays
+       )
+   ```
+
+   - **`content`**: Stores the raw text of the document.
+   - **`embeddings`**: An array field where each element is a 128-dimensional vector representing a token embedding.
+
+3. **Generate and Store Multi-Vector Embeddings**
+
+   Use ColBERT to generate an embedding for each token in a document. Here’s a sample function that could generate and store these embeddings:
+
+   ```python
+   from transformers import ColBERTTokenizer, ColBERTModel
+   import torch
+
+   # Assume ColBERTModel and ColBERTTokenizer are set up correctly
+   tokenizer = ColBERTTokenizer.from_pretrained("bert-base-uncased")
+   model = ColBERTModel.from_pretrained("bert-base-uncased")
+
+   def generate_and_store_embeddings(document_text):
+       # Tokenize and obtain embeddings for each token
+       inputs = tokenizer(document_text, return_tensors="pt")
+       embeddings = model(**inputs).last_hidden_state.squeeze().detach().numpy()
+
+       # Convert embeddings to a list of lists for storage
+       embeddings_list = embeddings.tolist()[:128]  # Limit to 128 dimensions
+
+       # Store in Django
+       document = Document(content=document_text, embeddings=embeddings_list)
+       document.save()
+   ```
+
+   This example assumes that each token in `document_text` has been converted to a 128-dimensional vector. After generation, the vectors are stored in the `embeddings` array field.
+
+#### Indexing for Efficient Retrieval
+
+Create an index on the embeddings array to speed up similarity searches:
+
+```sql
+CREATE INDEX ON document_embeddings USING ivfflat (embeddings);
+```
+
+This index enables faster nearest neighbor searches within the vector space.
+
+### 3. Practical Storage and Retrieval with Multi-Vectors
+
+With multi-vector embeddings stored, we can now perform searches that take advantage of the fine-grained information embedded in each token’s vector.
+
+#### Example Query for Similarity Search with Late Interaction
+
+When performing a search query, you generate embeddings for the query terms and compare them against the token-level embeddings of each document.
+
+Here’s an example using Django to retrieve similar documents based on a multi-vector query:
+
+```python
+from django.db.models import F
+from pgvector.django import CosineDistance
+
+def find_similar_documents(query_text):
+    # Generate multi-vector embeddings for query text
+    query_embeddings = get_embeddings_for_text(query_text)
+
+    # Perform similarity search against stored embeddings
+    results = Document.objects.annotate(
+        similarity=CosineDistance(F("embeddings"), query_embeddings)
+    ).order_by("similarity")[:10]
+
+    return results
+```
+
+In this example:
+- **Cosine Similarity**: Measures similarity between the query embeddings and each document’s token embeddings, allowing for granular matching.
+- **Late Interaction**: This function calculates similarity scores by interacting with each token embedding, offering higher precision in retrieval.
+
+### Practical Considerations
+
+1. **Storage Requirements**: Multi-vector embeddings require more storage than single-vector embeddings. Each token embedding adds data, so ensure your database can handle the additional storage requirements.
+
+2. **Query Performance**: Multi-vector queries are more computationally intensive. Indexing, caching, and query optimization can help maintain acceptable performance.
+
+3. **Adjustable Dimensionality**: If storage is a concern, consider reducing the dimensionality (e.g., to 64 or 32 dimensions) to lower storage requirements, though this may reduce search accuracy.
+
+---
+
+### Summary
+
+Using ColBERT-style multi-vector embeddings with `pgvector` in Django enables a high-resolution search experience, ideal for complex corpora like code repositories or large document collections.
+
+- **Define Multi-Vector Storage**: Use `ArrayField` and `VectorField` to store arrays of vectors, representing token-level embeddings.
+- **Efficient Querying**: Use Cosine similarity with late interaction to retrieve relevant results based on fine-grained semantic matches.
+- **Performance and Storage Management**: Balance dimensionality, indexing, and storage to optimize for both accuracy and performance.
+
+This setup provides a robust foundation for implementing advanced search capabilities within a Django application, leveraging the power of multi-vector embeddings and `pgvector`.
\ No newline at end of file
diff --git a/md/notes/practical-embeddings-tutorial.md b/md/notes/practical-embeddings-tutorial.md
new file mode 100644
index 0000000..d5ae117
--- /dev/null
+++ b/md/notes/practical-embeddings-tutorial.md
@@ -0,0 +1,102 @@
+Here’s an updated, accurate tutorial on embeddings with a focus on using `text-embedding-3-small` at 1536 dimensions, trade-offs in embedding size, practical strategies for corpora like books and code repositories, and how to manage dimensionality effectively.
+
+---
+
+## Comprehensive Guide to Embeddings in High-Quality Search Systems
+
+Embeddings transform text or code into dense vectors, capturing semantic meaning for search and retrieval. For a high-quality application like Corpora, we’re focusing on accurate representations, optimized for both contextual retrieval and future flexibility in vector dimensions.
+
+This guide covers:
+1. **Model Dimensions and Configurations**
+2. **Effective Embedding Strategies for Different Corpora**
+3. **Trade-offs in Dimensionality and Storage**
+
+## Model Dimensions and Configurations
+
+### Overview of `text-embedding-3-small` and Other Models
+
+- **`text-embedding-3-small`**: Generates a **1536-dimensional** embedding by default. This provides rich contextual data, suitable for complex retrieval tasks. OpenAI allows for reduced dimensions if needed by specifying a custom `dimensions` parameter, but the full dimensionality is recommended for applications focused on high semantic fidelity.
+
+- **Higher-Dimensional Options**: OpenAI’s larger models, such as `text-embedding-3-large`, can offer dimensions up to **3072**. These are ideal for highly nuanced tasks but come with increased storage and computational costs.
+
+- **Other Common Models**:
+  - **BERT**: Often used at 768 dimensions, strong for general-purpose contextual similarity.
+  - **Sentence Transformers**: Available at dimensions from 384 to 1024, flexible and widely used for semantic search.
+
+Given Corpora’s focus, using the **full 1536 dimensions** of `text-embedding-3-small` offers an optimal balance of detail and performance.
+
+### Setting up 1536 Dimensions in Django with pgvector
+
+To store 1536-dimensional vectors, configure your `VectorField` as follows:
+
+```python
+# Define a 1536-dimensional VectorField for embeddings in Django with pgvector
+from pgvector.django import VectorField
+
+vector = VectorField(dimensions=1536, null=True, blank=True)
+```
+
+This setup ensures high-quality embeddings, capturing detailed semantic nuances without any dimensionality reduction.
+
+## Effective Embedding Strategies for Different Corpora
+
+To make the most of embeddings, consider the structure and purpose of each corpus type:
+
+### 1. Books or Long Documents
+   - **Chunking for Contextual Search**: For long texts, break each document into chunks (e.g., 300–500 words). Embed each chunk individually, allowing for fine-grained search and retrieval of relevant passages.
+   - **Hierarchical Search**: Store and retrieve passages by chunk-level embeddings, then apply a secondary ranking if needed, based on the entire document.
+
+### 2. Code Repositories
+   - **Function-Level Embeddings**: For large files, embedding each function or class provides focused representations that are ideal for retrieving specific code snippets or analyzing code structure.
+   - **File-Level Embeddings for Smaller Files**: For small code files, embedding the entire file at once can be effective, offering a holistic view of the code’s purpose.
+
+In both cases, keeping context manageable (within 8191 tokens for `text-embedding-3-small`) ensures that embeddings maintain accuracy and relevance.
+
+### Strategies for Balancing Dimensionality and Retrieval Goals
+
+With high-dimensional embeddings like 1536, retrieval tasks are highly accurate but can be resource-intensive. Here are some tips for balancing dimensionality with search performance:
+
+1. **Use Primary Embedding (1536) for Critical Similarity Searches**: For high-stakes applications, prioritize retrieval with the full-dimensionality embedding. This setup will yield the best results but requires indexing and query optimization.
+
+2. **Store Multiple Embeddings for Configurable Retrieval**: If different levels of granularity are needed, consider storing multiple embeddings per record, such as:
+   - **`embedding_1536`**: Main field for detailed retrieval.
+   - **`embedding_300`**: Secondary field for lightweight, approximate similarity searches.
+
+   This approach provides flexibility, allowing you to choose the embedding based on the retrieval context.
+
+3. **pgvector and Indexing**: Ensure pgvector is properly indexed for the primary vector field. For high-dimensional vectors, **Cosine Similarity** or **Inner Product** are commonly used to find nearest neighbors.
+
+```python
+from django.db.models import F
+from pgvector.django import CosineDistance
+
+def find_similar_documents(query_text):
+    query_embedding = client.get_embedding(query_text)
+    return Document.objects.annotate(
+        similarity=CosineDistance(F("embedding_1536"), query_embedding)
+    ).order_by("similarity")[:10]
+```
+
+This ensures efficient, high-quality retrieval across complex document corpora.
+
+## Trade-offs in Dimensionality and Storage
+
+Choosing a higher dimensionality, like 1536, brings significant advantages but also some considerations:
+
+1. **Storage Costs**: Higher dimensions require more storage. With a 1536-dimensional vector, each entry will consume significantly more database space compared to lower-dimensional vectors (e.g., 300 or 512). However, for high-value applications like Corpora, this trade-off is worthwhile.
+
+2. **Performance and Latency**: Queries with high-dimensional vectors can be slower, especially with larger datasets. Indexing and caching strategies, along with a well-tuned database, help mitigate these effects.
+
+3. **Dimensionality Reduction (Optional)**: If certain use cases warrant lower dimensions, consider techniques like **Principal Component Analysis (PCA)** on the 1536-dimensional vectors to produce 300- or 512-dimensional approximations for faster, approximate searches.
+
+---
+
+## Summary
+
+For a world-class application like Corpora, the 1536-dimensional configuration of `text-embedding-3-small` strikes an ideal balance, providing:
+
+- **High Semantic Fidelity**: Essential for detailed contextual search across books and code.
+- **Scalable Retrieval Strategies**: Options to chunk and store various embedding levels for optimal search performance.
+- **Configurable Dimensionality**: Storing primary high-dimension embeddings while allowing secondary, smaller dimensions if required.
+
+By following these strategies, Corpora will be equipped for high-quality, contextually rich search capabilities that meet the standards of a world-class application.
diff --git a/py/packages/corpora/admin.py b/py/packages/corpora/admin.py
index aa39fd8..7c291c2 100644
--- a/py/packages/corpora/admin.py
+++ b/py/packages/corpora/admin.py
@@ -23,8 +23,11 @@ class CorpusTextFileAdmin(admin.ModelAdmin):
     readonly_fields = ("checksum", "created_at", "updated_at")
     fieldsets = (
         (None, {"fields": ("corpus", "path", "content")}),
-        ("AI Summary", {"fields": ("ai_summary", "vector_of_summary")}),
-        ("Metadata", {"fields": ("checksum", "created_at", "updated_at")}),
+        ("AI Summary", {"fields": ("ai_summary",)}),
+        (
+            "Metadata",
+            {"fields": ("checksum", "created_at", "updated_at")},
+        ),
     )
 
 
@@ -34,10 +37,9 @@ class SplitAdmin(admin.ModelAdmin):
     list_display = ("file", "order", "content_preview", "metadata")
     search_fields = ("file__path", "content")
     ordering = ("file", "order")
-    readonly_fields = ("vector",)
     fieldsets = (
         (None, {"fields": ("file", "order", "content")}),
-        ("Vector Data", {"fields": ("vector", "metadata")}),
+        ("Meta", {"fields": ("metadata",)}),
     )
 
     def content_preview(self, obj):
diff --git a/py/packages/corpora/migrations/0007_alter_split_vector.py b/py/packages/corpora/migrations/0007_alter_split_vector.py
new file mode 100644
index 0000000..911b9f9
--- /dev/null
+++ b/py/packages/corpora/migrations/0007_alter_split_vector.py
@@ -0,0 +1,24 @@
+# Generated by Django 5.1.2 on 2024-11-04 01:41
+
+import pgvector.django.vector
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("corpora", "0006_rename_file_corpustextfile"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="split",
+            name="vector",
+            field=pgvector.django.vector.VectorField(
+                blank=True,
+                dimensions=1536,
+                help_text="text-embedding-3-small vector of the content",
+                null=True,
+            ),
+        ),
+    ]
diff --git a/py/packages/corpora/migrations/0008_alter_corpustextfile_vector_of_summary.py b/py/packages/corpora/migrations/0008_alter_corpustextfile_vector_of_summary.py
new file mode 100644
index 0000000..0e64f27
--- /dev/null
+++ b/py/packages/corpora/migrations/0008_alter_corpustextfile_vector_of_summary.py
@@ -0,0 +1,24 @@
+# Generated by Django 5.1.2 on 2024-11-04 02:58
+
+import pgvector.django.vector
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("corpora", "0007_alter_split_vector"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="corpustextfile",
+            name="vector_of_summary",
+            field=pgvector.django.vector.VectorField(
+                blank=True,
+                dimensions=1536,
+                help_text="text-embedding-3-small vector of the content",
+                null=True,
+            ),
+        ),
+    ]
diff --git a/py/packages/corpora/models.py b/py/packages/corpora/models.py
index 317bb8b..a6388df 100644
--- a/py/packages/corpora/models.py
+++ b/py/packages/corpora/models.py
@@ -1,9 +1,15 @@
+import os
 import uuid
+
 from django.db import models
 from django.utils import timezone
 from django.contrib.auth import get_user_model
+
+# from django.contrib.postgres.fields import ArrayField
 from pgvector.django import VectorField
 
+from corpora_ai.split import get_text_splitter
+
 User = get_user_model()
 
 
@@ -54,7 +60,13 @@ class CorpusTextFile(models.Model):
     path = models.CharField(max_length=1024)
     content = models.TextField(blank=True)
     ai_summary = models.TextField(blank=True)
-    vector_of_summary = VectorField(dimensions=300, null=True, blank=True)
+    vector_of_summary = VectorField(
+        dimensions=1536,
+        null=True,
+        blank=True,
+        help_text="text-embedding-3-small vector of the content",
+        editable=False,
+    )
     checksum = models.CharField(
         max_length=40,
         editable=False,
@@ -70,6 +82,44 @@ class Meta:
     def __str__(self):
         return f"{self.corpus.name}:{self.path}"
 
+    def get_and_save_summary(self):
+        from corpora_ai.provider_loader import load_llm_provider
+
+        llm = load_llm_provider()
+        summary = llm.get_summary(self._get_text_representation())
+        self.ai_summary = summary
+        self.save(update_fields=["ai_summary"])
+
+    def _get_text_representation(self):
+        return f"{self.corpus.name}:{self.path}\n\n{self.content}"
+
+    def get_and_save_vector_of_summary(self):
+        from corpora_ai.provider_loader import load_llm_provider
+
+        llm = load_llm_provider()
+        vector = llm.get_embedding(self.ai_summary)
+        self.vector_of_summary = vector
+        self.save(update_fields=["vector_of_summary"])
+
+    def split_content(self):
+        """
+        Splits the content of the file into smaller parts using an appropriate text splitter.
+        Returns a list of Split instances.
+        """
+        file_name = os.path.basename(self.path)
+        splitter = get_text_splitter(file_name)
+
+        # Split content into parts
+        parts = splitter.split_text(self.content)
+        splits = []
+
+        # Create Split instances for each part
+        for order, part in enumerate(parts):
+            split = Split.objects.create(file=self, order=order, content=part)
+            splits.append(split)
+
+        return splits
+
 
 class Split(models.Model):
     id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
@@ -78,7 +128,20 @@ class Split(models.Model):
     )
     order = models.PositiveIntegerField()
     content = models.TextField(blank=True)
-    vector = VectorField(dimensions=300, null=True, blank=True)
+    vector = VectorField(
+        dimensions=1536,
+        null=True,
+        blank=True,
+        help_text="text-embedding-3-small vector of the content",
+        editable=False,
+    )
+    # # Multivector: https://huggingface.co/colbert-ir/colbertv2.0
+    # https://github.com/pgvector/pgvector/issues/640
+    # https://docs.djangoproject.com/en/5.1/ref/contrib/postgres/fields/#arrayfield
+    # colbert_embeddings = ArrayField(
+    #     base_field=VectorField(dimensions=128),
+    #     size=None,  # Set to None for variable-length arrays
+    # )
     metadata = models.JSONField(default=dict, blank=True)
 
     class Meta:
@@ -87,3 +150,17 @@ class Meta:
 
     def __str__(self):
         return f"{self.file.corpus.name}:{self.file.path}:{self.order}"
+
+    def get_and_save_vector(self):
+        from corpora_ai.provider_loader import load_llm_provider
+
+        llm = load_llm_provider()
+        vector = llm.get_embedding(self.content)
+        self.vector = vector
+        self.save(update_fields=["vector"])
+
+    # # Optionally, for multi-vector storage
+    # def get_and_save_colbert_vectors(self):
+    #     colbert_vectors = generate_colbert_vectors(self.content)  # e.g., a list of 128-dim vectors
+    #     self.colbert_embeddings = colbert_vectors
+    #     self.save(update_fields=["colbert_embeddings"])
diff --git a/py/packages/corpora/tasks.py b/py/packages/corpora/tasks.py
index 674284d..503a2e5 100644
--- a/py/packages/corpora/tasks.py
+++ b/py/packages/corpora/tasks.py
@@ -3,34 +3,58 @@
 from celery import shared_task
 
 from .lib.files import compute_checksum
-from .models import Corpus, CorpusTextFile
+from .models import Corpus, CorpusTextFile, Split
 
 
 @shared_task
 def process_tarball(corpus_id: str, tarball: bytes) -> None:
-    """
-    Process a tarball by extracting each file and creating a `CorpusFile`
-    entry for each extracted file in the database.
-    """
-    print(f"Processing tarball... {corpus_id}")
     corpus = Corpus.objects.get(id=corpus_id)
-
     with tarfile.open(fileobj=io.BytesIO(tarball), mode="r:gz") as tar:
         for member in tar.getmembers():
             if member.isfile():
-                file_content = tar.extractfile(member).read()
+                file_content = (
+                    tar.extractfile(member).read().decode("utf-8", errors="replace")
+                )
                 checksum = compute_checksum(file_content)
-                print(f"{member.name}")
-                print(f"{checksum}")
 
-                # Save each extracted file as a `CorpusFile` entry
-                cf = CorpusTextFile.objects.create(
+                # Create a CorpusTextFile and kick off further tasks
+                corpus_file = CorpusTextFile.objects.create(
                     corpus=corpus,
                     path=member.name,
-                    content=file_content.decode("utf-8", errors="replace"),
+                    content=file_content,
                     checksum=checksum,
                 )
-                print(f"Created file: {cf}")
+                generate_summary_task.delay(corpus_file.id)
+                split_file_task.delay(corpus_file.id)
+
+
+@shared_task
+def generate_summary_task(corpus_file_id: str) -> None:
+    corpus_file = CorpusTextFile.objects.get(id=corpus_file_id)
+    corpus_file.get_and_save_summary()
+    corpus_file.get_and_save_vector_of_summary()
+
+
+@shared_task
+def split_file_task(corpus_file_id: str) -> None:
+    corpus_file = CorpusTextFile.objects.get(id=corpus_file_id)
+    splits = corpus_file.split_content()
+    for split in splits:
+        # no need to chain
+        generate_vector_task.delay(split.id)
+        # generate_colbert_vectors_task.delay(split.id)
+
+
+@shared_task
+def generate_vector_task(split_id: str) -> None:
+    split = Split.objects.get(id=split_id)
+    split.get_and_save_vector()
+
+
+# @shared_task
+# def generate_colbert_vectors_task(split_id: str) -> None:
+#     split = Split.objects.get(id=split_id)
+#     split.get_and_save_colbert_vectors()
 
 
 @shared_task
diff --git a/py/packages/corpora_ai/README.md b/py/packages/corpora_ai/README.md
index cd514d5..08fe35e 100644
--- a/py/packages/corpora_ai/README.md
+++ b/py/packages/corpora_ai/README.md
@@ -36,6 +36,6 @@ print(response)
 ### Generating an Embedding
 
 ```python
-embedding = llm.generate_embedding("Sample text for embedding")
+embedding = llm.get_embedding("Sample text for embedding")
 print(embedding)
 ```
diff --git a/py/packages/corpora_ai/count_tokens.py b/py/packages/corpora_ai/count_tokens.py
new file mode 100644
index 0000000..876589d
--- /dev/null
+++ b/py/packages/corpora_ai/count_tokens.py
@@ -0,0 +1,20 @@
+import tiktoken
+
+
+def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
+    """
+    Counts the number of tokens in a given text string for a specific model.
+
+    Args:
+        text (str): The text to count tokens for.
+        model (str): The model to base the tokenization on. Default is "gpt-3.5-turbo".
+
+    Returns:
+        int: The number of tokens in the text.
+    """
+    # Load the tokenizer for the specified model
+    encoding = tiktoken.encoding_for_model(model)
+
+    # Encode the text and count the tokens
+    tokens = encoding.encode(text)
+    return len(tokens)
diff --git a/py/packages/corpora_ai/llm_interface.py b/py/packages/corpora_ai/llm_interface.py
index ddcd656..6638379 100644
--- a/py/packages/corpora_ai/llm_interface.py
+++ b/py/packages/corpora_ai/llm_interface.py
@@ -2,6 +2,8 @@
 from dataclasses import dataclass
 from typing import List
 
+from corpora_ai.prompts import SUMMARIZE_SYSTEM_MESSAGE
+
 
 @dataclass
 class ChatCompletionTextMessage:
@@ -32,7 +34,7 @@ def get_text_completion(self, messages: List[ChatCompletionTextMessage]) -> str:
         pass
 
     @abstractmethod
-    def generate_embedding(self, text: str) -> List[float]:
+    def get_embedding(self, text: str) -> List[float]:
         """
         Generates an embedding vector for the input text, suitable for a pgvector VectorField.
 
@@ -43,3 +45,26 @@ def generate_embedding(self, text: str) -> List[float]:
             List[float]: The embedding vector.
         """
         pass
+
+    def get_summary(self, text: str) -> str:
+        """
+        Generates a summary of the input text.
+
+        Args:
+            text (str): The text to summarize.
+
+        Returns:
+            str: The generated summary text.
+        """
+        return self.get_text_completion(
+            [
+                ChatCompletionTextMessage(
+                    role="system",
+                    content=SUMMARIZE_SYSTEM_MESSAGE,
+                ),
+                ChatCompletionTextMessage(
+                    role="user",
+                    content=f"Summarize the following:\n {text}",
+                ),
+            ]
+        )
diff --git a/py/packages/corpora_ai/prompts.py b/py/packages/corpora_ai/prompts.py
new file mode 100644
index 0000000..787e6ea
--- /dev/null
+++ b/py/packages/corpora_ai/prompts.py
@@ -0,0 +1,9 @@
+SUMMARIZE_SYSTEM_MESSAGE = (
+    "You are a highly analytical assistant trained to produce concise summaries. "
+    "For the input text, summarize its main ideas while preserving key terminology, "
+    "concepts, and any domain-specific vocabulary, whether from programming, technical "
+    "documents, or general language. Retain proper nouns, unique terms, and relevant "
+    "phrases that capture the essential meaning and context. The summary should be "
+    "short, cohesive, and representative of the original text's core message, making "
+    "it suitable for semantic search and relevance matching."
+)
diff --git a/py/packages/corpora_ai/split.py b/py/packages/corpora_ai/split.py
new file mode 100644
index 0000000..a261b20
--- /dev/null
+++ b/py/packages/corpora_ai/split.py
@@ -0,0 +1,41 @@
+import os
+from typing import Union
+
+from langchain_text_splitters import (
+    PythonCodeTextSplitter,
+    # MarkdownHeaderTextSplitter,
+    MarkdownTextSplitter,
+    CharacterTextSplitter,
+)
+
+
+def get_text_splitter(
+    file_name: str,
+    chunk_size: int = 5000,  # number of characters
+    chunk_overlap: int = 0,  # number of characters
+) -> Union[PythonCodeTextSplitter, CharacterTextSplitter]:
+    """
+    Returns an appropriate text splitter based on the file extension or name.
+    """
+    # Mapping specific extensions to splitters
+    extension_to_splitter = {
+        ".py": PythonCodeTextSplitter,
+        ".md": MarkdownTextSplitter,
+        # Add more mappings as needed
+    }
+
+    # Extract the extension (if available) and lower-case for consistency
+    _, ext = os.path.splitext(file_name)
+    ext = ext.lower()
+
+    # Handle files without extensions using a default splitter
+    splitter_class = extension_to_splitter.get(ext, None)
+
+    # For files with defined splitters, return the configured splitter instance
+    if splitter_class:
+        return splitter_class(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+
+    # For text-based formats or unknown extensions, use CharacterTextSplitter with `\n\n`
+    return CharacterTextSplitter(
+        separator="\n\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
diff --git a/py/packages/corpora_ai/test_provider_loader.py b/py/packages/corpora_ai/test_provider_loader.py
index ccd0f21..5e4307b 100644
--- a/py/packages/corpora_ai/test_provider_loader.py
+++ b/py/packages/corpora_ai/test_provider_loader.py
@@ -24,7 +24,7 @@ def test_load_openai_provider_success(self, MockOpenAIClient):
         self.assertIsInstance(provider, LLMBaseInterface)
         self.assertEqual(provider, mock_client_instance)
 
-    @patch.dict(os.environ, {"LLM_PROVIDER": "openai"})
+    @patch.dict(os.environ, {"LLM_PROVIDER": "openai", "OPENAI_API_KEY": ""})
     @patch("corpora_ai.provider_loader.OpenAIClient")
     def test_missing_openai_api_key(self, MockOpenAIClient):
         """
diff --git a/py/packages/corpora_ai_openai/README.md b/py/packages/corpora_ai_openai/README.md
index 4724c50..62832cb 100644
--- a/py/packages/corpora_ai_openai/README.md
+++ b/py/packages/corpora_ai_openai/README.md
@@ -33,7 +33,7 @@ print(response)
 ### Generating an Embedding
 
 ```python
-embedding = llm.generate_embedding("Sample text for embedding")
+embedding = llm.get_embedding("Sample text for embedding")
 print(embedding)
 ```
 
diff --git a/py/packages/corpora_ai_openai/llm_client.py b/py/packages/corpora_ai_openai/llm_client.py
index 3c80f7a..69bf23b 100644
--- a/py/packages/corpora_ai_openai/llm_client.py
+++ b/py/packages/corpora_ai_openai/llm_client.py
@@ -25,7 +25,7 @@ def get_text_completion(self, messages: List[ChatCompletionTextMessage]) -> str:
         )
         return response.choices[0].message.content
 
-    def generate_embedding(self, text: str) -> List[float]:
+    def get_embedding(self, text: str) -> List[float]:
         if not text:
             raise ValueError("Input text must not be empty.")
         response = self.client.embeddings.create(input=text, model=self.embedding_model)
diff --git a/py/packages/corpora_ai_openai/test_llm_client.py b/py/packages/corpora_ai_openai/test_llm_client.py
index 8b9ec63..4cb7747 100644
--- a/py/packages/corpora_ai_openai/test_llm_client.py
+++ b/py/packages/corpora_ai_openai/test_llm_client.py
@@ -37,9 +37,9 @@ def test_get_text_completion_success(self):
             model="gpt-4o", messages=[{"role": "user", "content": "Tell me a joke."}]
         )
 
-    def test_generate_embedding_success(self):
+    def test_get_embedding_success(self):
         """
-        Test that generate_embedding returns the correct embedding vector.
+        Test that get_embedding returns the correct embedding vector.
         """
         # Mock response from OpenAI API
         mock_response = MagicMock()
@@ -47,8 +47,8 @@ def test_generate_embedding_success(self):
         mock_response.data[0].embedding = [0.1, 0.2, 0.3]
         self.mock_openai_client.embeddings.create.return_value = mock_response
 
-        # Call generate_embedding and assert response
-        response = self.client.generate_embedding("Sample text for embedding")
+        # Call get_embedding and assert response
+        response = self.client.get_embedding("Sample text for embedding")
         self.assertEqual(response, [0.1, 0.2, 0.3])
 
         # Ensure OpenAI API was called with correct parameters
@@ -63,12 +63,12 @@ def test_get_text_completion_empty_messages(self):
         with self.assertRaises(ValueError):
             self.client.get_text_completion([])
 
-    def test_generate_embedding_empty_text(self):
+    def test_get_embedding_empty_text(self):
         """
-        Test that generate_embedding raises an error when text is empty.
+        Test that get_embedding raises an error when text is empty.
         """
         with self.assertRaises(ValueError):
-            self.client.generate_embedding("")
+            self.client.get_embedding("")
 
 
 if __name__ == "__main__":
diff --git a/py/packages/corpora_cli/commands/corpus.py b/py/packages/corpora_cli/commands/corpus.py
index 5b5bc48..2b8d4c8 100644
--- a/py/packages/corpora_cli/commands/corpus.py
+++ b/py/packages/corpora_cli/commands/corpus.py
@@ -45,7 +45,7 @@ def delete(ctx: typer.Context):
     c.console.print(f"Deleting corpus: {corpus_name}")
     try:
         c.api_client.corpora_api_delete_corpus(corpus_name)
-        c.console.print("Corpus deleted.", style="green")
+        c.console.print(f"{corpus_name} deleted", style="green")
     except ApiException as e:
         if e.status == 404:
             c.console.print("Corpus not found.", style="red")
diff --git a/py/requirements.txt b/py/requirements.txt
index 6b80a53..66af576 100644
--- a/py/requirements.txt
+++ b/py/requirements.txt
@@ -4,4 +4,7 @@
 -r packages/corpora_client/test-requirements.txt
 -r packages/corpora_proj/requirements.txt
 -r packages/corpora_ai_openai/requirements.txt
+# TODO: decide if these should be isolated
+langchain-text-splitters==0.3.2
+tiktoken==0.8.0
 -r requirements-dev.txt