Skip to content

Commit

Permalink
refactor(py): use ruff instead of black (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
skyl authored Nov 24, 2024
1 parent ab7b60b commit 1f7cfe9
Show file tree
Hide file tree
Showing 112 changed files with 1,799 additions and 1,389 deletions.
24 changes: 8 additions & 16 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,13 @@
"customizations": {
"vscode": {
"settings": {
"python.formatting.provider": "black",
"python.formatting.formatOnSave": true,
"python.formatting.blackArgs": [
"--line-length",
"88",
"--config",
"/workspace/py/pyproject.toml"
],
// TODO: you have to uninstall autopep8 in devcontainer
// because of Microsoft defaults.
// ... if the editor is formatting wrongly
// then uninstall autopep8 extension.
// some day Microsoft will fix this?
"python.formatting.autopep8Path": "",
"editor.formatOnSave": true,
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.fixAll": "always"
}
},
"python.linting.enabled": true,
"python.pythonPath": "/usr/local/bin/python",
"python.analysis.autoImportCompletions": true,
Expand All @@ -29,7 +21,7 @@
"extensions": [
// py
"ms-python.python",
"ms-python.black-formatter",
"charliermarsh.ruff",
// github
"GitHub.copilot",
"GitHub.copilot-chat",
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/ci-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,9 @@ jobs:
python -m pip install --upgrade pip
pip install -r py/requirements.txt
- name: Black
run: black --check py/
- name: Ruff
working-directory: py/
run: ruff check

- name: Run tests with pytest
env:
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ services:
CORPORA_CLIENT_ID: "${CORPORA_CLIENT_ID}"
CORPORA_CLIENT_SECRET: "${CORPORA_CLIENT_SECRET}"
GITHUB_TOKEN: "${GITHUB_TOKEN}"
# OPENAI_API_KEY: "${OPENAI_API_KEY}"
command: sleep infinity
working_dir: /workspace
networks:
Expand Down
2 changes: 1 addition & 1 deletion py/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ Each package serves a distinct purpose:
## Development Workflow

- **API Development**: Add endpoints in `corpora`. Use `./genall.sh` to update `corpora_client` and `corpora_cli`.
- **Testing and Formatting**: Use `black` for code formatting and `pytest` for tests.
- **Testing and Formatting**: Use `ruff` for code formatting and `pytest` for tests.
2 changes: 1 addition & 1 deletion py/genall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@ cp gen/corpora_client/setup.py packages/corpora_client/setup.py
cp gen/corpora_client/requirements.txt packages/corpora_client/requirements.txt
cp gen/corpora_client/test-requirements.txt packages/corpora_client/test-requirements.txt
rm -rf gen/corpora_client
black .
ruff check --fix
pytest
3 changes: 3 additions & 0 deletions py/packages/corpora/admin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from django.contrib import admin

from .models import Corpus, CorpusTextFile, Split


Expand All @@ -8,9 +9,11 @@ class CorpusAdmin(admin.ModelAdmin):
search_fields = ("name", "id", "url")
ordering = ("-updated_at",)
readonly_fields = ("id", "created_at", "updated_at")
autocomplete_fields = ("owner",)
fieldsets = (
(None, {"fields": ("name", "id", "url")}),
("Timestamps", {"fields": ("created_at", "updated_at")}),
("Owner", {"fields": ("owner",)}),
)


Expand Down
2 changes: 1 addition & 1 deletion py/packages/corpora/auth.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from asgiref.sync import sync_to_async
from django.utils import timezone
from ninja.security import HttpBearer
from oauth2_provider.models import AccessToken
from django.utils import timezone


class BearerAuth(HttpBearer):
Expand Down
1 change: 1 addition & 0 deletions py/packages/corpora/lib/dj/decorators.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from functools import wraps

from django.core.exceptions import ObjectDoesNotExist
from django.http import Http404

Expand Down
3 changes: 2 additions & 1 deletion py/packages/corpora/lib/dj/test_decorators.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pytest
from django.core.exceptions import ObjectDoesNotExist
from django.http import Http404

from corpora.lib.dj.decorators import async_raise_not_found
from django.core.exceptions import ObjectDoesNotExist


# A sample async function to simulate normal and failing behavior
Expand Down
4 changes: 2 additions & 2 deletions py/packages/corpora/lib/files.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Union
import hashlib
from typing import Union


def compute_checksum(content: Union[bytes, str]) -> str:
Expand All @@ -13,5 +13,5 @@ def compute_checksum(content: Union[bytes, str]) -> str:
content = content.encode("utf-8")

size = str(len(content))
sha = hashlib.sha1(f"blob {size}\0".encode("utf-8") + content).hexdigest()
sha = hashlib.sha1(f"blob {size}\0".encode() + content).hexdigest()
return sha
4 changes: 3 additions & 1 deletion py/packages/corpora/lib/test_files.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest
import hashlib

import pytest

from corpora.lib.files import compute_checksum


Expand Down
9 changes: 5 additions & 4 deletions py/packages/corpora/migrations/0002_initial.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Generated by Django 5.1.2 on 2024-10-27 17:12

import uuid

import django.db.models.deletion
import django.utils.timezone
import pgvector.django.vector
import uuid
from django.db import migrations, models


Expand Down Expand Up @@ -75,14 +76,14 @@ class Migration(migrations.Migration):
(
"vector_of_summary",
pgvector.django.vector.VectorField(
blank=True, dimensions=300, null=True
blank=True, dimensions=300, null=True,
),
),
("checksum", models.CharField(editable=False, max_length=32)),
(
"created_at",
models.DateTimeField(
default=django.utils.timezone.now, editable=False
default=django.utils.timezone.now, editable=False,
),
),
("updated_at", models.DateTimeField(auto_now=True)),
Expand Down Expand Up @@ -117,7 +118,7 @@ class Migration(migrations.Migration):
(
"vector",
pgvector.django.vector.VectorField(
blank=True, dimensions=300, null=True
blank=True, dimensions=300, null=True,
),
),
("metadata", models.JSONField(blank=True, default=dict)),
Expand Down
69 changes: 39 additions & 30 deletions py/packages/corpora/models.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
import logging
import os
import uuid

from corpora_ai.split import get_text_splitter
from django.contrib.auth import get_user_model
from django.db import models
from django.db.models.manager import BaseManager
from django.utils import timezone
from django.contrib.auth import get_user_model

# from django.contrib.postgres.fields import ArrayField
from pgvector.django import VectorField, CosineDistance
from pgvector.django import CosineDistance, VectorField


from corpora_ai.split import get_text_splitter
# TODO: This loads too early and makes it hard to mock
# from corpora_ai.provider_loader import load_llm_provider

User = get_user_model()

logger = logging.getLogger(__name__)


class Corpus(models.Model):
"""
Represents a unique corpus, often corresponding to a specific repository
"""Represents a unique corpus, often corresponding to a specific repository
or collection of documents. A corpus can have an associated URL for
tracking its origin, such as a GitHub repository.
"""
Expand All @@ -30,7 +34,6 @@ class Corpus(models.Model):
help_text="User who owns the corpus.",
)
url = models.URLField(
null=True,
blank=True,
help_text="Optional URL associated with the corpus, e.g., a GitHub repository.",
)
Expand All @@ -40,7 +43,8 @@ class Corpus(models.Model):
help_text="Timestamp indicating when the corpus was created.",
)
updated_at = models.DateTimeField(
auto_now=True, help_text="Timestamp indicating the last update of the corpus."
auto_now=True,
help_text="Timestamp indicating the last update of the corpus.",
)

class Meta:
Expand All @@ -50,10 +54,12 @@ class Meta:
def __str__(self):
return self.name

def get_relevant_splits(self, text: str, limit: int = 10):
"""
Given a text query, return the most relevant splits from this corpus.
"""
def get_relevant_splits(
self,
text: str,
limit: int = 10,
) -> BaseManager["Split"]:
"""Given a text query, return the most relevant splits from this corpus."""
from corpora_ai.provider_loader import load_llm_provider

llm = load_llm_provider()
Expand All @@ -70,38 +76,37 @@ def get_relevant_splits(self, text: str, limit: int = 10):
.order_by("similarity")[:limit]
)

def get_relevant_splits_context(self, text: str, limit: int = 10):
"""
Given a text query, return the most relevant splits from this corpus
def get_relevant_splits_context(self, text: str, limit: int = 10) -> str:
"""Given a text query, return the most relevant splits from this corpus
along with the context of the split.
"""
splits = self.get_relevant_splits(text, limit)
split_context = ""
for split in splits:
split_context += f"\n\n{split.file.path}:\n```\n{split.content}\n```\n\n"
split_context += (
f"\n\n{split.file.path}:\n```\n{split.content}\n```\n\n"
)
return split_context

def get_file_hashes(self) -> dict:
"""
Retrieve a map of file paths to their hashes for this Corpus.
"""
"""Retrieve a map of file paths to their hashes for this Corpus."""
# TODO: types?
return {file.path: file.checksum for file in self.files.all()}

def delete_files(self, files: list):
"""
Delete files from this Corpus by path.
"""
def delete_files(self, files: list) -> None:
"""Delete files from this Corpus by path."""
self.files.filter(path__in=files).delete()


class CorpusTextFile(models.Model):
"""
A file with UTF-8 text content associated with a Corpus.
"""
"""A file with UTF-8 text content associated with a Corpus."""

id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
corpus = models.ForeignKey(Corpus, on_delete=models.CASCADE, related_name="files")
corpus = models.ForeignKey(
Corpus,
on_delete=models.CASCADE,
related_name="files",
)
path = models.CharField(max_length=1024)
content = models.TextField(blank=True)
ai_summary = models.TextField(blank=True)
Expand Down Expand Up @@ -147,8 +152,7 @@ def get_and_save_vector_of_summary(self):
self.save(update_fields=["vector_of_summary"])

def split_content(self):
"""
Splits the content of the file into smaller parts using an appropriate text splitter.
"""Splits the content of the file into smaller parts using an appropriate text splitter.
Returns a list of Split instances.
"""
file_name = os.path.basename(self.path)
Expand All @@ -169,7 +173,9 @@ def split_content(self):
class Split(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
file = models.ForeignKey(
CorpusTextFile, on_delete=models.CASCADE, related_name="splits"
CorpusTextFile,
on_delete=models.CASCADE,
related_name="splits",
)
order = models.PositiveIntegerField()
content = models.TextField(blank=True)
Expand Down Expand Up @@ -197,6 +203,9 @@ def __str__(self):
return f"{self.file.corpus.name}:{self.file.path}:{self.order}"

def get_and_save_vector(self):
logger.info(
f"{self.file.path}: {self.content[:10]} ... {self.content[-10:]}",
)
from corpora_ai.provider_loader import load_llm_provider

llm = load_llm_provider()
Expand Down
3 changes: 1 addition & 2 deletions py/packages/corpora/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
from .auth import BearerAuth
from .routers.corpus import corpus_router
from .routers.corpustextfile import file_router
from .routers.split import split_router
from .routers.plan import plan_router
from .routers.split import split_router
from .routers.workon import workon_router


api = Router(tags=["corpora"], auth=BearerAuth())

api.add_router("corpus", corpus_router)
Expand Down
Loading

0 comments on commit 1f7cfe9

Please sign in to comment.