Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add CLI command to run OCR on past images #572

Merged
merged 1 commit into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ migrate-db:
@echo "🥫 Migrating database …"
${DOCKER_COMPOSE} run --rm --no-deps api python3 manage.py migrate

cli: guard-args
${DOCKER_COMPOSE} run --rm --no-deps api python3 manage.py ${args}


# TODO: migrate to Django
add-db-revision: guard-message
${DOCKER_COMPOSE} run --rm --no-deps api alembic revision --autogenerate -m "${message}"
Expand Down
15 changes: 15 additions & 0 deletions docs/maintenance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Maintenance

## How to launch OCR on previously uploaded images

OCR (through Google Cloud Vision) is launched on every new proof image. However, if you want to launch OCR on previously uploaded images, you can do so by running the following command:

```bash
make cli args='run_ocr'
```

To override existing OCR results, add the `--override` flag:

```bash
make cli args='run_ocr --override'
```
31 changes: 31 additions & 0 deletions open_prices/proofs/management/commands/run_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import argparse
import glob

import tqdm
from django.conf import settings
from django.core.management.base import BaseCommand

from open_prices.proofs.utils import fetch_and_save_ocr_data


class Command(BaseCommand):
help = "Run OCR on images with missing OCR files."

def add_arguments(self, parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--override", action="store_true", help="Override existing OCR data."
)

def handle(self, *args, **options) -> None: # type: ignore
self.stdout.write("Starting OCR processing...")
override = options["override"]
processed = 0

for image_path_str in tqdm.tqdm(
glob.iglob("**/*", root_dir=settings.IMAGES_DIR), desc="images"
):
image_path = settings.IMAGES_DIR / image_path_str
result = fetch_and_save_ocr_data(image_path, override=override)
processed += int(result)

self.stdout.write("%d OCR saved" % processed)
12 changes: 11 additions & 1 deletion open_prices/proofs/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,8 @@ def test_fetch_and_save_ocr_data_success(self):
image_path = Path(f"{tmpdirname}/test.jpg")
with image_path.open("w") as f:
f.write("test")
fetch_and_save_ocr_data(image_path)
output = fetch_and_save_ocr_data(image_path)
self.assertTrue(output)
mock_run_ocr_on_image.assert_called_once_with(
image_path, "test_api_key"
)
Expand All @@ -339,3 +340,12 @@ def test_fetch_and_save_ocr_data_success(self):
self.assertEqual(
actual_data["responses"], response_data["responses"]
)

def test_fetch_and_save_ocr_data_invalid_extension(self):
with self.settings(GOOGLE_CLOUD_VISION_API_KEY="test_api_key"):
with tempfile.TemporaryDirectory() as tmpdirname:
image_path = Path(f"{tmpdirname}/test.bin")
with image_path.open("w") as f:
f.write("test")
output = fetch_and_save_ocr_data(image_path)
self.assertFalse(output)
17 changes: 12 additions & 5 deletions open_prices/proofs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def run_ocr_on_image(image_path: Path | str, api_key: str) -> dict[str, Any] | N
return r.json()


def fetch_and_save_ocr_data(image_path: Path | str, override: bool = False) -> None:
def fetch_and_save_ocr_data(image_path: Path | str, override: bool = False) -> bool:
"""Run OCR on the image stored at the given path and save the result to a
JSON file.

Expand All @@ -187,28 +187,35 @@ def fetch_and_save_ocr_data(image_path: Path | str, override: bool = False) -> N

:param image_path: the path to the image
:param override: whether to override existing OCR data, default to False
:return: True if the OCR data was saved, False otherwise
"""
image_path = Path(image_path)

if image_path.suffix not in (".jpg", ".jpeg", ".png", ".webp"):
logger.debug("Skipping %s, not a supported image type", image_path)
return False

api_key = settings.GOOGLE_CLOUD_VISION_API_KEY

if api_key is None:
if not api_key:
logger.error("No Google Cloud Vision API key found")
return
return False

ocr_json_path = image_path.with_suffix(".json.gz")

if ocr_json_path.exists() and not override:
logger.info("OCR data already exists for %s", image_path)
return
return False

data = run_ocr_on_image(image_path, api_key)

if data is None:
return
return False

data["created_at"] = int(time.time())

with gzip.open(ocr_json_path, "wt") as f:
f.write(json.dumps(data))

logger.debug("OCR data saved to %s", ocr_json_path)
return True