Create a standardized form testing dataset (#287)

* created dataset import script * formatting * edited script to run all dataset migration on one click * edited readme * added lock file * edited read me
CDCgov · Oct 8, 2024 · a3e10c3 · a3e10c3
1 parent d639651
commit a3e10c3
Show file tree

Hide file tree

Showing 5 changed files with 953 additions and 10 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,11 @@ tmp/
 temp/
 *.tmp
 
+# Datasets
+OCR/ocr/reportvision-dataset-1/ground_truth/
+OCR/ocr/reportvision-dataset-1/images/
+
+
 # macOS
 .DS_Store
 
@@ -411,4 +416,5 @@ sketch
 ## Terraform ##
 .terraform
 *.tfplan*
-*.tfstate*
+*.tfstate*
+
diff --git a/OCR/README.md b/OCR/README.md
@@ -7,77 +7,93 @@ pipx install poetry
 ```
 
 Activate the virtual environment and install dependencies, all subsequent commands assume you are in the virtual env
+
 ```shell
 poetry shell
 poetry install
 ```
 
 Run unit tests
+
 ```shell
 poetry run pytest
 ```
 
 Run benchmark tests
+
 ```shell
 cd tests
 poetry run pytest benchmark_test.py -v
 ```
 
-poetry run pytest bench_test.py -v  
+poetry run pytest bench_test.py -v
 
 Run main, hoping to convert this to a cli at some point
+
 ```shell
 poetry run main
 ```
 
 To build the OCR service into an executable artifact
+
 ```shell
 poetry run build
 ```
 
 Adding new dependencies
+
 ```shell
 poetry add package-name
 ```
 
 To manually update the poetry lock file
+
 ```shell
 poetry lock
 ```
 
 To view installed packages in the virtual env
+
 ```shell
 poetry show
 ```
 
 To lint your files using ruff
+
 ```shell
 ruff check --fix
 ```
 
-To format your files using ruff 
+To format your files using ruff
+
 ```shell
 ruff format
 ```
 
 To run the API in dev mode with reload
+
 ```shell
 fastapi dev ocr/api.py
 ```
 
 To run the API in prod mode
+
 ```shell
 poetry run api
 ```
 
+### Test Data Sets
+
+You can also run the script pytest run reportvision-dataset-1/medical_report_import.py to pull in all relevant data.
+
 ### Dockerized Development
 
 It is also possible to run the entire project in a collection of docker containers. This is useful for development and testing purposes as it doesn't require any additional dependencies to be installed on your local machine.
 
 To start the containers, run the following command:
 
 ```shell
-docker compose -f dev-env.yaml up 
+docker compose -f dev-env.yaml up
 ```
 
 This will start the following containers:
@@ -87,4 +103,4 @@ This will start the following containers:
 
 The frontend container will automatically reload when changes are made to the frontend code. To access the frontend, navigate to http://localhost:5173 in your browser.
 
-The OCR service container will restart automatically when changes are made to the OCR code. To access the API, navigate to http://localhost:8000/ in your browser.
+The OCR service container will restart automatically when changes are made to the OCR code. To access the API, navigate to http://localhost:8000/ in your browser.
diff --git a/OCR/ocr/reportvision-dataset-1/medical_report_import.py b/OCR/ocr/reportvision-dataset-1/medical_report_import.py
@@ -0,0 +1,58 @@
+import os
+import json
+from datasets import load_dataset
+
+# Define the destination folder
+destination_base_folder = "OCR/ocr/reportvision-dataset-1"
+datasets_info = [
+    {"name": "singhsays/fake-w2-us-tax-form-dataset", "json_key": "ground_truth"},
+    {"name": "AnubhutiBhardwaj/medical-reports-demo", "json_key": "json"},
+    {"name": "Technoculture/medical-prescriptions", "json_key": "json"},
+]
+
+
+images_folder = os.path.join(destination_base_folder, "images")
+json_folder = os.path.join(destination_base_folder, "ground_truth")
+
+os.makedirs(images_folder, exist_ok=True)
+os.makedirs(json_folder, exist_ok=True)
+
+
+def process_dataset(dataset_name, json_key):
+    print(f"Processing dataset: {dataset_name}")
+    dataset = load_dataset(dataset_name)
+
+    for split in dataset.keys():
+        split_data = dataset[split]
+        for idx, example in enumerate(split_data):
+            unique_id = f"{split}_{idx}"
+
+            # Save image
+            image = example["image"]
+            image_filename = f"report_{unique_id}.png"
+            image_path = os.path.join(images_folder, image_filename)
+
+            image.save(image_path)
+
+            # Parse the ground truth JSON data
+            if json_key == "json":
+                ground_truth_data = json.loads(example[json_key])
+            elif json_key == "ground_truth":
+                ground_truth_data = json.loads(example[json_key])["gt_parse"]
+
+            json_filename = f"report_{unique_id}.json"
+            json_path = os.path.join(json_folder, json_filename)
+
+            with open(json_path, "w") as f:
+                json.dump(ground_truth_data, f, indent=4)
+
+            print(f"Saved {image_filename} and {json_filename}")
+
+    print(f"Finished processing dataset: {dataset_name}")
+
+
+# Process all datasets
+for dataset_info in datasets_info:
+    process_dataset(dataset_info["name"], dataset_info["json_key"])
+
+print(f"All datasets have been successfully saved to {destination_base_folder}")