Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix error when extracting exif metadata features from images in RAI Vision dashboard #2461

Merged
merged 1 commit into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
create_text_classification_pipeline,
load_covid19_emergency_event_dataset,
load_emotion_dataset)
from huggingface_hub.utils._validators import HFValidationError
from rai_text_insights_validator import validate_rai_text_insights

from responsibleai._internal.constants import ManagerNames
Expand Down Expand Up @@ -134,8 +133,8 @@ def test_loading_rai_insights_without_model_file(self):
match_msg = 'Can\'t load the configuration'
expected_error = OSError
else:
match_msg = 'Repo id must'
expected_error = HFValidationError
match_msg = 'local folder'
expected_error = OSError
with pytest.raises(expected_error, match=match_msg):
without_model_rai_insights = RAITextInsights.load(save_path)
assert without_model_rai_insights.model is None
Expand Down
3 changes: 2 additions & 1 deletion responsibleai_vision/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,5 @@ opencv-python

fastai
mlflow
pydantic<2.0.0
pydantic<2.0.0
piexif
12 changes: 12 additions & 0 deletions responsibleai_vision/responsibleai_vision/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
from enum import Enum


class ExtractedFeatures(str, Enum):
"""Provide constants related to the extracted image features."""

MEAN_PIXEL_VALUE = 'mean_pixel_value'


class ModelTask(str, Enum):
"""Provide model task constants.

Expand All @@ -28,6 +34,12 @@ class ImageColumns(str, Enum):
IMAGE_DETAILS = 'image_details'


class ImageModes(str, Enum):
"""Provide constants related to the image modes."""

RGB = 'RGB'


class ExplainabilityLiterals:
"""Parameters for explainability method names."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from responsibleai.serialization_utilities import serialize_json_safe
from responsibleai_vision.common.constants import (CommonTags,
ExplainabilityDefaults,
ImageColumns,
ImageColumns, ImageModes,
MLFlowSchemaLiterals,
ModelTask)
from responsibleai_vision.managers.error_analysis_manager import \
Expand Down Expand Up @@ -135,7 +135,7 @@ def __init__(self, model: Any,
classes: Optional[np.ndarray] = None,
serializer: Optional[Any] = None,
maximum_rows_for_test: int = 5000,
image_mode: str = "RGB",
image_mode: str = ImageModes.RGB,
test_data_path: Optional[str] = None,
transformations: Optional[Any] = None,
image_downloader: Optional[Any] = None,
Expand Down Expand Up @@ -267,7 +267,7 @@ def __init__(self, model: Any,
serializer)

ext_test, ext_features = extract_features(
self.test, self.target_column, self.task_type,
self.test, self.target_column,
self.image_mode,
self._feature_metadata)
self._ext_test = ext_test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""Defines the feature extractors."""

import warnings
from typing import Optional

import pandas as pd
Expand All @@ -11,13 +12,17 @@
from tqdm import tqdm

from responsibleai.feature_metadata import FeatureMetadata
from responsibleai_vision.common.constants import ExtractedFeatures
from responsibleai_vision.utils.image_reader import (
get_all_exif_feature_names, get_image_from_path,
get_image_pointer_from_path)

MEAN_PIXEL_VALUE = ExtractedFeatures.MEAN_PIXEL_VALUE.value
MAX_CUSTOM_LEN = 100


def extract_features(image_dataset: pd.DataFrame,
target_column: str, task_type: str,
target_column: str,
image_mode: str = None,
feature_metadata: Optional[FeatureMetadata] = None):
'''Extract tabular data features from the image dataset.
Expand All @@ -27,8 +32,6 @@ def extract_features(image_dataset: pd.DataFrame,
:param target_column: The name of the label column or list of columns.
This is a list of columns for multilabel models.
:type target_column: str or list[str]
:param task_type: The type of task to be performed.
:type task_type: str
:param image_mode: The mode to open the image in.
See pillow documentation for all modes:
https://pillow.readthedocs.io/en/stable/handbook/concepts.html
Expand All @@ -45,7 +48,7 @@ def extract_features(image_dataset: pd.DataFrame,
if feature_metadata and feature_metadata.categorical_features is None:
feature_metadata.categorical_features = []
exif_feature_names = get_all_exif_feature_names(image_dataset)
feature_names = ["mean_pixel_value"] + exif_feature_names
feature_names = [MEAN_PIXEL_VALUE] + exif_feature_names

# append all feature names other than target column and label
column_names = image_dataset.columns
Expand All @@ -58,6 +61,7 @@ def extract_features(image_dataset: pd.DataFrame,
continue
feature_names.append(column_names[j])

blacklisted_tags = {}
# append all features
for i in tqdm(range(image_dataset.shape[0])):
image = image_dataset.iloc[i, 0]
Expand All @@ -81,9 +85,26 @@ def extract_features(image_dataset: pd.DataFrame,
# decode bytes
if isinstance(data, bytes):
data = data.decode()
if len(data) > MAX_CUSTOM_LEN:
data = data[:MAX_CUSTOM_LEN] + '...'
if isinstance(data, str):
feature_metadata.categorical_features.append(str(tag))
row_feature_values[feature_names.index(tag)] = data
if not feature_metadata:
feature_metadata = FeatureMetadata()
feature_metadata.categorical_features = []
if tag in feature_names:
feature_metadata.categorical_features.append(
str(tag))
tag_index = feature_names.index(tag)
row_feature_values[tag_index] = data
else:
# in theory this should now never happen with
# latest code, but adding this check for safety
if tag not in blacklisted_tags:
blacklisted_tags.add(tag)
warnings.warn(
f'Exif tag {tag} could not be found '
'in the feature names. Ignoring tag '
'from extracted metadata.')
elif isinstance(data, int) or isinstance(data, float):
row_feature_values[feature_names.index(tag)] = data

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ def get_all_exif_feature_names(image_dataset):
data = exifdata.get(tag_id)
if isinstance(data, str) or \
isinstance(data, int) or \
isinstance(data, float):
isinstance(data, float) or \
isinstance(data, bytes):
exif_feature_names.add(tag)
return list(exif_feature_names)

Expand Down
11 changes: 10 additions & 1 deletion responsibleai_vision/tests/common_vision_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import numpy as np
import pandas as pd
import piexif
import shap
import torch
import torch.nn as nn
Expand Down Expand Up @@ -172,7 +173,7 @@ def retrieve_unzip_file(download_url, data_file):
os.remove(data_file)


def load_fridge_dataset():
def load_fridge_dataset(add_extra_mixed_metadata=False):
# create data folder if it doesnt exist.
os.makedirs("data", exist_ok=True)

Expand All @@ -186,6 +187,14 @@ def load_fridge_dataset():
for folder in os.listdir("./data/fridgeObjects"):
for file in os.listdir("./data/fridgeObjects/" + folder):
image_path = "./data/fridgeObjects/" + folder + "/" + file
if add_extra_mixed_metadata and file.endswith("1.jpg"):
with Image.open(image_path) as im:
exif_dict = piexif.load(im.info['exif'])
comment = 'Extra metadata for {}'.format(file).encode()
exif_dict['0th'][piexif.ImageIFD.XPComment] = comment
exif_dict['1st'][piexif.ImageIFD.XPComment] = comment
exif_bytes = piexif.dump(exif_dict)
im.save(image_path, exif=exif_bytes)
data = data.append({IMAGE: image_path, LABEL: folder},
ignore_index=True)
return data
Expand Down
69 changes: 69 additions & 0 deletions responsibleai_vision/tests/test_feature_extractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

from common_vision_utils import (load_flowers_dataset, load_fridge_dataset,
load_fridge_object_detection_dataset,
load_imagenet_dataset)

from responsibleai_vision.common.constants import (ExtractedFeatures,
ImageColumns, ImageModes)
from responsibleai_vision.utils.feature_extractors import extract_features

MEAN_PIXEL_VALUE = ExtractedFeatures.MEAN_PIXEL_VALUE.value
FRIDGE_METADATA_FEATURES = [
'Make', 'ResolutionUnit', 'ImageLength', 'ExifOffset', 'Model',
'GPSInfo', 'ImageWidth', 'DateTime', 'YCbCrPositioning',
'Software', 'Orientation'
]


def validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data):
assert len(extracted_features) == len(data)
assert feature_names[0] == expected_feature_names[0]
for i in range(1, len(feature_names)):
assert feature_names[i] in expected_feature_names
assert len(feature_names) == len(expected_feature_names)
assert len(extracted_features[0]) == len(feature_names)


def extract_dataset_features(data):
return extract_features(data, ImageColumns.LABEL, ImageModes.RGB, None)


class TestFeatureExtractors(object):
def test_extract_features_fridge_object_detection(self):
data = load_fridge_object_detection_dataset(automl_format=False)
extracted_features, feature_names = extract_dataset_features(data)
expected_feature_names = [MEAN_PIXEL_VALUE] + FRIDGE_METADATA_FEATURES
validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data)

def test_extract_features_fridge_metadata(self):
data = load_fridge_dataset()
extracted_features, feature_names = extract_dataset_features(data)
expected_feature_names = [MEAN_PIXEL_VALUE] + FRIDGE_METADATA_FEATURES
validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data)

def test_extract_features_imagenet_metadata(self):
data = load_imagenet_dataset()
extracted_features, feature_names = extract_dataset_features(data)
expected_feature_names = [MEAN_PIXEL_VALUE]
validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data)

def test_extract_features_flowers_metadata(self):
data = load_flowers_dataset(upscale=False)
extracted_features, feature_names = extract_dataset_features(data)
expected_feature_names = [MEAN_PIXEL_VALUE]
validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data)

def test_extract_features_mixed_exif_XPComment_metadata(self):
data = load_fridge_dataset(add_extra_mixed_metadata=True)
extracted_features, feature_names = extract_dataset_features(data)
expected_feature_names = [MEAN_PIXEL_VALUE, 'XPComment']
expected_feature_names += FRIDGE_METADATA_FEATURES
validate_extracted_features(extracted_features, feature_names,
expected_feature_names, data)
5 changes: 4 additions & 1 deletion responsibleai_vision/tests/test_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def test_transform_object_detection_labels(self):
assert label_j[5] == o_label_j[IS_CROWD]

def test_retry_sessions_match_domain_count(self):
sessions_before_test = len(image_reader_requests_sessions)
urls = [f"https://{i}.com/image.png" for i in range(10)]
duplicates = urls.copy()
urls.extend(duplicates)
Expand All @@ -72,7 +73,9 @@ def test_retry_sessions_match_domain_count(self):
for url in urls:
image_reader_get_retry_session(url)

assert len(image_reader_requests_sessions) == domain_unique_count
new_session_count = len(image_reader_requests_sessions)
new_session_count -= sessions_before_test
assert new_session_count == domain_unique_count

@patch("urllib3.connectionpool.HTTPConnectionPool._make_request")
def test_retry_sessions_retries_on_conn_failure(self, request_mock):
Expand Down
9 changes: 5 additions & 4 deletions responsibleai_vision/tests/test_rai_vision_insights.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from responsibleai.feature_metadata import FeatureMetadata
from responsibleai_vision import ModelTask, RAIVisionInsights
from responsibleai_vision.common.constants import (ExplainabilityDefaults,
ImageColumns)
ImageColumns, ImageModes)

DEFAULT_MAX_EVALS = ExplainabilityDefaults.DEFAULT_MAX_EVALS
DEFAULT_NUM_MASKS = ExplainabilityDefaults.DEFAULT_NUM_MASKS
Expand All @@ -40,7 +40,7 @@ def test_rai_insights_image_classification_imagenet(self):
task_type = ModelTask.IMAGE_CLASSIFICATION
class_names = load_imagenet_labels()
run_rai_insights(pred, data[:3], ImageColumns.LABEL,
task_type, class_names, image_mode='RGB')
task_type, class_names, image_mode=ImageModes.RGB)

@pytest.mark.parametrize('max_evals', [None, 10, 200])
def test_rai_insights_image_classification_max_evals(self, max_evals):
Expand All @@ -51,7 +51,7 @@ def test_rai_insights_image_classification_max_evals(self, max_evals):
# run on a single image to avoid running out of memory on
# test machines
run_rai_insights(pred, data[:1], ImageColumns.LABEL,
task_type, class_names, image_mode='RGB',
task_type, class_names, image_mode=ImageModes.RGB,
test_explainer=True, max_evals=max_evals)

@pytest.mark.parametrize('max_evals', [-100, -1, 0])
Expand All @@ -63,7 +63,8 @@ def test_rai_insights_invalid_max_evals(self, max_evals):
with pytest.raises(ValueError,
match="max_evals must be greater than 0"):
run_rai_insights(pred, data[:1], ImageColumns.LABEL,
task_type, class_names, image_mode='RGB',
task_type, class_names,
image_mode=ImageModes.RGB,
test_explainer=True, max_evals=max_evals)

def test_rai_insights_image_classification_fridge(self):
Expand Down
Loading