Skip to content

Commit

Permalink
exif metadata extension
Browse files Browse the repository at this point in the history
  • Loading branch information
Advitya17 committed Feb 22, 2024
1 parent d665107 commit 3c95c72
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,22 @@
from typing import Optional

import pandas as pd
from PIL import Image
from PIL import Image, ExifTags
from PIL.ExifTags import TAGS
from PIL.TiffImagePlugin import IFDRational
from tqdm import tqdm

from responsibleai.feature_metadata import FeatureMetadata
from responsibleai_vision.common.constants import (ExtractedFeatures,
ImageColumns)
from responsibleai_vision.utils.image_reader import (
get_all_exif_feature_names, get_image_from_path,
get_image_pointer_from_path)
get_image_pointer_from_path, IFD_CODE_LOOKUP)

MEAN_PIXEL_VALUE = ExtractedFeatures.MEAN_PIXEL_VALUE.value
MAX_CUSTOM_LEN = 100
IMAGE_DETAILS = ImageColumns.IMAGE_DETAILS.value


def extract_features(image_dataset: pd.DataFrame,
target_column: str,
image_mode: str = None,
Expand Down Expand Up @@ -91,35 +91,37 @@ def extract_features(image_dataset: pd.DataFrame,
return results, feature_names


def append_exif_features(image, row_feature_values, feature_names,
blacklisted_tags, feature_metadata):
def process_data(data, tag, feature_names, feature_metadata, row_feature_values, blacklisted_tags):
if isinstance(data, bytes):
data = data.decode(errors='replace')
if len(data) > MAX_CUSTOM_LEN:
data = data[:MAX_CUSTOM_LEN] + '...'
elif isinstance(data, IFDRational):
data = data.numerator / data.denominator
if isinstance(data, (str, int, float)):
if tag in feature_names:
if tag not in feature_metadata.categorical_features:
feature_metadata.categorical_features.append(tag)
row_feature_values[feature_names.index(tag)] = data
elif tag not in blacklisted_tags:
blacklisted_tags.add(tag)
warnings.warn(
f'Exif tag {tag} could not be found '
'in the feature names. Ignoring tag '
'from extracted metadata.')

def append_exif_features(image, row_feature_values, feature_names, blacklisted_tags, feature_metadata):
if isinstance(image, str):
image_pointer_path = get_image_pointer_from_path(image)
with Image.open(image_pointer_path) as im:
exifdata = im.getexif()
for tag_id in exifdata:
# get the tag name, instead of human unreadable tag id
tag = str(TAGS.get(tag_id, tag_id))
data = exifdata.get(tag_id)
# decode bytes
if isinstance(data, bytes):
data = data.decode()
if len(data) > MAX_CUSTOM_LEN:
data = data[:MAX_CUSTOM_LEN] + '...'
if isinstance(data, str):
if tag in feature_names:
if tag not in feature_metadata.categorical_features:
feature_metadata.categorical_features.append(tag)
tag_index = feature_names.index(tag)
row_feature_values[tag_index] = data
else:
# in theory this should now never happen with
# latest code, but adding this check for safety
if tag not in blacklisted_tags:
blacklisted_tags.add(tag)
warnings.warn(
f'Exif tag {tag} could not be found '
'in the feature names. Ignoring tag '
'from extracted metadata.')
elif isinstance(data, int) or isinstance(data, float):
row_feature_values[feature_names.index(tag)] = data
if tag_id in IFD_CODE_LOOKUP:
ifd_data = exifdata.get_ifd(tag_id)
for nested_tag_id, data in ifd_data.items():
tag = ExifTags.GPSTAGS.get(nested_tag_id, None) or ExifTags.TAGS.get(nested_tag_id, None) or nested_tag_id
process_data(data, tag, feature_names, feature_metadata, row_feature_values, blacklisted_tags)
else:
tag = str(TAGS.get(tag_id, tag_id))
data = exifdata.get(tag_id)
process_data(data, tag, feature_names, feature_metadata, row_feature_values, blacklisted_tags)
23 changes: 14 additions & 9 deletions responsibleai_vision/responsibleai_vision/utils/image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import requests
from numpy import asarray
from PIL import Image
from PIL import Image, ExifTags
from PIL.ExifTags import TAGS
from requests.adapters import HTTPAdapter, Retry

Expand All @@ -20,6 +20,7 @@
# domain mapped session for reuse
_requests_sessions = {}

IFD_CODE_LOOKUP = {t.value: t.name for t in ExifTags.IFD}

def _get_retry_session(url):
domain = urlparse(url.lower()).netloc
Expand Down Expand Up @@ -88,15 +89,19 @@ def get_all_exif_feature_names(image_dataset):
with Image.open(image_pointer_path) as im:
exifdata = im.getexif()
for tag_id in exifdata:
# get the tag name, instead of human unreadable tag id
tag = TAGS.get(tag_id, tag_id)
if tag not in image_dataset.columns:
data = exifdata.get(tag_id)
if isinstance(data, str) or \
isinstance(data, int) or \
isinstance(data, float) or \
isinstance(data, bytes):
# nesting for IFD block tags
if tag_id in IFD_CODE_LOOKUP:
ifd_data = exifdata.get_ifd(tag_id)

for nested_tag_id in ifd_data:
nested_tag = ExifTags.GPSTAGS.get(nested_tag_id, None) or ExifTags.TAGS.get(nested_tag_id, None) or nested_tag_id
exif_feature_names.add(nested_tag)
else:
# get the tag name, instead of human unreadable tag id
tag = TAGS.get(tag_id, tag_id)
if tag not in image_dataset.columns:
exif_feature_names.add(tag)

return list(exif_feature_names)


Expand Down

0 comments on commit 3c95c72

Please sign in to comment.