Skip to content

Commit

Permalink
use surrogateescaping for non-utf8 encodable attribute values in hslo…
Browse files Browse the repository at this point in the history
…ad - fix for #130 (#138)
  • Loading branch information
jreadey authored Jan 25, 2023
1 parent c7e3446 commit 46a5165
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 6 deletions.
14 changes: 13 additions & 1 deletion h5pyd/_apps/utillib.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,18 @@ def copy_attribute(desobj, name, srcobj, ctx):

tgtarr = None
data = srcobj.attrs[name]
if data.dtype.kind == "S" and isinstance(data, bytes):
# check that this is actually utf-encodable
try:
data.decode("utf-8")
except UnicodeDecodeError:
msg = f"byte value for attribute {name} in {srcobj.name} "
msg += "is not utf8 encodable - using surrogateescaping"
logging.warning(msg)
if ctx["verbose"]:
print(msg)
data = data.decode("utf-8", errors="surrogateescape")

src_dt = None
try:
src_dt = data.dtype
Expand All @@ -340,7 +352,7 @@ def copy_attribute(desobj, name, srcobj, ctx):
desobj.attrs.create(name, tgtarr)
except (IOError, TypeError) as e:
msg = f"ERROR: failed to create attribute {name} "
msg += f"of object {desobj.naame} -- {e}"
msg += f"of object {desobj.name} -- {e}"
logging.error(msg)
if not ctx["ignore_error"]:
raise IOError(msg)
Expand Down
11 changes: 10 additions & 1 deletion h5pyd/_hl/attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,16 @@ def __getitem__(self, name):
arr = jsonToArray(shape, htype, value_json)

if len(arr.shape) == 0:
return arr[()]
v = arr[()]
if isinstance(v, str):
# if this is not utf-8, return bytes instead
try:
v.encode("utf-8")
except UnicodeEncodeError:
self._parent.log.debug("converting utf8 unencodable string as bytes")
v = v.encode("utf-8", errors="surrogateescape")
return v

return arr

def __setitem__(self, name, value):
Expand Down
1 change: 1 addition & 0 deletions h5pyd/_hl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ def jsonToArray(data_shape, data_dtype, data_json):
converted_data = toTuple(np_shape_rank, data_json)
data_json = converted_data


arr = np.array(data_json, dtype=data_dtype)
# raise an exception of the array shape doesn't match the selection shape
# allow if the array is a scalar and the selection shape is one element,
Expand Down
7 changes: 3 additions & 4 deletions test/hl/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ def test_create(self):
self.assertEqual(f.id.id, 0)

for mode in ("w-", "x"):
print(mode)
try:
# re-open is exclusive mode (should fail)
h5py.File(filename, mode)
Expand Down Expand Up @@ -137,8 +136,8 @@ def test_create(self):

# re-open as read-only
if is_hsds:
wait_time = 90
print("waiting {} seconds for root scan sync".format(wait_time))
wait_time = 1 # change to >90 to test async updates
#print("waiting {} seconds for root scan sync".format(wait_time))
time.sleep(wait_time) # let async process update obj number
f = h5py.File(filename, 'r')
self.assertEqual(f.filename, filename)
Expand Down Expand Up @@ -172,7 +171,7 @@ def test_create(self):
# check properties that are only available for h5pyd
# Note: num_groups won't reflect current state since the
# data is being updated asynchronously
if is_hsds:
if is_hsds and wait_time >= 90:
self.assertEqual(f.num_objects, 2)
self.assertEqual(f.num_groups, 2)
else:
Expand Down

0 comments on commit 46a5165

Please sign in to comment.