From f08baf036ea67428bfc0d9f51bb5e129d0348870 Mon Sep 17 00:00:00 2001 From: Luc Grosheintz Date: Sat, 2 Nov 2024 14:28:12 +0100 Subject: [PATCH 1/5] Add test for a supposedly null-terminated string. --- tests/unit/test_string.cpp | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/unit/test_string.cpp b/tests/unit/test_string.cpp index ed6a4a5bf..fb29a913f 100644 --- a/tests/unit/test_string.cpp +++ b/tests/unit/test_string.cpp @@ -206,6 +206,36 @@ void check_multiple_string(File file, size_t string_length) { } } +void check_supposedly_nullterm() { + auto file = HighFive::File("not_null_terminated.h5", HighFive::File::Truncate); + auto dspace = HighFive::DataSpace::Scalar(); + auto dtype = HighFive::FixedLengthStringType(5, HighFive::StringPadding::NullTerminated); + auto dset = file.createDataSet("dset", dspace, dtype); + auto attr = file.createAttribute("attr", dspace, dtype); + + // Creates a 5 byte, "null-terminated", fixed-length string. The first five + // bytes are filled with "GROUP". Clearly, this isn't null-terminated. However, + // h5py will read it back as "GROUP", HDF5 allows us to create these; and they're + // found in the wild. + std::string value = "GROUP"; + dset.write_raw(value.c_str(), dtype); + attr.write_raw(value.c_str(), dtype); + + SECTION("dset") { + auto actual = dset.read(); + REQUIRE(actual == value); + } + + SECTION("attr") { + auto actual = attr.read(); + REQUIRE(actual == value); + } +} + +TEST_CASE("HighFiveSTDString (nullterm cornercase)") { + check_supposedly_nullterm(); +} + TEST_CASE("HighFiveSTDString (dataset, single, short)") { File file("std_string_dataset_single_short.h5", File::Truncate); check_single_string(file, 3); From 3162eb0bf28197767e1ef9875ea152f1b184997c Mon Sep 17 00:00:00 2001 From: Luc Grosheintz Date: Sat, 2 Nov 2024 14:38:42 +0100 Subject: [PATCH 2/5] Fix. --- include/highfive/bits/H5Converter_misc.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/highfive/bits/H5Converter_misc.hpp b/include/highfive/bits/H5Converter_misc.hpp index 5fcbafb5e..56ebb35a5 100644 --- a/include/highfive/bits/H5Converter_misc.hpp +++ b/include/highfive/bits/H5Converter_misc.hpp @@ -229,7 +229,7 @@ struct StringBuffer { /// `length() + 1` bytes long. size_t length() const { if (buffer.isNullTerminated()) { - return char_buffer_size(data(), buffer.string_length); + return char_buffer_size(data(), buffer.string_size); } else { return buffer.string_length; } From bbbea3e10e62145ce6fc26c39ba4cc2ae412f5da Mon Sep 17 00:00:00 2001 From: Luc Grosheintz Date: Sat, 2 Nov 2024 16:10:04 +0100 Subject: [PATCH 3/5] Align documentation and variable names. --- include/highfive/H5DataType.hpp | 18 ++++++++++-------- include/highfive/bits/H5Converter_misc.hpp | 8 ++++---- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/highfive/H5DataType.hpp b/include/highfive/H5DataType.hpp index 985eb6dd4..b7e7d78e1 100644 --- a/include/highfive/H5DataType.hpp +++ b/include/highfive/H5DataType.hpp @@ -153,16 +153,18 @@ class FixedLengthStringType: public StringType { /// UTF8. In particular, a string with `n` UFT8 characters in general /// requires `4*n` bytes. /// - /// The string padding is subtle, essentially it's just a hint. A - /// null-terminated string is guaranteed to have one `'\0'` which marks the - /// semantic end of the string. The length of the buffer must be at least - /// `size` bytes regardless. HDF5 will read or write `size` bytes, - /// irrespective of the when the `\0` occurs. + /// The string padding is subtle, essentially it's just a hint. While + /// commonly, a null-terminated string is guaranteed to have one `'\0'` + /// which marks the semantic end of the string, this is not enforced by + /// HDF5. In fact, there are HDF5 files that contain strings that claim to + /// be null-terminated but aren't. The length of the buffer must be at + /// least `size` bytes regardless of the padding. HDF5 will read or write + /// `size` bytes, irrespective of when (if at all) the `\0` occurs. /// - /// Note that when writing passing `StringPadding::NullTerminated` is a + /// Note that when writing, passing `StringPadding::NullTerminated` is a /// guarantee to the reader that it contains a `\0`. Therefore, make sure - /// that the string really is nullterminated. Otherwise prefer a - /// null-padded string which only means states that the buffer is filled up + /// that the string really is null-terminated. Otherwise prefer a + /// null-padded string. This mearly states that the buffer is filled up /// with 0 or more `\0`. FixedLengthStringType(size_t size, StringPadding padding, diff --git a/include/highfive/bits/H5Converter_misc.hpp b/include/highfive/bits/H5Converter_misc.hpp index 56ebb35a5..12d0d0d27 100644 --- a/include/highfive/bits/H5Converter_misc.hpp +++ b/include/highfive/bits/H5Converter_misc.hpp @@ -190,7 +190,7 @@ struct StringBuffer { } else if (buffer.isFixedLengthString()) { // If the buffer is fixed-length and null-terminated, then // `buffer.string_length` doesn't include the null-character. - if (length > buffer.string_length) { + if (length > buffer.string_max_length) { throw std::invalid_argument("String length too big."); } @@ -231,7 +231,7 @@ struct StringBuffer { if (buffer.isNullTerminated()) { return char_buffer_size(data(), buffer.string_size); } else { - return buffer.string_length; + return buffer.string_max_length; } } @@ -272,7 +272,7 @@ struct StringBuffer { : file_datatype(_file_datatype.asStringType()) , padding(file_datatype.getPadding()) , string_size(file_datatype.isVariableStr() ? size_t(-1) : file_datatype.getSize()) - , string_length(string_size - size_t(isNullTerminated())) + , string_max_length(string_size - size_t(isNullTerminated())) , dims(_dims) { if (string_size == 0 && isNullTerminated()) { throw DataTypeException( @@ -324,7 +324,7 @@ struct StringBuffer { StringPadding padding; size_t string_size; // Size of buffer required to store the string. // Meaningful for fixed length strings only. - size_t string_length; // Semantic length of string. + size_t string_max_length; // Maximum length of string. std::vector dims; std::vector fixed_length_buffer; From 3e345b577fdce7ba568b362b4ec9febd122887f2 Mon Sep 17 00:00:00 2001 From: Luc Grosheintz Date: Sat, 2 Nov 2024 18:31:05 +0100 Subject: [PATCH 4/5] Appease formatter. --- include/highfive/bits/H5Converter_misc.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/highfive/bits/H5Converter_misc.hpp b/include/highfive/bits/H5Converter_misc.hpp index 12d0d0d27..a89cb132a 100644 --- a/include/highfive/bits/H5Converter_misc.hpp +++ b/include/highfive/bits/H5Converter_misc.hpp @@ -322,9 +322,11 @@ struct StringBuffer { private: StringType file_datatype; StringPadding padding; - size_t string_size; // Size of buffer required to store the string. - // Meaningful for fixed length strings only. - size_t string_max_length; // Maximum length of string. + // Size of buffer required to store the string. + // Meaningful for fixed length strings only. + size_t string_size; + // Maximum length of string. + size_t string_max_length; std::vector dims; std::vector fixed_length_buffer; From f3885caa865166ec9898e307f3c9495348320cb2 Mon Sep 17 00:00:00 2001 From: Luc Grosheintz Date: Sun, 3 Nov 2024 10:23:15 +0100 Subject: [PATCH 5/5] Refactor test. --- tests/unit/test_string.cpp | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/tests/unit/test_string.cpp b/tests/unit/test_string.cpp index fb29a913f..2b0d5afc2 100644 --- a/tests/unit/test_string.cpp +++ b/tests/unit/test_string.cpp @@ -206,34 +206,31 @@ void check_multiple_string(File file, size_t string_length) { } } -void check_supposedly_nullterm() { - auto file = HighFive::File("not_null_terminated.h5", HighFive::File::Truncate); - auto dspace = HighFive::DataSpace::Scalar(); - auto dtype = HighFive::FixedLengthStringType(5, HighFive::StringPadding::NullTerminated); - auto dset = file.createDataSet("dset", dspace, dtype); - auto attr = file.createAttribute("attr", dspace, dtype); +template +void check_supposedly_nullterm(HighFive::File& file) { + auto dataspace = HighFive::DataSpace::Scalar(); + auto datatype = HighFive::FixedLengthStringType(5, HighFive::StringPadding::NullTerminated); + auto obj = CreateTraits::create(file, "not_null_terminated", dataspace, datatype); // Creates a 5 byte, "null-terminated", fixed-length string. The first five // bytes are filled with "GROUP". Clearly, this isn't null-terminated. However, // h5py will read it back as "GROUP", HDF5 allows us to create these; and they're // found in the wild. std::string value = "GROUP"; - dset.write_raw(value.c_str(), dtype); - attr.write_raw(value.c_str(), dtype); + obj.write_raw(value.c_str(), datatype); - SECTION("dset") { - auto actual = dset.read(); - REQUIRE(actual == value); - } + auto actual = obj.template read(); + REQUIRE(actual == value); +} - SECTION("attr") { - auto actual = attr.read(); - REQUIRE(actual == value); - } +TEST_CASE("HighFiveSTDString (attribute, nullterm cornercase)") { + auto file = HighFive::File("not_null_terminated_attribute.h5", HighFive::File::Truncate); + check_supposedly_nullterm(file); } -TEST_CASE("HighFiveSTDString (nullterm cornercase)") { - check_supposedly_nullterm(); +TEST_CASE("HighFiveSTDString (dataset, nullterm cornercase)") { + auto file = HighFive::File("not_null_terminated_dataset.h5", HighFive::File::Truncate); + check_supposedly_nullterm(file); } TEST_CASE("HighFiveSTDString (dataset, single, short)") {