forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
apacheGH-42146: [MATLAB] Add IPC
RecordBatchFileReader
and `RecordB…
…atchFileWriter` MATLAB classes (apache#42201) ### Rationale for this change To enable initial IPC I/O support in the MATLAB interface, we should add a `RecordBatchFileReader` class and a `RecordBatchFileWriter` class. ### What changes are included in this PR? 1. Added a new `arrow.io.ipc.RecordBatchFileWriter` class. 2. Added a new `arrow.io.ipc.RecordBatchFileReader` class. **Example** ```matlab >> city = ["Boston" "Seattle" "Denver" "Juno" "Anchorage" "Chicago"]'; >> daylength = duration(["15:17:01" "15:59:16" "14:59:14" "19:21:23" "14:18:24" "15:13:39"])'; >> matlabTable = table(city, daylength, VariableNames=["City", "DayLength"]); >> recordBatch1 = arrow.recordBatch(matlabTable(1:4, :)) >> recordBatch2 = arrow.recordBatch(matlabTable(5:end, :)); >> writer = arrow.io.ipc.RecordBatchFileWriter("daylight.arrow", recordBatch1.Schema); >> writer.writeRecordBatch(recordBatch1); >> writer.writeRecordBatch(recordBatch2); >> writer.close(); >> reader = arrow.io.ipc.RecordBatchFileReader("daylight.arrow"); reader = RecordBatchFileReader with properties: NumRecordBatches: 2 Schema: [1×1 arrow.tabular.Schema] >> reader.Schema ans = Arrow Schema with 2 fields: City: String | DayLength: Time64 >> rb1 = reader.read(1); >> isequal(rb1, recordBatch1) ans = logical 1 >> rb2 = reader.read(2); >> isequal(rb2, recordBatch2) ans = logical 1 ``` ### Are these changes tested? Yes. Added two new test files: 1. `arrow/matlab/test/io/ipc/tRecordBatchFileWriter.m` 2. `arrow/matlab/test/io/ipc/tRecordBatchFileReader.m` ### Are there any user-facing changes? Yes. Users can now serialize `RecordBatch`es and `Table`s to files using the Arrow IPC data format as well as read in `RecordBatch`es from Arrow IPC data files. ### Future Directions 1. Add `RecordBatchStreamWriter` and `RecordBatchStreamReader` 2. Expose options for [controlling](https://github.com/apache/arrow/blob/main/cpp/src/arrow/ipc/options.h) IPC reading and writing in MATLAB. 3. Add more methods to `RecordBatchFileReader` to read in multiple record batches at once as well as importing the data as an Arrow `Table`. * GitHub Issue: apache#42146 Authored-by: Sarah Gilmore <[email protected]> Signed-off-by: Sarah Gilmore <[email protected]>
- Loading branch information
1 parent
89d6354
commit d261a82
Showing
11 changed files
with
871 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
128 changes: 128 additions & 0 deletions
128
matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_file_reader.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/matlab/io/ipc/proxy/record_batch_file_reader.h" | ||
#include "arrow/io/file.h" | ||
#include "arrow/matlab/error/error.h" | ||
#include "arrow/matlab/tabular/proxy/record_batch.h" | ||
#include "arrow/matlab/tabular/proxy/schema.h" | ||
#include "arrow/util/utf8.h" | ||
|
||
#include "libmexclass/proxy/ProxyManager.h" | ||
|
||
namespace arrow::matlab::io::ipc::proxy { | ||
|
||
namespace { | ||
libmexclass::error::Error makeInvalidNumericIndexError(const int32_t matlab_index, | ||
const int32_t num_batches) { | ||
std::stringstream error_message_stream; | ||
error_message_stream << "Invalid record batch index: "; | ||
error_message_stream << matlab_index; | ||
error_message_stream | ||
<< ". Record batch index must be between 1 and the number of record batches ("; | ||
error_message_stream << num_batches; | ||
error_message_stream << ")."; | ||
return libmexclass::error::Error{error::IPC_RECORD_BATCH_READ_INVALID_INDEX, | ||
error_message_stream.str()}; | ||
} | ||
} // namespace | ||
|
||
RecordBatchFileReader::RecordBatchFileReader( | ||
const std::shared_ptr<arrow::ipc::RecordBatchFileReader> reader) | ||
: reader{std::move(reader)} { | ||
REGISTER_METHOD(RecordBatchFileReader, getNumRecordBatches); | ||
REGISTER_METHOD(RecordBatchFileReader, getSchema); | ||
REGISTER_METHOD(RecordBatchFileReader, readRecordBatchAtIndex); | ||
} | ||
|
||
libmexclass::proxy::MakeResult RecordBatchFileReader::make( | ||
const libmexclass::proxy::FunctionArguments& constructor_arguments) { | ||
namespace mda = ::matlab::data; | ||
using RecordBatchFileReaderProxy = arrow::matlab::io::ipc::proxy::RecordBatchFileReader; | ||
|
||
const mda::StructArray opts = constructor_arguments[0]; | ||
|
||
const mda::StringArray filename_mda = opts[0]["Filename"]; | ||
const auto filename_utf16 = std::u16string(filename_mda[0]); | ||
MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8, | ||
arrow::util::UTF16StringToUTF8(filename_utf16), | ||
error::UNICODE_CONVERSION_ERROR_ID); | ||
|
||
MATLAB_ASSIGN_OR_ERROR(auto input_stream, arrow::io::ReadableFile::Open(filename_utf8), | ||
error::FAILED_TO_OPEN_FILE_FOR_WRITE); | ||
|
||
MATLAB_ASSIGN_OR_ERROR(auto reader, | ||
arrow::ipc::RecordBatchFileReader::Open(input_stream), | ||
error::IPC_RECORD_BATCH_READER_OPEN_FAILED); | ||
|
||
return std::make_shared<RecordBatchFileReaderProxy>(std::move(reader)); | ||
} | ||
|
||
void RecordBatchFileReader::getNumRecordBatches( | ||
libmexclass::proxy::method::Context& context) { | ||
namespace mda = ::matlab::data; | ||
|
||
mda::ArrayFactory factory; | ||
const auto num_batches = reader->num_record_batches(); | ||
context.outputs[0] = factory.createScalar(num_batches); | ||
} | ||
|
||
void RecordBatchFileReader::getSchema(libmexclass::proxy::method::Context& context) { | ||
namespace mda = ::matlab::data; | ||
using SchemaProxy = arrow::matlab::tabular::proxy::Schema; | ||
|
||
auto schema = reader->schema(); | ||
|
||
auto schema_proxy = std::make_shared<SchemaProxy>(std::move(schema)); | ||
const auto schema_proxy_id = | ||
libmexclass::proxy::ProxyManager::manageProxy(schema_proxy); | ||
|
||
mda::ArrayFactory factory; | ||
const auto schema_proxy_id_mda = factory.createScalar(schema_proxy_id); | ||
context.outputs[0] = schema_proxy_id_mda; | ||
} | ||
|
||
void RecordBatchFileReader::readRecordBatchAtIndex( | ||
libmexclass::proxy::method::Context& context) { | ||
namespace mda = ::matlab::data; | ||
using RecordBatchProxy = arrow::matlab::tabular::proxy::RecordBatch; | ||
|
||
mda::StructArray opts = context.inputs[0]; | ||
const mda::TypedArray<int32_t> matlab_index_mda = opts[0]["Index"]; | ||
|
||
const auto matlab_index = matlab_index_mda[0]; | ||
const auto num_record_batches = reader->num_record_batches(); | ||
if (matlab_index < 1 || matlab_index > num_record_batches) { | ||
context.error = makeInvalidNumericIndexError(matlab_index, num_record_batches); | ||
return; | ||
} | ||
const auto arrow_index = matlab_index - 1; | ||
|
||
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto record_batch, | ||
reader->ReadRecordBatch(arrow_index), context, | ||
error::IPC_RECORD_BATCH_READ_FAILED); | ||
|
||
auto record_batch_proxy = std::make_shared<RecordBatchProxy>(std::move(record_batch)); | ||
const auto record_batch_proxy_id = | ||
libmexclass::proxy::ProxyManager::manageProxy(record_batch_proxy); | ||
|
||
mda::ArrayFactory factory; | ||
const auto record_batch_proxyy_id_mda = factory.createScalar(record_batch_proxy_id); | ||
context.outputs[0] = record_batch_proxyy_id_mda; | ||
} | ||
|
||
} // namespace arrow::matlab::io::ipc::proxy |
44 changes: 44 additions & 0 deletions
44
matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_file_reader.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#pragma once | ||
|
||
#include "arrow/ipc/reader.h" | ||
#include "libmexclass/proxy/Proxy.h" | ||
|
||
namespace arrow::matlab::io::ipc::proxy { | ||
|
||
class RecordBatchFileReader : public libmexclass::proxy::Proxy { | ||
public: | ||
RecordBatchFileReader(std::shared_ptr<arrow::ipc::RecordBatchFileReader> reader); | ||
|
||
~RecordBatchFileReader() = default; | ||
|
||
static libmexclass::proxy::MakeResult make( | ||
const libmexclass::proxy::FunctionArguments& constructor_arguments); | ||
|
||
protected: | ||
std::shared_ptr<arrow::ipc::RecordBatchFileReader> reader; | ||
|
||
void getNumRecordBatches(libmexclass::proxy::method::Context& context); | ||
|
||
void getSchema(libmexclass::proxy::method::Context& context); | ||
|
||
void readRecordBatchAtIndex(libmexclass::proxy::method::Context& context); | ||
}; | ||
|
||
} // namespace arrow::matlab::io::ipc::proxy |
107 changes: 107 additions & 0 deletions
107
matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_file_writer.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/matlab/io/ipc/proxy/record_batch_file_writer.h" | ||
#include "arrow/io/file.h" | ||
#include "arrow/matlab/error/error.h" | ||
#include "arrow/matlab/tabular/proxy/record_batch.h" | ||
#include "arrow/matlab/tabular/proxy/schema.h" | ||
#include "arrow/matlab/tabular/proxy/table.h" | ||
#include "arrow/util/utf8.h" | ||
|
||
#include "libmexclass/proxy/ProxyManager.h" | ||
|
||
namespace arrow::matlab::io::ipc::proxy { | ||
|
||
RecordBatchFileWriter::RecordBatchFileWriter( | ||
const std::shared_ptr<arrow::ipc::RecordBatchWriter> writer) | ||
: writer{std::move(writer)} { | ||
REGISTER_METHOD(RecordBatchFileWriter, close); | ||
REGISTER_METHOD(RecordBatchFileWriter, writeRecordBatch); | ||
REGISTER_METHOD(RecordBatchFileWriter, writeTable); | ||
} | ||
|
||
libmexclass::proxy::MakeResult RecordBatchFileWriter::make( | ||
const libmexclass::proxy::FunctionArguments& constructor_arguments) { | ||
namespace mda = ::matlab::data; | ||
using RecordBatchFileWriterProxy = arrow::matlab::io::ipc::proxy::RecordBatchFileWriter; | ||
using SchemaProxy = arrow::matlab::tabular::proxy::Schema; | ||
|
||
const mda::StructArray opts = constructor_arguments[0]; | ||
|
||
const mda::StringArray filename_mda = opts[0]["Filename"]; | ||
const auto filename_utf16 = std::u16string(filename_mda[0]); | ||
MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8, | ||
arrow::util::UTF16StringToUTF8(filename_utf16), | ||
error::UNICODE_CONVERSION_ERROR_ID); | ||
|
||
const mda::TypedArray<uint64_t> arrow_schema_proxy_id_mda = opts[0]["SchemaProxyID"]; | ||
auto proxy = libmexclass::proxy::ProxyManager::getProxy(arrow_schema_proxy_id_mda[0]); | ||
auto arrow_schema_proxy = std::static_pointer_cast<SchemaProxy>(proxy); | ||
auto arrow_schema = arrow_schema_proxy->unwrap(); | ||
|
||
MATLAB_ASSIGN_OR_ERROR(auto output_stream, | ||
arrow::io::FileOutputStream::Open(filename_utf8), | ||
error::FAILED_TO_OPEN_FILE_FOR_WRITE); | ||
|
||
MATLAB_ASSIGN_OR_ERROR(auto writer, | ||
arrow::ipc::MakeFileWriter(output_stream, arrow_schema), | ||
"arrow:matlab:MakeFailed"); | ||
|
||
return std::make_shared<RecordBatchFileWriterProxy>(std::move(writer)); | ||
} | ||
|
||
void RecordBatchFileWriter::writeRecordBatch( | ||
libmexclass::proxy::method::Context& context) { | ||
namespace mda = ::matlab::data; | ||
using RecordBatchProxy = ::arrow::matlab::tabular::proxy::RecordBatch; | ||
|
||
mda::StructArray opts = context.inputs[0]; | ||
const mda::TypedArray<uint64_t> record_batch_proxy_id_mda = | ||
opts[0]["RecordBatchProxyID"]; | ||
const uint64_t record_batch_proxy_id = record_batch_proxy_id_mda[0]; | ||
|
||
auto proxy = libmexclass::proxy::ProxyManager::getProxy(record_batch_proxy_id); | ||
auto record_batch_proxy = std::static_pointer_cast<RecordBatchProxy>(proxy); | ||
auto record_batch = record_batch_proxy->unwrap(); | ||
|
||
MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(writer->WriteRecordBatch(*record_batch), context, | ||
error::IPC_RECORD_BATCH_WRITE_FAILED); | ||
} | ||
|
||
void RecordBatchFileWriter::writeTable(libmexclass::proxy::method::Context& context) { | ||
namespace mda = ::matlab::data; | ||
using TableProxy = ::arrow::matlab::tabular::proxy::Table; | ||
|
||
mda::StructArray opts = context.inputs[0]; | ||
const mda::TypedArray<uint64_t> table_proxy_id_mda = opts[0]["TableProxyID"]; | ||
const uint64_t table_proxy_id = table_proxy_id_mda[0]; | ||
|
||
auto proxy = libmexclass::proxy::ProxyManager::getProxy(table_proxy_id); | ||
auto table_proxy = std::static_pointer_cast<TableProxy>(proxy); | ||
auto table = table_proxy->unwrap(); | ||
|
||
MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(writer->WriteTable(*table), context, | ||
error::IPC_RECORD_BATCH_WRITE_FAILED); | ||
} | ||
|
||
void RecordBatchFileWriter::close(libmexclass::proxy::method::Context& context) { | ||
MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(writer->Close(), context, | ||
error::IPC_RECORD_BATCH_WRITE_CLOSE_FAILED); | ||
} | ||
|
||
} // namespace arrow::matlab::io::ipc::proxy |
42 changes: 42 additions & 0 deletions
42
matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_file_writer.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/ipc/writer.h" | ||
#include "libmexclass/proxy/Proxy.h" | ||
|
||
namespace arrow::matlab::io::ipc::proxy { | ||
|
||
class RecordBatchFileWriter : public libmexclass::proxy::Proxy { | ||
public: | ||
RecordBatchFileWriter(std::shared_ptr<arrow::ipc::RecordBatchWriter> writer); | ||
|
||
~RecordBatchFileWriter() = default; | ||
|
||
static libmexclass::proxy::MakeResult make( | ||
const libmexclass::proxy::FunctionArguments& constructor_arguments); | ||
|
||
protected: | ||
std::shared_ptr<arrow::ipc::RecordBatchWriter> writer; | ||
|
||
void writeRecordBatch(libmexclass::proxy::method::Context& context); | ||
|
||
void writeTable(libmexclass::proxy::method::Context& context); | ||
|
||
void close(libmexclass::proxy::method::Context& context); | ||
}; | ||
|
||
} // namespace arrow::matlab::io::ipc::proxy |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.