Skip to content

Commit

Permalink
apacheGH-42146: [MATLAB] Add IPC RecordBatchFileReader and `RecordB…
Browse files Browse the repository at this point in the history
…atchFileWriter` MATLAB classes (apache#42201)

### Rationale for this change

To enable initial IPC I/O support in the MATLAB interface, we should add a `RecordBatchFileReader` class and a `RecordBatchFileWriter` class.

### What changes are included in this PR?

1. Added a new `arrow.io.ipc.RecordBatchFileWriter` class.
2. Added a new `arrow.io.ipc.RecordBatchFileReader` class.

**Example**

```matlab
>> city = ["Boston" "Seattle" "Denver" "Juno" "Anchorage" "Chicago"]';
>> daylength = duration(["15:17:01" "15:59:16" "14:59:14" "19:21:23" "14:18:24" "15:13:39"])';
>> matlabTable = table(city, daylength, VariableNames=["City", "DayLength"]);
>> recordBatch1 = arrow.recordBatch(matlabTable(1:4, :))
>> recordBatch2 = arrow.recordBatch(matlabTable(5:end, :));

>> writer = arrow.io.ipc.RecordBatchFileWriter("daylight.arrow", recordBatch1.Schema);
>> writer.writeRecordBatch(recordBatch1);
>> writer.writeRecordBatch(recordBatch2);
>> writer.close();

>> reader = arrow.io.ipc.RecordBatchFileReader("daylight.arrow"); 

reader = 

  RecordBatchFileReader with properties:

    NumRecordBatches: 2
              Schema: [1×1 arrow.tabular.Schema]

>> reader.Schema

ans = 

  Arrow Schema with 2 fields:

    City: String | DayLength: Time64

>> rb1 = reader.read(1);
>> isequal(rb1, recordBatch1)

ans =

  logical

   1

>> rb2 = reader.read(2);
>> isequal(rb2, recordBatch2)

ans =

  logical

   1

```

### Are these changes tested?

Yes.  Added two new test files:

1. `arrow/matlab/test/io/ipc/tRecordBatchFileWriter.m`
2. `arrow/matlab/test/io/ipc/tRecordBatchFileReader.m`

### Are there any user-facing changes?

Yes. Users can now serialize `RecordBatch`es and `Table`s to files using the Arrow IPC data format as well as read in `RecordBatch`es from Arrow IPC data files.

### Future Directions

1. Add `RecordBatchStreamWriter` and `RecordBatchStreamReader`
2. Expose options for [controlling](https://github.com/apache/arrow/blob/main/cpp/src/arrow/ipc/options.h)  IPC reading and writing in MATLAB.
3. Add more methods to `RecordBatchFileReader` to read in multiple record batches at once as well as importing the data as an Arrow `Table`.

* GitHub Issue: apache#42146

Authored-by: Sarah Gilmore <[email protected]>
Signed-off-by: Sarah Gilmore <[email protected]>
  • Loading branch information
sgilmore10 authored Jun 20, 2024
1 parent 89d6354 commit d261a82
Show file tree
Hide file tree
Showing 11 changed files with 871 additions and 1 deletion.
7 changes: 7 additions & 0 deletions matlab/src/cpp/arrow/matlab/error/error.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,5 +242,12 @@ static const char* ARRAY_SLICE_FAILED_TO_CREATE_ARRAY_PROXY =
"arrow:array:slice:FailedToCreateArrayProxy";
static const char* C_EXPORT_FAILED = "arrow:c:export:ExportFailed";
static const char* C_IMPORT_FAILED = "arrow:c:import:ImportFailed";
static const char* IPC_RECORD_BATCH_WRITE_FAILED =
"arrow:io:ipc:FailedToWriteRecordBatch";
static const char* IPC_RECORD_BATCH_WRITE_CLOSE_FAILED = "arrow:io:ipc:CloseFailed";
static const char* IPC_RECORD_BATCH_READER_OPEN_FAILED =
"arrow:io:ipc:FailedToOpenRecordBatchReader";
static const char* IPC_RECORD_BATCH_READ_INVALID_INDEX = "arrow:io:ipc:InvalidIndex";
static const char* IPC_RECORD_BATCH_READ_FAILED = "arrow:io:ipc:ReadFailed";

} // namespace arrow::matlab::error
128 changes: 128 additions & 0 deletions matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_file_reader.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/matlab/io/ipc/proxy/record_batch_file_reader.h"
#include "arrow/io/file.h"
#include "arrow/matlab/error/error.h"
#include "arrow/matlab/tabular/proxy/record_batch.h"
#include "arrow/matlab/tabular/proxy/schema.h"
#include "arrow/util/utf8.h"

#include "libmexclass/proxy/ProxyManager.h"

namespace arrow::matlab::io::ipc::proxy {

namespace {
libmexclass::error::Error makeInvalidNumericIndexError(const int32_t matlab_index,
const int32_t num_batches) {
std::stringstream error_message_stream;
error_message_stream << "Invalid record batch index: ";
error_message_stream << matlab_index;
error_message_stream
<< ". Record batch index must be between 1 and the number of record batches (";
error_message_stream << num_batches;
error_message_stream << ").";
return libmexclass::error::Error{error::IPC_RECORD_BATCH_READ_INVALID_INDEX,
error_message_stream.str()};
}
} // namespace

RecordBatchFileReader::RecordBatchFileReader(
const std::shared_ptr<arrow::ipc::RecordBatchFileReader> reader)
: reader{std::move(reader)} {
REGISTER_METHOD(RecordBatchFileReader, getNumRecordBatches);
REGISTER_METHOD(RecordBatchFileReader, getSchema);
REGISTER_METHOD(RecordBatchFileReader, readRecordBatchAtIndex);
}

libmexclass::proxy::MakeResult RecordBatchFileReader::make(
const libmexclass::proxy::FunctionArguments& constructor_arguments) {
namespace mda = ::matlab::data;
using RecordBatchFileReaderProxy = arrow::matlab::io::ipc::proxy::RecordBatchFileReader;

const mda::StructArray opts = constructor_arguments[0];

const mda::StringArray filename_mda = opts[0]["Filename"];
const auto filename_utf16 = std::u16string(filename_mda[0]);
MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8,
arrow::util::UTF16StringToUTF8(filename_utf16),
error::UNICODE_CONVERSION_ERROR_ID);

MATLAB_ASSIGN_OR_ERROR(auto input_stream, arrow::io::ReadableFile::Open(filename_utf8),
error::FAILED_TO_OPEN_FILE_FOR_WRITE);

MATLAB_ASSIGN_OR_ERROR(auto reader,
arrow::ipc::RecordBatchFileReader::Open(input_stream),
error::IPC_RECORD_BATCH_READER_OPEN_FAILED);

return std::make_shared<RecordBatchFileReaderProxy>(std::move(reader));
}

void RecordBatchFileReader::getNumRecordBatches(
libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;

mda::ArrayFactory factory;
const auto num_batches = reader->num_record_batches();
context.outputs[0] = factory.createScalar(num_batches);
}

void RecordBatchFileReader::getSchema(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
using SchemaProxy = arrow::matlab::tabular::proxy::Schema;

auto schema = reader->schema();

auto schema_proxy = std::make_shared<SchemaProxy>(std::move(schema));
const auto schema_proxy_id =
libmexclass::proxy::ProxyManager::manageProxy(schema_proxy);

mda::ArrayFactory factory;
const auto schema_proxy_id_mda = factory.createScalar(schema_proxy_id);
context.outputs[0] = schema_proxy_id_mda;
}

void RecordBatchFileReader::readRecordBatchAtIndex(
libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
using RecordBatchProxy = arrow::matlab::tabular::proxy::RecordBatch;

mda::StructArray opts = context.inputs[0];
const mda::TypedArray<int32_t> matlab_index_mda = opts[0]["Index"];

const auto matlab_index = matlab_index_mda[0];
const auto num_record_batches = reader->num_record_batches();
if (matlab_index < 1 || matlab_index > num_record_batches) {
context.error = makeInvalidNumericIndexError(matlab_index, num_record_batches);
return;
}
const auto arrow_index = matlab_index - 1;

MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto record_batch,
reader->ReadRecordBatch(arrow_index), context,
error::IPC_RECORD_BATCH_READ_FAILED);

auto record_batch_proxy = std::make_shared<RecordBatchProxy>(std::move(record_batch));
const auto record_batch_proxy_id =
libmexclass::proxy::ProxyManager::manageProxy(record_batch_proxy);

mda::ArrayFactory factory;
const auto record_batch_proxyy_id_mda = factory.createScalar(record_batch_proxy_id);
context.outputs[0] = record_batch_proxyy_id_mda;
}

} // namespace arrow::matlab::io::ipc::proxy
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/ipc/reader.h"
#include "libmexclass/proxy/Proxy.h"

namespace arrow::matlab::io::ipc::proxy {

class RecordBatchFileReader : public libmexclass::proxy::Proxy {
public:
RecordBatchFileReader(std::shared_ptr<arrow::ipc::RecordBatchFileReader> reader);

~RecordBatchFileReader() = default;

static libmexclass::proxy::MakeResult make(
const libmexclass::proxy::FunctionArguments& constructor_arguments);

protected:
std::shared_ptr<arrow::ipc::RecordBatchFileReader> reader;

void getNumRecordBatches(libmexclass::proxy::method::Context& context);

void getSchema(libmexclass::proxy::method::Context& context);

void readRecordBatchAtIndex(libmexclass::proxy::method::Context& context);
};

} // namespace arrow::matlab::io::ipc::proxy
107 changes: 107 additions & 0 deletions matlab/src/cpp/arrow/matlab/io/ipc/proxy/record_batch_file_writer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/matlab/io/ipc/proxy/record_batch_file_writer.h"
#include "arrow/io/file.h"
#include "arrow/matlab/error/error.h"
#include "arrow/matlab/tabular/proxy/record_batch.h"
#include "arrow/matlab/tabular/proxy/schema.h"
#include "arrow/matlab/tabular/proxy/table.h"
#include "arrow/util/utf8.h"

#include "libmexclass/proxy/ProxyManager.h"

namespace arrow::matlab::io::ipc::proxy {

RecordBatchFileWriter::RecordBatchFileWriter(
const std::shared_ptr<arrow::ipc::RecordBatchWriter> writer)
: writer{std::move(writer)} {
REGISTER_METHOD(RecordBatchFileWriter, close);
REGISTER_METHOD(RecordBatchFileWriter, writeRecordBatch);
REGISTER_METHOD(RecordBatchFileWriter, writeTable);
}

libmexclass::proxy::MakeResult RecordBatchFileWriter::make(
const libmexclass::proxy::FunctionArguments& constructor_arguments) {
namespace mda = ::matlab::data;
using RecordBatchFileWriterProxy = arrow::matlab::io::ipc::proxy::RecordBatchFileWriter;
using SchemaProxy = arrow::matlab::tabular::proxy::Schema;

const mda::StructArray opts = constructor_arguments[0];

const mda::StringArray filename_mda = opts[0]["Filename"];
const auto filename_utf16 = std::u16string(filename_mda[0]);
MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8,
arrow::util::UTF16StringToUTF8(filename_utf16),
error::UNICODE_CONVERSION_ERROR_ID);

const mda::TypedArray<uint64_t> arrow_schema_proxy_id_mda = opts[0]["SchemaProxyID"];
auto proxy = libmexclass::proxy::ProxyManager::getProxy(arrow_schema_proxy_id_mda[0]);
auto arrow_schema_proxy = std::static_pointer_cast<SchemaProxy>(proxy);
auto arrow_schema = arrow_schema_proxy->unwrap();

MATLAB_ASSIGN_OR_ERROR(auto output_stream,
arrow::io::FileOutputStream::Open(filename_utf8),
error::FAILED_TO_OPEN_FILE_FOR_WRITE);

MATLAB_ASSIGN_OR_ERROR(auto writer,
arrow::ipc::MakeFileWriter(output_stream, arrow_schema),
"arrow:matlab:MakeFailed");

return std::make_shared<RecordBatchFileWriterProxy>(std::move(writer));
}

void RecordBatchFileWriter::writeRecordBatch(
libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
using RecordBatchProxy = ::arrow::matlab::tabular::proxy::RecordBatch;

mda::StructArray opts = context.inputs[0];
const mda::TypedArray<uint64_t> record_batch_proxy_id_mda =
opts[0]["RecordBatchProxyID"];
const uint64_t record_batch_proxy_id = record_batch_proxy_id_mda[0];

auto proxy = libmexclass::proxy::ProxyManager::getProxy(record_batch_proxy_id);
auto record_batch_proxy = std::static_pointer_cast<RecordBatchProxy>(proxy);
auto record_batch = record_batch_proxy->unwrap();

MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(writer->WriteRecordBatch(*record_batch), context,
error::IPC_RECORD_BATCH_WRITE_FAILED);
}

void RecordBatchFileWriter::writeTable(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
using TableProxy = ::arrow::matlab::tabular::proxy::Table;

mda::StructArray opts = context.inputs[0];
const mda::TypedArray<uint64_t> table_proxy_id_mda = opts[0]["TableProxyID"];
const uint64_t table_proxy_id = table_proxy_id_mda[0];

auto proxy = libmexclass::proxy::ProxyManager::getProxy(table_proxy_id);
auto table_proxy = std::static_pointer_cast<TableProxy>(proxy);
auto table = table_proxy->unwrap();

MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(writer->WriteTable(*table), context,
error::IPC_RECORD_BATCH_WRITE_FAILED);
}

void RecordBatchFileWriter::close(libmexclass::proxy::method::Context& context) {
MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(writer->Close(), context,
error::IPC_RECORD_BATCH_WRITE_CLOSE_FAILED);
}

} // namespace arrow::matlab::io::ipc::proxy
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/ipc/writer.h"
#include "libmexclass/proxy/Proxy.h"

namespace arrow::matlab::io::ipc::proxy {

class RecordBatchFileWriter : public libmexclass::proxy::Proxy {
public:
RecordBatchFileWriter(std::shared_ptr<arrow::ipc::RecordBatchWriter> writer);

~RecordBatchFileWriter() = default;

static libmexclass::proxy::MakeResult make(
const libmexclass::proxy::FunctionArguments& constructor_arguments);

protected:
std::shared_ptr<arrow::ipc::RecordBatchWriter> writer;

void writeRecordBatch(libmexclass::proxy::method::Context& context);

void writeTable(libmexclass::proxy::method::Context& context);

void close(libmexclass::proxy::method::Context& context);
};

} // namespace arrow::matlab::io::ipc::proxy
4 changes: 4 additions & 0 deletions matlab/src/cpp/arrow/matlab/proxy/factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#include "arrow/matlab/io/csv/proxy/table_writer.h"
#include "arrow/matlab/io/feather/proxy/reader.h"
#include "arrow/matlab/io/feather/proxy/writer.h"
#include "arrow/matlab/io/ipc/proxy/record_batch_file_reader.h"
#include "arrow/matlab/io/ipc/proxy/record_batch_file_writer.h"
#include "arrow/matlab/tabular/proxy/record_batch.h"
#include "arrow/matlab/tabular/proxy/schema.h"
#include "arrow/matlab/tabular/proxy/table.h"
Expand Down Expand Up @@ -107,6 +109,8 @@ libmexclass::proxy::MakeResult Factory::make_proxy(
REGISTER_PROXY(arrow.c.proxy.ArrayImporter , arrow::matlab::c::proxy::ArrayImporter);
REGISTER_PROXY(arrow.c.proxy.Schema , arrow::matlab::c::proxy::Schema);
REGISTER_PROXY(arrow.c.proxy.RecordBatchImporter , arrow::matlab::c::proxy::RecordBatchImporter);
REGISTER_PROXY(arrow.io.ipc.proxy.RecordBatchFileReader , arrow::matlab::io::ipc::proxy::RecordBatchFileReader);
REGISTER_PROXY(arrow.io.ipc.proxy.RecordBatchFileWriter , arrow::matlab::io::ipc::proxy::RecordBatchFileWriter);
// clang-format on

return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID,
Expand Down
Loading

0 comments on commit d261a82

Please sign in to comment.