DeepRec-AI · 2sin18 · Aug 3, 2022 · Aug 3, 2022
diff --git a/hybridbackend/__init__.py b/hybridbackend/__init__.py
@@ -20,6 +20,6 @@
 from __future__ import division
 from __future__ import print_function
 
-__version__ = '0.5.4'
+__version__ = '0.5.4post1'
 __author__ = 'Alibaba Group Holding Limited'
 __copyright__ = '2021 Alibaba Group Holding Limited'
diff --git a/hybridbackend/cpp/tensorflow/arrow/arrow.cc b/hybridbackend/cpp/tensorflow/arrow/arrow.cc
@@ -195,6 +195,12 @@ class RaggedTensorBuilder : public ::arrow::ArrayVisitor {
     if (!st.ok()) {
       return st;
     }
+
+    // Follow RaggedTensor-style ordering: V, Sn, Sn-1, ..., S0
+    if (ragged_tensor_.size() > 1) {
+      std::reverse(std::next(ragged_tensor_.begin()), ragged_tensor_.end());
+    }
+
     output_tensors->insert(output_tensors->end(), ragged_tensor_.begin(),
                            ragged_tensor_.end());
     return ::arrow::Status::OK();

diff --git a/hybridbackend/tensorflow/data/dataframe.py b/hybridbackend/tensorflow/data/dataframe.py
@@ -163,6 +163,16 @@ def __new__(cls, values, nested_row_splits=None):
     def __repr__(self):
       return f'{{{self.values}, splits={self.nested_row_splits}}}'
 
+    def to_list(self):
+      result = self.values.tolist()
+      for rank in reversed(range(len(self.nested_row_splits))):
+        row_splits = self.nested_row_splits[rank]
+        result = [
+          result[row_splits[i]:row_splits[i + 1]]
+          for i in range(len(row_splits) - 1)
+        ]
+      return result
+
     def to_sparse(self, name=None):
       if len(self.nested_row_splits) == 0:
         return self.values

diff --git a/tests/tensorflow/data/parquet_dataset_ragged_nested_test.py b/tests/tensorflow/data/parquet_dataset_ragged_nested_test.py
@@ -0,0 +1,82 @@
+# Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+r'''Parquet batch dataset nested ragged tensors test.
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+import tensorflow as tf
+
+import hybridbackend.tensorflow as hb
+from tests.tensorflow.spawn import register
+
+
+# pylint: disable=missing-docstring
+class ParquetDatasetRaggedNestedTest(unittest.TestCase):
+  def setUp(self):  # pylint: disable=invalid-name
+    self._workspace = tempfile.mkdtemp()
+    self._filename = os.path.join(
+      self._workspace, 'ragged_test_pyarrow.parquet')
+    self._data = pa.array(
+      [[[1], [2, 3]], [[4], [5]]], pa.list_(pa.list_(pa.int64())))
+    table = pa.Table.from_arrays([self._data], ['A'])
+    pq.write_table(table, self._filename, compression='ZSTD')
+
+  def tearDown(self):  # pylint: disable=invalid-name
+    os.remove(self._filename)
+
+  def test_read(self):
+    with tf.Graph().as_default() as graph:
+      ds = hb.data.ParquetDataset(
+        [self._filename],
+        batch_size=2)
+      ds = ds.prefetch(4)
+      batch = hb.data.make_one_shot_iterator(ds).get_next()
+
+    with tf.Session(graph=graph) as sess:
+      actual = sess.run(batch)['A'].to_list()
+      expected = self._data.to_pylist()
+      np.testing.assert_equal(actual, expected)
+
+  def test_apply_to_sparse(self):
+    with tf.Graph().as_default() as graph:
+      ds = hb.data.ParquetDataset(
+        [self._filename],
+        batch_size=2)
+      ds = ds.apply(hb.data.to_sparse())
+      batch = hb.data.make_one_shot_iterator(ds).get_next()['A']
+      baseline = tf.ragged.constant(self._data.to_pylist()).to_sparse()
+
+    with tf.Session(graph=graph) as sess:
+      actual, expected = sess.run([batch, baseline])
+      np.testing.assert_equal(actual.indices, expected.indices)
+      np.testing.assert_equal(actual.values, expected.values)
+      np.testing.assert_equal(actual.dense_shape, expected.dense_shape)
+
+
+if __name__ == '__main__':
+  register(['cpu', 'data'])
+  os.environ['CUDA_VISIBLE_DEVICES'] = ''
+  unittest.main()