Add Dataset.tables_columns and tables_rows (#113)

* Fix missing </tr> * Add Dataset.tables_rows * Fix expected test templates * Add Dataset.tables_columns * Make font smaller * Improve caching of table stats * Avoid loops in test * Avoid loop in test, specify results * Include columns and rows in cached table props * Improve docstring of _tables_stats() * Simplify _tables_stats() * Extend tests with edge cases * Simplify description test
audeering · Oct 22, 2024 · 0401183 · 0401183
1 parent e79768d
commit 0401183
Show file tree

Hide file tree

Showing 6 changed files with 163 additions and 45 deletions.
diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py
@@ -22,7 +22,9 @@ class _Dataset:
     _table_related_cached_properties = [
         "segment_durations",
         "segments",
+        "tables_columns",
         "tables_preview",
+        "tables_rows",
     ]
     """Cached properties relying on table data.
 
@@ -510,6 +512,22 @@ def tables(self) -> typing.List[str]:
         tables = list(db)
         return tables
 
+    @functools.cached_property
+    def tables_columns(self) -> typing.Dict[str, int]:
+        """Number of columns for each table of the dataset.
+
+        Returns:
+            dictionary with table IDs as keys
+            and number of columns as values
+
+        Examples:
+            >>> ds = Dataset("emodb", "1.4.1")
+            >>> ds.tables_columns["speaker"]
+            3
+
+        """
+        return {table: stats["columns"] for table, stats in self._tables_stats.items()}
+
     @functools.cached_property
     def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]:
         """Table preview for each table of the dataset.
@@ -540,21 +558,32 @@ def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]:
 
         """
         preview = {}
-        for table in list(self.header):
-            df = audb.load_table(
-                self.name,
-                table,
-                version=self.version,
-                verbose=False,
-            )
+        for table, stats in self._tables_stats.items():
+            df = stats["preview"]
             df = df.reset_index()
             header = [df.columns.tolist()]
-            body = df.head(5).astype("string").values.tolist()
+            body = df.astype("string").values.tolist()
             # Remove unwanted chars and limit length of each entry
             body = [[self._parse_text(column) for column in row] for row in body]
             preview[table] = header + body
         return preview
 
+    @functools.cached_property
+    def tables_rows(self) -> typing.Dict[str, int]:
+        """Number of rows for each table of the dataset.
+
+        Returns:
+            dictionary with table IDs as keys
+            and number of rows as values
+
+        Examples:
+            >>> ds = Dataset("emodb", "1.4.1")
+            >>> ds.tables_rows["speaker"]
+            10
+
+        """
+        return {table: stats["rows"] for table, stats in self._tables_stats.items()}
+
     @functools.cached_property
     def tables_table(self) -> typing.List[str]:
         """Tables of the dataset."""
@@ -751,6 +780,39 @@ def _segments(self) -> pd.MultiIndex:
                 index = audformat.utils.union([index, df.index])
         return index
 
+    @functools.cached_property
+    def _tables_stats(self) -> typing.Dict[str, dict]:
+        """Table information of tables in the dataset.
+
+        Caches table information to improve performance
+        of multiple table-related properties.
+        This property computes and stores statistics for all tables,
+        reducing repeated computations.
+        It significantly improves performance
+        when accessing multiple table properties frequently.
+
+        Returns:
+            A dictionary with table names as keys and dictionaries containing:
+            - "columns": number of columns
+            - "rows": number of rows
+            - "preview": dataframe preview (first 5 rows)
+
+        """
+        stats = {}
+        for table in list(self.header):
+            df = audb.load_table(
+                self.name,
+                table,
+                version=self.version,
+                verbose=False,
+            )
+            stats[table] = {
+                "columns": len(df.columns),
+                "rows": len(df),
+                "preview": df.head(5),
+            }
+        return stats
+
     @staticmethod
     def _map_iso_languages(languages: typing.List[str]) -> typing.List[str]:
         r"""Calculate ISO languages for a list of languages.

diff --git a/audbcards/core/templates/datacard_tables.j2 b/audbcards/core/templates/datacard_tables.j2
@@ -41,8 +41,10 @@ Tables
     {% for column in row %}
     <td><p>{{ column }}</p></td>
     {% endfor %}
+    </tr>
     {% endif %}
     {% endfor %}
+    <tr><td><p class="table-statistic">{{ tables_rows[row[0]] }} {% if tables_rows[row[0]] == 1 %}row{% else %}rows{% endif %} x {{ tables_columns[row[0]] }} {% if tables_columns[row[0]] == 1 %}column{% else %}columns{% endif %}</p></td></tr>
     </tbody>
     </table>
 

diff --git a/audbcards/sphinx/table-preview.css b/audbcards/sphinx/table-preview.css
@@ -34,6 +34,10 @@ table.preview td {
     border-top: none;
     border-bottom: none;
 }
+table.preview td p.table-statistic {
+    /* Make "N rows x M columns" smaller */
+    font-size: 90%;
+}
 table.clickable td:not(.expanded-row-content),
 table.clickable th {
     /* Allow to center cell copntent with `margin: auto` */

diff --git a/tests/test_data/rendered_templates/medium_db.rst b/tests/test_data/rendered_templates/medium_db.rst
@@ -73,10 +73,13 @@ Tables
                     <tr>
         <td><p>data/f0.wav</p></td>
         <td><p>0</p></td>
-                    <tr>
+        </tr>
+                <tr>
         <td><p>data/f1.wav</p></td>
         <td><p>1</p></td>
-                </tbody>
+        </tr>
+            <tr><td><p class="table-statistic">2 rows x 1 column</p></td></tr>
+    </tbody>
     </table>
 
 
@@ -104,22 +107,27 @@ Tables
         <td><p>0 days 00:00:00</p></td>
         <td><p>0 days 00:00:00.500000</p></td>
         <td><p>neutral</p></td>
-                    <tr>
+        </tr>
+                <tr>
         <td><p>data/f0.wav</p></td>
         <td><p>0 days 00:00:00.500000</p></td>
         <td><p>0 days 00:00:01</p></td>
         <td><p>neutral</p></td>
-                    <tr>
+        </tr>
+                <tr>
         <td><p>data/f1.wav</p></td>
         <td><p>0 days 00:00:00</p></td>
         <td><p>0 days 00:02:30</p></td>
         <td><p>happy</p></td>
-                    <tr>
+        </tr>
+                <tr>
         <td><p>data/f1.wav</p></td>
         <td><p>0 days 00:02:30</p></td>
         <td><p>0 days 00:05:01</p></td>
         <td><p>angry</p></td>
-                </tbody>
+        </tr>
+            <tr><td><p class="table-statistic">4 rows x 1 column</p></td></tr>
+    </tbody>
     </table>
 
 
@@ -145,11 +153,14 @@ Tables
         <td><p>0</p></td>
         <td><p>23</p></td>
         <td><p>female</p></td>
-                    <tr>
+        </tr>
+                <tr>
         <td><p>1</p></td>
         <td><p>49</p></td>
         <td><p>male</p></td>
-                </tbody>
+        </tr>
+            <tr><td><p class="table-statistic">2 rows x 2 columns</p></td></tr>
+    </tbody>
     </table>
 
 

diff --git a/tests/test_data/rendered_templates/minimal_db.rst b/tests/test_data/rendered_templates/minimal_db.rst
@@ -58,7 +58,9 @@ Tables
                     <tr>
         <td><p>f0.wav</p></td>
         <td><p>0</p></td>
-                </tbody>
+        </tr>
+            <tr><td><p class="table-statistic">1 row x 1 column</p></td></tr>
+    </tbody>
     </table>
 
 

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -7,7 +7,6 @@
 
 import audb
 import audeer
-import audformat
 import audiofile
 
 import audbcards
@@ -50,12 +49,67 @@ def test_dataset_property_scope(tmpdir, db, request):
 
 
 @pytest.mark.parametrize(
-    "db",
+    "db, "
+    "expected_description, "
+    "expected_schemes_table, "
+    "expected_tables_table, "
+    "expected_tables_columns, "
+    "expected_tables_rows, "
+    "expected_segment_durations",
     [
-        "medium_db",
+        (
+            "bare_db",
+            "",
+            [[]],
+            [["ID", "Type", "Columns"]],
+            {},
+            {},
+            [],
+        ),
+        (
+            "minimal_db",
+            "Minimal database.",
+            [[]],
+            [["ID", "Type", "Columns"], ["files", "filewise", "speaker"]],
+            {"files": 1},
+            {"files": 1},
+            [],
+        ),
+        (
+            "medium_db",
+            "Medium database. | Some description |.",
+            [
+                ["ID", "Dtype", "Min", "Labels", "Mappings"],
+                ["age", "int", 0, "", ""],
+                ["emotion", "str", "", "angry, happy, neutral", ""],
+                ["gender", "str", "", "female, male", ""],
+                ["speaker", "int", "", "0, 1", "age, gender"],
+            ],
+            [
+                ["ID", "Type", "Columns"],
+                ["files", "filewise", "speaker"],
+                ["segments", "segmented", "emotion"],
+                ["speaker", "misc", "age, gender"],
+            ],
+            {"files": 1, "segments": 1, "speaker": 2},
+            {"files": 2, "segments": 4, "speaker": 2},
+            [0.5, 0.5, 150, 151],
+        ),
     ],
 )
-def test_dataset(audb_cache, tmpdir, repository, db, request):
+def test_dataset(
+    audb_cache,
+    tmpdir,
+    repository,
+    request,
+    db,
+    expected_description,
+    expected_schemes_table,
+    expected_tables_table,
+    expected_tables_columns,
+    expected_tables_rows,
+    expected_segment_durations,
+):
     r"""Test audbcards.Dataset object and all its properties."""
     db = request.getfixturevalue(db)
 
@@ -115,7 +169,7 @@ def test_dataset(audb_cache, tmpdir, repository, db, request):
 
     # duration
     expected_duration = db.files_duration(db.files).sum()
-    assert dataset.duration == expected_duration
+    assert dataset.duration == pd.to_timedelta(expected_duration)
 
     # files
     expected_files = len(db.files)
@@ -175,46 +229,29 @@ def test_dataset(audb_cache, tmpdir, repository, db, request):
     assert dataset.schemes == expected_schemes
 
     # schemes_table
-    expected_schemes_table = [
-        ["ID", "Dtype", "Min", "Labels", "Mappings"],
-        ["age", "int", 0, "", ""],
-        ["emotion", "str", "", "angry, happy, neutral", ""],
-        ["gender", "str", "", "female, male", ""],
-        ["speaker", "int", "", "0, 1", "age, gender"],
-    ]
     assert dataset.schemes_table == expected_schemes_table
 
     # segment_durations
-    expected_segment_durations = [0.5, 0.5, 150, 151]
     assert dataset.segment_durations == expected_segment_durations
 
     # segments
     expected_segments = str(len(db.segments))
     assert dataset.segments == expected_segments
 
     # short_description
-    max_desc_length = 150
-    expected_description = (
-        db.description
-        if (len(db.description) < max_desc_length)
-        else f"{db.description[:max_desc_length - 3]}..."
-    )
     assert dataset.short_description == expected_description
 
     # tables
     expected_tables = list(db)
     assert dataset.tables == expected_tables
 
+    # tables_columns
+    assert dataset.tables_columns == expected_tables_columns
+
+    # tables_rows
+    assert dataset.tables_rows == expected_tables_rows
+
     # tables_table
-    expected_tables_table = [["ID", "Type", "Columns"]]
-    for table_id in list(db):
-        table = db[table_id]
-        if isinstance(table, audformat.MiscTable):
-            table_type = "misc"
-        else:
-            table_type = table.type
-        columns = ", ".join(list(table.columns))
-        expected_tables_table.append([table_id, table_type, columns])
     assert dataset.tables_table == expected_tables_table
 
     # version