Skip to content

Commit

Permalink
Add Dataset.tables_columns and tables_rows (#113)
Browse files Browse the repository at this point in the history
* Fix missing </tr>

* Add Dataset.tables_rows

* Fix expected test templates

* Add Dataset.tables_columns

* Make font smaller

* Improve caching of table stats

* Avoid loops in test

* Avoid loop in test, specify results

* Include columns and rows in cached table props

* Improve docstring of _tables_stats()

* Simplify _tables_stats()

* Extend tests with edge cases

* Simplify description test
  • Loading branch information
hagenw authored Oct 22, 2024
1 parent e79768d commit 0401183
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 45 deletions.
78 changes: 70 additions & 8 deletions audbcards/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ class _Dataset:
_table_related_cached_properties = [
"segment_durations",
"segments",
"tables_columns",
"tables_preview",
"tables_rows",
]
"""Cached properties relying on table data.
Expand Down Expand Up @@ -510,6 +512,22 @@ def tables(self) -> typing.List[str]:
tables = list(db)
return tables

@functools.cached_property
def tables_columns(self) -> typing.Dict[str, int]:
"""Number of columns for each table of the dataset.
Returns:
dictionary with table IDs as keys
and number of columns as values
Examples:
>>> ds = Dataset("emodb", "1.4.1")
>>> ds.tables_columns["speaker"]
3
"""
return {table: stats["columns"] for table, stats in self._tables_stats.items()}

@functools.cached_property
def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]:
"""Table preview for each table of the dataset.
Expand Down Expand Up @@ -540,21 +558,32 @@ def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]:
"""
preview = {}
for table in list(self.header):
df = audb.load_table(
self.name,
table,
version=self.version,
verbose=False,
)
for table, stats in self._tables_stats.items():
df = stats["preview"]
df = df.reset_index()
header = [df.columns.tolist()]
body = df.head(5).astype("string").values.tolist()
body = df.astype("string").values.tolist()
# Remove unwanted chars and limit length of each entry
body = [[self._parse_text(column) for column in row] for row in body]
preview[table] = header + body
return preview

@functools.cached_property
def tables_rows(self) -> typing.Dict[str, int]:
"""Number of rows for each table of the dataset.
Returns:
dictionary with table IDs as keys
and number of rows as values
Examples:
>>> ds = Dataset("emodb", "1.4.1")
>>> ds.tables_rows["speaker"]
10
"""
return {table: stats["rows"] for table, stats in self._tables_stats.items()}

@functools.cached_property
def tables_table(self) -> typing.List[str]:
"""Tables of the dataset."""
Expand Down Expand Up @@ -751,6 +780,39 @@ def _segments(self) -> pd.MultiIndex:
index = audformat.utils.union([index, df.index])
return index

@functools.cached_property
def _tables_stats(self) -> typing.Dict[str, dict]:
"""Table information of tables in the dataset.
Caches table information to improve performance
of multiple table-related properties.
This property computes and stores statistics for all tables,
reducing repeated computations.
It significantly improves performance
when accessing multiple table properties frequently.
Returns:
A dictionary with table names as keys and dictionaries containing:
- "columns": number of columns
- "rows": number of rows
- "preview": dataframe preview (first 5 rows)
"""
stats = {}
for table in list(self.header):
df = audb.load_table(
self.name,
table,
version=self.version,
verbose=False,
)
stats[table] = {
"columns": len(df.columns),
"rows": len(df),
"preview": df.head(5),
}
return stats

@staticmethod
def _map_iso_languages(languages: typing.List[str]) -> typing.List[str]:
r"""Calculate ISO languages for a list of languages.
Expand Down
2 changes: 2 additions & 0 deletions audbcards/core/templates/datacard_tables.j2
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ Tables
{% for column in row %}
<td><p>{{ column }}</p></td>
{% endfor %}
</tr>
{% endif %}
{% endfor %}
<tr><td><p class="table-statistic">{{ tables_rows[row[0]] }} {% if tables_rows[row[0]] == 1 %}row{% else %}rows{% endif %} x {{ tables_columns[row[0]] }} {% if tables_columns[row[0]] == 1 %}column{% else %}columns{% endif %}</p></td></tr>
</tbody>
</table>

Expand Down
4 changes: 4 additions & 0 deletions audbcards/sphinx/table-preview.css
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ table.preview td {
border-top: none;
border-bottom: none;
}
table.preview td p.table-statistic {
/* Make "N rows x M columns" smaller */
font-size: 90%;
}
table.clickable td:not(.expanded-row-content),
table.clickable th {
/* Allow to center cell copntent with `margin: auto` */
Expand Down
27 changes: 19 additions & 8 deletions tests/test_data/rendered_templates/medium_db.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,13 @@ Tables
<tr>
<td><p>data/f0.wav</p></td>
<td><p>0</p></td>
<tr>
</tr>
<tr>
<td><p>data/f1.wav</p></td>
<td><p>1</p></td>
</tbody>
</tr>
<tr><td><p class="table-statistic">2 rows x 1 column</p></td></tr>
</tbody>
</table>


Expand Down Expand Up @@ -104,22 +107,27 @@ Tables
<td><p>0 days 00:00:00</p></td>
<td><p>0 days 00:00:00.500000</p></td>
<td><p>neutral</p></td>
<tr>
</tr>
<tr>
<td><p>data/f0.wav</p></td>
<td><p>0 days 00:00:00.500000</p></td>
<td><p>0 days 00:00:01</p></td>
<td><p>neutral</p></td>
<tr>
</tr>
<tr>
<td><p>data/f1.wav</p></td>
<td><p>0 days 00:00:00</p></td>
<td><p>0 days 00:02:30</p></td>
<td><p>happy</p></td>
<tr>
</tr>
<tr>
<td><p>data/f1.wav</p></td>
<td><p>0 days 00:02:30</p></td>
<td><p>0 days 00:05:01</p></td>
<td><p>angry</p></td>
</tbody>
</tr>
<tr><td><p class="table-statistic">4 rows x 1 column</p></td></tr>
</tbody>
</table>


Expand All @@ -145,11 +153,14 @@ Tables
<td><p>0</p></td>
<td><p>23</p></td>
<td><p>female</p></td>
<tr>
</tr>
<tr>
<td><p>1</p></td>
<td><p>49</p></td>
<td><p>male</p></td>
</tbody>
</tr>
<tr><td><p class="table-statistic">2 rows x 2 columns</p></td></tr>
</tbody>
</table>


Expand Down
4 changes: 3 additions & 1 deletion tests/test_data/rendered_templates/minimal_db.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ Tables
<tr>
<td><p>f0.wav</p></td>
<td><p>0</p></td>
</tbody>
</tr>
<tr><td><p class="table-statistic">1 row x 1 column</p></td></tr>
</tbody>
</table>


Expand Down
93 changes: 65 additions & 28 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import audb
import audeer
import audformat
import audiofile

import audbcards
Expand Down Expand Up @@ -50,12 +49,67 @@ def test_dataset_property_scope(tmpdir, db, request):


@pytest.mark.parametrize(
"db",
"db, "
"expected_description, "
"expected_schemes_table, "
"expected_tables_table, "
"expected_tables_columns, "
"expected_tables_rows, "
"expected_segment_durations",
[
"medium_db",
(
"bare_db",
"",
[[]],
[["ID", "Type", "Columns"]],
{},
{},
[],
),
(
"minimal_db",
"Minimal database.",
[[]],
[["ID", "Type", "Columns"], ["files", "filewise", "speaker"]],
{"files": 1},
{"files": 1},
[],
),
(
"medium_db",
"Medium database. | Some description |.",
[
["ID", "Dtype", "Min", "Labels", "Mappings"],
["age", "int", 0, "", ""],
["emotion", "str", "", "angry, happy, neutral", ""],
["gender", "str", "", "female, male", ""],
["speaker", "int", "", "0, 1", "age, gender"],
],
[
["ID", "Type", "Columns"],
["files", "filewise", "speaker"],
["segments", "segmented", "emotion"],
["speaker", "misc", "age, gender"],
],
{"files": 1, "segments": 1, "speaker": 2},
{"files": 2, "segments": 4, "speaker": 2},
[0.5, 0.5, 150, 151],
),
],
)
def test_dataset(audb_cache, tmpdir, repository, db, request):
def test_dataset(
audb_cache,
tmpdir,
repository,
request,
db,
expected_description,
expected_schemes_table,
expected_tables_table,
expected_tables_columns,
expected_tables_rows,
expected_segment_durations,
):
r"""Test audbcards.Dataset object and all its properties."""
db = request.getfixturevalue(db)

Expand Down Expand Up @@ -115,7 +169,7 @@ def test_dataset(audb_cache, tmpdir, repository, db, request):

# duration
expected_duration = db.files_duration(db.files).sum()
assert dataset.duration == expected_duration
assert dataset.duration == pd.to_timedelta(expected_duration)

# files
expected_files = len(db.files)
Expand Down Expand Up @@ -175,46 +229,29 @@ def test_dataset(audb_cache, tmpdir, repository, db, request):
assert dataset.schemes == expected_schemes

# schemes_table
expected_schemes_table = [
["ID", "Dtype", "Min", "Labels", "Mappings"],
["age", "int", 0, "", ""],
["emotion", "str", "", "angry, happy, neutral", ""],
["gender", "str", "", "female, male", ""],
["speaker", "int", "", "0, 1", "age, gender"],
]
assert dataset.schemes_table == expected_schemes_table

# segment_durations
expected_segment_durations = [0.5, 0.5, 150, 151]
assert dataset.segment_durations == expected_segment_durations

# segments
expected_segments = str(len(db.segments))
assert dataset.segments == expected_segments

# short_description
max_desc_length = 150
expected_description = (
db.description
if (len(db.description) < max_desc_length)
else f"{db.description[:max_desc_length - 3]}..."
)
assert dataset.short_description == expected_description

# tables
expected_tables = list(db)
assert dataset.tables == expected_tables

# tables_columns
assert dataset.tables_columns == expected_tables_columns

# tables_rows
assert dataset.tables_rows == expected_tables_rows

# tables_table
expected_tables_table = [["ID", "Type", "Columns"]]
for table_id in list(db):
table = db[table_id]
if isinstance(table, audformat.MiscTable):
table_type = "misc"
else:
table_type = table.type
columns = ", ".join(list(table.columns))
expected_tables_table.append([table_id, table_type, columns])
assert dataset.tables_table == expected_tables_table

# version
Expand Down

0 comments on commit 0401183

Please sign in to comment.