From 31203e4817be1626f3266aa5eab3cc6bcf811bd8 Mon Sep 17 00:00:00 2001 From: Andreas Ahlenstorf Date: Mon, 25 May 2020 17:40:29 +0200 Subject: [PATCH] Add section about related work in joss paper --- paper/paper.bib | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ paper/paper.md | 16 +++++++++++++++- 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/paper/paper.bib b/paper/paper.bib index 91d6d11..c3028ef 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -60,3 +60,51 @@ @misc{ardila2019common archivePrefix={arXiv}, primaryClass={cs.CL} } + +@misc{torchaudio, + title = {TORCHAUDIO}, + howpublished = {\url{https://pytorch.org/audio/}}, + note = {Accessed: 2020-05-25}, + year = {2020} +} + +@misc{dataloaders, + title = {dataloaders}, + howpublished = {\url{https://github.com/juliagusak/dataloaders}}, + note = {Accessed: 2020-05-25}, + year = {2020} +} + +@misc{audiodatasets, + title = {Audio Datasets}, + howpublished = {\url{https://github.com/mcfletch/audiodatasets}}, + note = {Accessed: 2020-05-25}, + year = {2020} +} + +@misc{mirdata, + title = {mirdata}, + howpublished = {\url{https://github.com/mir-dataset-loaders/mirdata}}, + note = {Accessed: 2020-05-25}, + year = {2020} +} + +@misc{speechcorpusdownloader, + title = {Speech Corpus Downloader}, + howpublished = {\url{https://github.com/mdangschat/speech-corpus-dl}}, + note = {Accessed: 2020-05-25}, + year = {2020} +} + +@inproceedings {tensorflow, + author = {Mart{\'\i}n Abadi and Paul Barham and Jianmin Chen and Zhifeng Chen and Andy Davis and Jeffrey Dean and Matthieu Devin and Sanjay Ghemawat and Geoffrey Irving and Michael Isard and Manjunath Kudlur and Josh Levenberg and Rajat Monga and Sherry Moore and Derek G. Murray and Benoit Steiner and Paul Tucker and Vijay Vasudevan and Pete Warden and Martin Wicke and Yuan Yu and Xiaoqiang Zheng}, + title = {TensorFlow: A System for Large-Scale Machine Learning}, + booktitle = {12th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 16)}, + year = {2016}, + isbn = {978-1-931971-33-1}, + address = {Savannah, GA}, + pages = {265--283}, + url = {https://www.usenix.org/conference/osdi16/technical-sessions/presentation/abadi}, + publisher = {{USENIX} Association}, + month = nov, +} \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md index e7f2c54..796e66d 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -78,7 +78,7 @@ Assume that the task is to train a neural network to detect segments in audio st MUSAN [@musan2015] and GTZAN [@GTZAN] are two suitable datasets for this task because they provide a wide selection of music, speech, and noise samples. In the example below, we first download MUSAN and GTZAN to the local disk before creating `Loader` instances for each format that allow Audiomate to access both datasets using a unified interface. Then, we instruct Audiomate to merge both datasets. Afterwards, we use a `Splitter` to partition the merged dataset into a train and test set. -By merely creating views, Audiomate avoids creating unnecessary disk I/O and is therefore ideally suited to work with large datasets in the range of tens or hundreds of gigabytes. +By merely creating views, Audiomate avoids creating unnecessary disk I/O and is therefore ideally suited to work with large datasets in the range of tens or hundreds of gigabytes. Ultimately, we load the samples and labels by iterating over all utterances. Alternatively, it is possible to load the samples in batches, which is ideal for feeding them to a deep learning toolkit like PyTorch. @@ -129,4 +129,18 @@ Usually, `Reader` and `Downloader` are implemented for datasets, while `Writer` Audiomate supports more than a dozen datasets and half as many toolkits. +# Related Work + +A variety of frameworks and tools offer functionality similar to Audiomate. + +**Data loaders** Data loaders are libraries that focus on downloading and preprocessing data sets to make them easily accessible without requiring a specific tool or framework. +In contrast to Audiomate, they cannot convert between formats, split or merge data sets. +Examples of libraries in that category are [@mirdata], [@speechcorpusdownloader], and [@audiodatasets]. +Furthermore, some of these libraries focus on a particular kind of data, such as music, and do not assist with speech data sets. + +**Tools for specific frameworks** Various machine learning tools and deep learning frameworks include the necessary infrastructure to make various datasets readily available to their users. +One notable example is TensorFlow [@tensorflow], which includes data loaders for different kinds of data, including image, speech, and music data sets, such as [@ardila2019common]. +Another one is torchaudio [@torchaudio] for PyTorch, which not only offers data loaders but is also capable of converting between various formats. +In contrast to Audiomate, those tools or libraries support a specific machine learning or deep learning framework (TensorFlow or PyTorch, respectively), whereas Audiomate is framework agnostic. + # References