diff --git a/src/datatrove/pipeline/readers/base.py b/src/datatrove/pipeline/readers/base.py index f8507b4b..71bc63f0 100644 --- a/src/datatrove/pipeline/readers/base.py +++ b/src/datatrove/pipeline/readers/base.py @@ -82,6 +82,12 @@ def read_files_shard(self, shard): def run(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1) -> DocumentsPipeline: if data: yield from data - for doc in self.read_files_shard(self.data_folder.get_files_shard(rank, world_size)): + files_shard = self.data_folder.get_files_shard(rank, world_size) + if len(files_shard) == 0: + if rank == 0: + raise RuntimeError(f"No files found on {self.data_folder.path}!") + # otherwise just a warning + logger.warning(f"No files found on {self.data_folder.path} for {rank=}") + for doc in self.read_files_shard(files_shard): self.update_doc_stats(doc) yield doc