Skip to content

Commit

Permalink
added check on rank 0 for input files on readers
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Dec 6, 2023
1 parent 3e3f0c8 commit c834995
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion src/datatrove/pipeline/readers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ def read_files_shard(self, shard):
def run(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1) -> DocumentsPipeline:
if data:
yield from data
for doc in self.read_files_shard(self.data_folder.get_files_shard(rank, world_size)):
files_shard = self.data_folder.get_files_shard(rank, world_size)
if len(files_shard) == 0:
if rank == 0:
raise RuntimeError(f"No files found on {self.data_folder.path}!")
# otherwise just a warning
logger.warning(f"No files found on {self.data_folder.path} for {rank=}")
for doc in self.read_files_shard(files_shard):
self.update_doc_stats(doc)
yield doc

0 comments on commit c834995

Please sign in to comment.