huggingface · guipenedo · Jan 30, 2025 · Jan 26, 2025 · Jan 26, 2025 · Jan 28, 2025
diff --git a/src/datatrove/pipeline/writers/huggingface.py b/src/datatrove/pipeline/writers/huggingface.py
@@ -2,7 +2,7 @@
 import random
 import tempfile
 import time
-from typing import Callable, Literal
+from typing import Any, Callable, Literal
 
 from huggingface_hub import (
     CommitOperationAdd,
@@ -36,6 +36,7 @@ def __init__(
         cleanup: bool = True,
         expand_metadata: bool = True,
         max_file_size: int = round(4.5 * 2**30),  # 4.5GB, leave some room for the last batch
+        schema: Any = None,
     ):
         """
         This class is intended to upload VERY LARGE datasets. Consider using `push_to_hub` or just using a
@@ -73,6 +74,7 @@ def __init__(
             adapter=adapter,
             expand_metadata=expand_metadata,
             max_file_size=max_file_size,
+            schema=schema,
         )
         self.operations = []
         self._repo_init = False

diff --git a/src/datatrove/pipeline/writers/parquet.py b/src/datatrove/pipeline/writers/parquet.py
@@ -1,5 +1,5 @@
 from collections import Counter, defaultdict
-from typing import IO, Callable, Literal
+from typing import IO, Any, Callable, Literal
 
 from datatrove.io import DataFolderLike
 from datatrove.pipeline.writers.disk_base import DiskWriter
@@ -19,6 +19,7 @@ def __init__(
         batch_size: int = 1000,
         expand_metadata: bool = False,
         max_file_size: int = 5 * 2**30,  # 5GB
+        schema: Any = None,
     ):
         # Validate the compression setting
         if compression not in {"snappy", "gzip", "brotli", "lz4", "zstd", None}:
@@ -40,6 +41,7 @@ def __init__(
         self._file_counter = Counter()
         self.compression = compression
         self.batch_size = batch_size
+        self.schema = schema
 
     def _on_file_switch(self, original_name, old_filename, new_filename):
         """
@@ -59,7 +61,7 @@ def _write_batch(self, filename):
         import pyarrow as pa
 
         # prepare batch
-        batch = pa.RecordBatch.from_pylist(self._batches.pop(filename))
+        batch = pa.RecordBatch.from_pylist(self._batches.pop(filename), schema=self.schema)
         # write batch
         self._writers[filename].write_batch(batch)
 
@@ -70,7 +72,7 @@ def _write(self, document: dict, file_handler: IO, filename: str):
         if filename not in self._writers:
             self._writers[filename] = pq.ParquetWriter(
                 file_handler,
-                schema=pa.RecordBatch.from_pylist([document]).schema,
+                schema=self.schema if self.schema is not None else pa.RecordBatch.from_pylist([document]).schema,
                 compression=self.compression,
             )
         self._batches[filename].append(document)