Skip to content

Commit

Permalink
samples limit and full dataset epoch
Browse files Browse the repository at this point in the history
  • Loading branch information
mitya52 authored and JegernOUTT committed May 22, 2024
1 parent a28005a commit 8e53a9a
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 18 deletions.
45 changes: 28 additions & 17 deletions refact_data_pipeline/finetune_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,31 +193,42 @@ def _build_pipeline(self, files: List[Dict[str, Any]]):


class ReadJSONLFileByFile(ReadFileByFile):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._samples_limit = self.dataopts.get("samples_limit", 0)

def __iter__(self):
sample_num = 0
file_num = 0
epoch = 0

for j in self.inner_filter:
with jsonlines.open(os.path.join(self.basedir, j["path"])) as r:
for data in r:
if not data["middle"]:
continue
yield {
**data,
"path": j["path"],
"stats": {
"sample_num": sample_num,
"file_num": file_num,
"epoch": epoch,
},
}
sample_num += 1
file_num += 1
quit_flag = False
while not quit_flag:
for j in self.inner_filter:
with jsonlines.open(os.path.join(self.basedir, j["path"])) as r:
for data in r:
if not data["middle"]:
continue
yield {
**data,
"path": j["path"],
"stats": {
"sample_num": sample_num,
"file_num": file_num,
"epoch": epoch,
},
}
sample_num += 1
if self._samples_limit and sample_num >= self._samples_limit:
quit_flag = True
break
file_num += 1
if quit_flag:
break
epoch += 1
self.epoch_callback(epoch)
if epoch == self.quit_on_epoch:
break
quit_flag = True


class RAGFIM(PipelineNode):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
_fim_test_ds_pipeline = {
"ds_opts": "n_ctx={n_ctx},debug=0,seed=42,shuffle_depth=0,quit_on_epoch=1,"
"fim_probability=0.9,fim_drop_residual=1,random_trim_context_prob=0.01,"
"pack_single=1,pack_complete=0,pack_buffer_size=50",
"pack_single=1,pack_complete=0,pack_buffer_size=50,samples_limit=16",
"ds_name": "RefactRAGFIMDataset"
}
_bigcode_tokenizer_mapping = {
Expand Down

0 comments on commit 8e53a9a

Please sign in to comment.