Skip to content

Commit

Permalink
PR Changes
Browse files Browse the repository at this point in the history
Signed-off-by: Abhishek <[email protected]>
  • Loading branch information
Abhishek-TAMU committed Feb 4, 2025
1 parent 0e9ad3f commit 8d3e77f
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 4 deletions.
1 change: 0 additions & 1 deletion tests/data/test_data_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
# https://spdx.dev/learn/handling-license-info/

# Third Party
from jinja2.exceptions import TemplateSyntaxError
from transformers import AutoTokenizer
import datasets
import pytest
Expand Down
8 changes: 6 additions & 2 deletions tuning/data/data_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from transformers import AutoTokenizer

# Local
from tuning.utils.config_utils import transform_placeholders
from tuning.utils.config_utils import process_jinja_placeholders


### Utils for custom masking / manipulating input / output strs, etc
Expand Down Expand Up @@ -112,6 +112,8 @@ def apply_custom_data_formatting_template(
Expects to be run as a HF Map API function.
Args:
element: the HF Dataset element loaded from a JSON or DatasetDict object.
tokenizer: Tokenizer to be used for the EOS token, which will be appended
when formatting the data into a single sequence. Defaults to empty.
template: Template to format data with. Features of Dataset
should be referred to by {{key}}
formatted_dataset_field: Dataset_text_field
Expand Down Expand Up @@ -152,6 +154,8 @@ def apply_custom_data_formatting_jinja_template(
Expects to be run as a HF Map API function.
Args:
element: the HF Dataset element loaded from a JSON or DatasetDict object.
tokenizer: Tokenizer to be used for the EOS token, which will be appended
when formatting the data into a single sequence. Defaults to empty.
dataset_text_field: formatted_dataset_field.
template: Template to format data with. Features of Dataset
should be referred to by {{key}}.
Expand All @@ -160,7 +164,7 @@ def apply_custom_data_formatting_jinja_template(
"""

template += tokenizer.eos_token
template = transform_placeholders(template)
template = process_jinja_placeholders(template)
env = Environment(undefined=StrictUndefined)
jinja_template = env.from_string(template)

Expand Down
2 changes: 1 addition & 1 deletion tuning/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def txt_to_obj(txt):
return pickle.loads(message_bytes)


def transform_placeholders(template: str) -> str:
def process_jinja_placeholders(template: str) -> str:
"""
Function to detect all placeholders of the form {{...}}.
- If the inside has a space (e.g. {{Tweet text}}),
Expand Down

0 comments on commit 8d3e77f

Please sign in to comment.