Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add granite documents format #1566

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions prepare/formats/models/granite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from unitxt.catalog import add_to_catalog
from unitxt.formats import GraniteDocumentsFormat

format = GraniteDocumentsFormat(model="ibm-granite/granite-3.1-8b-instruct")

add_to_catalog(format, "formats.models.granite_3_1_documents", overwrite=True)
4 changes: 4 additions & 0 deletions src/unitxt/catalog/formats/models/granite_3_1_documents.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"__type__": "granite_documents_format",
"model": "ibm-granite/granite-3.1-8b-instruct"
}
50 changes: 50 additions & 0 deletions src/unitxt/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from .dataclass import OptionalField
from .dict_utils import dict_get
from .error_utils import UnitxtError
from .image_operators import image_to_data_url
from .operator import InstanceOperator
from .settings_utils import get_constants
Expand All @@ -25,6 +26,55 @@ class Format(InstanceOperator):
pass


class GraniteDocumentsFormat(Format):
model: str = "ibm-granite/granite-3.1-8b-instruct"
citations: bool = True
length: str = "long"

_requirements_list = ["transformers"]

def prepare(self):
super().prepare()
from transformers import AutoTokenizer

self.tokenizer = AutoTokenizer.from_pretrained(self.model)

def process(
self, instance: Dict[str, Any], stream_name: str | None = None
) -> Dict[str, Any]:
inputs = instance["input_fields"]
if "question" not in inputs:
raise UnitxtError(
"GraniteRAGFormat works only for tasks with field: 'question'"
)
if "context" not in inputs and "contexts" not in inputs:
raise UnitxtError(
"GraniteRAGFormat works only for tasks with field: 'context' or 'contexts"
)

if "context" in inputs:
texts = [inputs["context"]]
if "contexts" in inputs:
texts = inputs["contexts"]

documents = []
for text in texts:
documents.append({"title": "", "text": text})

question = inputs["question"]

instance["source"] = self.tokenizer.apply_chat_template(
[
{"role": "user", "content": question},
],
documents=documents,
controls={"citations": self.citations, "length": self.length},
add_generation_prompt=True,
tokenize=False,
)
return instance


def apply_capital_new_line_notation(text: str) -> str:
r"""Transforms a given string by applying the Capital New Line Notation.

Expand Down
141 changes: 139 additions & 2 deletions tests/library/test_formats.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,29 @@
from datetime import datetime

from unitxt.api import load_dataset
from unitxt.card import TaskCard
from unitxt.formats import ChatAPIFormat, HFSystemFormat, SystemFormat
from unitxt.collections_operators import Wrap
from unitxt.formats import (
ChatAPIFormat,
GraniteDocumentsFormat,
HFSystemFormat,
SystemFormat,
)
from unitxt.loaders import LoadFromDictionary
from unitxt.operators import Rename, Set
from unitxt.settings_utils import get_constants
from unitxt.standard import DatasetRecipe
from unitxt.system_prompts import TextualSystemPrompt
from unitxt.task import Task
from unitxt.templates import InputOutputTemplate
from unitxt.templates import InputOutputTemplate, MultiReferenceTemplate, TemplatesDict
from unitxt.test_utils.operators import (
check_operator,
)

from tests.library.test_image_operators import create_random_jpeg_image
from tests.utils import UnitxtTestCase

# Assume
constants = get_constants()


Expand Down Expand Up @@ -327,6 +338,132 @@ def test_hf_system_format(self):
tester=self,
)

def test_granite_documents_format(self):
inputs = [
{
"input_fields": {
"question": "what is love?",
"contexts": ["love is love"],
},
},
{
"input_fields": {
"question": "what is love?",
"context": "love is love",
},
},
]

system_format = GraniteDocumentsFormat()

today = datetime.today().strftime("%B %d, %Y")
targets = [
{
"input_fields": {
"question": "what is love?",
"contexts": ["love is love"],
},
"source": "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: "
+ today
+ '.\nYou are Granite, developed by IBM. Write the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.<|end_of_text|>\n<|start_of_role|>documents<|end_of_role|>Document 0\nlove is love<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>what is love?<|end_of_text|>\n<|start_of_role|>assistant {"citations": true, "length": "long"}<|end_of_role|>',
},
{
"input_fields": {
"question": "what is love?",
"context": "love is love",
},
"source": "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: "
+ today
+ '.\nYou are Granite, developed by IBM. Write the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.<|end_of_text|>\n<|start_of_role|>documents<|end_of_role|>Document 0\nlove is love<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>what is love?<|end_of_text|>\n<|start_of_role|>assistant {"citations": true, "length": "long"}<|end_of_role|>',
},
]

check_operator(
operator=system_format,
inputs=inputs,
targets=targets,
tester=self,
)

data = {
"test": [
{
"query": "What city is the largest in Texas?",
"extracted_chunks": "Austin is the capital of Texas.\nHouston is the the largest city in Texas but not the capital of it. ",
"expected_answer": "Houston",
},
{
"query": "What city is the capital of Texas?",
"extracted_chunks": "Houston is the the largest city in Texas but not the capital of it. ",
"expected_answer": "Austin",
},
]
}

card = TaskCard(
# Assumes this csv, contains 3 fields
# question (string), extracted_chunks (string), expected_answer (string)
loader=LoadFromDictionary(data=data),
# Map these fields to the fields of the task.rag.response_generation task.
# See https://www.unitxt.ai/en/latest/catalog/catalog.tasks.rag.response_generation.html
preprocess_steps=[
Rename(field_to_field={"query": "question"}),
Wrap(field="extracted_chunks", inside="list", to_field="contexts"),
Wrap(
field="expected_answer", inside="list", to_field="reference_answers"
),
Set(
fields={
"contexts_ids": [],
}
),
],
# Specify the task and the desired metrics (note that these are part of the default
# metrics for the task, so the metrics selection can be omitted).
task="tasks.rag.response_generation",
# Specify a default template
templates=TemplatesDict(
{
"simple": MultiReferenceTemplate(
instruction="Answer the question based on the information provided in the document given below.\n\n",
input_format="Document: {contexts}\nQuestion: {question}",
references_field="reference_answers",
),
}
),
)

# select recommended metrics according to your available resources.
metrics = [
"metrics.rag.response_generation.recommended.cpu_only.all",
# "metrics.rag.response_generation.recommended.small_llm.all",
# "metrics.rag.response_generation.recommended.llmaj_watsonx.all",
# "metrics.rag.response_generation.recommended.llmaj_rits.all"
# "metrics.rag.response_generation.recommended.llmaj_azure.all"
]

# Verbalize the dataset using the template
dataset = load_dataset(
card=card,
template_card_index="simple",
format=GraniteDocumentsFormat(),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set the model name in explicit way, just in case things change in the future and there are multiple granite ones

split="test",
max_test_instances=10,
metrics=metrics,
)

self.assertListEqual(
dataset["source"],
[
"<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: "
+ today
+ '.\nYou are Granite, developed by IBM. Write the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.<|end_of_text|>\n<|start_of_role|>documents<|end_of_role|>Document 0\nAustin is the capital of Texas.\nHouston is the the largest city in Texas but not the capital of it. <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What city is the largest in Texas?<|end_of_text|>\n<|start_of_role|>assistant {"citations": true, "length": "long"}<|end_of_role|>',
"<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: "
+ today
+ '.\nYou are Granite, developed by IBM. Write the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.<|end_of_text|>\n<|start_of_role|>documents<|end_of_role|>Document 0\nHouston is the the largest city in Texas but not the capital of it. <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What city is the capital of Texas?<|end_of_text|>\n<|start_of_role|>assistant {"citations": true, "length": "long"}<|end_of_role|>',
],
)

def test_system_format(self):
instruction = "solve the math exercises"

Expand Down
Loading