From 4f196bb6a396cb30c457ae3f1e63ad3bed30ba85 Mon Sep 17 00:00:00 2001
From: Sarath S <47180054+sarathsgvr@users.noreply.github.com>
Date: Wed, 29 Jan 2025 19:15:58 +0530
Subject: [PATCH 1/2] Add CollateInstanceByField operator to group data by
 specific field (#1546)

* Add GroupByProcessor operator to group data and apply custom functions

Signed-off-by: Sarath-S <sarath.s@ibm.com>

* Add CollateInstanceByField operator to group and aggregate data by field name

Signed-off-by: Sarath-S <sarath.s@ibm.com>

* Add CollateInstanceByField operator to group and aggregate data by field name

Signed-off-by: Sarath-S <sarath.s@ibm.com>

* Add consistency validation and test cases

Signed-off-by: Sarath-S <sarath.s@ibm.com>

* fix for test case failures

Signed-off-by: Sarath-S <sarath.s@ibm.com>

* fix for test case failures

Signed-off-by: Sarath-S <sarath.s@ibm.com>

* Added more tests and clarified error messages

Signed-off-by: Yoav Katz <katz@il.ibm.com>

* More tests

Signed-off-by: Yoav Katz <katz@il.ibm.com>

* Added checks and fixed bug in data classification policty handling

Signed-off-by: Yoav Katz <katz@il.ibm.com>

* Added more tests and error message and remove doc_id default.

Signed-off-by: Yoav Katz <katz@il.ibm.com>

* Improved documentation

Signed-off-by: Yoav Katz <katz@il.ibm.com>

* Fix for handling data_classification_policy None cases

Signed-off-by: Sarath-S <sarath.s@ibm.com>

---------

Signed-off-by: Sarath-S <sarath.s@ibm.com>
Signed-off-by: Yoav Katz <katz@il.ibm.com>
Co-authored-by: Sarath-S <sarath.s@ibm.com>
Co-authored-by: Yoav Katz <katz@il.ibm.com>
Co-authored-by: Yoav Katz <68273864+yoavkatz@users.noreply.github.com>
---
 src/unitxt/operators.py         |  97 ++++++++++++++++++++++++++++
 tests/library/test_operators.py | 110 ++++++++++++++++++++++++++++++++
 2 files changed, 207 insertions(+)

diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py
index 6b9e444a83..999a188efd 100644
--- a/src/unitxt/operators.py
+++ b/src/unitxt/operators.py
@@ -67,6 +67,7 @@
 from .dataclass import NonPositionalField, OptionalField
 from .deprecation_utils import deprecation
 from .dict_utils import dict_delete, dict_get, dict_set, is_subpath
+from .error_utils import UnitxtError
 from .generator_utils import ReusableGenerator
 from .operator import (
     InstanceOperator,
@@ -2243,6 +2244,102 @@ def verify(self):
             )
 
 
+class CollateInstancesByField(StreamOperator):
+    """Groups a list of instances by a specified field, aggregates specified fields into lists, and ensures consistency for all other non-aggregated fields.
+
+    Args:
+        by_field str: the name of the field to group data by.
+        aggregate_fields list(str): the field names to aggregate into lists.
+
+    Returns:
+        A stream of instances grouped and aggregated by the specified field.
+
+    Raises:
+        UnitxtError: If non-aggregate fields have inconsistent values.
+
+    Example:
+        Collate the instances based on field "category" and aggregate fields "value" and "id".
+
+        CollateInstancesByField(by_field="category", aggregate_fields=["value", "id"])
+
+        given input:
+        [
+            {"id": 1, "category": "A", "value": 10", "flag" : True},
+            {"id": 2, "category": "B", "value": 20", "flag" : False},
+            {"id": 3, "category": "A", "value": 30", "flag" : True},
+            {"id": 4, "category": "B", "value": 40", "flag" : False}
+        ]
+
+        the output is:
+        [
+            {"category": "A", "id": [1, 3], "value": [10, 30], "info": True},
+            {"category": "B", "id": [2, 4], "value": [20, 40], "info": False}
+        ]
+
+        Note that the "flag" field is not aggregated, and must be the same
+        in all instances in the same category, or an error is raised.
+    """
+
+    by_field: str = NonPositionalField(required=True)
+    aggregate_fields: List[str] = NonPositionalField(required=True)
+
+    def prepare(self):
+        super().prepare()
+
+    def verify(self):
+        super().verify()
+        if not isinstance(self.by_field, str):
+            raise UnitxtError(
+                f"The 'by_field' value is not a string but '{type(self.by_field)}'"
+            )
+
+        if not isinstance(self.aggregate_fields, list):
+            raise UnitxtError(
+                f"The 'allowed_field_values' is not a list but '{type(self.aggregate_fields)}'"
+            )
+
+    def process(self, stream: Stream, stream_name: Optional[str] = None):
+        grouped_data = {}
+
+        for instance in stream:
+            if self.by_field not in instance:
+                raise UnitxtError(
+                    f"The field '{self.by_field}' specified by CollateInstancesByField's 'by_field' argument is not found in instance."
+                )
+            for k in self.aggregate_fields:
+                if k not in instance:
+                    raise UnitxtError(
+                        f"The field '{k}' specified in CollateInstancesByField's 'aggregate_fields' argument is not found in instance."
+                    )
+            key = instance[self.by_field]
+
+            if key not in grouped_data:
+                grouped_data[key] = {
+                    k: v for k, v in instance.items() if k not in self.aggregate_fields
+                }
+                # Add empty lists for fields to aggregate
+                for agg_field in self.aggregate_fields:
+                    if agg_field in instance:
+                        grouped_data[key][agg_field] = []
+
+            for k, v in instance.items():
+                # Merge classification policy list across instance with same key
+                if k == "data_classification_policy" and instance[k]:
+                    grouped_data[key][k] = sorted(set(grouped_data[key][k] + v))
+                # Check consistency for all non-aggregate fields
+                elif k != self.by_field and k not in self.aggregate_fields:
+                    if k in grouped_data[key] and grouped_data[key][k] != v:
+                        raise ValueError(
+                            f"Inconsistent value for field '{k}' in group '{key}': "
+                            f"'{grouped_data[key][k]}' vs '{v}'. Ensure that all non-aggregated fields in CollateInstancesByField are consistent across all instances."
+                        )
+                # Aggregate fields
+                elif k in self.aggregate_fields:
+                    grouped_data[key][k].append(instance[k])
+
+        yield from grouped_data.values()
+
+
 class WikipediaFetcher(FieldOperator):
     mode: Literal["summary", "text"] = "text"
     _requirements_list = ["Wikipedia-API"]
diff --git a/tests/library/test_operators.py b/tests/library/test_operators.py
index af038dd654..9fb9b688b0 100644
--- a/tests/library/test_operators.py
+++ b/tests/library/test_operators.py
@@ -12,6 +12,7 @@
     ApplyStreamOperatorsField,
     CastFields,
     CollateInstances,
+    CollateInstancesByField,
     Copy,
     Deduplicate,
     DeterministicBalancer,
@@ -2653,6 +2654,115 @@ def test_collate_instance(self):
             tester=self,
         )
 
+    def test_collate_instances_by_field(self):
+        inputs = [
+            {"id": 1, "category": "A", "value": 10},
+            {"id": 1, "category": "A", "value": 20},
+            {"id": 2, "category": "B", "value": 30},
+            {"id": 2, "category": "B", "value": 40},
+        ]
+
+        targets = [
+            {"category": "A", "id": 1, "value": [10, 20]},
+            {"category": "B", "id": 2, "value": [30, 40]},
+        ]
+
+        check_operator(
+            operator=CollateInstancesByField(
+                by_field="category", aggregate_fields=["value"]
+            ),
+            inputs=inputs,
+            targets=targets,
+            tester=self,
+        )
+
+        inputs = [
+            {
+                "id": 1,
+                "category": "A",
+                "value": 10,
+                "data_classification_policy": ["public"],
+            },
+            {
+                "id": 2,
+                "category": "A",
+                "value": 20,
+                "data_classification_policy": ["public"],
+            },
+            {
+                "id": 3,
+                "category": "B",
+                "value": 30,
+                "data_classification_policy": ["public"],
+            },
+            {
+                "id": 4,
+                "category": "B",
+                "value": 40,
+                "data_classification_policy": ["private"],
+            },
+        ]
+
+        targets = [
+            {
+                "category": "A",
+                "id": [1, 2],
+                "value": [10, 20],
+                "data_classification_policy": ["public"],
+            },
+            {
+                "category": "B",
+                "id": [3, 4],
+                "value": [30, 40],
+                "data_classification_policy": ["private", "public"],
+            },
+        ]
+
+        check_operator(
+            operator=CollateInstancesByField(
+                by_field="category", aggregate_fields=["value", "id"]
+            ),
+            inputs=inputs,
+            targets=targets,
+            tester=self,
+        )
+
+        exception_texts = [
+            "Inconsistent value for field 'id' in group 'A': '1' vs '2'. Ensure that all non-aggregated fields in CollateInstancesByField are consistent across all instances.",
+        ]
+        check_operator_exception(
+            operator=CollateInstancesByField(
+                by_field="category", aggregate_fields=["value"]
+            ),
+            inputs=inputs,
+            exception_texts=exception_texts,
+            tester=self,
+        )
+
+        exception_texts = [
+            "The field 'not_exist' specified by CollateInstancesByField's 'by_field' argument is not found in instance."
+        ]
+        check_operator_exception(
+            operator=CollateInstancesByField(
+                by_field="not_exist", aggregate_fields=["value"]
+            ),
+            inputs=inputs,
+            exception_texts=exception_texts,
+            tester=self,
+        )
+
+        exception_texts = [
+            "The field 'not_exist' specified in CollateInstancesByField's 'aggregate_fields' argument is not found in instance."
+        ]
+        check_operator_exception(
+            operator=CollateInstancesByField(
+                by_field="category", aggregate_fields=["id", "value", "not_exist"]
+            ),
+            inputs=inputs,
+            exception_texts=exception_texts,
+            tester=self,
+        )
+
 
 class TestApplyMetric(UnitxtTestCase):
     def _test_apply_metric(

From bc65c5c864623287bb14b6394526519785e5d2ce Mon Sep 17 00:00:00 2001
From: ShirApp <58909189+ShirApp@users.noreply.github.com>
Date: Wed, 29 Jan 2025 16:59:35 +0200
Subject: [PATCH 2/2] Fix prompts table benchmark (#1565)

* add metrics to scigen

Signed-off-by: ShirApp <shirashury@gmail.com>

* improve the instructions in table benchmark datasets + add postprocessing for using first line

Signed-off-by: ShirApp <shirashury@gmail.com>

* rerun prepare

Signed-off-by: ShirApp <shirashury@gmail.com>

---------

Signed-off-by: ShirApp <shirashury@gmail.com>
---
 prepare/cards/fin_qa.py                                    | 7 +++++--
 prepare/cards/scigen.py                                    | 2 +-
 prepare/cards/tab_fact.py                                  | 6 ++++--
 prepare/cards/tablebench_data_analysis.py                  | 5 ++++-
 prepare/cards/tablebench_fact_checking.py                  | 5 ++++-
 prepare/cards/tablebench_numerical_reasoning.py            | 5 ++++-
 prepare/cards/turl_col_type.py                             | 4 +++-
 prepare/cards/wikitq.py                                    | 6 ++++--
 prepare/tasks/generation.py                                | 2 +-
 prepare/templates/generation/generation.py                 | 3 ++-
 prepare/templates/qa/with_context.py                       | 3 ++-
 src/unitxt/catalog/cards/fin_qa.json                       | 6 ++++--
 src/unitxt/catalog/cards/scigen.json                       | 2 +-
 src/unitxt/catalog/cards/tab_fact.json                     | 2 +-
 src/unitxt/catalog/cards/tablebench_data_analysis.json     | 3 ++-
 src/unitxt/catalog/cards/tablebench_fact_checking.json     | 3 ++-
 .../catalog/cards/tablebench_numerical_reasoning.json      | 3 ++-
 src/unitxt/catalog/cards/turl_col_type.json                | 2 +-
 src/unitxt/catalog/cards/wikitq.json                       | 3 ++-
 src/unitxt/catalog/tasks/generation/from_pair.json         | 2 +-
 .../catalog/templates/generation/from_pair/default.json    | 2 +-
 src/unitxt/catalog/templates/qa/with_context/qtsumm.json   | 2 +-
 22 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/prepare/cards/fin_qa.py b/prepare/cards/fin_qa.py
index 60c3bb3845..7c25dca954 100644
--- a/prepare/cards/fin_qa.py
+++ b/prepare/cards/fin_qa.py
@@ -50,7 +50,8 @@
                 ["table-average", "table header", "number", "the average of one table row"],
                 ["table-max", "table header", "number", "the maximum number of one table row"],
                 ["table-min", "table header", "number", "the minimum number of one table row"]]
-                Answer with only the program, without any additional explanation.
+                \nAnswer with only the program, without any additional explanation or introductory text.
+                \nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.
                 """,
             input_format="""Pre-table text: {pre_text}
                 Table: {table}
@@ -59,7 +60,9 @@
                 Program:
                     """,
             output_format="{program_re}",
-            postprocessors=[],
+            postprocessors=[
+                "processors.take_first_non_empty_line",
+            ],
         ),
     ],
     __description__=(
diff --git a/prepare/cards/scigen.py b/prepare/cards/scigen.py
index 7ccdfa03ed..9f2c2f4f29 100644
--- a/prepare/cards/scigen.py
+++ b/prepare/cards/scigen.py
@@ -28,7 +28,7 @@
                 }
             ),
         ],
-        task="tasks.generation.from_pair[metrics=[metrics.llm_as_judge.rating.llama_3_1_70b_instruct_cross_provider_template_table2text_single_turn_with_reference]]",
+        task="tasks.generation.from_pair[metrics=[metrics.rouge,metrics.bert_score.bert_base_uncased,metrics.bleu,metrics.meteor,metrics.llm_as_judge.rating.llama_3_1_70b_instruct_cross_provider_template_table2text_single_turn_with_reference]]",
         templates=[
             "templates.generation.from_pair.default[postprocessors=[processors.lower_case]]"
         ],
diff --git a/prepare/cards/tab_fact.py b/prepare/cards/tab_fact.py
index d9c4ba9a5a..fdba6a4087 100644
--- a/prepare/cards/tab_fact.py
+++ b/prepare/cards/tab_fact.py
@@ -32,7 +32,9 @@
         task="tasks.classification.multi_class.relation",
         templates=[
             InputOutputTemplate(
-                instruction="Given a {text_a_type} and {text_b_type} classify the {type_of_relation} of the {text_b_type} to one of {classes}. You should only output the result. Do not add any explanation or other information.",
+                instruction="Given a {text_a_type} and {text_b_type} classify the {type_of_relation} of the {text_b_type} to one of {classes}."
+                + "\nOutput only the final answer without any explanations, extra information, or introductory text."
+                + "\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
                 input_format="{text_a_type}: {text_a}\n{text_b_type}: {text_b} ",
                 output_format="{label}",
                 postprocessors=[
@@ -52,5 +54,5 @@
         ),
     )
 
-    test_card(card)
+    test_card(card, num_demos=2, demos_pool_size=20)
     add_to_catalog(card, "cards.tab_fact", overwrite=True)
diff --git a/prepare/cards/tablebench_data_analysis.py b/prepare/cards/tablebench_data_analysis.py
index fff42a7509..7916377aef 100644
--- a/prepare/cards/tablebench_data_analysis.py
+++ b/prepare/cards/tablebench_data_analysis.py
@@ -51,11 +51,14 @@
     ),
     templates=[
         InputOutputTemplate(
-            instruction="You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}",
+            instruction="You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}"
+            + "\nOutput only the final answer without any explanations, extra information, or introductory text."
+            + "\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
             input_format="{context_type}: {context} \nQuestion: {question}",
             target_prefix="Final Answer: ",
             output_format="{answers}",
             postprocessors=[
+                "processors.take_first_non_empty_line",
                 "processors.lower_case",
                 "processors.remove_punctuations",
                 "processors.remove_articles",
diff --git a/prepare/cards/tablebench_fact_checking.py b/prepare/cards/tablebench_fact_checking.py
index 31a462f8f9..fe8747d733 100644
--- a/prepare/cards/tablebench_fact_checking.py
+++ b/prepare/cards/tablebench_fact_checking.py
@@ -56,11 +56,14 @@
     ),
     templates=[
         InputOutputTemplate(
-            instruction="You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}",
+            instruction="You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}"
+            + "\nOutput only the final answer without any explanations, extra information, or introductory text."
+            + "\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
             input_format="{context_type}: {context} \nQuestion: {question}",
             target_prefix="Final Answer: ",
             output_format="{answers}",
             postprocessors=[
+                "processors.take_first_non_empty_line",
                 "processors.lower_case",
                 "processors.remove_punctuations",
                 "processors.remove_articles",
diff --git a/prepare/cards/tablebench_numerical_reasoning.py b/prepare/cards/tablebench_numerical_reasoning.py
index ff930264e1..11c55dc23b 100644
--- a/prepare/cards/tablebench_numerical_reasoning.py
+++ b/prepare/cards/tablebench_numerical_reasoning.py
@@ -51,11 +51,14 @@
     ),
     templates=[
         InputOutputTemplate(
-            instruction="You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}",
+            instruction="You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}"
+            + "\nOutput only the final answer without any explanations, extra information, or introductory text."
+            + "\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
             input_format="{context_type}: {context} \nQuestion: {question}",
             target_prefix="Final Answer: ",
             output_format="{answers}",
             postprocessors=[
+                "processors.take_first_non_empty_line",
                 "processors.lower_case",
                 "processors.remove_punctuations",
                 "processors.remove_articles",
diff --git a/prepare/cards/turl_col_type.py b/prepare/cards/turl_col_type.py
index 1723095644..e283bfb3a3 100644
--- a/prepare/cards/turl_col_type.py
+++ b/prepare/cards/turl_col_type.py
@@ -44,7 +44,9 @@
         InputOutputTemplate(
             instruction="""
                     This is a column type annotation task. The goal of this task is to choose the correct types for one selected column of the given input table from the given candidate types. The Wikipedia page, section and table caption (if any) provide important information for choosing the correct column types.
-                    Candidate Types: {vocab} \nOutput only the correct column types for the mentioned from the candidate types.
+                    Candidate Types: {vocab}
+                    \nOutput only the correct column types from the candidate list for the mentioned columns. Do not include any explanations, extra information, or introductory text—only the final answer.
+                    \nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.
                 """.strip(),
             input_format="\nColumn name: {colname}"
             "\nPage Title: {page_title} "
diff --git a/prepare/cards/wikitq.py b/prepare/cards/wikitq.py
index 424bd1e60a..05644f5915 100644
--- a/prepare/cards/wikitq.py
+++ b/prepare/cards/wikitq.py
@@ -25,10 +25,12 @@
     task="tasks.qa.extractive[metrics=[metrics.f1_strings, metrics.unsorted_list_exact_match]]",
     templates=[
         MultiReferenceTemplate(
-            instruction="Answer the question based on the provided table. You should only output the final answer. Do not add any explanation or other information.",
-            input_format="\nQuestion: {question}\nTable: {context}\nAnswer: ",
+            instruction="Answer the question based on the provided table. Extract and output only the final answer—the exact phrase or data from the table that directly answers the question. Do not include any alterations, explanations, or introductory text"
+            + "\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
+            input_format="\nQuestion: {question}" "\nTable: {context}" "\nAnswer: ",
             references_field="answers",
             postprocessors=[
+                "processors.take_first_non_empty_line",
                 "processors.to_list_by_comma_space",
                 "processors.str_to_float_format",
             ],
diff --git a/prepare/tasks/generation.py b/prepare/tasks/generation.py
index 09eeb81d2e..5b40acef5d 100644
--- a/prepare/tasks/generation.py
+++ b/prepare/tasks/generation.py
@@ -29,9 +29,9 @@
         reference_fields={"output": str},
         prediction_type=str,
         metrics=[
-            "metrics.bleu",
             "metrics.rouge",
             "metrics.bert_score.bert_base_uncased",
+            "metrics.bleu",
             "metrics.meteor",
         ],
         augmentable_inputs=["input_a", "input_b"],
diff --git a/prepare/templates/generation/generation.py b/prepare/templates/generation/generation.py
index 9983175bcb..3e00596560 100644
--- a/prepare/templates/generation/generation.py
+++ b/prepare/templates/generation/generation.py
@@ -33,7 +33,8 @@
 
 add_to_catalog(
     InputOutputTemplate(
-        instruction="Given the following {type_of_input_a} and {type_of_input_b}, generate the corresponding {type_of_output}.",
+        instruction="Given the following {type_of_input_a} and {type_of_input_b}, generate the corresponding {type_of_output}."
+        + "\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
         input_format="{type_of_input_a}: \n{input_a} \n{type_of_input_b}: \n{input_b} \n{type_of_output}:",
         output_format="{output}",
         postprocessors=[
diff --git a/prepare/templates/qa/with_context.py b/prepare/templates/qa/with_context.py
index cc926904b1..70fac33893 100644
--- a/prepare/templates/qa/with_context.py
+++ b/prepare/templates/qa/with_context.py
@@ -92,7 +92,8 @@
 
 add_to_catalog(
     MultiReferenceTemplate(
-        instruction="Using the information from the {context_type} given below, summarize a paragraph-long response to the following user query.",
+        instruction="Using the information from the {context_type} given below, summarize a paragraph-long response to the following user query."
+        + "\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
         input_format="{context_type}:\n{context}\nQuery:\n{question}",
         output_format="{answers}",
         target_prefix="Answer:\n",
diff --git a/src/unitxt/catalog/cards/fin_qa.json b/src/unitxt/catalog/cards/fin_qa.json
index 7394e589a0..ba9d16cad3 100644
--- a/src/unitxt/catalog/cards/fin_qa.json
+++ b/src/unitxt/catalog/cards/fin_qa.json
@@ -59,10 +59,12 @@
     "templates": [
         {
             "__type__": "input_output_template",
-            "instruction": "Presented with a financial report consisting of textual contents and a structured table, given a question, generate the reasoning program in the domain specific language (DSL) that will be executed to get the answer. \nThe DSL consists of mathematical operations and table operations as executable programs. The program consists of a sequence of operations. Each operation takes a list of arguments. \nThere are 6 mathematical operations: add, subtract, multiply, divide, greater, exp, and 4 table aggregation operations table-max, table-min, table-sum, table-average, that apply aggregation operations on table rows. The mathematical operations take arguments of either numbers from the given reports, or a numerical result from a previous step.\nThe table operations take arguments of table row names. We use the special token #n to denote the result from the nth step. \nFor example, in the example \"divide(9413, 20.01), divide(8249, 9.48), subtract(#0, #1)\", the program consists of 3 steps; The first and the second division steps take arguments from the table and the text, respectively, then the third step subtracts the results from the two previous steps.\n                Definitions of all operations:\n                [[\"Name\", \"Arguments\", \"Output\", \"Description\"],\n                [\"add\", \"number1, number2\", \"number\", \"add two numbers: number1 + number2\"],\n                [\"subtract\", \"number1, number2\", \"number\", \"subtract two numbers: number1 - number2\"],\n                [\"multiply\", \"number1, number2\", \"number\", \"multiply two numbers: number1 * number2\"],\n                [\"divide\", \"number1, number2\", \"number\", \"multiply two numbers: number1 / number2\"],\n                [\"exp\", \"number1, number2\", \"number\", \"exponential: number1 ^ number2\"],\n                [\"greater\", \"number1, number2\", \"bool\", \"comparison: number1 > number2\"],\n                [\"table-sum\", \"table header\", \"number\", \"the summation of one table row\"],\n                [\"table-average\", \"table header\", \"number\", \"the average of one table row\"],\n                [\"table-max\", \"table header\", \"number\", \"the maximum number of one table row\"],\n                [\"table-min\", \"table header\", \"number\", \"the minimum number of one table row\"]]\n                Answer with only the program, without any additional explanation.\n                ",
+            "instruction": "Presented with a financial report consisting of textual contents and a structured table, given a question, generate the reasoning program in the domain specific language (DSL) that will be executed to get the answer. \nThe DSL consists of mathematical operations and table operations as executable programs. The program consists of a sequence of operations. Each operation takes a list of arguments. \nThere are 6 mathematical operations: add, subtract, multiply, divide, greater, exp, and 4 table aggregation operations table-max, table-min, table-sum, table-average, that apply aggregation operations on table rows. The mathematical operations take arguments of either numbers from the given reports, or a numerical result from a previous step.\nThe table operations take arguments of table row names. We use the special token #n to denote the result from the nth step. \nFor example, in the example \"divide(9413, 20.01), divide(8249, 9.48), subtract(#0, #1)\", the program consists of 3 steps; The first and the second division steps take arguments from the table and the text, respectively, then the third step subtracts the results from the two previous steps.\n                Definitions of all operations:\n                [[\"Name\", \"Arguments\", \"Output\", \"Description\"],\n                [\"add\", \"number1, number2\", \"number\", \"add two numbers: number1 + number2\"],\n                [\"subtract\", \"number1, number2\", \"number\", \"subtract two numbers: number1 - number2\"],\n                [\"multiply\", \"number1, number2\", \"number\", \"multiply two numbers: number1 * number2\"],\n                [\"divide\", \"number1, number2\", \"number\", \"multiply two numbers: number1 / number2\"],\n                [\"exp\", \"number1, number2\", \"number\", \"exponential: number1 ^ number2\"],\n                [\"greater\", \"number1, number2\", \"bool\", \"comparison: number1 > number2\"],\n                [\"table-sum\", \"table header\", \"number\", \"the summation of one table row\"],\n                [\"table-average\", \"table header\", \"number\", \"the average of one table row\"],\n                [\"table-max\", \"table header\", \"number\", \"the maximum number of one table row\"],\n                [\"table-min\", \"table header\", \"number\", \"the minimum number of one table row\"]]\n                \nAnswer with only the program, without any additional explanation or introductory text.\n                \nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.\n                ",
             "input_format": "Pre-table text: {pre_text}\n                Table: {table}\n                Post-table text: {post_text}\n                Question: {question}\n                Program:\n                    ",
             "output_format": "{program_re}",
-            "postprocessors": []
+            "postprocessors": [
+                "processors.take_first_non_empty_line"
+            ]
         }
     ],
     "__description__": "FINQA is an expert-annotated QA dataset that aims to tackle numerical reasoning over real-world financial data.",
diff --git a/src/unitxt/catalog/cards/scigen.json b/src/unitxt/catalog/cards/scigen.json
index 1cf59681ce..ca35ff667c 100644
--- a/src/unitxt/catalog/cards/scigen.json
+++ b/src/unitxt/catalog/cards/scigen.json
@@ -39,7 +39,7 @@
             }
         }
     ],
-    "task": "tasks.generation.from_pair[metrics=[metrics.llm_as_judge.rating.llama_3_1_70b_instruct_cross_provider_template_table2text_single_turn_with_reference]]",
+    "task": "tasks.generation.from_pair[metrics=[metrics.rouge,metrics.bert_score.bert_base_uncased,metrics.bleu,metrics.meteor,metrics.llm_as_judge.rating.llama_3_1_70b_instruct_cross_provider_template_table2text_single_turn_with_reference]]",
     "templates": [
         "templates.generation.from_pair.default[postprocessors=[processors.lower_case]]"
     ],
diff --git a/src/unitxt/catalog/cards/tab_fact.json b/src/unitxt/catalog/cards/tab_fact.json
index e421bd07ac..28c3631fed 100644
--- a/src/unitxt/catalog/cards/tab_fact.json
+++ b/src/unitxt/catalog/cards/tab_fact.json
@@ -42,7 +42,7 @@
     "templates": [
         {
             "__type__": "input_output_template",
-            "instruction": "Given a {text_a_type} and {text_b_type} classify the {type_of_relation} of the {text_b_type} to one of {classes}. You should only output the result. Do not add any explanation or other information.",
+            "instruction": "Given a {text_a_type} and {text_b_type} classify the {type_of_relation} of the {text_b_type} to one of {classes}.\nOutput only the final answer without any explanations, extra information, or introductory text.\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
             "input_format": "{text_a_type}: {text_a}\n{text_b_type}: {text_b} ",
             "output_format": "{label}",
             "postprocessors": [
diff --git a/src/unitxt/catalog/cards/tablebench_data_analysis.json b/src/unitxt/catalog/cards/tablebench_data_analysis.json
index edee8a390b..0b22c5ccca 100644
--- a/src/unitxt/catalog/cards/tablebench_data_analysis.json
+++ b/src/unitxt/catalog/cards/tablebench_data_analysis.json
@@ -84,11 +84,12 @@
     "templates": [
         {
             "__type__": "input_output_template",
-            "instruction": "You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}",
+            "instruction": "You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}\nOutput only the final answer without any explanations, extra information, or introductory text.\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
             "input_format": "{context_type}: {context} \nQuestion: {question}",
             "target_prefix": "Final Answer: ",
             "output_format": "{answers}",
             "postprocessors": [
+                "processors.take_first_non_empty_line",
                 "processors.lower_case",
                 "processors.remove_punctuations",
                 "processors.remove_articles",
diff --git a/src/unitxt/catalog/cards/tablebench_fact_checking.json b/src/unitxt/catalog/cards/tablebench_fact_checking.json
index 546d620a4e..a5cd12bda3 100644
--- a/src/unitxt/catalog/cards/tablebench_fact_checking.json
+++ b/src/unitxt/catalog/cards/tablebench_fact_checking.json
@@ -84,11 +84,12 @@
     "templates": [
         {
             "__type__": "input_output_template",
-            "instruction": "You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}",
+            "instruction": "You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}\nOutput only the final answer without any explanations, extra information, or introductory text.\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
             "input_format": "{context_type}: {context} \nQuestion: {question}",
             "target_prefix": "Final Answer: ",
             "output_format": "{answers}",
             "postprocessors": [
+                "processors.take_first_non_empty_line",
                 "processors.lower_case",
                 "processors.remove_punctuations",
                 "processors.remove_articles",
diff --git a/src/unitxt/catalog/cards/tablebench_numerical_reasoning.json b/src/unitxt/catalog/cards/tablebench_numerical_reasoning.json
index cb5c020855..a88e2262c8 100644
--- a/src/unitxt/catalog/cards/tablebench_numerical_reasoning.json
+++ b/src/unitxt/catalog/cards/tablebench_numerical_reasoning.json
@@ -84,11 +84,12 @@
     "templates": [
         {
             "__type__": "input_output_template",
-            "instruction": "You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}",
+            "instruction": "You are a table analyst. Your task is to answer questions based on the table content. {answer_formatter}\nOutput only the final answer without any explanations, extra information, or introductory text.\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
             "input_format": "{context_type}: {context} \nQuestion: {question}",
             "target_prefix": "Final Answer: ",
             "output_format": "{answers}",
             "postprocessors": [
+                "processors.take_first_non_empty_line",
                 "processors.lower_case",
                 "processors.remove_punctuations",
                 "processors.remove_articles",
diff --git a/src/unitxt/catalog/cards/turl_col_type.json b/src/unitxt/catalog/cards/turl_col_type.json
index aaa55c3493..c225332482 100644
--- a/src/unitxt/catalog/cards/turl_col_type.json
+++ b/src/unitxt/catalog/cards/turl_col_type.json
@@ -38,7 +38,7 @@
     "templates": [
         {
             "__type__": "input_output_template",
-            "instruction": "This is a column type annotation task. The goal of this task is to choose the correct types for one selected column of the given input table from the given candidate types. The Wikipedia page, section and table caption (if any) provide important information for choosing the correct column types.\n                    Candidate Types: {vocab} \nOutput only the correct column types for the mentioned from the candidate types.",
+            "instruction": "This is a column type annotation task. The goal of this task is to choose the correct types for one selected column of the given input table from the given candidate types. The Wikipedia page, section and table caption (if any) provide important information for choosing the correct column types.\n                    Candidate Types: {vocab}\n                    \nOutput only the correct column types from the candidate list for the mentioned columns. Do not include any explanations, extra information, or introductory text—only the final answer.\n                    \nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
             "input_format": "\nColumn name: {colname}\nPage Title: {page_title} \nSection Title: {section_title} \nTable caption: {table_caption} \nTable: \n{table} \nSelected Column: {colname} ",
             "output_format": "{annotations}",
             "postprocessors": [
diff --git a/src/unitxt/catalog/cards/wikitq.json b/src/unitxt/catalog/cards/wikitq.json
index 652c77e0a0..5e15ff954d 100644
--- a/src/unitxt/catalog/cards/wikitq.json
+++ b/src/unitxt/catalog/cards/wikitq.json
@@ -37,10 +37,11 @@
     "templates": [
         {
             "__type__": "multi_reference_template",
-            "instruction": "Answer the question based on the provided table. You should only output the final answer. Do not add any explanation or other information.",
+            "instruction": "Answer the question based on the provided table. Extract and output only the final answer—the exact phrase or data from the table that directly answers the question. Do not include any alterations, explanations, or introductory text\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
             "input_format": "\nQuestion: {question}\nTable: {context}\nAnswer: ",
             "references_field": "answers",
             "postprocessors": [
+                "processors.take_first_non_empty_line",
                 "processors.to_list_by_comma_space",
                 "processors.str_to_float_format"
             ]
diff --git a/src/unitxt/catalog/tasks/generation/from_pair.json b/src/unitxt/catalog/tasks/generation/from_pair.json
index 8474b002b4..392860655d 100644
--- a/src/unitxt/catalog/tasks/generation/from_pair.json
+++ b/src/unitxt/catalog/tasks/generation/from_pair.json
@@ -12,9 +12,9 @@
     },
     "prediction_type": "str",
     "metrics": [
-        "metrics.bleu",
         "metrics.rouge",
         "metrics.bert_score.bert_base_uncased",
+        "metrics.bleu",
         "metrics.meteor"
     ],
     "augmentable_inputs": [
diff --git a/src/unitxt/catalog/templates/generation/from_pair/default.json b/src/unitxt/catalog/templates/generation/from_pair/default.json
index 218406ac20..791072927a 100644
--- a/src/unitxt/catalog/templates/generation/from_pair/default.json
+++ b/src/unitxt/catalog/templates/generation/from_pair/default.json
@@ -1,6 +1,6 @@
 {
     "__type__": "input_output_template",
-    "instruction": "Given the following {type_of_input_a} and {type_of_input_b}, generate the corresponding {type_of_output}.",
+    "instruction": "Given the following {type_of_input_a} and {type_of_input_b}, generate the corresponding {type_of_output}.\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
     "input_format": "{type_of_input_a}: \n{input_a} \n{type_of_input_b}: \n{input_b} \n{type_of_output}:",
     "output_format": "{output}",
     "postprocessors": [
diff --git a/src/unitxt/catalog/templates/qa/with_context/qtsumm.json b/src/unitxt/catalog/templates/qa/with_context/qtsumm.json
index c62164dddf..172d1644d7 100644
--- a/src/unitxt/catalog/templates/qa/with_context/qtsumm.json
+++ b/src/unitxt/catalog/templates/qa/with_context/qtsumm.json
@@ -1,6 +1,6 @@
 {
     "__type__": "multi_reference_template",
-    "instruction": "Using the information from the {context_type} given below, summarize a paragraph-long response to the following user query.",
+    "instruction": "Using the information from the {context_type} given below, summarize a paragraph-long response to the following user query.\nHere are some input-output examples. Read the examples carefully to figure out the mapping. The output of the last example is not given, and your job is to figure out what it is.",
     "input_format": "{context_type}:\n{context}\nQuery:\n{question}",
     "output_format": "{answers}",
     "target_prefix": "Answer:\n",