theopenconversationkit · assouktim · Dec 5, 2024 · Jan 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -46,5 +46,6 @@ scripts/connector-messenger/ngrok.exe
 # Python
 **/requirements.txt
 **/.venv/
+**/.env
 **/.python-version
 gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/**/*.json
diff --git a/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/README.md b/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/README.md
@@ -240,28 +240,43 @@ To configure the default vector store, you can use the following environment var
 
 ### generate_dataset.py
 
-Generates a testing dataset based on an input file. The input file should have the correct format (see generate_datset_input.xlsx for sample). The generated dataset can be saved on filesystem, using the --csv-output option, on langsmith, using the --langsmith-dataset-name option, on langfuse using the --langfuse-dataset-name option, or both.
+Dataset Generator: Generate CSV, Langfuse or Langfuse datasets from an Excel file.
 
 ```
 Usage:
-    generate_dataset.py [-v] <input_excel> --range=<s> [--csv-output=<path>] [ --langsmith-dataset-name=<name> ] [ --langfuse-dataset-name=<name> ] [--locale=<locale>] [--no-answer=<na>]
-    generate_dataset.py [-v] <input_excel> --sheet=<n>... [--csv-output=<path>] [ --langsmith-dataset-name=<name> ] [ --langfuse-dataset-name=<name> ] [--locale=<locale>] [--no-answer=<na>]
+    generate_dataset.py [-v] --input-excel=<ie> [--csv-output=<co>] [--langsmith-dataset-name=<lsdn>] [--langfuse-dataset-name=<lfdn>] [--locale=<l>] [--no-answer=<na>]
+
+Description:
+    This script processes an input Excel file to generate a testing dataset. The output can be saved as a CSV file,
+    uploaded to Langsmith, or uploaded to Langfuse. The input Excel file must follow the specified format
+    (see examples/generate_dataset_input.example.xlsx).
 
 Arguments:
-    input_excel path to the input excel file
+    --input-excel=<ie>              Path to the input Excel file. This is a required argument.
 
 Options:
-    --range=<s>                     Range of sheet to be parsed. The expected format is X,Y where X is the first sheet to be included, and Y is the last. Indices are 0-indexed.
-    --sheet=<n>                     Sheet numbers to be parsed. Indices are 0-indexed.
-    --csv-output=<path>             Output path of csv file to be generated.
-    --langsmith-dataset-name=<name> Name of the dataset to be saved on langsmith.
-    --langfuse-dataset-name=<name> Name of the dataset to be saved on langfuse.
-    --locale=<locale>               Locale to be included in de dataset. [default: French]
-    --no-answer=<na>                Label of no_answer to be included in the dataset. [default: NO_RAG_SENTENCE]
-    -h --help                       Show this screen
-    --version                       Show version
-    -v                              Verbose output for debugging (without this option, script will be silent but for errors)
-Generates a testing dataset based on an input file. The input file should have the correct format (see generate_datset_input.xlsx for sample). The generated dataset can be saved on filesystem, using the --csv-output option, on langsmith, using the --langsmith-dataset-name option, on langfuse using the --langfuse-dataset-name option, or both.
+    --csv-output=<co>               Path to save the generated dataset as a CSV file. Optional.
+    --langsmith-dataset-name=<lsdn> Name of the dataset to be uploaded to Langsmith. Optional.
+    --langfuse-dataset-name=<lfdn>  Name of the dataset to be uploaded to Langfuse. Optional.
+    --locale=<l>                    Locale information to include in the dataset. Defaults to "French". Optional.
+    --no-answer=<na>                Label of no-answer to include in the dataset. Defaults to "NO_RAG_SENTENCE". Optional.
+    -v                              Enable verbose output for debugging purposes. If not set, the script runs silently except for errors.
+    -h, --help                      Display this help message and exit.
+    --version                       Display the version of the script.
+
+Examples:
+    1. Generate a CSV dataset:
+        python generate_dataset.py --input-excel=path/to/input.xlsx --csv-output=path/to/output.csv
+
+    2. Generate and upload a dataset to Langfuse:
+        python generate_dataset.py --input-excel=path/to/input.xlsx --langfuse-dataset-name=my_dataset
+
+    3. Generate a CSV dataset with a specified locale and verbose mode:
+        python generate_dataset.py --input-excel=path/to/input.xlsx --csv-output=path/to/output.csv --locale=English -v
+
+Notes:
+    - The input Excel file must adhere to the required format. Check examples/generate_dataset_input.example.xlsx for reference.
+    - You can simultaneously save the dataset locally (as a CSV) and upload it to Langsmith or Langfuse by providing the respective options.
 ```
 
 ### rag_testing_tool.py

diff --git a/...rver/src/main/python/tock-llm-indexing-tools/examples/generate_dataset_input.example.xlsx b/...rver/src/main/python/tock-llm-indexing-tools/examples/generate_dataset_input.example.xlsx
diff --git a/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/export_run_results.py b/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/export_run_results.py
@@ -37,6 +37,8 @@
 import requests
 from docopt import docopt
 from dotenv import load_dotenv
+from openpyxl.reader.excel import load_workbook
+from openpyxl.utils import get_column_letter
 
 from generate_dataset import init_langfuse
 
@@ -369,6 +371,33 @@ def check_environment_variables(provider):
             logging.error('Cannot proceed: LANGCHAIN_API_KEY is not defined.')
             sys.exit(1)
 
+def create_excel_output(iterations: list[str], dataset_items, output_file):
+    # Create a new workbook and sheet
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    wb = load_workbook(os.path.join(script_dir, "examples/generate_dataset_input.example.xlsx"))
+
+    sheet = wb['Template_Suivi_Recette']
+
+    for i in range(len(iterations)):
+        start_row = 7 + 6 * i
+        # Merge cells (example 6B:11B)
+        sheet.merge_cells(start_row=start_row, start_column=2, end_row=12+6*i, end_column=2)
+        sheet[f"B{start_row}"] = iterations[i]
+
+    for i in range(len(dataset_items)):
+        col_letter = get_column_letter(9 + i) # Column I (corresponds to index 9)
+        sheet[f"{col_letter}3"] = dataset_items[i][0] # Topic
+        sheet[f"{col_letter}4"] = dataset_items[i][1] # Question
+        sheet[f"{col_letter}5"] = dataset_items[i][2] # Expected response
+        counter = 2
+        for j in range(len(iterations)):
+            start_row = 7 + 6 * j
+            counter = counter + 1
+            sheet[f"{col_letter}{start_row}"] = dataset_items[i][counter] # Response for iteration j
+            counter = counter + 1
+            sheet[f"{col_letter}{start_row+1}"] = dataset_items[i][counter] # Response sources for iteration j
+
+    wb.save(output_file)
 
 if __name__ == '__main__':
     start_time = time.time()
@@ -381,34 +410,38 @@ def check_environment_variables(provider):
     logging.basicConfig(level=logging.DEBUG if cli_args['-v'] else logging.INFO, format=log_format)
 
     check_environment_variables(provider)  # Check environment variables based on provider
+    session_or_run_ids = cli_args['<session_or_run_ids>']
 
     csv_lines = []
     if provider == 'langfuse':
         dataset_name = cli_args['<dataset_id_or_name>']
-        runs_names = cli_args['<session_or_run_ids>']
         client = init_langfuse()
         dataset = client.get_dataset(name=dataset_name)
-        csv_lines = [create_csv_header(runs_names, provider, dataset_name)]
+        csv_lines = [create_csv_header(session_or_run_ids, provider, dataset_name)]
         for item in dataset.items:
-            csv_lines.append(append_runs_langfuse(item, runs_names))
+            csv_lines.append(append_runs_langfuse(item, session_or_run_ids))
 
     elif provider == 'langsmith':
         # The LangSmith API base url
         base_url = 'https://api.smith.langchain.com/api/v1'
         # Get LangSmith API key from environment
         _LANGSMITH_API_KEY = os.environ["LANGCHAIN_API_KEY"]
         dataset_id = cli_args['<dataset_id_or_name>']
-        session_ids = cli_args['<session_or_run_ids>']
         dataset_info = get_sessions(dataset_id)
         examples = get_dataset_examples(len(dataset_info), dataset_id)
-        csv_lines = [create_csv_header(session_ids, provider, dataset_id)]
+        csv_lines = [create_csv_header(session_or_run_ids, provider, dataset_id)]
         for example in examples:
-            csv_lines.append(append_runs_langsmith(example, session_ids))
+            csv_lines.append(append_runs_langsmith(example, session_or_run_ids))
 
-    output_csv_file = f"export_run_result_{provider}_{int(time.time())}.csv"
+    output_file = f"export_run_result_{provider}_{int(time.time())}"
+    output_csv_file = f"{output_file}.csv"
     with open(output_csv_file, 'w', newline='') as csv_file:
         writer = csv.writer(csv_file, delimiter='|')
         writer.writerows(csv_lines)
-
     logging.info(f"CSV file successfully generated: {output_csv_file}")
+
+    output_xlsx_file = f"{output_file}.xlsx"
+    create_excel_output(session_or_run_ids, csv_lines[1:], output_xlsx_file)
+    logging.info(f"Xls file successfully generated: {output_xlsx_file}")
+
     logging.info(f"Total execution time: {time.time() - start_time:.2f} seconds")