add materials

pfnet-research · Jun 24, 2024 · 0966dbb · 0966dbb
commit 0966dbb
Show file tree

Hide file tree

Showing 38 changed files with 10,949 additions and 0 deletions.
diff --git a/.env.tmp b/.env.tmp
@@ -0,0 +1,5 @@
+OPENAI_API_BASE=
+OPENAI_API_TYPE=openai/azure
+OPENAI_API_VERSION=
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,166 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.idea
+.vscode
+.env
+poetry.lock
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,28 @@
+All files in this repository except the following files/directories are licensed under the MIT License.
+ - data/pfmt_bench/question.json ([Anthropic](https://www.anthropic.com/) license)
+ - data/pfmt_bench/reference_answer/gpt-40+human.jsonl ([OpenAI](https://openai.com/) license)
+ - data/pfmt_bench/model_answer/ (depends on each model's license)
+ - data/pfmt_bench/model_judgment/ (depends on each model's license)
+
+====================
+MIT License
+
+Copyright (c) 2024 Preferred Networks, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,25 @@
+RUN := poetry run
+.PHONY: check
+check: lint mypy
+.PHONY: lint
+lint: lint-black lint-isort lint-flake8
+.PHONY: lint-black
+lint-black:
+	$(RUN) black --check --diff --quiet . 
+.PHONY: lint-isort
+lint-isort:
+	$(RUN) isort --check --quiet . 
+.PHONY: lint-flake8
+lint-flake8:
+	$(RUN) pflake8 . 
+.PHONY: mypy
+mypy:
+	$(RUN) mypy . 
+.PHONY: format
+format: format-black format-isort
+.PHONY: format-black
+format-black:
+	$(RUN) black --quiet . 
+.PHONY: format-isort
+format-isort:
+	$(RUN) isort --quiet .
diff --git a/README.md b/README.md
@@ -0,0 +1,68 @@
+# pfmt-bench-fin-ja: Preferred Multi-turn Benchmark for Finance in Japanese
+
+This is a benchmark measuring the generation quality of LLMs for financial conversations in Japanese. The benchmark consists of 360 dialogues, each containing 2 turns. There are 12 types of tasks, writing, roleplay, knowledge, extraction, reasoning, math, coding, idea, translation, ethics, trustworthiness, and ESGs.
+This benchmark aims to evaluate the generation quality of LLMs in financial conversations in Japanese.
+
+Originally, [MT-bench](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) contains writing, roleplay, reasoning, math, coding, extraction, STEM, and humanities tasks.
+Instead of STEM and humanities tasks, we added knowledge task for financial conversations.
+In addition, we also newly employed idea, translation, ethics, trustworthiness, and ESGs tasks.
+
+The evaluation is carried out with a 10-grade scale, and the evaluation is done by gpt-4o.
+
+<img src="top_image.png" width="80%"></img>
+
+## How to use
+The usages are almost the same as [MT-bench](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge)
+
+1. set the environmental variable using `.env` file
+2. generate models' answers:
+```
+# for HF models
+python gen_model_answers.py --model-path rinna/nekomata-7b rinna/nekomata-7b-instruction ...
+# for api models (currently, openai, anthropic are supported)
+python gen_api_answer.py --model gpt-35-turbo --parallel 10
+```
+For `gen_model_answers.py`, you can set `----disable-strict-injection-check` to disable strict injection check.
+The strict injection check is a function that checks whether the model's answer contains the generated next questions and answers, which could be a problem when the model outputs are evaluated in the next step.
+If you do not use this option (default), the output contains the special marker for Q&A, such as `###ユーザー` and `###アシスタント`, will be removed.
+
+3. generate judgments using gpt-4o:
+```
+# highly recommeded to evaluate model using single mode
+python gen_judgment.py [--model-list gpt-35-turbo rinna/nekomata-7b-instruction] --mode single --parallel 10
+# for pairwise evaluation (not recommended because a lot of requests are required)
+python gen_judgment.py [--model-list gpt-35-turbo rinna/nekomata-7b-instruction] --mode pairwise-all --parallel 10
+```
+If you don't set model-list, all models generating answers are evaluated.
+
+4. getting aggregated results:
+```
+python make_leaderboard.py
+```
+for getting aggregated results, which contain scores of each task.
+```
+python show_result.py
+```
+for calculating the results of each task on each turn.
+
+# Citation
+TBD.
+```
+@misc{Hirano2024-pfmt
+    title={{pfmt-bench-fin-ja: Preferred Multi-turn Benchmark for Finance in Japanese}},
+    author={Masanori Hirano and Kentaro Imajo},
+    year={2024},
+    url = {https://github.com/pfnet-research/pfmt-bench-fin-ja}
+}
+```
+
+# Contribution
+This project is created by [Masanori Hirano](https://mhirano.jp) and [Kentaro Imajo](https://imoz.jp/), owned by [Preferred Networks](https://www.preferred.jp) and maintained by [Masanori Hirano](https://mhirano.jp).
+
+# Note
+Models that are tuned based on this benchmark should not be evaluated by this benchmark.
+For example, model merging should not be performed based on this benchmark.
+
+# License
+This benchmark code is licensed under the MIT License.
+However, questions and some results files are exposed under each model's license.