From d5a0266df521e63737ed6ae1f471ab0e1b59a856 Mon Sep 17 00:00:00 2001 From: "Yao, Qing" Date: Fri, 6 Sep 2024 11:10:41 +0800 Subject: [PATCH] Add README for codegen acc test. Signed-off-by: Yao, Qing --- .../api_evaluator.py | 3 + examples/CodeGen/README.md | 92 +++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 examples/CodeGen/README.md diff --git a/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py b/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py index e6078764..b6faa5b1 100644 --- a/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py +++ b/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py @@ -16,6 +16,9 @@ def generate_text(self, task_name, intermediate_generations=None): dataset = task.get_dataset() # if args.limit is None, use all samples # if args.limit is used, make sure args.limit_start + args.limit <= len(dataset) + + # TODO: Only support running the entire task in its entirety now, + # parameters limit or limit_start will result in inaccurate results. n_tasks = min(self.args.limit, len(dataset) - self.args.limit_start) if self.args.limit else len(dataset) print(n_tasks) # when args.limit is None diff --git a/examples/CodeGen/README.md b/examples/CodeGen/README.md new file mode 100644 index 00000000..5d118967 --- /dev/null +++ b/examples/CodeGen/README.md @@ -0,0 +1,92 @@ +# CodeGen accuracy Evaluation + +## Evaluation Framework +We evaluate accuracy by [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness). It is a framework for the evaluation of code generation models. + + +## Evaluation FAQs + +### Launch CodeGen microservice +Please refer to [CodeGen Examples](https://github.com/opea-project/GenAIExamples/tree/main/CodeGen), follow the guide to deploy CodeGen megeservice. + +Use cURL command to test codegen service and ensure that it has started properly +```bash +export CODEGEN_ENDPOINT = "http://${your_ip}:7778/v1/codegen" +curl $CODEGEN_ENDPOINT \ + -H "Content-Type: application/json" \ + -d '{"messages": "Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception."}' + +``` + + +### Generation and Evaluation + +For evaluating the models on coding tasks or specifically coding LLMs, we follow the [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness) and provide the command line usage and function call usage. [HumanEval](https://huggingface.co/datasets/openai_humaneval), [HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp), [MBPP+](https://huggingface.co/datasets/evalplus/mbppplus), and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode are available. +#### command line usage + +```shell +cd evals/evaluation/bigcode_evaluation_harness/examples +python main.py --model Qwen/CodeQwen1.5-7B-Chat \ + --tasks humaneval \ + --codegen_url $CODEGEN_ENDPOINT \ + --max_length_generation 2048 \ + --batch_size 1 \ + --save_generations \ + --save_references \ + --allow_code_execution +``` + +***Note:*** Currently, our framework is designed to execute tasks in full. To ensure the accuracy of results, we advise against using the 'limit' or 'limit_start' parameters to restrict the number of test samples. + + +### accuracy Result +Here is the tested result for your reference +```json +{ + "humaneval": { + "pass@1": 0.7195121951219512 + }, + "config": { + "prefix": "", + "do_sample": true, + "temperature": 0.2, + "top_k": 0, + "top_p": 0.95, + "n_samples": 1, + "eos": "<|endoftext|>", + "seed": 0, + "model": "Qwen/CodeQwen1.5-7B-Chat", + "modeltype": "causal", + "peft_model": null, + "revision": null, + "use_auth_token": false, + "trust_remote_code": false, + "tasks": "humaneval", + "instruction_tokens": null, + "batch_size": 1, + "max_length_generation": 2048, + "precision": "fp32", + "load_in_8bit": false, + "load_in_4bit": false, + "left_padding": false, + "limit": null, + "limit_start": 0, + "save_every_k_tasks": -1, + "postprocess": true, + "allow_code_execution": true, + "generation_only": false, + "load_generations_path": null, + "load_data_path": null, + "metric_output_path": "evaluation_results.json", + "save_generations": true, + "load_generations_intermediate_paths": null, + "save_generations_path": "generations.json", + "save_references": true, + "save_references_path": "references.json", + "prompt": "prompt", + "max_memory_per_gpu": null, + "check_references": false, + "codegen_url": "http://192.168.123.104:31234/v1/codegen" + } +} +```