Add MMLU benchmarks.

usnistgov · Feb 5, 2024 · c33eb93 · c33eb93
1 parent f6f699a
commit c33eb93
Show file tree

Hide file tree

Showing 16 changed files with 468 additions and 0 deletions.
diff --git a/...rboard/contributions/itsliupeng_llama2_7b_zh/AI-TextClass-quiz-mmlu_test-test-acc.csv.zip b/...rboard/contributions/itsliupeng_llama2_7b_zh/AI-TextClass-quiz-mmlu_test-test-acc.csv.zip
diff --git a/jarvis_leaderboard/contributions/itsliupeng_llama2_7b_zh/metadata.json b/jarvis_leaderboard/contributions/itsliupeng_llama2_7b_zh/metadata.json
@@ -0,0 +1,18 @@
+{
+    "model_name": "itsliupeng_llama2_7b_zh",
+    "project_url": "https://huggingface.co/itsliupeng/llama2_7b_zh",
+    "date_submitted": "01-30-2024",
+    "author_email": "[email protected]",
+    "database_version": "12-12-2022",
+    "team_name": "ChemNLP",
+    "time_taken_seconds": {
+        "AI-TextGen-text-arxiv_gen-test-rouge.csv.zip": ""
+    },
+    "language": "python",
+    "os": "linux",
+    "software_used": "jarvis-tools,numpy,scipy,torch,alignn",
+    "hardware_used": "nisaba-cluster at NIST, V100 Tesla GPU",
+    "git_url": [
+        "https://huggingface.co/itsliupeng/llama2_7b_zh"
+    ]
+}
diff --git a/jarvis_leaderboard/contributions/itsliupeng_llama2_7b_zh/run.py b/jarvis_leaderboard/contributions/itsliupeng_llama2_7b_zh/run.py
@@ -0,0 +1,99 @@
+# conda activate chemdata
+import argparse
+import os
+import torch
+import numpy as np
+import pandas as pd
+from transformers import (
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+import time
+from tqdm import tqdm
+from jarvis.db.jsonutils import loadjson
+
+d = loadjson("mmlu_test.json")
+device = "cpu"
+if torch.cuda.is_available():
+
+    device = torch.device("cuda")
+#model_name = "mistralai/Mistral-7B-v0.1"
+odel_name = "itsliupeng/llama2_70b_mmlu"
+model_name = "meta-llama/Llama-2-7b"
+model_name = "meta-llama/Llama-2-7b-hf"
+model_name = "meta-llama/Llama-2-13b-hf"
+model_name = "meta-llama/Llama-2-7b-chat-hf"
+model_name = "itsliupeng/llama2_7b_zh"
+if "t5" in model_name:
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+
+if "t5" not in model_name:
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        load_in_8bit=False,
+        low_cpu_mem_usage=True,
+        device_map="auto",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+# model.to(device)
+# model.to(devices[0])
+# if num_gpus > 1:
+#    model = torch.nn.DataParallel(model)  # Use multiple GPUs
+#    #model = torch.nn.DataParallel(model, device_ids=devices)  # Use multiple GPUs
+
+f = open("AI-TextClass-quiz-mmlu_test-test-acc_meta-llama_Llama-2-7b-chat-hf.csv", "w")
+#f = open("AI-TextClass-quiz-mmlu_test-test-acc.csv", "w")
+f.write("id,target,prediction\n")
+# target_labels=[]
+# pred_labels=[]
+for ii, i in enumerate(tqdm(d)):
+   #if ii>10805:
+    prompt = i["prompt"]
+    label = i["answer"]
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(
+        model.device
+    )  # .cuda()
+    # decoder_input_ids = tokenizer("", return_tensors="pt").input_ids #.cuda()
+    # decoder_input_ids = model._shift_right(decoder_input_ids)
+    # logits = model(
+    #     input_ids=input_ids, decoder_input_ids=decoder_input_ids
+    # ).logits.flatten()
+    # input_ids.to(device)
+    # logits = model(input_ids=input_ids).logits.flatten()
+    # logits = model(input_ids=input_ids.to(device)).logits.flatten()
+    logits = model(input_ids=input_ids).logits[0, -1]
+    probs = (
+        torch.nn.functional.softmax(
+            torch.tensor(
+                [
+                    logits[tokenizer("A").input_ids[-1]],
+                    logits[tokenizer("B").input_ids[-1]],
+                    logits[tokenizer("C").input_ids[-1]],
+                    logits[tokenizer("D").input_ids[-1]],
+                ]
+            ),
+            dim=0,
+        )
+        .detach()
+        .cpu()
+        .numpy()
+    )
+    pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
+    # print("prompt",prompt)
+    # print("label",label)
+    # print("pred",pred)
+    # print()
+    # target_labels.append(label)
+    # pred_labels.append(pred)
+    line = i["id"] + "," + label + "," + pred + "\n"
+    # print(line)
+    f.write(line)
+    del input_ids
+    del logits
+    del probs
+f.close()
+#!zip AI-TextClass-quiz-mmlu_test-test-acc.csv.zip AI-TextClass-quiz-mmlu_test-test-acc.csv
diff --git a/jarvis_leaderboard/contributions/itsliupeng_llama2_7b_zh/run.sh b/jarvis_leaderboard/contributions/itsliupeng_llama2_7b_zh/run.sh
@@ -0,0 +1 @@
+python run.py
diff --git a/.../contributions/meta-llama_Llama-2-7b-chat-hf/AI-TextClass-quiz-mmlu_test-test-acc.csv.zip b/.../contributions/meta-llama_Llama-2-7b-chat-hf/AI-TextClass-quiz-mmlu_test-test-acc.csv.zip
diff --git a/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-chat-hf/metadata.json b/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-chat-hf/metadata.json
@@ -0,0 +1,18 @@
+{
+    "model_name": "meta-llama_Llama-2-7b-chat-hf",
+    "project_url": "https://huggingface.co/meta-llama/Llama-2-7b-chat-hf",
+    "date_submitted": "01-30-2024",
+    "author_email": "[email protected]",
+    "database_version": "12-12-2022",
+    "team_name": "ChemNLP",
+    "time_taken_seconds": {
+        "AI-TextGen-text-arxiv_gen-test-rouge.csv.zip": ""
+    },
+    "language": "python",
+    "os": "linux",
+    "software_used": "jarvis-tools,numpy,scipy,torch,alignn",
+    "hardware_used": "nisaba-cluster at NIST, V100 Tesla GPU",
+    "git_url": [
+        "https://huggingface.co/meta-llama/Llama-2-7b-chat-hf"
+    ]
+}
diff --git a/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-chat-hf/run.py b/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-chat-hf/run.py
@@ -0,0 +1,99 @@
+# conda activate chemdata
+import argparse
+import os
+import torch
+import numpy as np
+import pandas as pd
+from transformers import (
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+import time
+from tqdm import tqdm
+from jarvis.db.jsonutils import loadjson
+
+d = loadjson("mmlu_test.json")
+device = "cpu"
+if torch.cuda.is_available():
+
+    device = torch.device("cuda")
+#model_name = "mistralai/Mistral-7B-v0.1"
+odel_name = "itsliupeng/llama2_70b_mmlu"
+model_name = "meta-llama/Llama-2-7b"
+model_name = "itsliupeng/llama2_7b_zh"
+model_name = "meta-llama/Llama-2-7b-hf"
+model_name = "meta-llama/Llama-2-13b-hf"
+model_name = "meta-llama/Llama-2-7b-chat-hf"
+if "t5" in model_name:
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+
+if "t5" not in model_name:
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        load_in_8bit=False,
+        low_cpu_mem_usage=True,
+        device_map="auto",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+# model.to(device)
+# model.to(devices[0])
+# if num_gpus > 1:
+#    model = torch.nn.DataParallel(model)  # Use multiple GPUs
+#    #model = torch.nn.DataParallel(model, device_ids=devices)  # Use multiple GPUs
+
+f = open("AI-TextClass-quiz-mmlu_test-test-acc_meta-llama_Llama-2-7b-chat-hf.csv", "w")
+#f = open("AI-TextClass-quiz-mmlu_test-test-acc.csv", "w")
+f.write("id,target,prediction\n")
+# target_labels=[]
+# pred_labels=[]
+for ii, i in enumerate(tqdm(d)):
+   #if ii>10805:
+    prompt = i["prompt"]
+    label = i["answer"]
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(
+        model.device
+    )  # .cuda()
+    # decoder_input_ids = tokenizer("", return_tensors="pt").input_ids #.cuda()
+    # decoder_input_ids = model._shift_right(decoder_input_ids)
+    # logits = model(
+    #     input_ids=input_ids, decoder_input_ids=decoder_input_ids
+    # ).logits.flatten()
+    # input_ids.to(device)
+    # logits = model(input_ids=input_ids).logits.flatten()
+    # logits = model(input_ids=input_ids.to(device)).logits.flatten()
+    logits = model(input_ids=input_ids).logits[0, -1]
+    probs = (
+        torch.nn.functional.softmax(
+            torch.tensor(
+                [
+                    logits[tokenizer("A").input_ids[-1]],
+                    logits[tokenizer("B").input_ids[-1]],
+                    logits[tokenizer("C").input_ids[-1]],
+                    logits[tokenizer("D").input_ids[-1]],
+                ]
+            ),
+            dim=0,
+        )
+        .detach()
+        .cpu()
+        .numpy()
+    )
+    pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
+    # print("prompt",prompt)
+    # print("label",label)
+    # print("pred",pred)
+    # print()
+    # target_labels.append(label)
+    # pred_labels.append(pred)
+    line = i["id"] + "," + label + "," + pred + "\n"
+    # print(line)
+    f.write(line)
+    del input_ids
+    del logits
+    del probs
+f.close()
+#!zip AI-TextClass-quiz-mmlu_test-test-acc.csv.zip AI-TextClass-quiz-mmlu_test-test-acc.csv
diff --git a/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-chat-hf/run.sh b/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-chat-hf/run.sh
@@ -0,0 +1 @@
+python run.py
diff --git a/...board/contributions/meta-llama_Llama-2-7b-hf/AI-TextClass-quiz-mmlu_test-test-acc.csv.zip b/...board/contributions/meta-llama_Llama-2-7b-hf/AI-TextClass-quiz-mmlu_test-test-acc.csv.zip
diff --git a/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-hf/metadata.json b/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-hf/metadata.json
@@ -0,0 +1,18 @@
+{
+    "model_name": "meta-llama_Llama-2-7b-hf",
+    "project_url": "https://huggingface.co/meta-llama/Llama-2-7b-hf",
+    "date_submitted": "01-30-2024",
+    "author_email": "[email protected]",
+    "database_version": "12-12-2022",
+    "team_name": "ChemNLP",
+    "time_taken_seconds": {
+        "AI-TextGen-text-arxiv_gen-test-rouge.csv.zip": ""
+    },
+    "language": "python",
+    "os": "linux",
+    "software_used": "jarvis-tools,numpy,scipy,torch,alignn",
+    "hardware_used": "nisaba-cluster at NIST, V100 Tesla GPU",
+    "git_url": [
+        "https://huggingface.co/meta-llama/Llama-2-7b-hf"
+    ]
+}
diff --git a/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-hf/run.py b/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-hf/run.py
@@ -0,0 +1,99 @@
+# conda activate chemdata
+import argparse
+import os
+import torch
+import numpy as np
+import pandas as pd
+from transformers import (
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+import time
+from tqdm import tqdm
+from jarvis.db.jsonutils import loadjson
+
+d = loadjson("mmlu_test.json")
+device = "cpu"
+if torch.cuda.is_available():
+
+    device = torch.device("cuda")
+#model_name = "mistralai/Mistral-7B-v0.1"
+odel_name = "itsliupeng/llama2_70b_mmlu"
+model_name = "meta-llama/Llama-2-7b"
+model_name = "itsliupeng/llama2_7b_zh"
+model_name = "meta-llama/Llama-2-7b-hf"
+model_name = "meta-llama/Llama-2-7b-chat-hf"
+model_name = "meta-llama/Llama-2-13b-hf"
+if "t5" in model_name:
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+
+if "t5" not in model_name:
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        load_in_8bit=False,
+        low_cpu_mem_usage=True,
+        device_map="auto",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+# model.to(device)
+# model.to(devices[0])
+# if num_gpus > 1:
+#    model = torch.nn.DataParallel(model)  # Use multiple GPUs
+#    #model = torch.nn.DataParallel(model, device_ids=devices)  # Use multiple GPUs
+
+f = open("AI-TextClass-quiz-mmlu_test-test-acc_meta-llama_Llama-2-7b-chat-hf.csv", "w")
+#f = open("AI-TextClass-quiz-mmlu_test-test-acc.csv", "w")
+f.write("id,target,prediction\n")
+# target_labels=[]
+# pred_labels=[]
+for ii, i in enumerate(tqdm(d)):
+   #if ii>10805:
+    prompt = i["prompt"]
+    label = i["answer"]
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(
+        model.device
+    )  # .cuda()
+    # decoder_input_ids = tokenizer("", return_tensors="pt").input_ids #.cuda()
+    # decoder_input_ids = model._shift_right(decoder_input_ids)
+    # logits = model(
+    #     input_ids=input_ids, decoder_input_ids=decoder_input_ids
+    # ).logits.flatten()
+    # input_ids.to(device)
+    # logits = model(input_ids=input_ids).logits.flatten()
+    # logits = model(input_ids=input_ids.to(device)).logits.flatten()
+    logits = model(input_ids=input_ids).logits[0, -1]
+    probs = (
+        torch.nn.functional.softmax(
+            torch.tensor(
+                [
+                    logits[tokenizer("A").input_ids[-1]],
+                    logits[tokenizer("B").input_ids[-1]],
+                    logits[tokenizer("C").input_ids[-1]],
+                    logits[tokenizer("D").input_ids[-1]],
+                ]
+            ),
+            dim=0,
+        )
+        .detach()
+        .cpu()
+        .numpy()
+    )
+    pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
+    # print("prompt",prompt)
+    # print("label",label)
+    # print("pred",pred)
+    # print()
+    # target_labels.append(label)
+    # pred_labels.append(pred)
+    line = i["id"] + "," + label + "," + pred + "\n"
+    # print(line)
+    f.write(line)
+    del input_ids
+    del logits
+    del probs
+f.close()
+#!zip AI-TextClass-quiz-mmlu_test-test-acc.csv.zip AI-TextClass-quiz-mmlu_test-test-acc.csv
diff --git a/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-hf/run.sh b/jarvis_leaderboard/contributions/meta-llama_Llama-2-7b-hf/run.sh
@@ -0,0 +1 @@
+python run.py
diff --git a/...oard/contributions/mistralai_Mistral-7B-v0.1/AI-TextClass-quiz-mmlu_test-test-acc.csv.zip b/...oard/contributions/mistralai_Mistral-7B-v0.1/AI-TextClass-quiz-mmlu_test-test-acc.csv.zip
diff --git a/jarvis_leaderboard/contributions/mistralai_Mistral-7B-v0.1/metadata.json b/jarvis_leaderboard/contributions/mistralai_Mistral-7B-v0.1/metadata.json
@@ -0,0 +1,18 @@
+{
+    "model_name": "mistralai/Mistral-7B-v0.1",
+    "project_url": "https://huggingface.co/mistralai/Mistral-7B-v0.1",
+    "date_submitted": "01-30-2024",
+    "author_email": "[email protected]",
+    "database_version": "12-12-2022",
+    "team_name": "ChemNLP",
+    "time_taken_seconds": {
+        "AI-TextGen-text-arxiv_gen-test-rouge.csv.zip": ""
+    },
+    "language": "python",
+    "os": "linux",
+    "software_used": "jarvis-tools,numpy,scipy,torch,alignn",
+    "hardware_used": "nisaba-cluster at NIST, V100 Tesla GPU",
+    "git_url": [
+        "https://huggingface.co/mistralai/Mistral-7B-v0.1"
+    ]
+}