From 949fcb87242f686eb874f180eca0ecbc3ce67ad7 Mon Sep 17 00:00:00 2001 From: sahel Date: Wed, 27 Mar 2024 17:35:08 -0400 Subject: [PATCH 1/3] add raw retrieval --- .../large/eval/inbatch/my_retrieval.yaml | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 src/models/uniir_clip/clip_scorefusion/configs_scripts/large/eval/inbatch/my_retrieval.yaml diff --git a/src/models/uniir_clip/clip_scorefusion/configs_scripts/large/eval/inbatch/my_retrieval.yaml b/src/models/uniir_clip/clip_scorefusion/configs_scripts/large/eval/inbatch/my_retrieval.yaml new file mode 100644 index 0000000..5b034a2 --- /dev/null +++ b/src/models/uniir_clip/clip_scorefusion/configs_scripts/large/eval/inbatch/my_retrieval.yaml @@ -0,0 +1,33 @@ +experiment: + description: ${model.name} ${model.size} ${experiment.instruct_status} ${experiment.exp_name} + exp_name: InBatch + instruct_status: Instruct + path_suffix: ${model.short_name}/${model.size}/${experiment.instruct_status}/${experiment.exp_name}/ +model: + name: CLIPScoreFusion + short_name: CLIP_SF + size: Large +retrieval_config: + candidate_dir_name: cand_pool/global + embed_dir_name: embed + index_dir_name: index + query_dir_name: query + results_dir_name: retrieval_results + test_datasets_config: + correspond_cand_pools_name: + - UNION + - UNION + datasets_name: + - mscoco_task0 + - mscoco_task3 + enable_retrieve: true + train_datasets_config: + correspond_cand_pools_name: null + datasets_name: null + enable_retrieve: false + val_datasets_config: + correspond_cand_pools_name: null + correspond_qrels_name: null + datasets_name: null + enable_retrieve: false + write_to_tsv: false From 416ee347d280b1decee145ac6f407bafb28095a9 Mon Sep 17 00:00:00 2001 From: sahel Date: Wed, 27 Mar 2024 17:35:41 -0400 Subject: [PATCH 2/3] add raw retrieval --- src/common/mbeir_retriever.py | 120 +++++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 1 deletion(-) diff --git a/src/common/mbeir_retriever.py b/src/common/mbeir_retriever.py index 1c84961..0ffa5aa 100644 --- a/src/common/mbeir_retriever.py +++ b/src/common/mbeir_retriever.py @@ -11,6 +11,7 @@ import numpy as np import csv import gc +import json import faiss import pickle @@ -302,7 +303,7 @@ def run_retrieval(config): run_id = f"mbeir_{dataset_name}_union_pool_{split}_k{k}" else: run_id = f"mbeir_{dataset_name}_single_pool_{split}_k{k}" - run_file_name = f"{run_id}_run.txt" + run_file_name = f"{run_id}_run_{datetime.now()}.txt" run_file_path = os.path.join(exp_run_file_dir, run_file_name) with open(run_file_path, 'w') as run_file: for idx, (distances, indices) in enumerate(zip(retrieved_cand_dist, retrieved_indices)): @@ -421,6 +422,119 @@ def run_retrieval(config): print(f"Retriever: Results saved to {tsv_file_path}") +def run_raw_retrieval(config): + """This script runs retrieval on the faiss index""" + uniir_dir = config.uniir_dir + mbeir_data_dir = config.mbeir_data_dir + retrieval_config = config.retrieval_config + embed_dir_name = retrieval_config.embed_dir_name + index_dir_name = retrieval_config.index_dir_name + query_dir_name = retrieval_config.query_dir_name + candidate_dir_name = retrieval_config.candidate_dir_name + expt_dir_name = config.experiment.path_suffix + + # Create results directory if it doesn't exist + results_dir_name = retrieval_config.results_dir_name + exp_results_dir = os.path.join(uniir_dir, results_dir_name, expt_dir_name) + os.makedirs(exp_results_dir, exist_ok=True) + exp_run_file_dir = os.path.join(exp_results_dir, "run_files") + os.makedirs(exp_run_file_dir, exist_ok=True) + + splits = [] + # Load the dataset splits to embed + dataset_types = ["train", "val", "test"] + for split_name in dataset_types: + retrieval_dataset_config = getattr(retrieval_config, f"{split_name}_datasets_config", None) + if retrieval_dataset_config and retrieval_dataset_config.enable_retrieve: + dataset_name_list = getattr(retrieval_dataset_config, "datasets_name", None) + cand_pool_name_list = getattr(retrieval_dataset_config, "correspond_cand_pools_name", None) + dataset_embed_dir = os.path.join(uniir_dir, embed_dir_name, expt_dir_name, split_name) + splits.append((split_name, dataset_embed_dir, dataset_name_list, cand_pool_name_list)) + assert len(dataset_name_list) == len(cand_pool_name_list), "Mismatch between datasets and candidate pools." + + # Pretty Print dataset to index + print("-" * 30) + for split_name, dataset_embed_dir, dataset_name_list, cand_pool_name_list in splits: + print(f"Split: {split_name}, Retrieval Datasets: {dataset_name_list}, Candidate Pools: {cand_pool_name_list})") + print("-" * 30) + + cand_index_dir = os.path.join(uniir_dir, index_dir_name, expt_dir_name, "cand_pool") + for split, dataset_embed_dir, dataset_name_list, cand_pool_name_list in splits: + for dataset_name, cand_pool_name in zip(dataset_name_list, cand_pool_name_list): + print("\n" + "-" * 30) + print(f"Retriever: Retrieving for query:{dataset_name} | split:{split} | from cand_pool:{cand_pool_name}") + + dataset_name = dataset_name.lower() + cand_pool_name = cand_pool_name.lower() + + # Load query Hashed IDs + embed_query_id_path = os.path.join(dataset_embed_dir, f"mbeir_{dataset_name}_{split}_ids.npy") + hashed_query_ids = np.load(embed_query_id_path) + + # Load query embeddings + embed_query_path = os.path.join(dataset_embed_dir, f"mbeir_{dataset_name}_{split}_embed.npy") + + # Load the candidate pool index + cand_index_path = os.path.join(cand_index_dir, f"mbeir_{cand_pool_name}_cand_pool.index") + + # Search the index + # TODO: make k configurable + k = 10 + print(f"Retriever: Searching with k={k}") + retrieved_cand_dist, retrieved_indices = search_index( + embed_query_path, cand_index_path, batch_size=hashed_query_ids.shape[0], num_cand_to_retrieve=k + ) # Shape: (number_of_queries, k) + + # Load raw queries + queries_path = os.path.join(mbeir_data_dir, query_dir_name, f"{split}/mbeir_{dataset_name}_{split}.jsonl") + qid_to_queries = {} + with open(queries_path, 'r') as f: + for l in f: + q = json.loads(l.strip()) + assert q["qid"] not in qid_to_queries, "qids must be unique" + qid_to_queries[q["qid"]] = q + + # Load raw candidates + candidate_file_name = f"mbeir_{cand_pool_name}_{split}_cand_pool.jsonl" + candidates_path = os.path.join(mbeir_data_dir, candidate_dir_name, candidate_file_name) + did_to_candidates = {} + with open(candidates_path, 'r') as f: + for l in f: + c = json.loads(l.strip()) + assert c["did"] not in did_to_candidates, "dids must be unique" + did_to_candidates[c["did"]] = c + + # Open a file to write the run results + if cand_pool_name == "union": + run_id = f"mbeir_{dataset_name}_union_pool_{split}_k{k}" + else: + run_id = f"mbeir_{dataset_name}_single_pool_{split}_k{k}" + dt = datetime.now() + run_file_name = f"{run_id}_run_{dt}.txt" + run_file_path = os.path.join(exp_run_file_dir, run_file_name) + retrieved_cands_file_name = f"{run_id}_run_{dt}.jsonl" + retrieved_cands_file_path = os.path.join(exp_run_file_dir, retrieved_cands_file_name) + with open(run_file_path, 'w') as run_file: + with open(retrieved_cands_file_path, 'w') as cand_file: + for idx, (distances, indices) in enumerate(zip(retrieved_cand_dist, retrieved_indices)): + cands = [] + qid = unhash_qid(hashed_query_ids[idx]) + query = qid_to_queries[qid] + task_id = qid_to_queries[qid]["task_id"] + for rank, (hashed_doc_id, score) in enumerate(zip(indices, distances), start=1): + # Format: query-id Q0 document-id rank score run-id task_id + # We can remove task_id if we don't need it later using a helper + # Note: since we are using the cosine similarity, we don't need to invert the scores. + doc_id = unhash_did(hashed_doc_id) + cands.append(did_to_candidates[doc_id]) + run_file_line = f"{qid} Q0 {doc_id} {rank} {score} {run_id} {task_id}\n" + run_file.write(run_file_line) + json.dump({"query": query, "candidates": cands}, cand_file) + cand_file.write('\n') + print(f"Retriever: Run file saved to {run_file_path}") + print(f"Retriever: Retrieved candidates saved to {retrieved_cands_file_path}") + + def run_hard_negative_mining(config): uniir_dir = config.uniir_dir mbeir_data_dir = config.mbeir_data_dir @@ -526,6 +640,7 @@ def parse_arguments(): parser.add_argument("--enable_create_index", action="store_true", help="Enable create index") parser.add_argument("--enable_hard_negative_mining", action="store_true", help="Enable hard negative mining") parser.add_argument("--enable_retrieval", action="store_true", help="Enable retrieval") + parser.add_argument("--enable_raw_retrieval", action="store_true", help="Enable raw retrieval which skips metrics calculation, and stores retrieved candidates.") return parser.parse_args() @@ -546,6 +661,9 @@ def main(): if args.enable_retrieval: run_retrieval(config) + if args.enable_raw_retrieval: + run_raw_retrieval(config) + if __name__ == "__main__": main() From 0f639797f958c4206c435386b830ce6263d08a90 Mon Sep 17 00:00:00 2001 From: sahel Date: Wed, 27 Mar 2024 18:12:03 -0400 Subject: [PATCH 3/3] run raw eval --- .../eval/inbatch/run_eval_pipeline_inbatch.sh | 72 ++++++++++--------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/src/models/uniir_clip/clip_scorefusion/configs_scripts/large/eval/inbatch/run_eval_pipeline_inbatch.sh b/src/models/uniir_clip/clip_scorefusion/configs_scripts/large/eval/inbatch/run_eval_pipeline_inbatch.sh index 1f67536..2838923 100644 --- a/src/models/uniir_clip/clip_scorefusion/configs_scripts/large/eval/inbatch/run_eval_pipeline_inbatch.sh +++ b/src/models/uniir_clip/clip_scorefusion/configs_scripts/large/eval/inbatch/run_eval_pipeline_inbatch.sh @@ -3,17 +3,17 @@ set -e # Exit immediately if a command exits with a non-zero status # Initialize Conda -source /home/miniconda3/etc/profile.d/conda.sh # <--- Change this to the path of your conda.sh +source /opt/anaconda3/etc/profile.d/conda.sh # <--- Change this to the path of your conda.sh # Path to the codebase and config file -SRC="$HOME/UniIR/src" # Absolute path to codebse /UniIR/src # <--- Change this to the path of your UniIR/src +SRC="/store2/scratch/s8sharif/UniIR/src" # Absolute path to codebse /UniIR/src # <--- Change this to the path of your UniIR/src # Path to common dir COMMON_DIR="$SRC/common" # Path to MBEIR data and UniIR directory where we store the checkpoints, embeddings, etc. -UNIIR_DIR="/data/UniIR/" # <--- Change this to the UniIR directory -MBEIR_DATA_DIR="/data/UniIR/M-BEIR/" # <--- Change this to the MBEIR data directory you download from HF page +UNIIR_DIR="/store2/scratch/s8sharif/UniIR/data/UniIR/" # <--- Change this to the UniIR directory +MBEIR_DATA_DIR="/mnt/users/s8sharif/M-BEIR/" # <--- Change this to the MBEIR data directory you download from HF page # Path to config dir MODEL="uniir_clip/clip_scorefusion" # <--- Change this to the model you want to run @@ -24,8 +24,8 @@ EXP_NAME="inbatch" CONFIG_DIR="$MODEL_DIR/configs_scripts/$SIZE/$MODE/$EXP_NAME" # Set CUDA devices and PYTHONPATH -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # <--- Change this to the CUDA devices you want to use -NPROC=8 # <--- Change this to the number of GPUs you want to use +export CUDA_VISIBLE_DEVICES=5 # <--- Change this to the CUDA devices you want to use +NPROC=1 # <--- Change this to the number of GPUs you want to use export PYTHONPATH=$SRC echo "PYTHONPATH: $PYTHONPATH" echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" @@ -38,43 +38,44 @@ cd $COMMON_DIR conda activate uniir # <--- Change this to the name of your conda environment # Run Embedding command -CONFIG_PATH="$CONFIG_DIR/embed.yaml" -SCRIPT_NAME="mbeir_embedder.py" -echo "CONFIG_PATH: $CONFIG_PATH" -echo "SCRIPT_NAME: $SCRIPT_NAME" +# CONFIG_PATH="$CONFIG_DIR/embed.yaml" +# SCRIPT_NAME="mbeir_embedder.py" +# echo "CONFIG_PATH: $CONFIG_PATH" +# echo "SCRIPT_NAME: $SCRIPT_NAME" -python config_updater.py \ - --update_mbeir_yaml_instruct_status \ - --mbeir_yaml_file_path $CONFIG_PATH \ - --enable_instruct True +# python config_updater.py \ +# --update_mbeir_yaml_instruct_status \ +# --mbeir_yaml_file_path $CONFIG_PATH \ +# --enable_instruct True -python -m torch.distributed.run --nproc_per_node=$NPROC $SCRIPT_NAME \ - --config_path "$CONFIG_PATH" \ - --uniir_dir "$UNIIR_DIR" \ - --mbeir_data_dir "$MBEIR_DATA_DIR" +# python -m torch.distributed.run --nproc_per_node=$NPROC $SCRIPT_NAME \ +# --config_path "$CONFIG_PATH" \ +# --uniir_dir "$UNIIR_DIR" \ +# --mbeir_data_dir "$MBEIR_DATA_DIR" # Activate faiss environment conda activate faiss # <--- Change this to the name of your conda environment # Run Index command -CONFIG_PATH="$CONFIG_DIR/index.yaml" -SCRIPT_NAME="mbeir_retriever.py" -echo "CONFIG_PATH: $CONFIG_PATH" -echo "SCRIPT_NAME: $SCRIPT_NAME" - -python config_updater.py \ - --update_mbeir_yaml_instruct_status \ - --mbeir_yaml_file_path $CONFIG_PATH \ - --enable_instruct True - -python $SCRIPT_NAME \ - --config_path "$CONFIG_PATH" \ - --uniir_dir "$UNIIR_DIR" \ - --mbeir_data_dir "$MBEIR_DATA_DIR" \ - --enable_create_index +# CONFIG_PATH="$CONFIG_DIR/index.yaml" +# SCRIPT_NAME="mbeir_retriever.py" +# echo "CONFIG_PATH: $CONFIG_PATH" +# echo "SCRIPT_NAME: $SCRIPT_NAME" + +# python config_updater.py \ +# --update_mbeir_yaml_instruct_status \ +# --mbeir_yaml_file_path $CONFIG_PATH \ +# --enable_instruct True + +# python $SCRIPT_NAME \ +# --config_path "$CONFIG_PATH" \ +# --uniir_dir "$UNIIR_DIR" \ +# --mbeir_data_dir "$MBEIR_DATA_DIR" \ +# --enable_create_index # Run retrieval command -CONFIG_PATH="$CONFIG_DIR/retrieval.yaml" +#CONFIG_PATH="$CONFIG_DIR/retrieval.yaml" +CONFIG_PATH="$CONFIG_DIR/my_retrieval.yaml" SCRIPT_NAME="mbeir_retriever.py" echo "CONFIG_PATH: $CONFIG_PATH" echo "SCRIPT_NAME: $SCRIPT_NAME" @@ -88,4 +89,5 @@ python $SCRIPT_NAME \ --config_path "$CONFIG_PATH" \ --uniir_dir "$UNIIR_DIR" \ --mbeir_data_dir "$MBEIR_DATA_DIR" \ - --enable_retrieval \ No newline at end of file + --enable_raw_retrieval + #--enable_retrieval \ No newline at end of file