Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raw retrieval #3

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 119 additions & 1 deletion src/common/mbeir_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import csv
import gc
import json

import faiss
import pickle
Expand Down Expand Up @@ -302,7 +303,7 @@ def run_retrieval(config):
run_id = f"mbeir_{dataset_name}_union_pool_{split}_k{k}"
else:
run_id = f"mbeir_{dataset_name}_single_pool_{split}_k{k}"
run_file_name = f"{run_id}_run.txt"
run_file_name = f"{run_id}_run_{datetime.now()}.txt"
run_file_path = os.path.join(exp_run_file_dir, run_file_name)
with open(run_file_path, 'w') as run_file:
for idx, (distances, indices) in enumerate(zip(retrieved_cand_dist, retrieved_indices)):
Expand Down Expand Up @@ -421,6 +422,119 @@ def run_retrieval(config):
print(f"Retriever: Results saved to {tsv_file_path}")


def run_raw_retrieval(config):
"""This script runs retrieval on the faiss index"""
uniir_dir = config.uniir_dir
mbeir_data_dir = config.mbeir_data_dir
retrieval_config = config.retrieval_config
embed_dir_name = retrieval_config.embed_dir_name
index_dir_name = retrieval_config.index_dir_name
query_dir_name = retrieval_config.query_dir_name
candidate_dir_name = retrieval_config.candidate_dir_name
expt_dir_name = config.experiment.path_suffix

# Create results directory if it doesn't exist
results_dir_name = retrieval_config.results_dir_name
exp_results_dir = os.path.join(uniir_dir, results_dir_name, expt_dir_name)
os.makedirs(exp_results_dir, exist_ok=True)
exp_run_file_dir = os.path.join(exp_results_dir, "run_files")
os.makedirs(exp_run_file_dir, exist_ok=True)

splits = []
# Load the dataset splits to embed
dataset_types = ["train", "val", "test"]
for split_name in dataset_types:
retrieval_dataset_config = getattr(retrieval_config, f"{split_name}_datasets_config", None)
if retrieval_dataset_config and retrieval_dataset_config.enable_retrieve:
dataset_name_list = getattr(retrieval_dataset_config, "datasets_name", None)
cand_pool_name_list = getattr(retrieval_dataset_config, "correspond_cand_pools_name", None)
dataset_embed_dir = os.path.join(uniir_dir, embed_dir_name, expt_dir_name, split_name)
splits.append((split_name, dataset_embed_dir, dataset_name_list, cand_pool_name_list))
assert len(dataset_name_list) == len(cand_pool_name_list), "Mismatch between datasets and candidate pools."

# Pretty Print dataset to index
print("-" * 30)
for split_name, dataset_embed_dir, dataset_name_list, cand_pool_name_list in splits:
print(f"Split: {split_name}, Retrieval Datasets: {dataset_name_list}, Candidate Pools: {cand_pool_name_list})")
print("-" * 30)

cand_index_dir = os.path.join(uniir_dir, index_dir_name, expt_dir_name, "cand_pool")
for split, dataset_embed_dir, dataset_name_list, cand_pool_name_list in splits:
for dataset_name, cand_pool_name in zip(dataset_name_list, cand_pool_name_list):
print("\n" + "-" * 30)
print(f"Retriever: Retrieving for query:{dataset_name} | split:{split} | from cand_pool:{cand_pool_name}")

dataset_name = dataset_name.lower()
cand_pool_name = cand_pool_name.lower()

# Load query Hashed IDs
embed_query_id_path = os.path.join(dataset_embed_dir, f"mbeir_{dataset_name}_{split}_ids.npy")
hashed_query_ids = np.load(embed_query_id_path)

# Load query embeddings
embed_query_path = os.path.join(dataset_embed_dir, f"mbeir_{dataset_name}_{split}_embed.npy")

# Load the candidate pool index
cand_index_path = os.path.join(cand_index_dir, f"mbeir_{cand_pool_name}_cand_pool.index")

# Search the index
# TODO: make k configurable
k = 10
print(f"Retriever: Searching with k={k}")
retrieved_cand_dist, retrieved_indices = search_index(
embed_query_path, cand_index_path, batch_size=hashed_query_ids.shape[0], num_cand_to_retrieve=k
) # Shape: (number_of_queries, k)

# Load raw queries
queries_path = os.path.join(mbeir_data_dir, query_dir_name, f"{split}/mbeir_{dataset_name}_{split}.jsonl")
qid_to_queries = {}
with open(queries_path, 'r') as f:
for l in f:
q = json.loads(l.strip())
assert q["qid"] not in qid_to_queries, "qids must be unique"
qid_to_queries[q["qid"]] = q

# Load raw candidates
candidate_file_name = f"mbeir_{cand_pool_name}_{split}_cand_pool.jsonl"
candidates_path = os.path.join(mbeir_data_dir, candidate_dir_name, candidate_file_name)
did_to_candidates = {}
with open(candidates_path, 'r') as f:
for l in f:
c = json.loads(l.strip())
assert c["did"] not in did_to_candidates, "dids must be unique"
did_to_candidates[c["did"]] = c

# Open a file to write the run results
if cand_pool_name == "union":
run_id = f"mbeir_{dataset_name}_union_pool_{split}_k{k}"
else:
run_id = f"mbeir_{dataset_name}_single_pool_{split}_k{k}"
dt = datetime.now()
run_file_name = f"{run_id}_run_{dt}.txt"
run_file_path = os.path.join(exp_run_file_dir, run_file_name)
retrieved_cands_file_name = f"{run_id}_run_{dt}.jsonl"
retrieved_cands_file_path = os.path.join(exp_run_file_dir, retrieved_cands_file_name)
with open(run_file_path, 'w') as run_file:
with open(retrieved_cands_file_path, 'w') as cand_file:
for idx, (distances, indices) in enumerate(zip(retrieved_cand_dist, retrieved_indices)):
cands = []
qid = unhash_qid(hashed_query_ids[idx])
query = qid_to_queries[qid]
task_id = qid_to_queries[qid]["task_id"]
for rank, (hashed_doc_id, score) in enumerate(zip(indices, distances), start=1):
# Format: query-id Q0 document-id rank score run-id task_id
# We can remove task_id if we don't need it later using a helper
# Note: since we are using the cosine similarity, we don't need to invert the scores.
doc_id = unhash_did(hashed_doc_id)
cands.append(did_to_candidates[doc_id])
run_file_line = f"{qid} Q0 {doc_id} {rank} {score} {run_id} {task_id}\n"
run_file.write(run_file_line)
json.dump({"query": query, "candidates": cands}, cand_file)
cand_file.write('\n')
print(f"Retriever: Run file saved to {run_file_path}")
print(f"Retriever: Retrieved candidates saved to {retrieved_cands_file_path}")


def run_hard_negative_mining(config):
uniir_dir = config.uniir_dir
mbeir_data_dir = config.mbeir_data_dir
Expand Down Expand Up @@ -526,6 +640,7 @@ def parse_arguments():
parser.add_argument("--enable_create_index", action="store_true", help="Enable create index")
parser.add_argument("--enable_hard_negative_mining", action="store_true", help="Enable hard negative mining")
parser.add_argument("--enable_retrieval", action="store_true", help="Enable retrieval")
parser.add_argument("--enable_raw_retrieval", action="store_true", help="Enable raw retrieval which skips metrics calculation, and stores retrieved candidates.")
return parser.parse_args()


Expand All @@ -546,6 +661,9 @@ def main():
if args.enable_retrieval:
run_retrieval(config)

if args.enable_raw_retrieval:
run_raw_retrieval(config)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
experiment:
description: ${model.name} ${model.size} ${experiment.instruct_status} ${experiment.exp_name}
exp_name: InBatch
instruct_status: Instruct
path_suffix: ${model.short_name}/${model.size}/${experiment.instruct_status}/${experiment.exp_name}/
model:
name: CLIPScoreFusion
short_name: CLIP_SF
size: Large
retrieval_config:
candidate_dir_name: cand_pool/global
embed_dir_name: embed
index_dir_name: index
query_dir_name: query
results_dir_name: retrieval_results
test_datasets_config:
correspond_cand_pools_name:
- UNION
- UNION
datasets_name:
- mscoco_task0
- mscoco_task3
enable_retrieve: true
train_datasets_config:
correspond_cand_pools_name: null
datasets_name: null
enable_retrieve: false
val_datasets_config:
correspond_cand_pools_name: null
correspond_qrels_name: null
datasets_name: null
enable_retrieve: false
write_to_tsv: false
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
set -e # Exit immediately if a command exits with a non-zero status

# Initialize Conda
source /home/miniconda3/etc/profile.d/conda.sh # <--- Change this to the path of your conda.sh
source /opt/anaconda3/etc/profile.d/conda.sh # <--- Change this to the path of your conda.sh

# Path to the codebase and config file
SRC="$HOME/UniIR/src" # Absolute path to codebse /UniIR/src # <--- Change this to the path of your UniIR/src
SRC="/store2/scratch/s8sharif/UniIR/src" # Absolute path to codebse /UniIR/src # <--- Change this to the path of your UniIR/src

# Path to common dir
COMMON_DIR="$SRC/common"

# Path to MBEIR data and UniIR directory where we store the checkpoints, embeddings, etc.
UNIIR_DIR="/data/UniIR/" # <--- Change this to the UniIR directory
MBEIR_DATA_DIR="/data/UniIR/M-BEIR/" # <--- Change this to the MBEIR data directory you download from HF page
UNIIR_DIR="/store2/scratch/s8sharif/UniIR/data/UniIR/" # <--- Change this to the UniIR directory
MBEIR_DATA_DIR="/mnt/users/s8sharif/M-BEIR/" # <--- Change this to the MBEIR data directory you download from HF page

# Path to config dir
MODEL="uniir_clip/clip_scorefusion" # <--- Change this to the model you want to run
Expand All @@ -24,8 +24,8 @@ EXP_NAME="inbatch"
CONFIG_DIR="$MODEL_DIR/configs_scripts/$SIZE/$MODE/$EXP_NAME"

# Set CUDA devices and PYTHONPATH
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # <--- Change this to the CUDA devices you want to use
NPROC=8 # <--- Change this to the number of GPUs you want to use
export CUDA_VISIBLE_DEVICES=5 # <--- Change this to the CUDA devices you want to use
NPROC=1 # <--- Change this to the number of GPUs you want to use
export PYTHONPATH=$SRC
echo "PYTHONPATH: $PYTHONPATH"
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
Expand All @@ -38,43 +38,44 @@ cd $COMMON_DIR
conda activate uniir # <--- Change this to the name of your conda environment

# Run Embedding command
CONFIG_PATH="$CONFIG_DIR/embed.yaml"
SCRIPT_NAME="mbeir_embedder.py"
echo "CONFIG_PATH: $CONFIG_PATH"
echo "SCRIPT_NAME: $SCRIPT_NAME"
# CONFIG_PATH="$CONFIG_DIR/embed.yaml"
# SCRIPT_NAME="mbeir_embedder.py"
# echo "CONFIG_PATH: $CONFIG_PATH"
# echo "SCRIPT_NAME: $SCRIPT_NAME"

python config_updater.py \
--update_mbeir_yaml_instruct_status \
--mbeir_yaml_file_path $CONFIG_PATH \
--enable_instruct True
# python config_updater.py \
# --update_mbeir_yaml_instruct_status \
# --mbeir_yaml_file_path $CONFIG_PATH \
# --enable_instruct True

python -m torch.distributed.run --nproc_per_node=$NPROC $SCRIPT_NAME \
--config_path "$CONFIG_PATH" \
--uniir_dir "$UNIIR_DIR" \
--mbeir_data_dir "$MBEIR_DATA_DIR"
# python -m torch.distributed.run --nproc_per_node=$NPROC $SCRIPT_NAME \
# --config_path "$CONFIG_PATH" \
# --uniir_dir "$UNIIR_DIR" \
# --mbeir_data_dir "$MBEIR_DATA_DIR"

# Activate faiss environment
conda activate faiss # <--- Change this to the name of your conda environment

# Run Index command
CONFIG_PATH="$CONFIG_DIR/index.yaml"
SCRIPT_NAME="mbeir_retriever.py"
echo "CONFIG_PATH: $CONFIG_PATH"
echo "SCRIPT_NAME: $SCRIPT_NAME"

python config_updater.py \
--update_mbeir_yaml_instruct_status \
--mbeir_yaml_file_path $CONFIG_PATH \
--enable_instruct True

python $SCRIPT_NAME \
--config_path "$CONFIG_PATH" \
--uniir_dir "$UNIIR_DIR" \
--mbeir_data_dir "$MBEIR_DATA_DIR" \
--enable_create_index
# CONFIG_PATH="$CONFIG_DIR/index.yaml"
# SCRIPT_NAME="mbeir_retriever.py"
# echo "CONFIG_PATH: $CONFIG_PATH"
# echo "SCRIPT_NAME: $SCRIPT_NAME"

# python config_updater.py \
# --update_mbeir_yaml_instruct_status \
# --mbeir_yaml_file_path $CONFIG_PATH \
# --enable_instruct True

# python $SCRIPT_NAME \
# --config_path "$CONFIG_PATH" \
# --uniir_dir "$UNIIR_DIR" \
# --mbeir_data_dir "$MBEIR_DATA_DIR" \
# --enable_create_index

# Run retrieval command
CONFIG_PATH="$CONFIG_DIR/retrieval.yaml"
#CONFIG_PATH="$CONFIG_DIR/retrieval.yaml"
CONFIG_PATH="$CONFIG_DIR/my_retrieval.yaml"
SCRIPT_NAME="mbeir_retriever.py"
echo "CONFIG_PATH: $CONFIG_PATH"
echo "SCRIPT_NAME: $SCRIPT_NAME"
Expand All @@ -88,4 +89,5 @@ python $SCRIPT_NAME \
--config_path "$CONFIG_PATH" \
--uniir_dir "$UNIIR_DIR" \
--mbeir_data_dir "$MBEIR_DATA_DIR" \
--enable_retrieval
--enable_raw_retrieval
#--enable_retrieval