castorini · HAKSOAT · May 21, 2022 · May 21, 2022 · Jun 2, 2022 · Jun 3, 2022
diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java
@@ -20,6 +20,8 @@
 import io.anserini.search.SearchArgs;
 import io.anserini.search.query.BagOfWordsQueryGenerator;
 import io.anserini.search.query.PhraseQueryGenerator;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
@@ -65,6 +67,7 @@
  * This class provides a lot of functionality that is exposed in Python via Pyserini.
  */
 public class IndexReaderUtils {
+  private static final Logger LOG = LogManager.getLogger(IndexReaderUtils.class);
 
   /**
    * An individual posting in a postings list. Note that this class is used primarily for inspecting
@@ -726,7 +729,129 @@ public static float computeQueryDocumentScoreWithSimilarityAndAnalyzer(
     return rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1;
   }
 
-  // TODO: Write a variant of computeQueryDocumentScore that takes a set of documents.
+  /**
+   * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer.
+   *
+   * @param reader index reader
+   * @param docids A list of docids of the documents to score
+   * @param q query
+   * @return a map of document ids to their scores with respect to the query
+   * @throws IOException if error encountered during query
+   */
+  public static Map<String, Float> batchComputeQueryDocumentScore(
+          IndexReader reader, List<String> docids, String q)
+          throws IOException {
+
+    SearchArgs args = new SearchArgs();
+    return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q,
+            new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0])),
+            IndexCollection.DEFAULT_ANALYZER);
+  }
+
+
+  /**
+   * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer.
+   *
+   * @param reader index reader
+   * @param docids A list of docids of the documents to score
+   * @param q query
+   * @param similarity scoring function
+   * @return a map of document ids to their scores with respect to the query
+   * @throws IOException if error encountered during query
+   */
+  public static Map<String, Float> batchComputeQueryDocumentScore(
+          IndexReader reader, List<String> docids, String q, Similarity similarity)
+          throws IOException {
+
+    return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, similarity,
+            IndexCollection.DEFAULT_ANALYZER);
+  }
+
+
+  /**
+   * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer.
+   *
+   * @param reader index reader
+   * @param docids A list of docids of the documents to score
+   * @param q query
+   * @param analyzer analyzer to use
+   * @return a map of document ids to their scores with respect to the query
+   * @throws IOException if error encountered during query
+   */
+  public static Map<String, Float> batchComputeQueryDocumentScore(
+          IndexReader reader, List<String> docids, String q, Analyzer analyzer)
+          throws IOException {
+
+    SearchArgs args = new SearchArgs();
+    return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q,
+            new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0])),
+            analyzer);
+  }
+
+
+  /**
+   * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer.
+   *
+   * @param reader index reader
+   * @param docids A list of docids of the documents to score
+   * @param q query
+   * @param similarity scoring function
+   * @param analyzer analyzer to use
+   * @return a map of document ids to their scores with respect to the query
+   * @throws IOException if error encountered during query
+   */
+  public static Map<String, Float> batchComputeQueryDocumentScore(
+          IndexReader reader, List<String> docids, String q, Similarity similarity, Analyzer analyzer)
+          throws IOException {
+    return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, similarity, analyzer);
+  }
+
+
+  /**
+   * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer.
+   *
+   * @param reader index reader
+   * @param docids A list of docids of the documents to score
+   * @param q query
+   * @param similarity scoring function
+   * @param analyzer analyzer to use
+   * @return a map of document ids to their scores with respect to the query
+   * @throws IOException if error encountered during query
+   */
+  public static Map<String, Float> batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(
+          IndexReader reader, List<String> docids, String q, Similarity similarity, Analyzer analyzer)
+          throws IOException {
+    // We compute the query-document score by issuing the query with additional filters that restricts
+    // consideration to the set of docids provided, and then returning the retrieval score.
+
+    IndexSearcher searcher = new IndexSearcher(reader);
+    searcher.setSimilarity(similarity);
+
+    HashMap<String, Float> results = new HashMap<>();
+
+    Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q);
+
+    BooleanQuery.Builder builder = new BooleanQuery.Builder();
+    for (String docid: docids){
+      // Setting default result value for all docids.
+      results.put(docid, 0.0f);
+      Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid)));
+      builder.add(filterQuery, BooleanClause.Occur.SHOULD);
+    }
+    builder.add(query, BooleanClause.Occur.MUST);
+    Query finalQuery = builder.build();
+
+    TopDocs rs = searcher.search(finalQuery, docids.size());
+
+    for (int i=0; i < rs.scoreDocs.length; i++){
+      String docid = convertLuceneDocidToDocid(reader, rs.scoreDocs[i].doc);
+      // Removing 1 for the ConstantScoreQuery.
+      float result = rs.scoreDocs[i].score -1;
+      results.put(docid, result);
+    }
+
+    return results;
+  }
 
   /**
    * Converts a collection docid to a Lucene internal docid.

diff --git a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java
@@ -21,6 +21,7 @@
 import io.anserini.analysis.DefaultEnglishAnalyzer;
 import io.anserini.search.SearchArgs;
 import io.anserini.search.SimpleSearcher;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexReader;
@@ -40,6 +41,7 @@
 
 import java.io.ByteArrayOutputStream;
 import java.io.PrintStream;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
@@ -535,6 +537,50 @@ public void testComputeQueryDocumentScore() throws Exception {
     dir.close();
   }
 
+  @Test
+  public void testBatchComputeQueryDocumentScore() throws Exception {
+    SimpleSearcher searcher1 = new SimpleSearcher(tempDir1.toString());
+    // Using analyzer asides the default for second searcher.
+    Analyzer stemAnalyzer = DefaultEnglishAnalyzer.newStemmingInstance("krovertz");
+    SimpleSearcher searcher2 = new SimpleSearcher(tempDir1.toString(), stemAnalyzer);
+    Directory dir = FSDirectory.open(tempDir1);
+    IndexReader reader = DirectoryReader.open(dir);
+    Similarity similarity = new BM25Similarity(0.9f, 0.4f);
+
+    // A bunch of test queries...
+    String[] queries = {"text city", "text", "city"};
+
+    for (String query: queries) {
+      SimpleSearcher.Result[] results1 = searcher1.search(query);
+
+      List<String> docids = new ArrayList<String>();
+      for (SimpleSearcher.Result result: results1){
+        docids.add(result.docid);
+      }
+
+      Map<String, Float> batchScore1 = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity);
+      for (SimpleSearcher.Result result: results1){
+        assertEquals(batchScore1.get(result.docid), result.score, 10e-5);
+      }
+
+      SimpleSearcher.Result[] results2 = searcher2.search(query);
+      Map<String, Float> batchScore2 = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity, stemAnalyzer);
+      for (SimpleSearcher.Result result: results2){
+        assertEquals(batchScore2.get(result.docid), result.score, 10e-5);
+      }
+
+      // This is hard coded - doc3 isn't retrieved by any of the queries.
+      String fakeId = "doc3";
+      docids = List.of(fakeId);
+      Map<String, Float> batchScore = IndexReaderUtils.batchComputeQueryDocumentScore(
+              reader, docids, query, similarity);
+      assertEquals(0.0f, batchScore.get(fakeId), 10e-6);
+    }
+
+    reader.close();
+    dir.close();
+  }
+
   @Test
   public void testGetIndexStats() throws Exception {
     Directory dir = FSDirectory.open(tempDir1);