From f8ba0b07d0993cd53334a99bfa24da4a70d8a6bb Mon Sep 17 00:00:00 2001 From: Habeeb Shopeju Date: Sat, 21 May 2022 01:00:33 +0100 Subject: [PATCH 1/4] Added batch equivalent of computeQueryDocumentScore --- .../io/anserini/index/IndexReaderUtils.java | 155 +++++++++++++++++- .../anserini/index/IndexReaderUtilsTest.java | 39 +++++ 2 files changed, 193 insertions(+), 1 deletion(-) diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java index bdc6564759..1c1b093180 100755 --- a/src/main/java/io/anserini/index/IndexReaderUtils.java +++ b/src/main/java/io/anserini/index/IndexReaderUtils.java @@ -20,6 +20,8 @@ import io.anserini.search.SearchArgs; import io.anserini.search.query.BagOfWordsQueryGenerator; import io.anserini.search.query.PhraseQueryGenerator; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; @@ -59,12 +61,17 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; /** * Class containing a bunch of static helper methods for accessing a Lucene inverted index. * This class provides a lot of functionality that is exposed in Python via Pyserini. */ public class IndexReaderUtils { + private static final Logger LOG = LogManager.getLogger(IndexReaderUtils.class); /** * An individual posting in a postings list. Note that this class is used primarily for inspecting @@ -726,7 +733,153 @@ public static float computeQueryDocumentScoreWithSimilarityAndAnalyzer( return rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1; } - // TODO: Write a variant of computeQueryDocumentScore that takes a set of documents. + /** + * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer. + * + * @param reader index reader + * @param docids A list of docids of the documents to score + * @param q query + * @param threads number of threads + * @return a map of document ids to their scores with respect to the query + * @throws IOException if error encountered during query + */ + public static Map batchComputeQueryDocumentScore( + IndexReader reader, List docids, String q, int threads) + throws IOException { + + SearchArgs args = new SearchArgs(); + return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, + new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0])), + IndexCollection.DEFAULT_ANALYZER, threads); + } + + + /** + * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer. + * + * @param reader index reader + * @param docids A list of docids of the documents to score + * @param q query + * @param similarity scoring function + * @param threads number of threads + * @return a map of document ids to their scores with respect to the query + * @throws IOException if error encountered during query + */ + public static Map batchComputeQueryDocumentScore( + IndexReader reader, List docids, String q, Similarity similarity, int threads) + throws IOException { + + return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, similarity, + IndexCollection.DEFAULT_ANALYZER, threads); + } + + + /** + * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer. + * + * @param reader index reader + * @param docids A list of docids of the documents to score + * @param q query + * @param analyzer analyzer to use + * @param threads number of threads + * @return a map of document ids to their scores with respect to the query + * @throws IOException if error encountered during query + */ + public static Map batchComputeQueryDocumentScore( + IndexReader reader, List docids, String q, Analyzer analyzer, int threads) + throws IOException { + + SearchArgs args = new SearchArgs(); + return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, + new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0])), + analyzer, threads); + } + + + /** + * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer. + * + * @param reader index reader + * @param docids A list of docids of the documents to score + * @param q query + * @param similarity scoring function + * @param analyzer analyzer to use + * @param threads number of threads + * @return a map of document ids to their scores with respect to the query + * @throws IOException if error encountered during query + */ + public static Map batchComputeQueryDocumentScore( + IndexReader reader, List docids, String q, Similarity similarity, Analyzer analyzer, int threads) + throws IOException { + return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, similarity, analyzer, threads); + } + + + /** + * Computes the scores of a batch of documents with respect to a query given a scoring function and an analyzer. + * + * @param reader index reader + * @param docids A list of docids of the documents to score + * @param q query + * @param similarity scoring function + * @param analyzer analyzer to use + * @param threads number of threads + * @return a map of document ids to their scores with respect to the query + * @throws IOException if error encountered during query + */ + public static Map batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer( + IndexReader reader, List docids, String q, Similarity similarity, Analyzer analyzer, int threads) + throws IOException { + // We compute the query-document score by issuing the query with an additional filter clause that restricts + // consideration to only the docid in question, and then returning the retrieval score. + // + // This implementation is inefficient, but as the advantage of using the existing Lucene similarity, which means + // that we don't need to copy the scoring function and keep it in sync wrt code updates. + + IndexSearcher searcher = new IndexSearcher(reader); + searcher.setSimilarity(similarity); + + ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(threads); + ConcurrentHashMap results = new ConcurrentHashMap<>(); + + for (String docid: docids) { + executor.execute(() -> { + try { + Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q); + + Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid))); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(filterQuery, BooleanClause.Occur.MUST); + builder.add(query, BooleanClause.Occur.MUST); + Query finalQuery = builder.build(); + + TopDocs rs = searcher.search(finalQuery, 1); + + // We want the score of the first (and only) hit, but remember to remove 1 for the ConstantScoreQuery. + // If we get zero results, indicates that term isn't found in the document. + float result = rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1; + results.put(docid, result); + } catch (Exception e){} + }); + } + + executor.shutdown(); + + try { + // Wait for existing tasks to terminate + while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { + LOG.info(String.format("%.2f percent completed", + (double) executor.getCompletedTaskCount() / docids.size() * 100.0d)); + } + } catch (InterruptedException ie) { + // (Re-)Cancel if current thread also interrupted + executor.shutdownNow(); + // Preserve interrupt status + Thread.currentThread().interrupt(); + } + + return results; + } /** * Converts a collection docid to a Lucene internal docid. diff --git a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java index a785235c40..520d00f80d 100755 --- a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java +++ b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java @@ -40,6 +40,7 @@ import java.io.ByteArrayOutputStream; import java.io.PrintStream; +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -535,6 +536,44 @@ public void testComputeQueryDocumentScore() throws Exception { dir.close(); } + @Test + public void testBatchComputeQueryDocumentScore() throws Exception { + SimpleSearcher searcher = new SimpleSearcher(tempDir1.toString()); + Directory dir = FSDirectory.open(tempDir1); + IndexReader reader = DirectoryReader.open(dir); + Similarity similarity = new BM25Similarity(0.9f, 0.4f); + + // A bunch of test queries... + String[] queries = {"text city", "text", "city"}; + + for (String query: queries) { + SimpleSearcher.Result[] results = searcher.search(query); + + // Strategy is to loop over the results, compute query-document score individually, and compare. + List docids = new ArrayList(); + for (SimpleSearcher.Result result: results){ + docids.add(result.docid); + } + + Map batchScore = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity, 2); + for (SimpleSearcher.Result result: results){ + assertEquals(batchScore.get(result.docid), result.score, 10e-5); + } + + + // This is hard coded - doc3 isn't retrieved by any of the queries. + String fakeId = "doc3"; + docids = List.of(fakeId); + batchScore = IndexReaderUtils.batchComputeQueryDocumentScore( + reader, docids, query, similarity, 2); + assertEquals(0.0f, batchScore.get(fakeId), 10e-6); + } + + reader.close(); + dir.close(); + } + + @Test public void testGetIndexStats() throws Exception { Directory dir = FSDirectory.open(tempDir1); From abdc1c3c9690bf6fa40343dc42701a2bf0b61180 Mon Sep 17 00:00:00 2001 From: Habeeb Shopeju Date: Sat, 21 May 2022 02:35:25 +0100 Subject: [PATCH 2/4] Added test for computing query score with non-default analyzer --- .../anserini/index/IndexReaderUtilsTest.java | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java index 520d00f80d..836266cc29 100755 --- a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java +++ b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java @@ -21,6 +21,7 @@ import io.anserini.analysis.DefaultEnglishAnalyzer; import io.anserini.search.SearchArgs; import io.anserini.search.SimpleSearcher; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexReader; @@ -538,7 +539,10 @@ public void testComputeQueryDocumentScore() throws Exception { @Test public void testBatchComputeQueryDocumentScore() throws Exception { - SimpleSearcher searcher = new SimpleSearcher(tempDir1.toString()); + SimpleSearcher searcher1 = new SimpleSearcher(tempDir1.toString()); + // Using analyzer asides the default for second searcher. + Analyzer stemAnalyzer = DefaultEnglishAnalyzer.newStemmingInstance("krovertz"); + SimpleSearcher searcher2 = new SimpleSearcher(tempDir1.toString(), stemAnalyzer); Directory dir = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir); Similarity similarity = new BM25Similarity(0.9f, 0.4f); @@ -547,24 +551,28 @@ public void testBatchComputeQueryDocumentScore() throws Exception { String[] queries = {"text city", "text", "city"}; for (String query: queries) { - SimpleSearcher.Result[] results = searcher.search(query); + SimpleSearcher.Result[] results1 = searcher1.search(query); - // Strategy is to loop over the results, compute query-document score individually, and compare. List docids = new ArrayList(); - for (SimpleSearcher.Result result: results){ + for (SimpleSearcher.Result result: results1){ docids.add(result.docid); } - Map batchScore = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity, 2); - for (SimpleSearcher.Result result: results){ - assertEquals(batchScore.get(result.docid), result.score, 10e-5); + Map batchScore1 = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity, 2); + for (SimpleSearcher.Result result: results1){ + assertEquals(batchScore1.get(result.docid), result.score, 10e-5); } + SimpleSearcher.Result[] results2 = searcher2.search(query); + Map batchScore2 = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity, stemAnalyzer, 2); + for (SimpleSearcher.Result result: results2){ + assertEquals(batchScore2.get(result.docid), result.score, 10e-5); + } // This is hard coded - doc3 isn't retrieved by any of the queries. String fakeId = "doc3"; docids = List.of(fakeId); - batchScore = IndexReaderUtils.batchComputeQueryDocumentScore( + Map batchScore = IndexReaderUtils.batchComputeQueryDocumentScore( reader, docids, query, similarity, 2); assertEquals(0.0f, batchScore.get(fakeId), 10e-6); } From 9c716f99b468e9e5dc5bd871d72ca58a82b90aa1 Mon Sep 17 00:00:00 2001 From: Habeeb Shopeju Date: Fri, 3 Jun 2022 00:31:49 +0100 Subject: [PATCH 3/4] Changed batch implementation for compute query document score --- .../io/anserini/index/IndexReaderUtils.java | 59 ++++++------------- 1 file changed, 18 insertions(+), 41 deletions(-) diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java index 1c1b093180..3a716edce3 100755 --- a/src/main/java/io/anserini/index/IndexReaderUtils.java +++ b/src/main/java/io/anserini/index/IndexReaderUtils.java @@ -61,10 +61,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.Executors; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; /** * Class containing a bunch of static helper methods for accessing a Lucene inverted index. @@ -830,52 +826,33 @@ public static Map batchComputeQueryDocumentScore( public static Map batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer( IndexReader reader, List docids, String q, Similarity similarity, Analyzer analyzer, int threads) throws IOException { - // We compute the query-document score by issuing the query with an additional filter clause that restricts - // consideration to only the docid in question, and then returning the retrieval score. - // - // This implementation is inefficient, but as the advantage of using the existing Lucene similarity, which means - // that we don't need to copy the scoring function and keep it in sync wrt code updates. + // We compute the query-document score by issuing the query with additional filters that restricts + // consideration to the set of docids provided, and then returning the retrieval score. IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(similarity); - ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(threads); - ConcurrentHashMap results = new ConcurrentHashMap<>(); - - for (String docid: docids) { - executor.execute(() -> { - try { - Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q); - - Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid))); - BooleanQuery.Builder builder = new BooleanQuery.Builder(); - builder.add(filterQuery, BooleanClause.Occur.MUST); - builder.add(query, BooleanClause.Occur.MUST); - Query finalQuery = builder.build(); + HashMap results = new HashMap<>(); - TopDocs rs = searcher.search(finalQuery, 1); + Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q); - // We want the score of the first (and only) hit, but remember to remove 1 for the ConstantScoreQuery. - // If we get zero results, indicates that term isn't found in the document. - float result = rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1; - results.put(docid, result); - } catch (Exception e){} - }); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (String docid: docids){ + // Setting default result value for all docids. + results.put(docid, 0.0f); + Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid))); + builder.add(filterQuery, BooleanClause.Occur.SHOULD); } + builder.add(query, BooleanClause.Occur.MUST); + Query finalQuery = builder.build(); - executor.shutdown(); + TopDocs rs = searcher.search(finalQuery, docids.size()); - try { - // Wait for existing tasks to terminate - while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { - LOG.info(String.format("%.2f percent completed", - (double) executor.getCompletedTaskCount() / docids.size() * 100.0d)); - } - } catch (InterruptedException ie) { - // (Re-)Cancel if current thread also interrupted - executor.shutdownNow(); - // Preserve interrupt status - Thread.currentThread().interrupt(); + for (int i=0; i < rs.scoreDocs.length; i++){ + String docid = convertLuceneDocidToDocid(reader, rs.scoreDocs[i].doc); + // Removing 1 for the ConstantScoreQuery. + float result = rs.scoreDocs[i].score -1; + results.put(docid, result); } return results; From 3e06d73322e03d80869d7eba24c7ec2e3eaa90d8 Mon Sep 17 00:00:00 2001 From: Habeeb Shopeju Date: Fri, 3 Jun 2022 02:44:54 +0100 Subject: [PATCH 4/4] Removed thread parameters and arguments --- .../io/anserini/index/IndexReaderUtils.java | 23 ++++++++----------- .../anserini/index/IndexReaderUtilsTest.java | 7 +++--- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java index 3a716edce3..c46b40064d 100755 --- a/src/main/java/io/anserini/index/IndexReaderUtils.java +++ b/src/main/java/io/anserini/index/IndexReaderUtils.java @@ -735,18 +735,17 @@ public static float computeQueryDocumentScoreWithSimilarityAndAnalyzer( * @param reader index reader * @param docids A list of docids of the documents to score * @param q query - * @param threads number of threads * @return a map of document ids to their scores with respect to the query * @throws IOException if error encountered during query */ public static Map batchComputeQueryDocumentScore( - IndexReader reader, List docids, String q, int threads) + IndexReader reader, List docids, String q) throws IOException { SearchArgs args = new SearchArgs(); return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0])), - IndexCollection.DEFAULT_ANALYZER, threads); + IndexCollection.DEFAULT_ANALYZER); } @@ -757,16 +756,15 @@ public static Map batchComputeQueryDocumentScore( * @param docids A list of docids of the documents to score * @param q query * @param similarity scoring function - * @param threads number of threads * @return a map of document ids to their scores with respect to the query * @throws IOException if error encountered during query */ public static Map batchComputeQueryDocumentScore( - IndexReader reader, List docids, String q, Similarity similarity, int threads) + IndexReader reader, List docids, String q, Similarity similarity) throws IOException { return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, similarity, - IndexCollection.DEFAULT_ANALYZER, threads); + IndexCollection.DEFAULT_ANALYZER); } @@ -777,18 +775,17 @@ public static Map batchComputeQueryDocumentScore( * @param docids A list of docids of the documents to score * @param q query * @param analyzer analyzer to use - * @param threads number of threads * @return a map of document ids to their scores with respect to the query * @throws IOException if error encountered during query */ public static Map batchComputeQueryDocumentScore( - IndexReader reader, List docids, String q, Analyzer analyzer, int threads) + IndexReader reader, List docids, String q, Analyzer analyzer) throws IOException { SearchArgs args = new SearchArgs(); return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0])), - analyzer, threads); + analyzer); } @@ -800,14 +797,13 @@ public static Map batchComputeQueryDocumentScore( * @param q query * @param similarity scoring function * @param analyzer analyzer to use - * @param threads number of threads * @return a map of document ids to their scores with respect to the query * @throws IOException if error encountered during query */ public static Map batchComputeQueryDocumentScore( - IndexReader reader, List docids, String q, Similarity similarity, Analyzer analyzer, int threads) + IndexReader reader, List docids, String q, Similarity similarity, Analyzer analyzer) throws IOException { - return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, similarity, analyzer, threads); + return batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer(reader, docids, q, similarity, analyzer); } @@ -819,12 +815,11 @@ public static Map batchComputeQueryDocumentScore( * @param q query * @param similarity scoring function * @param analyzer analyzer to use - * @param threads number of threads * @return a map of document ids to their scores with respect to the query * @throws IOException if error encountered during query */ public static Map batchComputeQueryDocumentScoreWithSimilarityAndAnalyzer( - IndexReader reader, List docids, String q, Similarity similarity, Analyzer analyzer, int threads) + IndexReader reader, List docids, String q, Similarity similarity, Analyzer analyzer) throws IOException { // We compute the query-document score by issuing the query with additional filters that restricts // consideration to the set of docids provided, and then returning the retrieval score. diff --git a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java index 836266cc29..c31236a6d5 100755 --- a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java +++ b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java @@ -558,13 +558,13 @@ public void testBatchComputeQueryDocumentScore() throws Exception { docids.add(result.docid); } - Map batchScore1 = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity, 2); + Map batchScore1 = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity); for (SimpleSearcher.Result result: results1){ assertEquals(batchScore1.get(result.docid), result.score, 10e-5); } SimpleSearcher.Result[] results2 = searcher2.search(query); - Map batchScore2 = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity, stemAnalyzer, 2); + Map batchScore2 = IndexReaderUtils.batchComputeQueryDocumentScore(reader, docids, query, similarity, stemAnalyzer); for (SimpleSearcher.Result result: results2){ assertEquals(batchScore2.get(result.docid), result.score, 10e-5); } @@ -573,7 +573,7 @@ public void testBatchComputeQueryDocumentScore() throws Exception { String fakeId = "doc3"; docids = List.of(fakeId); Map batchScore = IndexReaderUtils.batchComputeQueryDocumentScore( - reader, docids, query, similarity, 2); + reader, docids, query, similarity); assertEquals(0.0f, batchScore.get(fakeId), 10e-6); } @@ -581,7 +581,6 @@ public void testBatchComputeQueryDocumentScore() throws Exception { dir.close(); } - @Test public void testGetIndexStats() throws Exception { Directory dir = FSDirectory.open(tempDir1);