Skip to content

Commit

Permalink
Make BP work on indexes that have blocks. (#13125)
Browse files Browse the repository at this point in the history
The current logic for reordering splits a slice of doc IDs into a left side and
a right side, and for each document it computes the expected gain of moving to
the other side. Then it swaps documents from both sides as long as the sum of
the gain of moving the left doc to the right and the right doc to the left is
positive.

This works well, but I would like to extend BP reordering to also work with
blocks, and the swapping logic is challenging to modify as two parent documents
may have different numbers of children.

One of the follow-up papers on BP suggested to use a different logic, where one
would compute a bias for all documents that is negative when a document is
attracted to the left and positive otherwise. Then we only have to partition doc
IDs around the mid point, e.g. with quickselect.

A benefit of this change is that it will make it easier to generalize BP
reordering to indexes that have blocks, e.g. by using a stable sort on biases.
  • Loading branch information
jpountz committed Mar 13, 2024
1 parent f6d7093 commit cb318af
Show file tree
Hide file tree
Showing 3 changed files with 237 additions and 10 deletions.
4 changes: 3 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ API Changes

New Features
---------------------
(No changes)

* GITHUB#13125: Recursive graph bisection is now supported on indexes that have blocks, as long as
they configure a parent field via `IndexWriterConfig#setParentField`. (Adrien Grand)

Improvements
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.util.concurrent.RecursiveAction;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
Expand All @@ -46,10 +47,12 @@
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntroSelector;
import org.apache.lucene.util.IntroSorter;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.packed.PackedInts;

Expand Down Expand Up @@ -253,16 +256,19 @@ private class IndexReorderingTask extends BaseRecursiveAction {
private final IntsRef docIDs;
private final float[] biases;
private final CloseableThreadLocal<PerThreadState> threadLocal;
private final BitSet parents;

IndexReorderingTask(
IntsRef docIDs,
float[] biases,
CloseableThreadLocal<PerThreadState> threadLocal,
BitSet parents,
int depth) {
super(depth);
this.docIDs = docIDs;
this.biases = biases;
this.threadLocal = threadLocal;
this.parents = parents;
}

private void computeDocFreqs(IntsRef docs, ForwardIndex forwardIndex, int[] docFreqs) {
Expand Down Expand Up @@ -292,6 +298,7 @@ protected void compute() {
} else {
assert sorted(docIDs);
}
assert assertParentStructure();

int halfLength = docIDs.length / 2;
if (halfLength < minPartitionSize) {
Expand All @@ -315,7 +322,14 @@ protected void compute() {
try {
moved =
shuffle(
forwardIndex, docIDs, right.offset, leftDocFreqs, rightDocFreqs, biases, iter);
forwardIndex,
docIDs,
right.offset,
leftDocFreqs,
rightDocFreqs,
biases,
parents,
iter);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
Expand All @@ -324,11 +338,33 @@ protected void compute() {
}
}

if (parents != null) {
// Make sure we split just after a parent doc
int lastLeftDocID = docIDs.ints[right.offset - 1];
int split = right.offset + parents.nextSetBit(lastLeftDocID) - lastLeftDocID;

if (split == docIDs.offset + docIDs.length) {
// No good split on the right side, look on the left side then.
split = right.offset - (lastLeftDocID - parents.prevSetBit(lastLeftDocID));
if (split == docIDs.offset) {
// No good split on the left side either: this slice has a single parent document, no
// reordering is possible. Stop recursing.
return;
}
}

assert parents.get(docIDs.ints[split - 1]);

left = new IntsRef(docIDs.ints, docIDs.offset, split - docIDs.offset);
right = new IntsRef(docIDs.ints, split, docIDs.offset + docIDs.length - split);
}

// It is fine for all tasks to share the same docs / biases array since they all work on
// different slices of the array at a given point in time.
IndexReorderingTask leftTask = new IndexReorderingTask(left, biases, threadLocal, depth + 1);
IndexReorderingTask leftTask =
new IndexReorderingTask(left, biases, threadLocal, parents, depth + 1);
IndexReorderingTask rightTask =
new IndexReorderingTask(right, biases, threadLocal, depth + 1);
new IndexReorderingTask(right, biases, threadLocal, parents, depth + 1);

if (shouldFork(docIDs.length, docIDs.ints.length)) {
invokeAll(leftTask, rightTask);
Expand All @@ -338,6 +374,28 @@ protected void compute() {
}
}

// used for asserts
private boolean assertParentStructure() {
if (parents == null) {
return true;
}
int i = docIDs.offset;
final int end = docIDs.offset + docIDs.length;
while (i < end) {
final int firstChild = docIDs.ints[i];
final int parent = parents.nextSetBit(firstChild);
final int numChildren = parent - firstChild;
assert i + numChildren < end;
for (int j = 1; j <= numChildren; ++j) {
assert docIDs.ints[i + j] == firstChild + j : "Parent structure has not been preserved";
}
i += numChildren + 1;
}
assert i == end : "Last doc ID must be a parent doc";

return true;
}

/**
* Shuffle doc IDs across both partitions so that each partition has lower gaps between
* consecutive postings.
Expand All @@ -349,6 +407,7 @@ private boolean shuffle(
int[] leftDocFreqs,
int[] rightDocFreqs,
float[] biases,
BitSet parents,
int iter)
throws IOException {

Expand All @@ -366,6 +425,23 @@ private boolean shuffle(
depth)
.compute();

if (parents != null) {
for (int i = docIDs.offset, end = docIDs.offset + docIDs.length; i < end; ) {
final int firstChild = docIDs.ints[i];
final int numChildren = parents.nextSetBit(firstChild) - firstChild;
assert parents.get(docIDs.ints[i + numChildren]);
double cumulativeBias = 0;
for (int j = 0; j <= numChildren; ++j) {
cumulativeBias += biases[i + j];
}
// Give all docs from the same block the same bias, which is the sum of biases of all
// documents in the block. This helps ensure that the follow-up sort() call preserves the
// block structure.
Arrays.fill(biases, i, i + numChildren + 1, (float) cumulativeBias);
i += numChildren + 1;
}
}

float maxLeftBias = Float.NEGATIVE_INFINITY;
for (int i = docIDs.offset; i < midPoint; ++i) {
maxLeftBias = Math.max(maxLeftBias, biases[i]);
Expand All @@ -383,19 +459,19 @@ private boolean shuffle(
return false;
}

new IntroSelector() {
class Selector extends IntroSelector {

int pivotDoc;
float pivotBias;

@Override
protected void setPivot(int i) {
public void setPivot(int i) {
pivotDoc = docIDs.ints[i];
pivotBias = biases[i];
}

@Override
protected int comparePivot(int j) {
public int comparePivot(int j) {
int cmp = Float.compare(pivotBias, biases[j]);
if (cmp == 0) {
// Tie break on the doc ID to preserve doc ID ordering as much as possible
Expand All @@ -405,7 +481,7 @@ protected int comparePivot(int j) {
}

@Override
protected void swap(int i, int j) {
public void swap(int i, int j) {
float tmpBias = biases[i];
biases[i] = biases[j];
biases[j] = tmpBias;
Expand All @@ -426,7 +502,32 @@ protected void swap(int i, int j) {
}
}
}
}.select(docIDs.offset, docIDs.offset + docIDs.length, midPoint);
}

Selector selector = new Selector();

if (parents == null) {
selector.select(docIDs.offset, docIDs.offset + docIDs.length, midPoint);
} else {
// When we have parents, we need to do a full sort to make sure we're not breaking the
// parent structure.
new IntroSorter() {
@Override
protected void setPivot(int i) {
selector.setPivot(i);
}

@Override
protected int comparePivot(int j) {
return selector.comparePivot(j);
}

@Override
protected void swap(int i, int j) {
selector.swap(i, j);
}
}.sort(docIDs.offset, docIDs.offset + docIDs.length);
}

return true;
}
Expand Down Expand Up @@ -809,6 +910,12 @@ private int[] computePermutation(CodecReader reader, Set<String> fields, Directo
sortedDocs[i] = i;
}

BitSet parents = null;
String parentField = reader.getFieldInfos().getParentField();
if (parentField != null) {
parents = BitSet.of(DocValues.getNumeric(reader, parentField), maxDoc);
}

try (CloseableThreadLocal<PerThreadState> threadLocal =
new CloseableThreadLocal<>() {
@Override
Expand All @@ -817,7 +924,8 @@ protected PerThreadState initialValue() {
}
}) {
IntsRef docs = new IntsRef(sortedDocs, 0, sortedDocs.length);
IndexReorderingTask task = new IndexReorderingTask(docs, new float[maxDoc], threadLocal, 0);
IndexReorderingTask task =
new IndexReorderingTask(docs, new float[maxDoc], threadLocal, parents, 0);
if (forkJoinPool != null) {
forkJoinPool.execute(task);
task.join();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,123 @@ public void doTestSingleTerm(ForkJoinPool pool) throws IOException {
dir.close();
}

public void testSingleTermWithBlocks() throws IOException {
doTestSingleTermWithBlocks(null);
}

public void testSingleTermWithBlocksAndForkJoinPool() throws IOException {
int concurrency = TestUtil.nextInt(random(), 1, 8);
// The default ForkJoinPool implementation uses a thread factory that removes all permissions on
// threads, so we need to create our own to avoid tests failing with FS-based directories.
ForkJoinPool pool =
new ForkJoinPool(
concurrency, p -> new ForkJoinWorkerThread(p) {}, null, random().nextBoolean());
try {
doTestSingleTermWithBlocks(pool);
} finally {
pool.shutdown();
}
}

private void doTestSingleTermWithBlocks(ForkJoinPool pool) throws IOException {
Directory dir = newDirectory();
IndexWriter w =
new IndexWriter(
dir,
newIndexWriterConfig()
.setParentField("parent")
.setMergePolicy(newLogMergePolicy(random().nextBoolean())));

w.addDocuments(createBlock("1", "lucene", "search", "lucene")); // 0-2
w.addDocuments(createBlock("2", "lucene")); // 3
w.addDocuments(createBlock("3", "search", "lucene", "search")); // 4-6
w.addDocuments(createBlock("4", "lucene", "lucene", "search")); // 7-9
w.addDocuments(createBlock("5", "lucene", "lucene", "lucene", "lucene")); // 10-13
w.addDocuments(createBlock("6", "search", "search", "search")); // 14-16
w.addDocuments(createBlock("7", "search", "lucene", "search")); // 17-19
w.addDocuments(createBlock("8", "search", "search")); // 20-21

w.forceMerge(1);

DirectoryReader reader = DirectoryReader.open(w);
LeafReader leafRealer = getOnlyLeafReader(reader);
CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafRealer);

BPIndexReorderer reorderer = new BPIndexReorderer();
reorderer.setForkJoinPool(pool);
reorderer.setMinDocFreq(2);
reorderer.setMinPartitionSize(1);
reorderer.setMaxIters(10);
CodecReader reordered = reorderer.reorder(codecReader, dir);
StoredFields storedFields = reordered.storedFields();

assertEquals("2", storedFields.document(0).get("id"));

assertEquals("1", storedFields.document(1).get("parent_id"));
assertEquals("0", storedFields.document(1).get("child_id"));
assertEquals("1", storedFields.document(2).get("parent_id"));
assertEquals("1", storedFields.document(2).get("child_id"));
assertEquals("1", storedFields.document(3).get("id"));

assertEquals("4", storedFields.document(4).get("parent_id"));
assertEquals("0", storedFields.document(4).get("child_id"));
assertEquals("4", storedFields.document(5).get("parent_id"));
assertEquals("1", storedFields.document(5).get("child_id"));
assertEquals("4", storedFields.document(6).get("id"));

assertEquals("5", storedFields.document(7).get("parent_id"));
assertEquals("0", storedFields.document(7).get("child_id"));
assertEquals("5", storedFields.document(8).get("parent_id"));
assertEquals("1", storedFields.document(8).get("child_id"));
assertEquals("5", storedFields.document(9).get("parent_id"));
assertEquals("2", storedFields.document(9).get("child_id"));
assertEquals("5", storedFields.document(10).get("id"));

assertEquals("3", storedFields.document(11).get("parent_id"));
assertEquals("0", storedFields.document(11).get("child_id"));
assertEquals("3", storedFields.document(12).get("parent_id"));
assertEquals("1", storedFields.document(12).get("child_id"));
assertEquals("3", storedFields.document(13).get("id"));

assertEquals("7", storedFields.document(14).get("parent_id"));
assertEquals("0", storedFields.document(14).get("child_id"));
assertEquals("7", storedFields.document(15).get("parent_id"));
assertEquals("1", storedFields.document(15).get("child_id"));
assertEquals("7", storedFields.document(16).get("id"));

assertEquals("8", storedFields.document(17).get("parent_id"));
assertEquals("0", storedFields.document(17).get("child_id"));
assertEquals("8", storedFields.document(18).get("id"));

assertEquals("6", storedFields.document(19).get("parent_id"));
assertEquals("0", storedFields.document(19).get("child_id"));
assertEquals("6", storedFields.document(20).get("parent_id"));
assertEquals("1", storedFields.document(20).get("child_id"));
assertEquals("6", storedFields.document(21).get("id"));

reader.close();
w.close();
dir.close();
}

private List<Document> createBlock(String parentID, String... values) {
List<Document> docs = new ArrayList<>();
for (int i = 0; i < values.length - 1; ++i) {
Document doc = new Document();
doc.add(new StoredField("parent_id", parentID));
doc.add(new StoredField("child_id", Integer.toString(i)));
doc.add(new StringField("body", values[i], Store.NO));
docs.add(doc);
}

Document parentDoc = new Document();
parentDoc.add(new StoredField("id", parentID));
parentDoc.add(new StringField("body", values[values.length - 1], Store.NO));
docs.add(parentDoc);

return docs;
}

public void testMultiTerm() throws IOException {
Directory dir = newDirectory();
IndexWriter w =
Expand Down

0 comments on commit cb318af

Please sign in to comment.