Skip to content

Commit

Permalink
refactor: index collection per learning_package
Browse files Browse the repository at this point in the history
As get_all_collections api was refactored [1] to only support fetching
collections by learning_package, update reindex function to get
collections by learning_package.

1. openedx/openedx-learning#216
  • Loading branch information
navinkarkera committed Sep 4, 2024
1 parent ba883fa commit 0511d51
Showing 1 changed file with 38 additions and 43 deletions.
81 changes: 38 additions & 43 deletions openedx/core/djangoapps/content/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,23 +289,29 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:

# Get the lists of libraries
status_cb("Counting libraries...")
lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')]
lib_keys = [
(lib.library_key, lib.learning_package_id)
for lib in lib_api.ContentLibrary.objects.select_related(
'org',
'learning_package'
).only(
'org',
'slug',
'learning_package_id'
)
]
num_libraries = len(lib_keys)

# Get the list of courses
status_cb("Counting courses...")
num_courses = CourseOverview.objects.count()

# Get the list of collections
status_cb("Counting collections...")
num_collections = authoring_api.get_collections().count()

# Some counters so we can track our progress as indexing progresses:
num_contexts = num_courses + num_libraries + num_collections
num_contexts = num_courses + num_libraries
num_contexts_done = 0 # How many courses/libraries we've indexed
num_blocks_done = 0 # How many individual components/XBlocks we've indexed

status_cb(f"Found {num_courses} courses, {num_libraries} libraries and {num_collections} collections.")
status_cb(f"Found {num_courses} courses and {num_libraries} libraries.")
with _using_temp_index(status_cb) as temp_index_name:
############## Configure the index ##############

Expand Down Expand Up @@ -369,7 +375,26 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
])

############## Libraries ##############
status_cb("Indexing libraries...")
status_cb("Indexing libraries and collections...")

def index_collection_batch(batch) -> int:
docs = []
for collection in batch:
try:
doc = searchable_doc_for_collection(collection)
# Uncomment below line once collections are tagged.
# doc.update(searchable_doc_tags(collection.id))
docs.append(doc)
except Exception as err: # pylint: disable=broad-except
status_cb(f"Error indexing collection {collection}: {err}")

if docs:
try:
# Add docs in batch of 100 at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
except (TypeError, KeyError, MeilisearchError) as err:
status_cb(f"Error indexing collection batch {p}: {err}")
return docs

def index_library(lib_key: str) -> list:
docs = []
Expand All @@ -390,10 +415,13 @@ def index_library(lib_key: str) -> list:
status_cb(f"Error indexing library {lib_key}: {err}")
return docs

for lib_key in lib_keys:
for lib_key, learning_package_id in lib_keys:
status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}")
lib_docs = index_library(lib_key)
num_blocks_done += len(lib_docs)
collections = authoring_api.get_collections(learning_package_id)
collection_docs = index_collection_batch(collections)
num_blocks_done += len(collection_docs)
num_contexts_done += 1

############## Courses ##############
Expand Down Expand Up @@ -430,40 +458,7 @@ def add_with_children(block):
num_contexts_done += 1
num_blocks_done += len(course_docs)

############## Collections ##############
status_cb("Indexing collections...")

def index_collection_batch(batch, num_contexts_done) -> int:
docs = []
for collection in batch:
status_cb(
f"{num_contexts_done + 1}/{num_contexts}. "
f"Now indexing collection {collection.title} ({collection.id})"
)
try:
doc = searchable_doc_for_collection(collection)
# Uncomment below line once collections are tagged.
# doc.update(searchable_doc_tags(collection.id))
docs.append(doc)
except Exception as err: # pylint: disable=broad-except
status_cb(f"Error indexing collection {collection}: {err}")
finally:
num_contexts_done += 1

if docs:
try:
# Add docs in batch of 100 at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
except (TypeError, KeyError, MeilisearchError) as err:
status_cb(f"Error indexing collection batch {p}: {err}")
return num_contexts_done

# To reduce memory usage on large instances, split up the Collections into pages of 100 collections:
paginator = Paginator(authoring_api.get_collections(enabled=True), 100)
for p in paginator.page_range:
num_contexts_done = index_collection_batch(paginator.page(p).object_list, num_contexts_done)

status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.")
status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and collections.")


def upsert_xblock_index_doc(usage_key: UsageKey, recursive: bool = True) -> None:
Expand Down

0 comments on commit 0511d51

Please sign in to comment.