refactor: index collection per learning_package

As get_all_collections api was refactored [1] to only support fetching collections by learning_package, update reindex function to get collections by learning_package. 1. openedx/openedx-learning#216
open-craft · Sep 4, 2024 · 0511d51 · 0511d51
1 parent ba883fa
commit 0511d51
Showing 1 changed file with 38 additions and 43 deletions.
diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py
@@ -289,23 +289,29 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
 
     # Get the lists of libraries
     status_cb("Counting libraries...")
-    lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')]
+    lib_keys = [
+        (lib.library_key, lib.learning_package_id)
+        for lib in lib_api.ContentLibrary.objects.select_related(
+            'org',
+            'learning_package'
+        ).only(
+            'org',
+            'slug',
+            'learning_package_id'
+        )
+    ]
     num_libraries = len(lib_keys)
 
     # Get the list of courses
     status_cb("Counting courses...")
     num_courses = CourseOverview.objects.count()
 
-    # Get the list of collections
-    status_cb("Counting collections...")
-    num_collections = authoring_api.get_collections().count()
-
     # Some counters so we can track our progress as indexing progresses:
-    num_contexts = num_courses + num_libraries + num_collections
+    num_contexts = num_courses + num_libraries
     num_contexts_done = 0  # How many courses/libraries we've indexed
     num_blocks_done = 0  # How many individual components/XBlocks we've indexed
 
-    status_cb(f"Found {num_courses} courses, {num_libraries} libraries and {num_collections} collections.")
+    status_cb(f"Found {num_courses} courses and {num_libraries} libraries.")
     with _using_temp_index(status_cb) as temp_index_name:
         ############## Configure the index ##############
 
@@ -369,7 +375,26 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
         ])
 
         ############## Libraries ##############
-        status_cb("Indexing libraries...")
+        status_cb("Indexing libraries and collections...")
+
+        def index_collection_batch(batch) -> int:
+            docs = []
+            for collection in batch:
+                try:
+                    doc = searchable_doc_for_collection(collection)
+                    # Uncomment below line once collections are tagged.
+                    # doc.update(searchable_doc_tags(collection.id))
+                    docs.append(doc)
+                except Exception as err:  # pylint: disable=broad-except
+                    status_cb(f"Error indexing collection {collection}: {err}")
+
+            if docs:
+                try:
+                    # Add docs in batch of 100 at once (usually faster than adding one at a time):
+                    _wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
+                except (TypeError, KeyError, MeilisearchError) as err:
+                    status_cb(f"Error indexing collection batch {p}: {err}")
+            return docs
 
         def index_library(lib_key: str) -> list:
             docs = []
@@ -390,10 +415,13 @@ def index_library(lib_key: str) -> list:
                     status_cb(f"Error indexing library {lib_key}: {err}")
             return docs
 
-        for lib_key in lib_keys:
+        for lib_key, learning_package_id in lib_keys:
             status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}")
             lib_docs = index_library(lib_key)
             num_blocks_done += len(lib_docs)
+            collections = authoring_api.get_collections(learning_package_id)
+            collection_docs = index_collection_batch(collections)
+            num_blocks_done += len(collection_docs)
             num_contexts_done += 1
 
         ############## Courses ##############
@@ -430,40 +458,7 @@ def add_with_children(block):
                 num_contexts_done += 1
                 num_blocks_done += len(course_docs)
 
-        ############## Collections ##############
-        status_cb("Indexing collections...")
-
-        def index_collection_batch(batch, num_contexts_done) -> int:
-            docs = []
-            for collection in batch:
-                status_cb(
-                    f"{num_contexts_done + 1}/{num_contexts}. "
-                    f"Now indexing collection {collection.title} ({collection.id})"
-                )
-                try:
-                    doc = searchable_doc_for_collection(collection)
-                    # Uncomment below line once collections are tagged.
-                    # doc.update(searchable_doc_tags(collection.id))
-                    docs.append(doc)
-                except Exception as err:  # pylint: disable=broad-except
-                    status_cb(f"Error indexing collection {collection}: {err}")
-                finally:
-                    num_contexts_done += 1
-
-            if docs:
-                try:
-                    # Add docs in batch of 100 at once (usually faster than adding one at a time):
-                    _wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
-                except (TypeError, KeyError, MeilisearchError) as err:
-                    status_cb(f"Error indexing collection batch {p}: {err}")
-            return num_contexts_done
-
-        # To reduce memory usage on large instances, split up the Collections into pages of 100 collections:
-        paginator = Paginator(authoring_api.get_collections(enabled=True), 100)
-        for p in paginator.page_range:
-            num_contexts_done = index_collection_batch(paginator.page(p).object_list, num_contexts_done)
-
-    status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.")
+    status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and collections.")
 
 
 def upsert_xblock_index_doc(usage_key: UsageKey, recursive: bool = True) -> None: