Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bszabo/course optimizer tests #36175

Merged
merged 2 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 27 additions & 13 deletions cms/djangoapps/contentstore/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,31 +1204,43 @@ async def _validate_urls_access_in_batches(url_list, course_key, batch_size=100)

for i in range(0, url_count, batch_size):
batch = url_list[i:i + batch_size]
async with aiohttp.ClientSession() as session:
tasks = [_validate_url_access(session, url_data, course_key) for url_data in batch]
batch_results = await asyncio.gather(*tasks)
responses.extend(batch_results)
LOGGER.debug(f'[Link Check] request batch {i // batch_size + 1} of {url_count // batch_size + 1}')
batch_results = await _validate_batch(batch, course_key)
responses.extend(batch_results)
LOGGER.debug(f'[Link Check] request batch {i // batch_size + 1} of {url_count // batch_size + 1}')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we leave this logging statement here?
Or should we update this logging statement to be recorded on datadog in stage and prod?

Copy link
Contributor Author

@bszabo bszabo Jan 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@schenedx What's involved in having it be recorded on Datadog?


return responses


async def _validate_batch(batch, course_key):
async with aiohttp.ClientSession() as session:
tasks = [_validate_url_access(session, url_data, course_key) for url_data in batch]
batch_results = await asyncio.gather(*tasks)
return batch_results

def _retry_validation(url_list, course_key, retry_count=3):
"""Retry urls that failed due to connection error."""
"""Retry urls that failed due to connection error.
returns URLs that could not be validated either because locked, or because of persistent connection problems
"""
results = []
retry_list = url_list
for i in range(0, retry_count):
if retry_list:
LOGGER.debug(f'[Link Check] retry attempt #{i + 1}')
bszabo marked this conversation as resolved.
Show resolved Hide resolved
validated_url_list = asyncio.run(
_validate_urls_access_in_batches(retry_list, course_key, batch_size=100)
)
filetered_url_list, retry_list = _filter_by_status(validated_url_list)
results.extend(filetered_url_list)

retry_list = _retry_validation_and_filter(course_key, results, retry_list)
results.extend(retry_list)

return results


def _retry_validation_and_filter(course_key, results, retry_list):
validated_url_list = asyncio.run(
_validate_urls_access_in_batches(retry_list, course_key, batch_size=100)
)
filtered_url_list, retry_list = _filter_by_status(validated_url_list)
results.extend(filtered_url_list)
return retry_list


def _filter_by_status(results):
"""
Filter results by status.
Expand Down Expand Up @@ -1265,6 +1277,9 @@ def _write_broken_links_to_file(broken_or_locked_urls, broken_links_file):
json.dump(broken_or_locked_urls, file, indent=4)

def _check_broken_links(task_instance, user_id, course_key_string, language):
"""
Checks for broken links in a course. Store the results in a file.
"""
user = _validate_user(task_instance, user_id, language)

task_instance.status.set_state('Scanning')
Expand Down Expand Up @@ -1300,7 +1315,6 @@ def _check_broken_links(task_instance, user_id, course_key_string, language):
task_instance.status.fail({'raw_error_msg': str(e)})
return


@shared_task(base=CourseLinkCheckTask, bind=True)
def check_broken_links(self, user_id, course_key_string, language):
"""
Expand Down
225 changes: 171 additions & 54 deletions cms/djangoapps/contentstore/tests/test_tasks.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,37 @@
"""
Unit tests for course import and export Celery tasks
"""


import asyncio
import copy
import json
import os
from tempfile import NamedTemporaryFile
from unittest import mock, TestCase
import logging
import pprint
from unittest import mock
from unittest.mock import AsyncMock, patch, MagicMock
from uuid import uuid4

import pytest as pytest
from django.conf import settings
from django.contrib.auth.models import User # lint-amnesty, pylint: disable=imported-auth-user
from django.test import TestCase
from django.test.utils import override_settings
from django.core.files import File
from edx_toggles.toggles.testutils import override_waffle_flag
from opaque_keys.edx.keys import CourseKey
from opaque_keys.edx.locator import CourseLocator
from organizations.models import OrganizationCourse
from organizations.tests.factories import OrganizationFactory
from user_tasks.models import UserTaskArtifact, UserTaskStatus

from cms.djangoapps.contentstore.tasks import (
logging = logging.getLogger(__name__)

from ..tasks import (
export_olx,
update_special_exams_and_publish,
rerun_course,
_convert_to_standard_url,
_validate_urls_access_in_batches,
_filter_by_status,
_get_urls,
_check_broken_links,
_is_studio_url,
_scan_course_for_links
Expand All @@ -37,7 +42,7 @@
from common.djangoapps.student.tests.factories import UserFactory
from openedx.core.djangoapps.course_apps.toggles import EXAMS_IDA
from openedx.core.djangoapps.embargo.models import Country, CountryAccessRule, RestrictedCourse
from xmodule.modulestore import ModuleStoreEnum # lint-amnesty, pylint: disable=wrong-import-order
from xmodule.modulestore import ModuleStoreEnum
from xmodule.modulestore.django import modulestore # lint-amnesty, pylint: disable=wrong-import-order
from xmodule.modulestore.tests.django_utils import TEST_DATA_SPLIT_MODULESTORE, ModuleStoreTestCase
from xmodule.modulestore.tests.factories import CourseFactory # lint-amnesty, pylint: disable=wrong-import-order
Expand Down Expand Up @@ -113,7 +118,6 @@ def _assert_failed(self, task_result, error_message):
self.assertEqual(error.name, 'Error')
self.assertEqual(error.text, error_message)


@override_settings(CONTENTSTORE=TEST_DATA_CONTENTSTORE)
class ExportLibraryTestCase(LibraryTestCase):
"""
Expand Down Expand Up @@ -221,6 +225,7 @@ class MockCourseLinkCheckTask(Task):
def __init__(self):
self.status = mock.Mock()

############## Course Optimizer tests ##############

class CheckBrokenLinksTaskTest(ModuleStoreTestCase):
def setUp(self):
Expand All @@ -239,7 +244,6 @@ def setUp(self):
]

@mock.patch('cms.djangoapps.contentstore.tasks.UserTaskArtifact', autospec=True)
@mock.patch('cms.djangoapps.contentstore.tasks.UserTaskStatus', autospec=True)
@mock.patch('cms.djangoapps.contentstore.tasks._scan_course_for_links')
@mock.patch('cms.djangoapps.contentstore.tasks._save_broken_links_file', autospec=True)
@mock.patch('cms.djangoapps.contentstore.tasks._write_broken_links_to_file', autospec=True)
Expand All @@ -248,15 +252,17 @@ def test_check_broken_links_stores_broken_and_locked_urls(
mock_write_broken_links_to_file,
mock_save_broken_links_file,
mock_scan_course_for_links,
_mock_user_task_status,
mock_user_task_artifact
):
'''
The test should verify that the check_broken_links task correctly
identifies and stores broken or locked URLs in the course.
The expected behavior is that the task scans the course,
validates the URLs, filters the results, and stores them in a
The test verifies that the check_broken_links task correctly
stores broken or locked URLs in the course.
The expected behavior is that the after scanning the course,
validating the URLs, and filtering the results, the task stores the results in a
JSON file.

Note that this test mocks all validation functions and therefore
does not test link validation or any of its support functions.
'''
# Arrange
mock_user = UserFactory.create(username='student', password='password')
Expand All @@ -277,17 +283,22 @@ def test_check_broken_links_stores_broken_and_locked_urls(
### Check that _save_broken_links_file was called with the correct arguments
mock_save_broken_links_file.assert_called_once_with(mock_user_task_artifact.return_value, mock.ANY)

def test_user_does_not_exist_raises_exception(self):
raise NotImplementedError

def test_no_course_access_raises_exception(self):
raise NotImplementedError

def test_hash_tags_stripped_from_url_lists(self):
raise NotImplementedError
NUM_HASH_TAG_LINES = 2
url_list = '''
href='#' # 1 of 2 lines that will be stripped
href='http://google.com'
src='#' # 2 of 2 lines that will be stripped
href='https://microsoft.com'
src="/static/resource_name"
'''

original_lines = len(url_list.splitlines()) - 2 # Correct for the two carriage returns surrounding the ''' marks
processed_url_list = _get_urls(url_list)
processed_lines = len(processed_url_list)

def test_urls_out_count_equals_urls_in_count_when_no_hashtags(self):
raise NotImplementedError
assert processed_lines == original_lines - NUM_HASH_TAG_LINES, \
f'Processed URL list lines = {processed_lines}; expected {original_lines - 2}'

def test_http_url_not_recognized_as_studio_url_scheme(self):
self.assertFalse(_is_studio_url(f'http://www.google.com'))
Expand All @@ -307,18 +318,6 @@ def test_container_url_without_url_base_is_recognized_as_studio_url_scheme(self)
def test_slash_url_without_url_base_is_recognized_as_studio_url_scheme(self):
self.assertTrue(_is_studio_url(f'/static/test'))

@pytest.mark.parametrize("url, course_key, post_substitution_url",
["/static/anything_goes_here?raw", "1", "2"])
def test_url_substitution_on_static_prefixes(self, url, course_key, post_substitution_url):
with_substitution = _convert_to_standard_url(url, course_key)
assert with_substitution == post_substitution_url, f'{with_substitution} expected to be {post_substitution_url}'

def test_url_substitution_on_forward_slash_prefixes(self):
raise NotImplementedError

def test_url_subsitution_on_containers(self):
raise NotImplementedError

@mock.patch('cms.djangoapps.contentstore.tasks.ModuleStoreEnum', autospec=True)
@mock.patch('cms.djangoapps.contentstore.tasks.modulestore', autospec=True)
def test_course_scan_occurs_on_published_version(self, mock_modulestore, mock_module_store_enum):
Expand All @@ -340,30 +339,148 @@ def test_course_scan_occurs_on_published_version(self, mock_modulestore, mock_mo

@mock.patch('cms.djangoapps.contentstore.tasks._get_urls', autospec=True)
def test_number_of_scanned_blocks_equals_blocks_in_course(self, mock_get_urls):
"""_scan_course_for_links should call _get_urls once per block in course"""
mock_get_urls = mock.Mock()
result = self.store.get_items(self.test_course.id)
"""
_scan_course_for_links should call _get_urls once per block in course.
"""
expected_blocks = self.store.get_items(self.test_course.id)

_scan_course_for_links(self.test_course.id)
self.assertEqual(len(result), mock_get_urls.call_count)
self.assertEqual(len(expected_blocks), mock_get_urls.call_count)

def test_every_detected_link_is_validated(self):
raise NotImplementedError

def test_link_validation_is_batched(self):
raise NotImplementedError
@pytest.mark.asyncio
async def test_every_detected_link_is_validated(self):
'''
The call to _validate_urls_access_in_batches() should call _validate_batch() three times, once for each
of the three batches of length 2 in url_list. The lambda function supplied for _validate_batch will
simply return the set of urls fed to _validate_batch(), and _validate_urls_access_in_batches() will
aggregate these into a list identical to the original url_list.

def test_all_links_in_link_list_longer_than_batch_size_are_validated(self):
raise NotImplementedError
What this shows is that each url submitted to _validate_urls_access_in_batches() is ending up as an argument
to one of the generated _validate_batch() calls, and that no input URL is left unprocessed.
'''
url_list = ['1', '2', '3', '4', '5']
course_key = 'course-v1:edX+DemoX+Demo_Course'
batch_size = 2
with patch("cms.djangoapps.contentstore.tasks._validate_batch", new_callable=AsyncMock) as mock_validate_batch:
mock_validate_batch.side_effect = lambda x, y: x
validated_urls = await _validate_urls_access_in_batches(url_list, course_key, batch_size)
mock_validate_batch.assert_called()
assert mock_validate_batch.call_count == 3 # two full batches and one partial batch
assert validated_urls == url_list, \
f"List of validated urls {validated_urls} is not identical to sourced urls {url_list}"

@pytest.mark.asyncio
async def test_all_links_are_validated_with_batch_validation(self):
'''
Here the focus is not on batching, but rather that when validation occurs it does so on the intended
URL strings
'''
with patch("cms.djangoapps.contentstore.tasks._validate_url_access", new_callable=AsyncMock) as mock_validate:
mock_validate.return_value = {"status": 200}

url_list = ['1', '2', '3', '4', '5']
course_key = 'course-v1:edX+DemoX+Demo_Course'
batch_size=2
await _validate_urls_access_in_batches(url_list, course_key, batch_size)
args_list = mock_validate.call_args_list
urls = [call_args.args[1] for call_args in args_list] # The middle argument in each of the function calls
for i in range(1,len(url_list)+1):
assert str(i) in urls, f'{i} not supplied as a url for validation in batches function'

def test_no_retries_on_403_access_denied_links(self):
raise NotImplementedError
'''
No mocking required here. Will populate "filtering_input" with simulated results for link checks where
some links time out, some links receive 403 errors, and some receive 200 success. This test then
ensures that "_filter_by_status()" tallies the three categories as expected, and formats the result
as expected.
'''
url_list = ['1', '2', '3', '4', '5']
filtering_input = []
for i in range(1, len(url_list)+1): # Notch out one of the URLs, having it return a '403' status code
filtering_input.append(
{'block_id': f'block_{i}',
'url': str(i),
'status': 200},
)
filtering_input[2]['status'] = 403
filtering_input[3]['status'] = 500
filtering_input[4]['status'] = None

broken_or_locked_urls, retry_list = _filter_by_status(filtering_input)
assert len(broken_or_locked_urls) == 2 # The inputs with status = 403 and 500
assert len(retry_list) == 1 # The input with status = None
assert retry_list[0][1] == '5' # The only URL fit for a retry operation (status == None)

@patch("cms.djangoapps.contentstore.tasks._validate_user", return_value=MagicMock())
@patch("cms.djangoapps.contentstore.tasks._scan_course_for_links", return_value=["url1", "url2"])
@patch("cms.djangoapps.contentstore.tasks._validate_urls_access_in_batches",
return_value=[{"url": "url1", "status": "ok"}])
@patch("cms.djangoapps.contentstore.tasks._filter_by_status",
return_value=(["block_1", "url1", True], ["block_2", "url2"]))
@patch("cms.djangoapps.contentstore.tasks._retry_validation",
return_value=['block_2', 'url2'])
def test_check_broken_links_calls_expected_support_functions(self,
mock_retry_validation,
mock_filter,
mock_validate_urls,
mock_scan_course,
mock_validate_user):
# Parameters for the function
user_id = 1234
language = "en"
course_key_string = "course-v1:edX+DemoX+2025"

# Mocking self and status attributes for the test
class MockStatus:
def __init__(self):
self.state = "READY"

def set_state(self, state):
self.state = state

def increment_completed_steps(self):
pass

def fail(self, error_details):
self.state = "FAILED"

class MockSelf:
def __init__(self):
self.status = MockStatus()

mock_self = MockSelf()

_check_broken_links(mock_self, user_id, course_key_string, language)

# Prepare expected results based on mock settings
url_list = mock_scan_course.return_value
validated_url_list = mock_validate_urls.return_value
broken_or_locked_urls, retry_list = mock_filter.return_value
course_key = CourseKey.from_string(course_key_string)

if retry_list:
retry_results = mock_retry_validation.return_value
broken_or_locked_urls.extend(retry_results)

# Perform verifications
try:
mock_self.status.increment_completed_steps()
mock_retry_validation.assert_called_once_with(
mock_filter.return_value[1], course_key, retry_count=3
)
except Exception as e:
logging.exception("Error checking links for course %s", course_key_string, exc_info=True)
if mock_self.status.state != "FAILED":
mock_self.status.fail({"raw_error_msg": str(e)})
assert False, "Exception should not occur"

# Assertions to confirm patched calls were invoked
mock_validate_user.assert_called_once_with(mock_self, user_id, language)
mock_scan_course.assert_called_once_with(course_key)
mock_validate_urls.assert_called_once_with(url_list, course_key, batch_size=100)
mock_filter.assert_called_once_with(validated_url_list)
if retry_list:
mock_retry_validation.assert_called_once_with(retry_list, course_key, retry_count=3)

def test_retries_attempted_on_connection_errors(self):
raise NotImplementedError

def test_max_number_of_retries_is_respected(self):
raise NotImplementedError

def test_scan_generates_file_named_by_course_key(self):
raise NotImplementedError
Loading