From 33bdb33af081adf2259e68c8c08047eff3975e51 Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Sun, 27 Oct 2024 09:01:01 -0700 Subject: [PATCH] Add range request support to the zipcontent endpoint. --- kolibri/core/content/test/test_zipcontent.py | 105 ++++++++++++++++ kolibri/core/content/zip_wsgi.py | 123 ++++++++++++++++--- 2 files changed, 212 insertions(+), 16 deletions(-) diff --git a/kolibri/core/content/test/test_zipcontent.py b/kolibri/core/content/test/test_zipcontent.py index becf0d5e993..8218be04b6d 100644 --- a/kolibri/core/content/test/test_zipcontent.py +++ b/kolibri/core/content/test/test_zipcontent.py @@ -279,6 +279,111 @@ def test_delete_not_allowed(self): response = self._get_file(self.test_name_1, REQUEST_METHOD="DELETE") self.assertEqual(response.status_code, 405) + def test_range_request_full_file(self): + """Ensure normal request works with Accept-Ranges header""" + response = self._get_file(self.test_name_1) + self.assertEqual(next(response.streaming_content).decode(), self.test_str_1) + self.assertEqual(response.headers["Accept-Ranges"], "bytes") + self.assertEqual(response.status_code, 200) + + def test_range_request_partial_file(self): + """Test successful range request for partial file""" + response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=2-5") + self.assertEqual( + next(response.streaming_content).decode(), self.test_str_1[2:6] + ) + self.assertEqual(response.status_code, 206) + self.assertEqual( + response.headers["Content-Range"], f"bytes 2-5/{len(self.test_str_1)}" + ) + self.assertEqual(response.headers["Content-Length"], "4") + self.assertEqual(response.headers["Accept-Ranges"], "bytes") + + def test_range_request_end_of_file(self): + """Test range request for last few bytes of file""" + response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=-4") + self.assertEqual( + next(response.streaming_content).decode(), self.test_str_1[-4:] + ) + self.assertEqual(response.status_code, 206) + + def test_range_request_beyond_eof(self): + """Test range request with start beyond file size""" + response = self._get_file( + self.test_name_1, + HTTP_RANGE=f"bytes={len(self.test_str_1) + 1}-{len(self.test_str_1) + 4}", + ) + self.assertEqual(response.status_code, 200) # Should return full file + self.assertEqual(next(response.streaming_content).decode(), self.test_str_1) + + def test_range_request_malformed(self): + """Test malformed range header""" + response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=invalid") + self.assertEqual(response.status_code, 200) # Should return full file + self.assertEqual(next(response.streaming_content).decode(), self.test_str_1) + + def test_range_request_multiple_ranges(self): + """Test multiple ranges - should return full file as we don't support multipart responses""" + response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=0-2,4-6") + self.assertEqual(response.status_code, 200) # Should return full file + self.assertEqual(next(response.streaming_content).decode(), self.test_str_1) + + def test_range_request_html_file(self): + """Test range requests on HTML files that get modified - should return full file""" + response = self._get_file(self.script_name, HTTP_RANGE="bytes=0-10") + # Should return full modified file, not range + content = ( + "{}".format( + hashi_injection + ) + ) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.content.decode("utf-8"), content) + + def test_range_request_large_file(self): + """Test range request on a larger file to verify streaming""" + large_str = "Large text file " * 1024 # ~16KB file + large_file = "large.txt" + + with zipfile.ZipFile(self.zip_path, "a") as zf: + zf.writestr(large_file, large_str) + + # Request middle section of file + start = 1024 + end = 2048 + response = self._get_file(large_file, HTTP_RANGE=f"bytes={start}-{end}") + + content = next(response.streaming_content).decode() + self.assertEqual(content, large_str[start : end + 1]) + self.assertEqual(response.status_code, 206) + self.assertEqual(response.headers["Content-Length"], str(end - start + 1)) + + def test_options_request_accept_ranges_html(self): + """Test OPTIONS request for HTML file returns Accept-Ranges: none""" + response = self._get_file(self.script_name, REQUEST_METHOD="OPTIONS") + self.assertEqual(response.headers["Accept-Ranges"], "none") + self.assertEqual(response.status_code, 200) + self.assertEqual(response.content.decode(), "") + + def test_options_request_accept_ranges_binary(self): + """Test OPTIONS request for non-HTML file returns Accept-Ranges: bytes""" + response = self._get_file( + self.test_name_1, REQUEST_METHOD="OPTIONS" # This is a .txt file + ) + self.assertEqual(response.headers["Accept-Ranges"], "bytes") + self.assertEqual(response.status_code, 200) + self.assertEqual(response.content.decode(), "") + + def test_accept_ranges_header_html(self): + """Test Accept-Ranges header is 'none' for HTML files""" + response = self._get_file(self.script_name) # HTML file + self.assertEqual(response.headers["Accept-Ranges"], "none") + + def test_accept_ranges_header_binary(self): + """Test Accept-Ranges header is 'bytes' for non-HTML files""" + response = self._get_file(self.test_name_1) # txt file + self.assertEqual(response.headers["Accept-Ranges"], "bytes") + @override_option("Deployment", "ZIP_CONTENT_URL_PATH_PREFIX", "prefix_test/") class UrlPrefixZipContentTestCase(ZipContentTestCase): diff --git a/kolibri/core/content/zip_wsgi.py b/kolibri/core/content/zip_wsgi.py index 1411ac246f0..23127aae509 100644 --- a/kolibri/core/content/zip_wsgi.py +++ b/kolibri/core/content/zip_wsgi.py @@ -2,6 +2,7 @@ import mimetypes import os import re +import sys import time import zipfile from urllib.parse import unquote @@ -20,6 +21,7 @@ from django.utils.cache import patch_response_headers from django.utils.encoding import force_str from django.utils.http import http_date +from whitenoise.responders import StaticFile from kolibri.core.content.errors import InvalidStorageFilenameError from kolibri.core.content.utils.paths import get_content_storage_file_path @@ -32,6 +34,61 @@ logger = logging.getLogger(__name__) +def parse_byte_range(range_header, file_size): + """Parse Range header using whitenoise's implementation""" + try: + start, end = StaticFile.parse_byte_range(range_header) + if start >= file_size: + # If start is beyond EOF, return None to trigger full file response + return None + + if end is None: + end = file_size - 1 + else: + end = min(end, file_size - 1) + + return (start if start >= 0 else file_size + start, end - start + 1) + except ValueError: + return None + + +class RangeZipFileObjectWrapper: + """ + A wrapper for a zip file object that supports byte range requests. + This is implemented for compatibility with Python 3.6, which does not + support seeking in file objects extracted from zip files. + This can be removed once Python 3.6 support is dropped. + """ + + def __init__(self, file_object, start=0, length=None): + self.file_object = file_object + self.remaining = length + # Python 3.7+ zipfile has seek support + if sys.version_info >= (3, 7): + self.file_object.seek(start) + else: + # Read and discard data until we reach start position + while start > 0: + chunk_size = min(start, 8192) + self.file_object.read(chunk_size) + start -= chunk_size + + def __iter__(self): + return self + + def __next__(self): + if self.remaining is not None and self.remaining <= 0: + raise StopIteration() + chunk = self.file_object.read( + min(8192, self.remaining if self.remaining is not None else 8192) + ) + if not chunk: + raise StopIteration() + if self.remaining is not None: + self.remaining -= len(chunk) + return chunk + + def add_security_headers(request, response): response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Methods"] = "GET, OPTIONS" @@ -127,7 +184,9 @@ def parse_html(content): return content -def get_embedded_file(zipped_path, zipped_filename, embedded_filepath): +def get_embedded_file( + zipped_path, zipped_filename, embedded_filepath, range_header=None +): with zipfile.ZipFile(zipped_path) as zf: # if no path, or a directory, is being referenced, look for an index.html file if not embedded_filepath or embedded_filepath.endswith("/"): @@ -143,26 +202,51 @@ def get_embedded_file(zipped_path, zipped_filename, embedded_filepath): ) ) - # file size - file_size = 0 - # try to guess the MIME type of the embedded file being referenced content_type = ( mimetypes.guess_type(embedded_filepath)[0] or "application/octet-stream" ) - if embedded_filepath.endswith("htm") or embedded_filepath.endswith("html"): - content = zf.open(info).read() + zipped_file_object = zf.open(info) + + is_html = embedded_filepath.lower().endswith(("html", "htm")) + + if is_html: + content = zipped_file_object.read() html = parse_html(content) response = HttpResponse(html, content_type=content_type) file_size = len(response.content) else: # generate a streaming response object, pulling data from within the zip file - response = FileResponse(zf.open(info), content_type=content_type) + status = 200 file_size = info.file_size + range_response_header = None + + # handle byte-range requests + if range_header: + range_tuple = parse_byte_range(range_header, file_size) + if range_tuple: + start, length = range_tuple + zipped_file_object = RangeZipFileObjectWrapper( + zipped_file_object, start, length + ) + status = 206 + # Use the total file size of the object for the Content-Range header + range_response_header = ( + f"bytes {start}-{start + length - 1}/{file_size}" + ) + # Update the file size to the length of the requested range + file_size = length + response = FileResponse( + zipped_file_object, content_type=content_type, status=status + ) + if range_response_header: + response.headers["Content-Range"] = range_response_header + + # Only accept byte ranges for files that are not HTML + response.headers["Accept-Ranges"] = "none" if is_html else "bytes" # set the content-length header to the size of the embedded file - if file_size: - response.headers["Content-Length"] = file_size + response.headers["Content-Length"] = file_size return response @@ -208,11 +292,17 @@ def _zip_content_from_request(request): # noqa: C901 "Path not found: {path}".format(path=request.path_info) ) - if request.method == "OPTIONS": - return HttpResponse() - remote_baseurl, zipped_filename, embedded_filepath = match.groups() + if request.method == "OPTIONS": + response = HttpResponse() + # If path ends with html/htm, set Accept-Ranges to none + if embedded_filepath.lower().endswith(("html", "htm")): + response.headers["Accept-Ranges"] = "none" + else: + response.headers["Accept-Ranges"] = "bytes" + return response + try: # calculate the local file path to the zip file zipped_path = get_content_storage_file_path(zipped_filename) @@ -277,8 +367,12 @@ def _zip_content_from_request(request): # noqa: C901 if cached_response is not None: return cached_response + range_header = request.META.get("HTTP_RANGE") + try: - response = get_embedded_file(zipped_path, zipped_filename, embedded_filepath) + response = get_embedded_file( + zipped_path, zipped_filename, embedded_filepath, range_header=range_header + ) except Exception: if remote_baseurl: return create_error_response( @@ -288,9 +382,6 @@ def _zip_content_from_request(request): # noqa: C901 ) raise - # ensure the browser knows not to try byte-range requests, as we don't support them here - response.headers["Accept-Ranges"] = "none" - response.headers["Last-Modified"] = http_date(time.time()) patch_response_headers(response, cache_timeout=YEAR_IN_SECONDS)