From 33bdb33af081adf2259e68c8c08047eff3975e51 Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Sun, 27 Oct 2024 09:01:01 -0700
Subject: [PATCH] Add range request support to the zipcontent endpoint.

---
 kolibri/core/content/test/test_zipcontent.py | 105 ++++++++++++++++
 kolibri/core/content/zip_wsgi.py             | 123 ++++++++++++++++---
 2 files changed, 212 insertions(+), 16 deletions(-)
diff --git a/kolibri/core/content/test/test_zipcontent.py b/kolibri/core/content/test/test_zipcontent.py
index becf0d5e993..8218be04b6d 100644
--- a/kolibri/core/content/test/test_zipcontent.py
+++ b/kolibri/core/content/test/test_zipcontent.py
@@ -279,6 +279,111 @@ def test_delete_not_allowed(self):
         response = self._get_file(self.test_name_1, REQUEST_METHOD="DELETE")
         self.assertEqual(response.status_code, 405)
 
+    def test_range_request_full_file(self):
+        """Ensure normal request works with Accept-Ranges header"""
+        response = self._get_file(self.test_name_1)
+        self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)
+        self.assertEqual(response.headers["Accept-Ranges"], "bytes")
+        self.assertEqual(response.status_code, 200)
+
+    def test_range_request_partial_file(self):
+        """Test successful range request for partial file"""
+        response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=2-5")
+        self.assertEqual(
+            next(response.streaming_content).decode(), self.test_str_1[2:6]
+        )
+        self.assertEqual(response.status_code, 206)
+        self.assertEqual(
+            response.headers["Content-Range"], f"bytes 2-5/{len(self.test_str_1)}"
+        )
+        self.assertEqual(response.headers["Content-Length"], "4")
+        self.assertEqual(response.headers["Accept-Ranges"], "bytes")
+
+    def test_range_request_end_of_file(self):
+        """Test range request for last few bytes of file"""
+        response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=-4")
+        self.assertEqual(
+            next(response.streaming_content).decode(), self.test_str_1[-4:]
+        )
+        self.assertEqual(response.status_code, 206)
+
+    def test_range_request_beyond_eof(self):
+        """Test range request with start beyond file size"""
+        response = self._get_file(
+            self.test_name_1,
+            HTTP_RANGE=f"bytes={len(self.test_str_1) + 1}-{len(self.test_str_1) + 4}",
+        )
+        self.assertEqual(response.status_code, 200)  # Should return full file
+        self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)
+
+    def test_range_request_malformed(self):
+        """Test malformed range header"""
+        response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=invalid")
+        self.assertEqual(response.status_code, 200)  # Should return full file
+        self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)
+
+    def test_range_request_multiple_ranges(self):
+        """Test multiple ranges - should return full file as we don't support multipart responses"""
+        response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=0-2,4-6")
+        self.assertEqual(response.status_code, 200)  # Should return full file
+        self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)
+
+    def test_range_request_html_file(self):
+        """Test range requests on HTML files that get modified - should return full file"""
+        response = self._get_file(self.script_name, HTTP_RANGE="bytes=0-10")
+        # Should return full modified file, not range
+        content = (
+            "<html><head>{}<script>test</script></head><body></body></html>".format(
+                hashi_injection
+            )
+        )
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content.decode("utf-8"), content)
+
+    def test_range_request_large_file(self):
+        """Test range request on a larger file to verify streaming"""
+        large_str = "Large text file " * 1024  # ~16KB file
+        large_file = "large.txt"
+
+        with zipfile.ZipFile(self.zip_path, "a") as zf:
+            zf.writestr(large_file, large_str)
+
+        # Request middle section of file
+        start = 1024
+        end = 2048
+        response = self._get_file(large_file, HTTP_RANGE=f"bytes={start}-{end}")
+
+        content = next(response.streaming_content).decode()
+        self.assertEqual(content, large_str[start : end + 1])
+        self.assertEqual(response.status_code, 206)
+        self.assertEqual(response.headers["Content-Length"], str(end - start + 1))
+
+    def test_options_request_accept_ranges_html(self):
+        """Test OPTIONS request for HTML file returns Accept-Ranges: none"""
+        response = self._get_file(self.script_name, REQUEST_METHOD="OPTIONS")
+        self.assertEqual(response.headers["Accept-Ranges"], "none")
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content.decode(), "")
+
+    def test_options_request_accept_ranges_binary(self):
+        """Test OPTIONS request for non-HTML file returns Accept-Ranges: bytes"""
+        response = self._get_file(
+            self.test_name_1, REQUEST_METHOD="OPTIONS"  # This is a .txt file
+        )
+        self.assertEqual(response.headers["Accept-Ranges"], "bytes")
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content.decode(), "")
+
+    def test_accept_ranges_header_html(self):
+        """Test Accept-Ranges header is 'none' for HTML files"""
+        response = self._get_file(self.script_name)  # HTML file
+        self.assertEqual(response.headers["Accept-Ranges"], "none")
+
+    def test_accept_ranges_header_binary(self):
+        """Test Accept-Ranges header is 'bytes' for non-HTML files"""
+        response = self._get_file(self.test_name_1)  # txt file
+        self.assertEqual(response.headers["Accept-Ranges"], "bytes")
+
 
 @override_option("Deployment", "ZIP_CONTENT_URL_PATH_PREFIX", "prefix_test/")
 class UrlPrefixZipContentTestCase(ZipContentTestCase):
diff --git a/kolibri/core/content/zip_wsgi.py b/kolibri/core/content/zip_wsgi.py
index 1411ac246f0..23127aae509 100644
--- a/kolibri/core/content/zip_wsgi.py
+++ b/kolibri/core/content/zip_wsgi.py
@@ -2,6 +2,7 @@
 import mimetypes
 import os
 import re
+import sys
 import time
 import zipfile
 from urllib.parse import unquote
@@ -20,6 +21,7 @@
 from django.utils.cache import patch_response_headers
 from django.utils.encoding import force_str
 from django.utils.http import http_date
+from whitenoise.responders import StaticFile
 
 from kolibri.core.content.errors import InvalidStorageFilenameError
 from kolibri.core.content.utils.paths import get_content_storage_file_path
@@ -32,6 +34,61 @@
 logger = logging.getLogger(__name__)
 
 
+def parse_byte_range(range_header, file_size):
+    """Parse Range header using whitenoise's implementation"""
+    try:
+        start, end = StaticFile.parse_byte_range(range_header)
+        if start >= file_size:
+            # If start is beyond EOF, return None to trigger full file response
+            return None
+
+        if end is None:
+            end = file_size - 1
+        else:
+            end = min(end, file_size - 1)
+
+        return (start if start >= 0 else file_size + start, end - start + 1)
+    except ValueError:
+        return None
+
+
+class RangeZipFileObjectWrapper:
+    """
+    A wrapper for a zip file object that supports byte range requests.
+    This is implemented for compatibility with Python 3.6, which does not
+    support seeking in file objects extracted from zip files.
+    This can be removed once Python 3.6 support is dropped.
+    """
+
+    def __init__(self, file_object, start=0, length=None):
+        self.file_object = file_object
+        self.remaining = length
+        # Python 3.7+ zipfile has seek support
+        if sys.version_info >= (3, 7):
+            self.file_object.seek(start)
+        else:
+            # Read and discard data until we reach start position
+            while start > 0:
+                chunk_size = min(start, 8192)
+                self.file_object.read(chunk_size)
+                start -= chunk_size
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.remaining is not None and self.remaining <= 0:
+            raise StopIteration()
+        chunk = self.file_object.read(
+            min(8192, self.remaining if self.remaining is not None else 8192)
+        )
+        if not chunk:
+            raise StopIteration()
+        if self.remaining is not None:
+            self.remaining -= len(chunk)
+        return chunk
+
+
 def add_security_headers(request, response):
     response.headers["Access-Control-Allow-Origin"] = "*"
     response.headers["Access-Control-Allow-Methods"] = "GET, OPTIONS"
@@ -127,7 +184,9 @@ def parse_html(content):
         return content
 
 
-def get_embedded_file(zipped_path, zipped_filename, embedded_filepath):
+def get_embedded_file(
+    zipped_path, zipped_filename, embedded_filepath, range_header=None
+):
     with zipfile.ZipFile(zipped_path) as zf:
         # if no path, or a directory, is being referenced, look for an index.html file
         if not embedded_filepath or embedded_filepath.endswith("/"):
@@ -143,26 +202,51 @@ def get_embedded_file(zipped_path, zipped_filename, embedded_filepath):
                 )
             )
 
-        # file size
-        file_size = 0
-
         # try to guess the MIME type of the embedded file being referenced
         content_type = (
             mimetypes.guess_type(embedded_filepath)[0] or "application/octet-stream"
         )
-        if embedded_filepath.endswith("htm") or embedded_filepath.endswith("html"):
-            content = zf.open(info).read()
+        zipped_file_object = zf.open(info)
+
+        is_html = embedded_filepath.lower().endswith(("html", "htm"))
+
+        if is_html:
+            content = zipped_file_object.read()
             html = parse_html(content)
             response = HttpResponse(html, content_type=content_type)
             file_size = len(response.content)
         else:
             # generate a streaming response object, pulling data from within the zip file
-            response = FileResponse(zf.open(info), content_type=content_type)
+            status = 200
             file_size = info.file_size
+            range_response_header = None
+
+            # handle byte-range requests
+            if range_header:
+                range_tuple = parse_byte_range(range_header, file_size)
+                if range_tuple:
+                    start, length = range_tuple
+                    zipped_file_object = RangeZipFileObjectWrapper(
+                        zipped_file_object, start, length
+                    )
+                    status = 206
+                    # Use the total file size of the object for the Content-Range header
+                    range_response_header = (
+                        f"bytes {start}-{start + length - 1}/{file_size}"
+                    )
+                    # Update the file size to the length of the requested range
+                    file_size = length
 
+            response = FileResponse(
+                zipped_file_object, content_type=content_type, status=status
+            )
+            if range_response_header:
+                response.headers["Content-Range"] = range_response_header
+
+        # Only accept byte ranges for files that are not HTML
+        response.headers["Accept-Ranges"] = "none" if is_html else "bytes"
         # set the content-length header to the size of the embedded file
-        if file_size:
-            response.headers["Content-Length"] = file_size
+        response.headers["Content-Length"] = file_size
         return response
 
 
@@ -208,11 +292,17 @@ def _zip_content_from_request(request):  # noqa: C901
             "Path not found: {path}".format(path=request.path_info)
         )
 
-    if request.method == "OPTIONS":
-        return HttpResponse()
-
     remote_baseurl, zipped_filename, embedded_filepath = match.groups()
 
+    if request.method == "OPTIONS":
+        response = HttpResponse()
+        # If path ends with html/htm, set Accept-Ranges to none
+        if embedded_filepath.lower().endswith(("html", "htm")):
+            response.headers["Accept-Ranges"] = "none"
+        else:
+            response.headers["Accept-Ranges"] = "bytes"
+        return response
+
     try:
         # calculate the local file path to the zip file
         zipped_path = get_content_storage_file_path(zipped_filename)
@@ -277,8 +367,12 @@ def _zip_content_from_request(request):  # noqa: C901
     if cached_response is not None:
         return cached_response
 
+    range_header = request.META.get("HTTP_RANGE")
+
     try:
-        response = get_embedded_file(zipped_path, zipped_filename, embedded_filepath)
+        response = get_embedded_file(
+            zipped_path, zipped_filename, embedded_filepath, range_header=range_header
+        )
     except Exception:
         if remote_baseurl:
             return create_error_response(
@@ -288,9 +382,6 @@ def _zip_content_from_request(request):  # noqa: C901
             )
         raise
 
-    # ensure the browser knows not to try byte-range requests, as we don't support them here
-    response.headers["Accept-Ranges"] = "none"
-
     response.headers["Last-Modified"] = http_date(time.time())
 
     patch_response_headers(response, cache_timeout=YEAR_IN_SECONDS)