Skip to content

Commit

Permalink
Add range request support to the zipcontent endpoint.
Browse files Browse the repository at this point in the history
  • Loading branch information
rtibbles committed Jan 13, 2025
1 parent ad48dd8 commit 33bdb33
Show file tree
Hide file tree
Showing 2 changed files with 212 additions and 16 deletions.
105 changes: 105 additions & 0 deletions kolibri/core/content/test/test_zipcontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,111 @@ def test_delete_not_allowed(self):
response = self._get_file(self.test_name_1, REQUEST_METHOD="DELETE")
self.assertEqual(response.status_code, 405)

def test_range_request_full_file(self):
"""Ensure normal request works with Accept-Ranges header"""
response = self._get_file(self.test_name_1)
self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)
self.assertEqual(response.headers["Accept-Ranges"], "bytes")
self.assertEqual(response.status_code, 200)

def test_range_request_partial_file(self):
"""Test successful range request for partial file"""
response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=2-5")
self.assertEqual(
next(response.streaming_content).decode(), self.test_str_1[2:6]
)
self.assertEqual(response.status_code, 206)
self.assertEqual(
response.headers["Content-Range"], f"bytes 2-5/{len(self.test_str_1)}"
)
self.assertEqual(response.headers["Content-Length"], "4")
self.assertEqual(response.headers["Accept-Ranges"], "bytes")

def test_range_request_end_of_file(self):
"""Test range request for last few bytes of file"""
response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=-4")
self.assertEqual(
next(response.streaming_content).decode(), self.test_str_1[-4:]
)
self.assertEqual(response.status_code, 206)

def test_range_request_beyond_eof(self):
"""Test range request with start beyond file size"""
response = self._get_file(
self.test_name_1,
HTTP_RANGE=f"bytes={len(self.test_str_1) + 1}-{len(self.test_str_1) + 4}",
)
self.assertEqual(response.status_code, 200) # Should return full file
self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)

def test_range_request_malformed(self):
"""Test malformed range header"""
response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=invalid")
self.assertEqual(response.status_code, 200) # Should return full file
self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)

def test_range_request_multiple_ranges(self):
"""Test multiple ranges - should return full file as we don't support multipart responses"""
response = self._get_file(self.test_name_1, HTTP_RANGE="bytes=0-2,4-6")
self.assertEqual(response.status_code, 200) # Should return full file
self.assertEqual(next(response.streaming_content).decode(), self.test_str_1)

def test_range_request_html_file(self):
"""Test range requests on HTML files that get modified - should return full file"""
response = self._get_file(self.script_name, HTTP_RANGE="bytes=0-10")
# Should return full modified file, not range
content = (
"<html><head>{}<script>test</script></head><body></body></html>".format(
hashi_injection
)
)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content.decode("utf-8"), content)

def test_range_request_large_file(self):
"""Test range request on a larger file to verify streaming"""
large_str = "Large text file " * 1024 # ~16KB file
large_file = "large.txt"

with zipfile.ZipFile(self.zip_path, "a") as zf:
zf.writestr(large_file, large_str)

# Request middle section of file
start = 1024
end = 2048
response = self._get_file(large_file, HTTP_RANGE=f"bytes={start}-{end}")

content = next(response.streaming_content).decode()
self.assertEqual(content, large_str[start : end + 1])
self.assertEqual(response.status_code, 206)
self.assertEqual(response.headers["Content-Length"], str(end - start + 1))

def test_options_request_accept_ranges_html(self):
"""Test OPTIONS request for HTML file returns Accept-Ranges: none"""
response = self._get_file(self.script_name, REQUEST_METHOD="OPTIONS")
self.assertEqual(response.headers["Accept-Ranges"], "none")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content.decode(), "")

def test_options_request_accept_ranges_binary(self):
"""Test OPTIONS request for non-HTML file returns Accept-Ranges: bytes"""
response = self._get_file(
self.test_name_1, REQUEST_METHOD="OPTIONS" # This is a .txt file
)
self.assertEqual(response.headers["Accept-Ranges"], "bytes")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content.decode(), "")

def test_accept_ranges_header_html(self):
"""Test Accept-Ranges header is 'none' for HTML files"""
response = self._get_file(self.script_name) # HTML file
self.assertEqual(response.headers["Accept-Ranges"], "none")

def test_accept_ranges_header_binary(self):
"""Test Accept-Ranges header is 'bytes' for non-HTML files"""
response = self._get_file(self.test_name_1) # txt file
self.assertEqual(response.headers["Accept-Ranges"], "bytes")


@override_option("Deployment", "ZIP_CONTENT_URL_PATH_PREFIX", "prefix_test/")
class UrlPrefixZipContentTestCase(ZipContentTestCase):
Expand Down
123 changes: 107 additions & 16 deletions kolibri/core/content/zip_wsgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import mimetypes
import os
import re
import sys
import time
import zipfile
from urllib.parse import unquote
Expand All @@ -20,6 +21,7 @@
from django.utils.cache import patch_response_headers
from django.utils.encoding import force_str
from django.utils.http import http_date
from whitenoise.responders import StaticFile

from kolibri.core.content.errors import InvalidStorageFilenameError
from kolibri.core.content.utils.paths import get_content_storage_file_path
Expand All @@ -32,6 +34,61 @@
logger = logging.getLogger(__name__)


def parse_byte_range(range_header, file_size):
"""Parse Range header using whitenoise's implementation"""
try:
start, end = StaticFile.parse_byte_range(range_header)
if start >= file_size:
# If start is beyond EOF, return None to trigger full file response
return None

if end is None:
end = file_size - 1
else:
end = min(end, file_size - 1)

return (start if start >= 0 else file_size + start, end - start + 1)
except ValueError:
return None


class RangeZipFileObjectWrapper:
"""
A wrapper for a zip file object that supports byte range requests.
This is implemented for compatibility with Python 3.6, which does not
support seeking in file objects extracted from zip files.
This can be removed once Python 3.6 support is dropped.
"""

def __init__(self, file_object, start=0, length=None):
self.file_object = file_object
self.remaining = length
# Python 3.7+ zipfile has seek support
if sys.version_info >= (3, 7):
self.file_object.seek(start)
else:
# Read and discard data until we reach start position
while start > 0:
chunk_size = min(start, 8192)
self.file_object.read(chunk_size)
start -= chunk_size

def __iter__(self):
return self

def __next__(self):
if self.remaining is not None and self.remaining <= 0:
raise StopIteration()
chunk = self.file_object.read(
min(8192, self.remaining if self.remaining is not None else 8192)
)
if not chunk:
raise StopIteration()
if self.remaining is not None:
self.remaining -= len(chunk)
return chunk


def add_security_headers(request, response):
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Methods"] = "GET, OPTIONS"
Expand Down Expand Up @@ -127,7 +184,9 @@ def parse_html(content):
return content


def get_embedded_file(zipped_path, zipped_filename, embedded_filepath):
def get_embedded_file(
zipped_path, zipped_filename, embedded_filepath, range_header=None
):
with zipfile.ZipFile(zipped_path) as zf:
# if no path, or a directory, is being referenced, look for an index.html file
if not embedded_filepath or embedded_filepath.endswith("/"):
Expand All @@ -143,26 +202,51 @@ def get_embedded_file(zipped_path, zipped_filename, embedded_filepath):
)
)

# file size
file_size = 0

# try to guess the MIME type of the embedded file being referenced
content_type = (
mimetypes.guess_type(embedded_filepath)[0] or "application/octet-stream"
)
if embedded_filepath.endswith("htm") or embedded_filepath.endswith("html"):
content = zf.open(info).read()
zipped_file_object = zf.open(info)

is_html = embedded_filepath.lower().endswith(("html", "htm"))

if is_html:
content = zipped_file_object.read()
html = parse_html(content)
response = HttpResponse(html, content_type=content_type)
file_size = len(response.content)
else:
# generate a streaming response object, pulling data from within the zip file
response = FileResponse(zf.open(info), content_type=content_type)
status = 200
file_size = info.file_size
range_response_header = None

# handle byte-range requests
if range_header:
range_tuple = parse_byte_range(range_header, file_size)
if range_tuple:
start, length = range_tuple
zipped_file_object = RangeZipFileObjectWrapper(
zipped_file_object, start, length
)
status = 206
# Use the total file size of the object for the Content-Range header
range_response_header = (
f"bytes {start}-{start + length - 1}/{file_size}"
)
# Update the file size to the length of the requested range
file_size = length

response = FileResponse(
zipped_file_object, content_type=content_type, status=status
)
if range_response_header:
response.headers["Content-Range"] = range_response_header

# Only accept byte ranges for files that are not HTML
response.headers["Accept-Ranges"] = "none" if is_html else "bytes"
# set the content-length header to the size of the embedded file
if file_size:
response.headers["Content-Length"] = file_size
response.headers["Content-Length"] = file_size
return response


Expand Down Expand Up @@ -208,11 +292,17 @@ def _zip_content_from_request(request): # noqa: C901
"Path not found: {path}".format(path=request.path_info)
)

if request.method == "OPTIONS":
return HttpResponse()

remote_baseurl, zipped_filename, embedded_filepath = match.groups()

if request.method == "OPTIONS":
response = HttpResponse()
# If path ends with html/htm, set Accept-Ranges to none
if embedded_filepath.lower().endswith(("html", "htm")):
response.headers["Accept-Ranges"] = "none"
else:
response.headers["Accept-Ranges"] = "bytes"
return response

try:
# calculate the local file path to the zip file
zipped_path = get_content_storage_file_path(zipped_filename)
Expand Down Expand Up @@ -277,8 +367,12 @@ def _zip_content_from_request(request): # noqa: C901
if cached_response is not None:
return cached_response

range_header = request.META.get("HTTP_RANGE")

try:
response = get_embedded_file(zipped_path, zipped_filename, embedded_filepath)
response = get_embedded_file(
zipped_path, zipped_filename, embedded_filepath, range_header=range_header
)
except Exception:
if remote_baseurl:
return create_error_response(
Expand All @@ -288,9 +382,6 @@ def _zip_content_from_request(request): # noqa: C901
)
raise

# ensure the browser knows not to try byte-range requests, as we don't support them here
response.headers["Accept-Ranges"] = "none"

response.headers["Last-Modified"] = http_date(time.time())

patch_response_headers(response, cache_timeout=YEAR_IN_SECONDS)
Expand Down

0 comments on commit 33bdb33

Please sign in to comment.