diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 1e0da1e28f..507949604e 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -22,6 +22,7 @@ DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob, + ReAddOrgPagesJob, PaginatedBackgroundJobResponse, AnyJob, StorageRef, @@ -301,8 +302,6 @@ async def create_delete_org_job( try: job_id = await self.crawl_manager.run_delete_org_job( oid=str(org.id), - backend_image=os.environ.get("BACKEND_IMAGE", ""), - pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), existing_job_id=existing_job_id, ) if existing_job_id: @@ -346,8 +345,6 @@ async def create_recalculate_org_stats_job( try: job_id = await self.crawl_manager.run_recalculate_org_stats_job( oid=str(org.id), - backend_image=os.environ.get("BACKEND_IMAGE", ""), - pull_policy=os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), existing_job_id=existing_job_id, ) if existing_job_id: @@ -381,6 +378,52 @@ async def create_recalculate_org_stats_job( print(f"warning: recalculate org stats job could not be started: {exc}") return None + async def create_re_add_org_pages_job( + self, + oid: UUID, + crawl_type: Optional[str] = None, + existing_job_id: Optional[str] = None, + ): + """Create job to (re)add all pages in an org, optionally filtered by crawl type""" + + try: + job_id = await self.crawl_manager.run_re_add_org_pages_job( + oid=str(oid), + crawl_type=crawl_type, + existing_job_id=existing_job_id, + ) + if existing_job_id: + readd_pages_job = await self.get_background_job(existing_job_id, oid) + previous_attempt = { + "started": readd_pages_job.started, + "finished": readd_pages_job.finished, + } + if readd_pages_job.previousAttempts: + readd_pages_job.previousAttempts.append(previous_attempt) + else: + readd_pages_job.previousAttempts = [previous_attempt] + readd_pages_job.started = dt_now() + readd_pages_job.finished = None + readd_pages_job.success = None + else: + readd_pages_job = ReAddOrgPagesJob( + id=job_id, + oid=oid, + crawl_type=crawl_type, + started=dt_now(), + ) + + await self.jobs.find_one_and_update( + {"_id": job_id}, {"$set": readd_pages_job.to_dict()}, upsert=True + ) + + return job_id + # pylint: disable=broad-exception-caught + except Exception as exc: + # pylint: disable=raise-missing-from + print(f"warning: re-add org pages job could not be started: {exc}") + return None + async def job_finished( self, job_id: str, @@ -430,7 +473,11 @@ async def job_finished( async def get_background_job( self, job_id: str, oid: Optional[UUID] = None ) -> Union[ - CreateReplicaJob, DeleteReplicaJob, DeleteOrgJob, RecalculateOrgStatsJob + CreateReplicaJob, + DeleteReplicaJob, + DeleteOrgJob, + RecalculateOrgStatsJob, + ReAddOrgPagesJob, ]: """Get background job""" query: dict[str, object] = {"_id": job_id} @@ -454,6 +501,9 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): if data["type"] == BgJobType.RECALCULATE_ORG_STATS: return RecalculateOrgStatsJob.from_dict(data) + if data["type"] == BgJobType.READD_ORG_PAGES: + return ReAddOrgPagesJob.from_dict(data) + return DeleteOrgJob.from_dict(data) async def list_background_jobs( @@ -595,6 +645,13 @@ async def retry_background_job( existing_job_id=job_id, ) + if job.type == BgJobType.READD_ORG_PAGES: + await self.create_re_add_org_pages_job( + org.id, + job.crawl_type, + existing_job_id=job_id, + ) + return {"success": True} async def retry_failed_background_jobs( diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index d913d3362b..01e700f8b5 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -1,6 +1,5 @@ """ base crawl type """ -import os from datetime import timedelta from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple from uuid import UUID @@ -29,6 +28,7 @@ UpdatedResponse, DeletedResponseQuota, CrawlSearchValuesResponse, + PRESIGN_DURATION_SECONDS, ) from .pagination import paginated_format, DEFAULT_PAGE_SIZE from .utils import dt_now, date_to_str @@ -47,11 +47,6 @@ CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object StorageOps = EventWebhookOps = BackgroundJobOps = object -# Presign duration must be less than 604800 seconds (one week), -# so set this one minute short of a week. -PRESIGN_MINUTES_MAX = 10079 -PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX - # ============================================================================ # pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines @@ -93,16 +88,8 @@ def __init__( self.background_job_ops = background_job_ops self.page_ops = cast(PageOps, None) - presign_duration_minutes = int( - os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT - ) - - self.presign_duration_seconds = ( - min(presign_duration_minutes, PRESIGN_MINUTES_MAX) * 60 - ) - # renew when <25% of time remaining - self.expire_at_duration_seconds = int(self.presign_duration_seconds * 0.75) + self.expire_at_duration_seconds = int(PRESIGN_DURATION_SECONDS * 0.75) def set_page_ops(self, page_ops): """set page ops reference""" @@ -336,8 +323,9 @@ async def delete_crawls( status_code=400, detail=f"Error Stopping Crawl: {exc}" ) + await self.page_ops.delete_crawl_pages(crawl_id, org.id) + if type_ == "crawl": - await self.page_ops.delete_crawl_pages(crawl_id, org.id) await self.delete_all_crawl_qa_files(crawl_id, org) crawl_size = await self._delete_crawl_files(crawl, org) @@ -382,7 +370,7 @@ async def _delete_crawl_files( size = 0 for file_ in crawl.files: size += file_.size - if not await self.storage_ops.delete_crawl_file_object(org, file_): + if not await self.storage_ops.delete_file_object(org, file_): raise HTTPException(status_code=400, detail="file_deletion_error") # Not replicating QA run WACZs yet if not isinstance(crawl, QARun): @@ -474,7 +462,7 @@ async def resolve_signed_urls( ): exp = now + delta presigned_url = await self.storage_ops.get_presigned_url( - org, file_, self.presign_duration_seconds + org, file_, PRESIGN_DURATION_SECONDS ) prefix = "files" diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index d0fdd43a7c..8bc02e689a 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -2,14 +2,20 @@ Collections API """ +# pylint: disable=too-many-lines + from collections import Counter from uuid import UUID, uuid4 -from typing import Optional, List, TYPE_CHECKING, cast, Dict +from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union +import os +import re +import urllib.parse import asyncio import pymongo from fastapi import Depends, HTTPException, Response from fastapi.responses import StreamingResponse +from starlette.requests import Request from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .models import ( @@ -29,7 +35,21 @@ EmptyResponse, UpdatedResponse, SuccessResponse, + AddedResponse, + DeletedResponse, CollectionSearchValuesResponse, + OrgPublicCollections, + PublicOrgDetails, + CollAccessType, + PageUrlCount, + PageIdTimestamp, + PaginatedPageUrlCountResponse, + UpdateCollHomeUrl, + User, + ImageFile, + ImageFilePreparer, + MIN_UPLOAD_PART_SIZE, + PublicCollOut, ) from .utils import dt_now @@ -42,11 +62,14 @@ OrgOps = StorageOps = EventWebhookOps = CrawlOps = object +THUMBNAIL_MAX_SIZE = 2_000_000 + + # ============================================================================ class CollectionOps: """ops for working with named collections of crawls""" - # pylint: disable=too-many-arguments + # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods orgs: OrgOps storage_ops: StorageOps @@ -57,6 +80,7 @@ def __init__(self, mdb, storage_ops, orgs, event_webhook_ops): self.collections = mdb["collections"] self.crawls = mdb["crawls"] self.crawl_configs = mdb["crawl_configs"] + self.pages = mdb["pages"] self.crawl_ops = cast(CrawlOps, None) self.orgs = orgs @@ -67,6 +91,11 @@ def set_crawl_ops(self, ops): """set crawl ops""" self.crawl_ops = ops + def set_page_ops(self, ops): + """set page ops""" + # pylint: disable=attribute-defined-outside-init + self.page_ops = ops + async def init_index(self): """init lookup index""" await self.collections.create_index( @@ -88,8 +117,11 @@ async def add_collection(self, oid: UUID, coll_in: CollIn): oid=oid, name=coll_in.name, description=coll_in.description, + caption=coll_in.caption, modified=modified, - isPublic=coll_in.isPublic, + access=coll_in.access, + defaultThumbnailName=coll_in.defaultThumbnailName, + allowPublicDownload=coll_in.allowPublicDownload, ) try: await self.collections.insert_one(coll.to_dict()) @@ -97,6 +129,7 @@ async def add_collection(self, oid: UUID, coll_in: CollIn): if crawl_ids: await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org) await self.update_collection_counts_and_tags(coll_id) + await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_added_to_collection_notification( crawl_ids, coll_id, org @@ -150,6 +183,7 @@ async def add_crawls_to_collection( raise HTTPException(status_code=404, detail="collection_not_found") await self.update_collection_counts_and_tags(coll_id) + await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_added_to_collection_notification( @@ -157,7 +191,7 @@ async def add_crawls_to_collection( ) ) - return await self.get_collection(coll_id, org) + return await self.get_collection_out(coll_id, org) async def remove_crawls_from_collection( self, coll_id: UUID, crawl_ids: List[str], org: Organization @@ -174,6 +208,7 @@ async def remove_crawls_from_collection( raise HTTPException(status_code=404, detail="collection_not_found") await self.update_collection_counts_and_tags(coll_id) + await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_removed_from_collection_notification( @@ -181,51 +216,106 @@ async def remove_crawls_from_collection( ) ) - return await self.get_collection(coll_id, org) + return await self.get_collection_out(coll_id, org) - async def get_collection( - self, coll_id: UUID, org: Organization, resources=False, public_only=False - ) -> CollOut: - """Get collection by id""" + async def get_collection_raw( + self, coll_id: UUID, public_or_unlisted_only: bool = False + ) -> Dict[str, Any]: + """Get collection by id as dict from database""" query: dict[str, object] = {"_id": coll_id} - if public_only: - query["isPublic"] = True + if public_or_unlisted_only: + query["access"] = {"$in": ["public", "unlisted"]} result = await self.collections.find_one(query) if not result: raise HTTPException(status_code=404, detail="collection_not_found") + return result + + async def get_collection( + self, coll_id: UUID, public_or_unlisted_only: bool = False + ) -> Collection: + """Get collection by id""" + result = await self.get_collection_raw(coll_id, public_or_unlisted_only) + return Collection.from_dict(result) + + async def get_collection_out( + self, + coll_id: UUID, + org: Organization, + resources=False, + public_or_unlisted_only=False, + ) -> CollOut: + """Get CollOut by id""" + result = await self.get_collection_raw(coll_id, public_or_unlisted_only) + if resources: - result["resources"] = await self.get_collection_crawl_resources( - coll_id, org + result["resources"] = await self.get_collection_crawl_resources(coll_id) + + thumbnail = result.get("thumbnail") + if thumbnail: + image_file = ImageFile(**thumbnail) + result["thumbnail"] = await image_file.get_image_file_out( + org, self.storage_ops ) + return CollOut.from_dict(result) + async def get_public_collection_out( + self, coll_id: UUID, org: Organization, allow_unlisted: bool = False + ) -> PublicCollOut: + """Get PublicCollOut by id""" + result = await self.get_collection_raw(coll_id) + + allowed_access = [CollAccessType.PUBLIC] + if allow_unlisted: + allowed_access.append(CollAccessType.UNLISTED) + + if result.get("access") not in allowed_access: + raise HTTPException(status_code=404, detail="collection_not_found") + + result["resources"] = await self.get_collection_crawl_resources(coll_id) + + thumbnail = result.get("thumbnail") + if thumbnail: + image_file = ImageFile(**thumbnail) + result["thumbnail"] = await image_file.get_public_image_file_out( + org, self.storage_ops + ) + + return PublicCollOut.from_dict(result) + async def list_collections( self, - oid: UUID, + org: Organization, + public_colls_out: bool = False, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, sort_by: Optional[str] = None, sort_direction: int = 1, name: Optional[str] = None, name_prefix: Optional[str] = None, + access: Optional[str] = None, ): """List all collections for org""" - # pylint: disable=too-many-locals, duplicate-code + # pylint: disable=too-many-locals, duplicate-code, too-many-branches # Zero-index page for query page = page - 1 skip = page * page_size - match_query: dict[str, object] = {"oid": oid} + match_query: dict[str, object] = {"oid": org.id} if name: match_query["name"] = name - elif name_prefix: regex_pattern = f"^{name_prefix}" match_query["name"] = {"$regex": regex_pattern, "$options": "i"} + if public_colls_out: + match_query["access"] = CollAccessType.PUBLIC + elif access: + match_query["access"] = access + aggregate = [{"$match": match_query}] if sort_by: @@ -262,15 +352,35 @@ async def list_collections( except (IndexError, ValueError): total = 0 - collections = [CollOut.from_dict(res) for res in items] + collections: List[Union[CollOut, PublicCollOut]] = [] + + for res in items: + res["resources"] = await self.get_collection_crawl_resources(res["_id"]) + + thumbnail = res.get("thumbnail") + if thumbnail: + image_file = ImageFile(**thumbnail) + + if public_colls_out: + res["thumbnail"] = await image_file.get_public_image_file_out( + org, self.storage_ops + ) + else: + res["thumbnail"] = await image_file.get_image_file_out( + org, self.storage_ops + ) + + if public_colls_out: + collections.append(PublicCollOut.from_dict(res)) + else: + collections.append(CollOut.from_dict(res)) return collections, total - async def get_collection_crawl_resources(self, coll_id: UUID, org: Organization): + async def get_collection_crawl_resources(self, coll_id: UUID): """Return pre-signed resources for all collection crawl files.""" - coll = await self.get_collection(coll_id, org) - if not coll: - raise HTTPException(status_code=404, detail="collection_not_found") + # Ensure collection exists + _ = await self.get_collection_raw(coll_id) all_files = [] @@ -305,6 +415,17 @@ async def get_collection_search_values(self, org: Organization): names = [name for name in names if name] return {"names": names} + async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]: + """Return list of crawl ids in collection""" + crawl_ids = [] + async for crawl_raw in self.crawls.find( + {"collectionIds": coll_id}, projection=["_id"] + ): + crawl_id = crawl_raw.get("_id") + if crawl_id: + crawl_ids.append(crawl_id) + return crawl_ids + async def delete_collection(self, coll_id: UUID, org: Organization): """Delete collection and remove from associated crawls.""" await self.crawl_ops.remove_collection_from_all_crawls(coll_id) @@ -321,7 +442,7 @@ async def delete_collection(self, coll_id: UUID, org: Organization): async def download_collection(self, coll_id: UUID, org: Organization): """Download all WACZs in collection as streaming nested WACZ""" - coll = await self.get_collection(coll_id, org, resources=True) + coll = await self.get_collection_out(coll_id, org, resources=True) metadata = { "type": "collection", @@ -339,6 +460,15 @@ async def download_collection(self, coll_id: UUID, org: Organization): resp, headers=headers, media_type="application/wacz+zip" ) + async def recalculate_org_collection_counts_tags(self, org: Organization): + """Recalculate counts and tags for collections in org""" + collections, _ = await self.list_collections( + org, + page_size=100_000, + ) + for coll in collections: + await self.update_collection_counts_and_tags(coll.id) + async def update_collection_counts_and_tags(self, collection_id: UUID): """Set current crawl info in config when crawl begins""" crawl_count = 0 @@ -346,6 +476,9 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): total_size = 0 tags = [] + coll = await self.get_collection(collection_id) + org = await self.orgs.get_org_by_id(coll.oid) + async for crawl_raw in self.crawls.find({"collectionIds": collection_id}): crawl = BaseCrawl.from_dict(crawl_raw) if crawl.state not in SUCCESSFUL_STATES: @@ -354,8 +487,16 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): files = crawl.files or [] for file in files: total_size += file.size - if crawl.stats: - page_count += crawl.stats.done + + try: + _, crawl_pages = await self.page_ops.list_pages( + crawl.id, org, page_size=1_000_000 + ) + page_count += crawl_pages + # pylint: disable=broad-exception-caught + except Exception: + pass + if crawl.tags: tags.extend(crawl.tags) @@ -373,6 +514,55 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): }, ) + async def recalculate_org_collection_dates(self, org: Organization): + """Recalculate earliest and latest dates for collections in org""" + collections, _ = await self.list_collections( + org, + page_size=100_000, + ) + for coll in collections: + await self.update_collection_dates(coll.id) + + async def update_collection_dates(self, coll_id: UUID): + """Update collection earliest and latest dates from page timestamps""" + coll = await self.get_collection(coll_id) + crawl_ids = await self.get_collection_crawl_ids(coll_id) + + earliest_ts = None + latest_ts = None + + match_query = { + "oid": coll.oid, + "crawl_id": {"$in": crawl_ids}, + "ts": {"$ne": None}, + } + + cursor = self.pages.find(match_query).sort("ts", 1).limit(1) + pages = await cursor.to_list(length=1) + try: + earliest_page = pages[0] + earliest_ts = earliest_page.get("ts") + except IndexError: + pass + + cursor = self.pages.find(match_query).sort("ts", -1).limit(1) + pages = await cursor.to_list(length=1) + try: + latest_page = pages[0] + latest_ts = latest_page.get("ts") + except IndexError: + pass + + await self.collections.find_one_and_update( + {"_id": coll_id}, + { + "$set": { + "dateEarliest": earliest_ts, + "dateLatest": latest_ts, + } + }, + ) + async def update_crawl_collections(self, crawl_id: str): """Update counts and tags for all collections in crawl""" crawl = await self.crawls.find_one({"_id": crawl_id}) @@ -391,10 +581,231 @@ async def add_successful_crawl_to_collections(self, crawl_id: str, cid: UUID): ) await self.update_crawl_collections(crawl_id) + async def get_org_public_collections( + self, + org_slug: str, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sort_by: Optional[str] = None, + sort_direction: int = 1, + ): + """List public collections for org""" + try: + org = await self.orgs.get_org_by_slug(org_slug) + # pylint: disable=broad-exception-caught + except Exception: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=404, detail="public_profile_not_found") + + if not org.enablePublicProfile: + raise HTTPException(status_code=404, detail="public_profile_not_found") + + collections, _ = await self.list_collections( + org, + page_size=page_size, + page=page, + sort_by=sort_by, + sort_direction=sort_direction, + public_colls_out=True, + ) + + public_org_details = PublicOrgDetails( + name=org.name, + description=org.publicDescription or "", + url=org.publicUrl or "", + ) + + return OrgPublicCollections(org=public_org_details, collections=collections) + + async def list_urls_in_collection( + self, + coll_id: UUID, + oid: UUID, + url_prefix: Optional[str] = None, + page_size: int = DEFAULT_PAGE_SIZE, + page: int = 1, + ) -> Tuple[List[PageUrlCount], int]: + """List all URLs in collection sorted desc by snapshot count""" + # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements + # Zero-index page for query + page = page - 1 + skip = page_size * page + + crawl_ids = await self.get_collection_crawl_ids(coll_id) + + match_query: dict[str, object] = {"oid": oid, "crawl_id": {"$in": crawl_ids}} + + if url_prefix: + url_prefix = urllib.parse.unquote(url_prefix) + regex_pattern = f"^{re.escape(url_prefix)}" + match_query["url"] = {"$regex": regex_pattern, "$options": "i"} + + aggregate = [{"$match": match_query}] + + aggregate.extend( + [ + { + "$group": { + "_id": "$url", + "pages": {"$push": "$$ROOT"}, + "count": {"$sum": 1}, + }, + }, + {"$sort": {"count": -1}}, + {"$set": {"url": "$_id"}}, + { + "$facet": { + "items": [ + {"$skip": skip}, + {"$limit": page_size}, + ], + "total": [{"$count": "count"}], + } + }, + ] + ) + + # Get total + cursor = self.pages.aggregate(aggregate) + results = await cursor.to_list(length=1) + result = results[0] + items = result["items"] + + try: + total = int(result["total"][0]["count"]) + except (IndexError, ValueError): + total = 0 + + return [ + PageUrlCount( + url=data.get("url", ""), + count=data.get("count", 0), + snapshots=[ + PageIdTimestamp( + pageId=p["_id"], ts=p.get("ts"), status=p.get("status", 200) + ) + for p in data.get("pages", []) + ], + ) + for data in items + ], total + + async def set_home_url( + self, coll_id: UUID, update: UpdateCollHomeUrl, org: Organization + ) -> Dict[str, bool]: + """Set home URL for collection and save thumbnail to database""" + if update.pageId: + page = await self.page_ops.get_page(update.pageId, org.id) + update_query = { + "homeUrl": page.url, + "homeUrlTs": page.ts, + "homeUrlPageId": page.id, + } + else: + update_query = { + "homeUrl": None, + "homeUrlTs": None, + "homeUrlPageId": None, + } + + await self.collections.find_one_and_update( + {"_id": coll_id, "oid": org.id}, + {"$set": update_query}, + ) + + return {"updated": True} + + async def upload_thumbnail_stream( + self, stream, filename: str, coll_id: UUID, org: Organization, user: User + ) -> Dict[str, bool]: + """Upload file as stream to use as collection thumbnail""" + coll = await self.get_collection(coll_id) + + _, extension = os.path.splitext(filename) + + image_filename = f"thumbnail-{str(coll_id)}{extension}" + + prefix = org.storage.get_storage_extra_path(str(org.id)) + "images/" + + file_prep = ImageFilePreparer( + prefix, + image_filename, + original_filename=filename, + user=user, + created=dt_now(), + ) + + async def stream_iter(): + """iterate over each chunk and compute and digest + total size""" + async for chunk in stream: + file_prep.add_chunk(chunk) + yield chunk + + print("Collection thumbnail stream upload starting", flush=True) + + if not await self.storage_ops.do_upload_multipart( + org, + file_prep.upload_name, + stream_iter(), + MIN_UPLOAD_PART_SIZE, + ): + print("Collection thumbnail stream upload failed", flush=True) + raise HTTPException(status_code=400, detail="upload_failed") + + print("Collection thumbnail stream upload complete", flush=True) + + thumbnail_file = file_prep.get_image_file(org.storage) + + if thumbnail_file.size > THUMBNAIL_MAX_SIZE: + print( + "Collection thumbnail stream upload failed: max size (2 MB) exceeded", + flush=True, + ) + await self.storage_ops.delete_file_object(org, thumbnail_file) + raise HTTPException( + status_code=400, + detail="max_thumbnail_size_2_mb_exceeded", + ) + + if coll.thumbnail: + if not await self.storage_ops.delete_file_object(org, coll.thumbnail): + print( + f"Unable to delete previous collection thumbnail: {coll.thumbnail.filename}" + ) + + coll.thumbnail = thumbnail_file + + # Update entire document to avoid bson.errors.InvalidDocument exception + await self.collections.find_one_and_update( + {"_id": coll_id, "oid": org.id}, + {"$set": coll.to_dict()}, + ) + + return {"added": True} + + async def delete_thumbnail(self, coll_id: UUID, org: Organization): + """Delete collection thumbnail""" + coll = await self.get_collection(coll_id) + + if not coll.thumbnail: + raise HTTPException(status_code=404, detail="thumbnail_not_found") + + if not await self.storage_ops.delete_file_object(org, coll.thumbnail): + print(f"Unable to delete collection thumbnail: {coll.thumbnail.filename}") + raise HTTPException(status_code=400, detail="file_deletion_error") + + # Delete from database + await self.collections.find_one_and_update( + {"_id": coll_id, "oid": org.id}, + {"$set": {"thumbnail": None}}, + ) + + return {"deleted": True} + # ============================================================================ # pylint: disable=too-many-locals -def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops): +def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_dep): """init collections api""" # pylint: disable=invalid-name, unused-argument, too-many-arguments @@ -427,15 +838,17 @@ async def list_collection_all( sortDirection: int = 1, name: Optional[str] = None, namePrefix: Optional[str] = None, + access: Optional[str] = None, ): collections, total = await colls.list_collections( - org.id, + org, page_size=pageSize, page=page, sort_by=sortBy, sort_direction=sortDirection, name=name, name_prefix=namePrefix, + access=access, ) return paginated_format(collections, total, page, pageSize) @@ -447,10 +860,10 @@ async def list_collection_all( async def get_collection_all(org: Organization = Depends(org_viewer_dep)): results = {} try: - all_collections, _ = await colls.list_collections(org.id, page_size=10_000) + all_collections, _ = await colls.list_collections(org, page_size=10_000) for collection in all_collections: results[collection.name] = await colls.get_collection_crawl_resources( - collection.id, org + collection.id ) except Exception as exc: # pylint: disable=raise-missing-from @@ -478,7 +891,7 @@ async def get_collection_search_values( async def get_collection( coll_id: UUID, org: Organization = Depends(org_viewer_dep) ): - return await colls.get_collection(coll_id, org) + return await colls.get_collection_out(coll_id, org) @app.get( "/orgs/{oid}/collections/{coll_id}/replay.json", @@ -488,7 +901,7 @@ async def get_collection( async def get_collection_replay( coll_id: UUID, org: Organization = Depends(org_viewer_dep) ): - return await colls.get_collection(coll_id, org, resources=True) + return await colls.get_collection_out(coll_id, org, resources=True) @app.get( "/orgs/{oid}/collections/{coll_id}/public/replay.json", @@ -500,8 +913,8 @@ async def get_collection_public_replay( coll_id: UUID, org: Organization = Depends(org_public), ): - coll = await colls.get_collection( - coll_id, org, resources=True, public_only=True + coll = await colls.get_collection_out( + coll_id, org, resources=True, public_or_unlisted_only=True ) response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Headers"] = "*" @@ -576,4 +989,127 @@ async def download_collection( ): return await colls.download_collection(coll_id, org) + @app.get( + "/public/orgs/{org_slug}/collections", + tags=["collections", "public"], + response_model=OrgPublicCollections, + ) + async def get_org_public_collections( + org_slug: str, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sortBy: Optional[str] = None, + sortDirection: int = 1, + ): + return await colls.get_org_public_collections( + org_slug, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + ) + + @app.get( + "/public/orgs/{org_slug}/collections/{coll_id}", + tags=["collections", "public"], + response_model=PublicCollOut, + ) + async def get_public_collection( + org_slug: str, + coll_id: UUID, + ): + try: + org = await colls.orgs.get_org_by_slug(org_slug) + # pylint: disable=broad-exception-caught + except Exception: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=404, detail="collection_not_found") + + return await colls.get_public_collection_out(coll_id, org, allow_unlisted=True) + + @app.get( + "/public/orgs/{org_slug}/collections/{coll_id}/download", + tags=["collections", "public"], + response_model=bytes, + ) + async def download_public_collection( + org_slug: str, + coll_id: UUID, + ): + try: + org = await colls.orgs.get_org_by_slug(org_slug) + # pylint: disable=broad-exception-caught + except Exception: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=404, detail="collection_not_found") + + # Make sure collection exists and is public/unlisted + coll = await colls.get_collection(coll_id, public_or_unlisted_only=True) + + if coll.allowPublicDownload is False: + raise HTTPException(status_code=403, detail="not_allowed") + + return await colls.download_collection(coll_id, org) + + @app.get( + "/orgs/{oid}/collections/{coll_id}/urls", + tags=["collections"], + response_model=PaginatedPageUrlCountResponse, + ) + async def get_collection_url_list( + coll_id: UUID, + oid: UUID, + urlPrefix: Optional[str] = None, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + ): + """Retrieve paginated list of urls in collection sorted by snapshot count""" + pages, total = await colls.list_urls_in_collection( + coll_id=coll_id, + oid=oid, + url_prefix=urlPrefix, + page_size=pageSize, + page=page, + ) + return paginated_format(pages, total, page, pageSize) + + @app.post( + "/orgs/{oid}/collections/{coll_id}/home-url", + tags=["collections"], + response_model=UpdatedResponse, + ) + async def set_collection_home_url( + update: UpdateCollHomeUrl, + coll_id: UUID, + org: Organization = Depends(org_crawl_dep), + ): + return await colls.set_home_url(coll_id, update, org) + + @app.put( + "/orgs/{oid}/collections/{coll_id}/thumbnail", + tags=["collections"], + response_model=AddedResponse, + ) + async def upload_thumbnail_stream( + request: Request, + filename: str, + coll_id: UUID, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), + ): + return await colls.upload_thumbnail_stream( + request.stream(), filename, coll_id, org, user + ) + + @app.delete( + "/orgs/{oid}/collections/{coll_id}/thumbnail", + tags=["collections"], + response_model=DeletedResponse, + ) + async def delete_thumbnail_stream( + coll_id: UUID, + org: Organization = Depends(org_crawl_dep), + ): + return await colls.delete_thumbnail(coll_id, org) + return colls diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index d91e6de76d..8047d663eb 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -118,8 +118,6 @@ async def run_replica_job( async def run_delete_org_job( self, oid: str, - backend_image: str, - pull_policy: str, existing_job_id: Optional[str] = None, ) -> str: """run job to delete org and all of its data""" @@ -130,14 +128,12 @@ async def run_delete_org_job( job_id = f"delete-org-{oid}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( - oid, backend_image, pull_policy, job_id, job_type=BgJobType.DELETE_ORG.value + oid, job_id, job_type=BgJobType.DELETE_ORG.value ) async def run_recalculate_org_stats_job( self, oid: str, - backend_image: str, - pull_policy: str, existing_job_id: Optional[str] = None, ) -> str: """run job to recalculate storage stats for the org""" @@ -149,19 +145,32 @@ async def run_recalculate_org_stats_job( return await self._run_bg_job_with_ops_classes( oid, - backend_image, - pull_policy, job_id, job_type=BgJobType.RECALCULATE_ORG_STATS.value, ) - async def _run_bg_job_with_ops_classes( + async def run_re_add_org_pages_job( self, oid: str, - backend_image: str, - pull_policy: str, - job_id: str, - job_type: str, + crawl_type: Optional[str] = None, + existing_job_id: Optional[str] = None, + ) -> str: + """run job to recalculate storage stats for the org""" + + if existing_job_id: + job_id = existing_job_id + else: + job_id = f"org-pages-{oid}-{secrets.token_hex(5)}" + + return await self._run_bg_job_with_ops_classes( + oid, + job_id, + job_type=BgJobType.READD_ORG_PAGES.value, + crawl_type=crawl_type, + ) + + async def _run_bg_job_with_ops_classes( + self, oid: str, job_id: str, job_type: str, **kwargs ) -> str: """run background job with access to ops classes""" @@ -169,8 +178,9 @@ async def _run_bg_job_with_ops_classes( "id": job_id, "oid": oid, "job_type": job_type, - "backend_image": backend_image, - "pull_policy": pull_policy, + "backend_image": os.environ.get("BACKEND_IMAGE", ""), + "pull_policy": os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), + **kwargs, } data = self.templates.env.get_template("background_job.yaml").render(params) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 5a0994fe70..539c408ee6 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -918,7 +918,7 @@ async def delete_crawl_qa_run_files( """delete crawl qa wacz files""" qa_run = await self.get_qa_run(crawl_id, qa_run_id, org) for file_ in qa_run.files: - if not await self.storage_ops.delete_crawl_file_object(org, file_): + if not await self.storage_ops.delete_file_object(org, file_): raise HTTPException(status_code=400, detail="file_deletion_error") # Not replicating QA run WACZs yet # await self.background_job_ops.create_delete_replica_jobs( diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 91b2bc7057..0723258e40 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -17,7 +17,7 @@ from .migrations import BaseMigration -CURR_DB_VERSION = "0035" +CURR_DB_VERSION = "0037" # ============================================================================ @@ -82,6 +82,7 @@ async def update_and_prepare_db( invite_ops, storage_ops, page_ops, + background_job_ops, db_inited, ): """Prepare database for application. @@ -94,7 +95,7 @@ async def update_and_prepare_db( """ await ping_db(mdb) print("Database setup started", flush=True) - if await run_db_migrations(mdb, user_manager, page_ops): + if await run_db_migrations(mdb, user_manager, background_job_ops, page_ops): await drop_indexes(mdb) await create_indexes( org_ops, @@ -113,7 +114,7 @@ async def update_and_prepare_db( # ============================================================================ -async def run_db_migrations(mdb, user_manager, page_ops): +async def run_db_migrations(mdb, user_manager, background_job_ops, page_ops): """Run database migrations.""" # if first run, just set version and exit @@ -145,7 +146,9 @@ async def run_db_migrations(mdb, user_manager, page_ops): assert spec.loader migration_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(migration_module) - migration = migration_module.Migration(mdb, page_ops=page_ops) + migration = migration_module.Migration( + mdb, background_job_ops=background_job_ops, page_ops=page_ops + ) if await migration.run(): migrations_run = True except ImportError as err: diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index f6b678cb82..927a03dcb8 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -223,7 +223,9 @@ def main() -> None: profiles, ) - coll_ops = init_collections_api(app, mdb, org_ops, storage_ops, event_webhook_ops) + coll_ops = init_collections_api( + app, mdb, org_ops, storage_ops, event_webhook_ops, current_active_user + ) base_crawl_init = ( app, @@ -243,14 +245,15 @@ def main() -> None: crawls = init_crawls_api(crawl_manager, *base_crawl_init) + upload_ops = init_uploads_api(*base_crawl_init) + page_ops = init_pages_api( - app, mdb, crawls, org_ops, storage_ops, current_active_user + app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user ) base_crawl_ops.set_page_ops(page_ops) crawls.set_page_ops(page_ops) - - init_uploads_api(*base_crawl_init) + upload_ops.set_page_ops(page_ops) org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops) @@ -260,6 +263,8 @@ def main() -> None: crawl_config_ops.set_coll_ops(coll_ops) + coll_ops.set_page_ops(page_ops) + # run only in first worker if run_once_lock("btrix-init-db"): asyncio.create_task( @@ -273,6 +278,7 @@ def main() -> None: invites, storage_ops, page_ops, + background_job_ops, db_inited, ) ) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index 2fba05e53f..709139d8d2 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -12,6 +12,7 @@ job_type = os.environ.get("BG_JOB_TYPE") oid = os.environ.get("OID") +crawl_type = os.environ.get("CRAWL_TYPE") # ============================================================================ @@ -27,7 +28,7 @@ async def main(): ) return 1 - (org_ops, _, _, _, _, _, _, _, _, _, user_manager) = init_ops() + (org_ops, _, _, _, _, page_ops, coll_ops, _, _, _, _, user_manager) = init_ops() if not oid: print("Org id missing, quitting") @@ -57,6 +58,17 @@ async def main(): traceback.print_exc() return 1 + if job_type == BgJobType.READD_ORG_PAGES: + try: + await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type) + await coll_ops.recalculate_org_collection_dates(org) + await coll_ops.recalculate_org_collection_counts_tags(org) + return 0 + # pylint: disable=broad-exception-caught + except Exception: + traceback.print_exc() + return 1 + print(f"Provided job type {job_type} not currently supported") return 1 diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index a6f6654be3..af7a2d0956 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -31,6 +31,7 @@ def main(): crawl_config_ops, _, crawl_ops, + _, page_ops, coll_ops, _, diff --git a/backend/btrixcloud/migrations/migration_0036_coll_visibility.py b/backend/btrixcloud/migrations/migration_0036_coll_visibility.py new file mode 100644 index 0000000000..48b97ce314 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0036_coll_visibility.py @@ -0,0 +1,49 @@ +""" +Migration 0036 -- collection access +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0036" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Move from Collection.isPublic cool to Collection.access enum + """ + colls_mdb = self.mdb["collections"] + + # Set non-public collections to private + try: + await colls_mdb.update_many( + {"isPublic": False}, + {"$set": {"access": "private"}, "$unset": {"isPublic": 1}}, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error migrating private collections: {err}", + flush=True, + ) + + # Set public collections to unlisted + try: + await colls_mdb.update_many( + {"isPublic": True}, + {"$set": {"access": "unlisted"}, "$unset": {"isPublic": 1}}, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error migrating public unlisted collections: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/migrations/migration_0037_upload_pages.py b/backend/btrixcloud/migrations/migration_0037_upload_pages.py new file mode 100644 index 0000000000..62bfe98237 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py @@ -0,0 +1,72 @@ +""" +Migration 0037 -- upload pages +""" + +from uuid import UUID + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0037" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + self.background_job_ops = kwargs.get("background_job_ops") + self.page_ops = kwargs.get("page_ops") + + async def org_upload_pages_already_added(self, oid: UUID) -> bool: + """Check if upload pages have already been added for this org""" + if self.page_ops is None: + print( + f"page_ops missing, assuming pages need to be added for org {oid}", + flush=True, + ) + return False + + mdb_crawls = self.mdb["crawls"] + async for upload in mdb_crawls.find({"oid": oid, "type": "upload"}): + upload_id = upload["_id"] + _, total = await self.page_ops.list_pages(upload_id) + if total > 0: + return True + return False + + async def migrate_up(self): + """Perform migration up. + + Start background jobs to parse uploads and add their pages to db + """ + if self.background_job_ops is None: + print( + "Unable to start background job, missing background_job_ops", flush=True + ) + return + + mdb_orgs = self.mdb["organizations"] + async for org in mdb_orgs.find(): + oid = org["_id"] + + pages_already_added = await self.org_upload_pages_already_added(oid) + + if pages_already_added: + print( + f"Skipping org {oid}, upload pages already added to db", flush=True + ) + continue + + try: + await self.background_job_ops.create_re_add_org_pages_job( + oid, crawl_type="upload" + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error starting background job to add upload pges to org {oid}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 2209c5f79e..807628fdc3 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -5,6 +5,9 @@ from datetime import datetime from enum import Enum, IntEnum from uuid import UUID +import base64 +import hashlib +import mimetypes import os from typing import Optional, List, Dict, Union, Literal, Any, get_args @@ -21,6 +24,7 @@ BeforeValidator, TypeAdapter, ) +from pathvalidate import sanitize_filename # from fastapi_users import models as fastapi_users_models @@ -29,6 +33,20 @@ # crawl scale for constraint MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) +# Presign duration must be less than 604800 seconds (one week), +# so set this one minute short of a week +PRESIGN_MINUTES_MAX = 10079 +PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX + +# Expire duration seconds for presigned urls +PRESIGN_DURATION_MINUTES = int( + os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT +) +PRESIGN_DURATION_SECONDS = min(PRESIGN_DURATION_MINUTES, PRESIGN_MINUTES_MAX) * 60 + +# Minimum part size for file uploads +MIN_UPLOAD_PART_SIZE = 10000000 + # annotated types # ============================================================================ @@ -779,6 +797,9 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel): reviewStatus: ReviewStatus = None + filePageCount: Optional[int] = 0 + errorPageCount: Optional[int] = 0 + # ============================================================================ class CollIdName(BaseModel): @@ -995,9 +1016,6 @@ class Crawl(BaseCrawl, CrawlConfigCore): qa: Optional[QARun] = None qaFinished: Optional[Dict[str, QARun]] = {} - filePageCount: Optional[int] = 0 - errorPageCount: Optional[int] = 0 - # ============================================================================ class CrawlCompleteIn(BaseModel): @@ -1050,11 +1068,169 @@ class UpdateUpload(UpdateCrawl): """Update modal that also includes name""" +# ============================================================================ +class FilePreparer: + """wrapper to compute digest / name for streaming upload""" + + def __init__(self, prefix, filename): + self.upload_size = 0 + self.upload_hasher = hashlib.sha256() + self.upload_name = prefix + self.prepare_filename(filename) + + def add_chunk(self, chunk): + """add chunk for file""" + self.upload_size += len(chunk) + self.upload_hasher.update(chunk) + + def get_crawl_file(self, storage: StorageRef): + """get crawl file""" + return CrawlFile( + filename=self.upload_name, + hash=self.upload_hasher.hexdigest(), + size=self.upload_size, + storage=storage, + ) + + def prepare_filename(self, filename): + """prepare filename by sanitizing and adding extra string + to avoid duplicates""" + name = sanitize_filename(filename.rsplit("/", 1)[-1]) + parts = name.split(".") + randstr = base64.b32encode(os.urandom(5)).lower() + parts[0] += "-" + randstr.decode("utf-8") + return ".".join(parts) + + +# ============================================================================ + +### USER-UPLOADED IMAGES ### + + +# ============================================================================ +class ImageFileOut(BaseModel): + """output for user-upload imaged file (conformance to Data Resource Spec)""" + + name: str + path: str + hash: str + size: int + + originalFilename: str + mime: str + userid: UUID + userName: str + created: datetime + + +# ============================================================================ +class PublicImageFileOut(BaseModel): + """public output for user-upload imaged file (conformance to Data Resource Spec)""" + + name: str + path: str + hash: str + size: int + + mime: str + + +# ============================================================================ +class ImageFile(BaseFile): + """User-uploaded image file""" + + originalFilename: str + mime: str + userid: UUID + userName: str + created: datetime + + async def get_image_file_out(self, org, storage_ops) -> ImageFileOut: + """Get ImageFileOut with new presigned url""" + presigned_url = await storage_ops.get_presigned_url( + org, self, PRESIGN_DURATION_SECONDS + ) + + return ImageFileOut( + name=self.filename, + path=presigned_url or "", + hash=self.hash, + size=self.size, + originalFilename=self.originalFilename, + mime=self.mime, + userid=self.userid, + userName=self.userName, + created=self.created, + ) + + async def get_public_image_file_out(self, org, storage_ops) -> PublicImageFileOut: + """Get PublicImageFileOut with new presigned url""" + presigned_url = await storage_ops.get_presigned_url( + org, self, PRESIGN_DURATION_SECONDS + ) + + return PublicImageFileOut( + name=self.filename, + path=presigned_url or "", + hash=self.hash, + size=self.size, + mime=self.mime, + ) + + +# ============================================================================ +class ImageFilePreparer(FilePreparer): + """Wrapper for user image streaming uploads""" + + # pylint: disable=too-many-arguments, too-many-function-args + + def __init__( + self, + prefix, + filename, + original_filename: str, + user: User, + created: datetime, + ): + super().__init__(prefix, filename) + + self.original_filename = original_filename + self.mime, _ = mimetypes.guess_type(original_filename) or ("image/jpeg", None) + self.userid = user.id + self.user_name = user.name + self.created = created + + def get_image_file( + self, + storage: StorageRef, + ) -> ImageFile: + """get user-uploaded image file""" + return ImageFile( + filename=self.upload_name, + hash=self.upload_hasher.hexdigest(), + size=self.upload_size, + storage=storage, + originalFilename=self.original_filename, + mime=self.mime, + userid=self.userid, + userName=self.user_name, + created=self.created, + ) + + # ============================================================================ ### COLLECTIONS ### +# ============================================================================ +class CollAccessType(str, Enum): + """Collection access types""" + + PRIVATE = "private" + UNLISTED = "unlisted" + PUBLIC = "public" + + # ============================================================================ class Collection(BaseMongoModel): """Org collection structure""" @@ -1062,16 +1238,29 @@ class Collection(BaseMongoModel): name: str = Field(..., min_length=1) oid: UUID description: Optional[str] = None + caption: Optional[str] = None modified: Optional[datetime] = None crawlCount: Optional[int] = 0 pageCount: Optional[int] = 0 totalSize: Optional[int] = 0 + dateEarliest: Optional[datetime] = None + dateLatest: Optional[datetime] = None + # Sorted by count, descending tags: Optional[List[str]] = [] - isPublic: Optional[bool] = False + access: CollAccessType = CollAccessType.PRIVATE + + homeUrl: Optional[AnyHttpUrl] = None + homeUrlTs: Optional[datetime] = None + homeUrlPageId: Optional[UUID] = None + + thumbnail: Optional[ImageFile] = None + defaultThumbnailName: Optional[str] = None + + allowPublicDownload: Optional[bool] = True # ============================================================================ @@ -1080,16 +1269,74 @@ class CollIn(BaseModel): name: str = Field(..., min_length=1) description: Optional[str] = None + caption: Optional[str] = None crawlIds: Optional[List[str]] = [] - isPublic: bool = False + access: CollAccessType = CollAccessType.PRIVATE + + defaultThumbnailName: Optional[str] = None + allowPublicDownload: bool = True + + +# ============================================================================ +class CollOut(BaseMongoModel): + """Collection output model with annotations.""" + + name: str + oid: UUID + description: Optional[str] = None + caption: Optional[str] = None + modified: Optional[datetime] = None + + crawlCount: Optional[int] = 0 + pageCount: Optional[int] = 0 + totalSize: Optional[int] = 0 + + dateEarliest: Optional[datetime] = None + dateLatest: Optional[datetime] = None + + # Sorted by count, descending + tags: Optional[List[str]] = [] + + access: CollAccessType = CollAccessType.PRIVATE + + homeUrl: Optional[AnyHttpUrl] = None + homeUrlTs: Optional[datetime] = None + homeUrlPageId: Optional[UUID] = None + + resources: List[CrawlFileOut] = [] + thumbnail: Optional[ImageFileOut] = None + defaultThumbnailName: Optional[str] = None + + allowPublicDownload: bool = True # ============================================================================ -class CollOut(Collection): +class PublicCollOut(BaseMongoModel): """Collection output model with annotations.""" + name: str + oid: UUID + description: Optional[str] = None + caption: Optional[str] = None + + crawlCount: Optional[int] = 0 + pageCount: Optional[int] = 0 + totalSize: Optional[int] = 0 + + dateEarliest: Optional[datetime] = None + dateLatest: Optional[datetime] = None + + access: CollAccessType = CollAccessType.PUBLIC + + homeUrl: Optional[AnyHttpUrl] = None + homeUrlTs: Optional[datetime] = None + resources: List[CrawlFileOut] = [] + thumbnail: Optional[PublicImageFileOut] = None + defaultThumbnailName: Optional[str] = None + + allowPublicDownload: bool = True # ============================================================================ @@ -1098,7 +1345,17 @@ class UpdateColl(BaseModel): name: Optional[str] = None description: Optional[str] = None - isPublic: Optional[bool] = None + caption: Optional[str] = None + access: Optional[CollAccessType] = None + defaultThumbnailName: Optional[str] = None + allowPublicDownload: Optional[bool] = None + + +# ============================================================================ +class UpdateCollHomeUrl(BaseModel): + """Update home url for collection""" + + pageId: Optional[UUID] = None # ============================================================================ @@ -1143,6 +1400,24 @@ class RenameOrg(BaseModel): slug: Optional[str] = None +# ============================================================================ +class PublicOrgDetails(BaseModel): + """Model for org details that are available in public profile""" + + name: str + description: str = "" + url: str = "" + + +# ============================================================================ +class OrgPublicCollections(BaseModel): + """Model for listing public collections in org""" + + org: PublicOrgDetails + + collections: List[PublicCollOut] = [] + + # ============================================================================ class OrgStorageRefs(BaseModel): """Input model for setting primary storage + optional replicas""" @@ -1371,6 +1646,15 @@ class OrgReadOnlyUpdate(BaseModel): readOnlyReason: Optional[str] = None +# ============================================================================ +class OrgPublicProfileUpdate(BaseModel): + """Organization enablePublicProfile update""" + + enablePublicProfile: Optional[bool] = None + publicDescription: Optional[str] = None + publicUrl: Optional[str] = None + + # ============================================================================ class OrgWebhookUrls(BaseModel): """Organization webhook URLs""" @@ -1439,6 +1723,10 @@ class OrgOut(BaseMongoModel): allowedProxies: list[str] = [] crawlingDefaults: Optional[CrawlConfigDefaults] = None + enablePublicProfile: bool = False + publicDescription: str = "" + publicUrl: str = "" + # ============================================================================ class Organization(BaseMongoModel): @@ -1494,6 +1782,10 @@ class Organization(BaseMongoModel): allowedProxies: list[str] = [] crawlingDefaults: Optional[CrawlConfigDefaults] = None + enablePublicProfile: bool = False + publicDescription: Optional[str] = None + publicUrl: Optional[str] = None + def is_owner(self, user): """Check if user is owner""" return self._is_auth(user, UserRole.OWNER) @@ -2022,6 +2314,7 @@ class BgJobType(str, Enum): DELETE_REPLICA = "delete-replica" DELETE_ORG = "delete-org" RECALCULATE_ORG_STATS = "recalculate-org-stats" + READD_ORG_PAGES = "readd-org-pages" # ============================================================================ @@ -2075,6 +2368,14 @@ class RecalculateOrgStatsJob(BackgroundJob): type: Literal[BgJobType.RECALCULATE_ORG_STATS] = BgJobType.RECALCULATE_ORG_STATS +# ============================================================================ +class ReAddOrgPagesJob(BackgroundJob): + """Model for tracking jobs to readd an org's pages""" + + type: Literal[BgJobType.READD_ORG_PAGES] = BgJobType.READD_ORG_PAGES + crawl_type: Optional[str] = None + + # ============================================================================ # Union of all job types, for response model @@ -2085,6 +2386,7 @@ class RecalculateOrgStatsJob(BackgroundJob): BackgroundJob, DeleteOrgJob, RecalculateOrgStatsJob, + ReAddOrgPagesJob, ] ] @@ -2196,7 +2498,7 @@ class PageWithAllQA(Page): class PageOut(Page): """Model for pages output, no QA""" - status: Optional[int] = 200 + status: int = 200 # ============================================================================ @@ -2222,6 +2524,24 @@ class PageNoteUpdatedResponse(BaseModel): data: PageNote +# ============================================================================ +class PageIdTimestamp(BaseModel): + """Simplified model for page info to include in PageUrlCount""" + + pageId: UUID + ts: Optional[datetime] = None + status: int = 200 + + +# ============================================================================ +class PageUrlCount(BaseModel): + """Model for counting pages by URL""" + + url: AnyHttpUrl + count: int = 0 + snapshots: List[PageIdTimestamp] = [] + + # ============================================================================ ### GENERIC RESPONSE MODELS ### @@ -2468,3 +2788,10 @@ class PaginatedUserEmailsResponse(PaginatedResponse): """Response model for user emails with org info""" items: List[UserEmailWithOrgInfo] + + +# ============================================================================ +class PaginatedPageUrlCountResponse(PaginatedResponse): + """Response model for page count by url""" + + items: List[PageUrlCount] diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index 23629de2aa..bee24d00c5 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -16,6 +16,7 @@ from .pages import PageOps from .profiles import ProfileOps from .storages import StorageOps +from .uploads import UploadOps from .users import UserManager from .webhooks import EventWebhookOps @@ -26,6 +27,7 @@ def init_ops() -> Tuple[ CrawlConfigOps, BaseCrawlOps, CrawlOps, + UploadOps, PageOps, CollectionOps, ProfileOps, @@ -70,7 +72,7 @@ def init_ops() -> Tuple[ coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops) - base_crawl_ops = BaseCrawlOps( + base_crawl_init = ( mdb, user_manager, org_ops, @@ -81,23 +83,17 @@ def init_ops() -> Tuple[ background_job_ops, ) - crawl_ops = CrawlOps( - crawl_manager, - mdb, - user_manager, - org_ops, - crawl_config_ops, - coll_ops, - storage_ops, - event_webhook_ops, - background_job_ops, - ) + base_crawl_ops = BaseCrawlOps(*base_crawl_init) - page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) + crawl_ops = CrawlOps(crawl_manager, *base_crawl_init) - base_crawl_ops.set_page_ops(page_ops) + upload_ops = UploadOps(*base_crawl_init) + page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops) + + base_crawl_ops.set_page_ops(page_ops) crawl_ops.set_page_ops(page_ops) + upload_ops.set_page_ops(page_ops) background_job_ops.set_ops(crawl_ops, profile_ops) @@ -109,11 +105,14 @@ def init_ops() -> Tuple[ crawl_config_ops.set_coll_ops(coll_ops) + coll_ops.set_page_ops(page_ops) + return ( org_ops, crawl_config_ops, base_crawl_ops, crawl_ops, + upload_ops, page_ops, coll_ops, profile_ops, diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index e5b1ae4751..8ba4d3508b 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -78,6 +78,7 @@ RemovedResponse, OrgSlugsResponse, OrgImportResponse, + OrgPublicProfileUpdate, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import ( @@ -294,6 +295,14 @@ async def get_org_by_id(self, oid: UUID) -> Organization: return Organization.from_dict(res) + async def get_org_by_slug(self, slug: str) -> Organization: + """Get an org by id""" + res = await self.orgs.find_one({"slug": slug}) + if not res: + raise HTTPException(status_code=400, detail="invalid_org_slug") + + return Organization.from_dict(res) + async def get_default_org(self) -> Organization: """Get default organization""" res = await self.orgs.find_one({"default": True}) @@ -940,7 +949,7 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: ) collections_count = await self.colls_db.count_documents({"oid": org.id}) public_collections_count = await self.colls_db.count_documents( - {"oid": org.id, "isPublic": True} + {"oid": org.id, "access": {"$in": ["public", "unlisted"]}} ) return { @@ -997,6 +1006,21 @@ async def update_read_only_on_cancel( ) return res is not None + async def update_public_profile( + self, org: Organization, update: OrgPublicProfileUpdate + ): + """Update or enable/disable organization's public profile""" + query = update.dict(exclude_unset=True) + + if len(query) == 0: + raise HTTPException(status_code=400, detail="no_update_data") + + res = await self.orgs.find_one_and_update( + {"_id": org.id}, + {"$set": query}, + ) + return res is not None + async def export_org( self, org: Organization, user_manager: UserManager ) -> StreamingResponse: @@ -1553,6 +1577,19 @@ async def update_read_only_on_cancel( return {"updated": True} + @router.post( + "/public-profile", + tags=["organizations", "collections"], + response_model=UpdatedResponse, + ) + async def update_public_profile( + update: OrgPublicProfileUpdate, + org: Organization = Depends(org_owner_dep), + ): + await ops.update_public_profile(org, update) + + return {"updated": True} + @router.post( "/event-webhook-urls", tags=["organizations"], response_model=UpdatedResponse ) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index a980567c49..251d959be1 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 -from fastapi import Depends, HTTPException +from fastapi import Depends, HTTPException, Request import pymongo from .models import ( @@ -24,6 +24,7 @@ PageNoteEdit, PageNoteDelete, QARunBucketStats, + StartedResponse, StartedResponseBool, UpdatedResponse, DeletedResponse, @@ -34,11 +35,12 @@ from .utils import str_to_date, str_list_to_bools, dt_now if TYPE_CHECKING: + from .background_jobs import BackgroundJobOps from .crawls import CrawlOps from .orgs import OrgOps from .storages import StorageOps else: - CrawlOps = StorageOps = OrgOps = object + CrawlOps = StorageOps = OrgOps = BackgroundJobOps = object # ============================================================================ @@ -49,18 +51,24 @@ class PageOps: crawl_ops: CrawlOps org_ops: OrgOps storage_ops: StorageOps + background_job_ops: BackgroundJobOps - def __init__(self, mdb, crawl_ops, org_ops, storage_ops): + def __init__(self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops): self.pages = mdb["pages"] self.crawls = mdb["crawls"] self.crawl_ops = crawl_ops self.org_ops = org_ops self.storage_ops = storage_ops + self.background_job_ops = background_job_ops async def init_index(self): """init index for pages db collection""" await self.pages.create_index([("crawl_id", pymongo.HASHED)]) + async def set_ops(self, background_job_ops: BackgroundJobOps): + """Set ops classes as needed""" + self.background_job_ops = background_job_ops + async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): """Add pages to database from WACZ files""" pages_buffer: List[Page] = [] @@ -94,9 +102,14 @@ def _get_page_from_dict( self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID ) -> Page: """Return Page object from dict""" - page_id = page_dict.get("id") + page_id = page_dict.get("id", "") if not page_id: - print(f'Page {page_dict.get("url")} has no id - assigning UUID', flush=True) + page_id = uuid4() + + try: + UUID(page_id) + except ValueError: + page_id = uuid4() status = page_dict.get("status") if not status and page_dict.get("loadState"): @@ -199,10 +212,7 @@ async def update_crawl_file_and_error_counts( inc_query["errorPageCount"] = error_count await self.crawls.find_one_and_update( - { - "_id": crawl_id, - "type": "crawl", - }, + {"_id": crawl_id}, {"$inc": inc_query}, ) @@ -554,13 +564,17 @@ async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): print(f"Deleted pages for crawl {crawl_id}", flush=True) await self.add_crawl_pages_to_db_from_wacz(crawl_id) - async def re_add_all_crawl_pages(self, oid: UUID): - """Re-add pages for all crawls in org""" - crawl_ids = await self.crawls.distinct( - "_id", {"type": "crawl", "finished": {"$ne": None}} - ) + async def re_add_all_crawl_pages( + self, org: Organization, crawl_type: Optional[str] = None + ): + """Re-add pages for all crawls and uploads in org""" + match_query: Dict[str, object] = {"finished": {"$ne": None}} + if crawl_type in ("crawl", "upload"): + match_query["type"] = crawl_type + + crawl_ids = await self.crawls.distinct("_id", match_query) for crawl_id in crawl_ids: - await self.re_add_crawl_pages(crawl_id, oid) + await self.re_add_crawl_pages(crawl_id, org.id) async def get_qa_run_aggregate_counts( self, @@ -630,47 +644,102 @@ async def get_qa_run_aggregate_counts( return sorted(return_data, key=lambda bucket: bucket.lowerBoundary) + def get_crawl_type_from_pages_route(self, request: Request): + """Get crawl type to filter on from request route""" + crawl_type = None + + try: + route_path = request.scope["route"].path + type_path = route_path.split("/")[4] + + if type_path == "uploads": + crawl_type = "upload" + if type_path == "crawls": + crawl_type = "crawl" + except (IndexError, AttributeError): + pass + + return crawl_type + # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme -def init_pages_api(app, mdb, crawl_ops, org_ops, storage_ops, user_dep): +def init_pages_api( + app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, user_dep +): """init pages API""" # pylint: disable=invalid-name - ops = PageOps(mdb, crawl_ops, org_ops, storage_ops) + ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops) org_crawl_dep = org_ops.org_crawl_dep @app.post( "/orgs/{oid}/crawls/all/pages/reAdd", - tags=["pages"], - response_model=StartedResponseBool, + tags=["pages", "crawls"], + response_model=StartedResponse, + ) + @app.post( + "/orgs/{oid}/uploads/all/pages/reAdd", + tags=["pages", "uploads"], + response_model=StartedResponse, + ) + @app.post( + "/orgs/{oid}/all-crawls/all/pages/reAdd", + tags=["pages", "all-crawls"], + response_model=StartedResponse, ) async def re_add_all_crawl_pages( - org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep) + request: Request, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), ): - """Re-add pages for all crawls in org (superuser only)""" + """Re-add pages for all crawls in org (superuser only, may delete page QA data!)""" if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - asyncio.create_task(ops.re_add_all_crawl_pages(org.id)) - return {"started": True} + crawl_type = ops.get_crawl_type_from_pages_route(request) + job_id = await ops.background_job_ops.create_re_add_org_pages_job( + org.id, crawl_type=crawl_type + ) + return {"started": job_id or ""} @app.post( "/orgs/{oid}/crawls/{crawl_id}/pages/reAdd", - tags=["pages"], + tags=["pages", "crawls"], + response_model=StartedResponseBool, + ) + @app.post( + "/orgs/{oid}/uploads/{crawl_id}/pages/reAdd", + tags=["pages", "uploads"], + response_model=StartedResponseBool, + ) + @app.post( + "/orgs/{oid}/all-crawls/{crawl_id}/pages/reAdd", + tags=["pages", "all-crawls"], response_model=StartedResponseBool, ) async def re_add_crawl_pages( - crawl_id: str, org: Organization = Depends(org_crawl_dep) + crawl_id: str, + org: Organization = Depends(org_crawl_dep), ): - """Re-add pages for crawl""" + """Re-add pages for crawl (may delete page QA data!)""" asyncio.create_task(ops.re_add_crawl_pages(crawl_id, org.id)) return {"started": True} @app.get( "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}", - tags=["pages"], + tags=["pages", "crawls"], + response_model=PageOut, + ) + @app.get( + "/orgs/{oid}/uploads/{crawl_id}/pages/{page_id}", + tags=["pages", "uploads"], + response_model=PageOut, + ) + @app.get( + "/orgs/{oid}/all-crawls/{crawl_id}/pages/{page_id}", + tags=["pages", "all-crawls"], response_model=PageOut, ) async def get_page( @@ -692,7 +761,7 @@ async def get_page_with_qa( page_id: UUID, org: Organization = Depends(org_crawl_dep), ): - """GET single page""" + """GET single page with QA details""" return await ops.get_page_out(page_id, org.id, crawl_id, qa_run_id=qa_run_id) @app.patch( @@ -753,12 +822,22 @@ async def delete_page_notes( delete: PageNoteDelete, org: Organization = Depends(org_crawl_dep), ): - """Edit page note""" + """Delete page note""" return await ops.delete_page_notes(page_id, org.id, delete, crawl_id) @app.get( "/orgs/{oid}/crawls/{crawl_id}/pages", - tags=["pages"], + tags=["pages", "crawls"], + response_model=PaginatedPageOutResponse, + ) + @app.get( + "/orgs/{oid}/uploads/{crawl_id}/pages", + tags=["pages", "uploads"], + response_model=PaginatedPageOutResponse, + ) + @app.get( + "/orgs/{oid}/all-crawls/{crawl_id}/pages", + tags=["pages", "all-crawls"], response_model=PaginatedPageOutResponse, ) async def get_pages_list( diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index ab72422472..9b8ae8da8f 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -426,7 +426,7 @@ async def delete_profile( # Delete file from storage if profile.resource: - await self.storage_ops.delete_crawl_file_object(org, profile.resource) + await self.storage_ops.delete_file_object(org, profile.resource) await self.orgs.inc_org_bytes_stored( org.id, -profile.resource.size, "profile" ) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 3c79648529..e167449eb5 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -475,9 +475,7 @@ async def get_presigned_url( return presigned_url - async def delete_crawl_file_object( - self, org: Organization, crawlfile: BaseFile - ) -> bool: + async def delete_file_object(self, org: Organization, crawlfile: BaseFile) -> bool: """delete crawl file from storage.""" return await self._delete_file(org, crawlfile.filename, crawlfile.storage) diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index ded0630719..e95b30427f 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -1,9 +1,6 @@ """ handle user uploads into browsertrix """ import uuid -import hashlib -import os -import base64 from urllib.parse import unquote from uuid import UUID @@ -13,7 +10,6 @@ from fastapi import Depends, UploadFile, File from fastapi import HTTPException from starlette.requests import Request -from pathvalidate import sanitize_filename from .basecrawls import BaseCrawlOps from .storages import CHUNK_SIZE @@ -27,18 +23,16 @@ Organization, PaginatedCrawlOutResponse, User, - StorageRef, UpdatedResponse, DeletedResponseQuota, AddedResponseIdQuota, + FilePreparer, + MIN_UPLOAD_PART_SIZE, ) from .pagination import paginated_format, DEFAULT_PAGE_SIZE from .utils import dt_now -MIN_UPLOAD_PART_SIZE = 10000000 - - # ============================================================================ class UploadOps(BaseCrawlOps): """upload ops""" @@ -105,9 +99,10 @@ async def stream_iter(): if prev_upload: try: await self._delete_crawl_files(prev_upload, org) + await self.page_ops.delete_crawl_pages(prev_upload.id, org.id) # pylint: disable=broad-exception-caught except Exception as exc: - print("replace file deletion failed", exc) + print(f"Error handling previous upload: {exc}", flush=True) return await self._create_upload( files, name, description, collections, tags, id_, org, user @@ -195,6 +190,8 @@ async def _create_upload( self.event_webhook_ops.create_upload_finished_notification(crawl_id, org.id) ) + asyncio.create_task(self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id)) + await self.orgs.inc_org_bytes_stored(org.id, file_size, "upload") quota_reached = self.orgs.storage_quota_reached(org) @@ -224,39 +221,6 @@ async def delete_uploads( return {"deleted": True, "storageQuotaReached": quota_reached} -# ============================================================================ -class FilePreparer: - """wrapper to compute digest / name for streaming upload""" - - def __init__(self, prefix, filename): - self.upload_size = 0 - self.upload_hasher = hashlib.sha256() - self.upload_name = prefix + self.prepare_filename(filename) - - def add_chunk(self, chunk): - """add chunk for file""" - self.upload_size += len(chunk) - self.upload_hasher.update(chunk) - - def get_crawl_file(self, storage: StorageRef): - """get crawl file""" - return CrawlFile( - filename=self.upload_name, - hash=self.upload_hasher.hexdigest(), - size=self.upload_size, - storage=storage, - ) - - def prepare_filename(self, filename): - """prepare filename by sanitizing and adding extra string - to avoid duplicates""" - name = sanitize_filename(filename.rsplit("/", 1)[-1]) - parts = name.split(".") - randstr = base64.b32encode(os.urandom(5)).lower() - parts[0] += "-" + randstr.decode("utf-8") - return ".".join(parts) - - # ============================================================================ class UploadFileReader(BufferedReader): """Compute digest on file upload""" @@ -446,3 +410,5 @@ async def delete_uploads( org: Organization = Depends(org_crawl_dep), ): return await ops.delete_uploads(delete_list, org, user) + + return ops diff --git a/backend/test/data/thumbnail.jpg b/backend/test/data/thumbnail.jpg new file mode 100644 index 0000000000..133cfae2b0 Binary files /dev/null and b/backend/test/data/thumbnail.jpg differ diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index d0210e24da..c2999bb5ad 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -1,10 +1,11 @@ import requests import os +from uuid import uuid4 from zipfile import ZipFile, ZIP_STORED from tempfile import TemporaryFile -from .conftest import API_PREFIX +from .conftest import API_PREFIX, NON_DEFAULT_ORG_NAME, NON_DEFAULT_ORG_SLUG from .utils import read_in_chunks COLLECTION_NAME = "Test collection" @@ -12,11 +13,24 @@ UPDATED_NAME = "Updated tést cöllection" SECOND_COLLECTION_NAME = "second-collection" DESCRIPTION = "Test description" +CAPTION = "Short caption" +UPDATED_CAPTION = "Updated caption" + +NON_PUBLIC_COLL_FIELDS = ( + "modified", + "tags", + "homeUrlPageId", +) +NON_PUBLIC_IMAGE_FIELDS = ("originalFilename", "userid", "userName", "created") + _coll_id = None _second_coll_id = None +_public_coll_id = None +_second_public_coll_id = None upload_id = None modified = None +default_org_slug = None curr_dir = os.path.dirname(os.path.realpath(__file__)) @@ -24,12 +38,16 @@ def test_create_collection( crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id ): + default_thumbnail_name = "default-thumbnail.jpg" + r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/collections", headers=crawler_auth_headers, json={ "crawlIds": [crawler_crawl_id], "name": COLLECTION_NAME, + "caption": CAPTION, + "defaultThumbnailName": default_thumbnail_name, }, ) assert r.status_code == 200 @@ -48,6 +66,29 @@ def test_create_collection( assert _coll_id in r.json()["collectionIds"] assert r.json()["collections"] == [{"name": COLLECTION_NAME, "id": _coll_id}] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == _coll_id + assert data["name"] == COLLECTION_NAME + assert data["caption"] == CAPTION + assert data["crawlCount"] == 1 + assert data["pageCount"] > 0 + assert data["totalSize"] > 0 + modified = data["modified"] + assert modified + assert modified.endswith("Z") + + assert data["dateEarliest"] + assert data["dateLatest"] + + assert data["defaultThumbnailName"] == default_thumbnail_name + assert data["allowPublicDownload"] + def test_create_public_collection( crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id @@ -58,7 +99,9 @@ def test_create_public_collection( json={ "crawlIds": [crawler_crawl_id], "name": PUBLIC_COLLECTION_NAME, - "isPublic": True, + "caption": CAPTION, + "access": "public", + "allowPublicDownload": False, }, ) assert r.status_code == 200 @@ -66,6 +109,7 @@ def test_create_public_collection( assert data["added"] assert data["name"] == PUBLIC_COLLECTION_NAME + global _public_coll_id _public_coll_id = data["id"] # Verify that it is public @@ -73,7 +117,7 @@ def test_create_public_collection( f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", headers=crawler_auth_headers, ) - assert r.json()["isPublic"] + assert r.json()["access"] == "public" def test_create_collection_taken_name( @@ -113,6 +157,7 @@ def test_update_collection( headers=crawler_auth_headers, json={ "description": DESCRIPTION, + "caption": UPDATED_CAPTION, }, ) assert r.status_code == 200 @@ -128,6 +173,7 @@ def test_update_collection( assert data["id"] == _coll_id assert data["name"] == COLLECTION_NAME assert data["description"] == DESCRIPTION + assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 1 assert data["pageCount"] > 0 assert data["totalSize"] > 0 @@ -135,6 +181,9 @@ def test_update_collection( modified = data["modified"] assert modified assert modified.endswith("Z") + assert data["dateEarliest"] + assert data["dateLatest"] + assert data["defaultThumbnailName"] def test_rename_collection( @@ -209,6 +258,8 @@ def test_add_remove_crawl_from_collection( assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] # Verify it was added r = requests.get( @@ -231,6 +282,8 @@ def test_add_remove_crawl_from_collection( assert data["totalSize"] == 0 assert data["modified"] >= modified assert data.get("tags", []) == [] + assert data.get("dateEarliest") is None + assert data.get("dateLatest") is None # Verify they were removed r = requests.get( @@ -259,6 +312,8 @@ def test_add_remove_crawl_from_collection( assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] def test_get_collection(crawler_auth_headers, default_org_id): @@ -272,11 +327,15 @@ def test_get_collection(crawler_auth_headers, default_org_id): assert data["name"] == UPDATED_NAME assert data["oid"] == default_org_id assert data["description"] == DESCRIPTION + assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 2 assert data["pageCount"] > 0 assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] + assert data["defaultThumbnailName"] def test_get_collection_replay(crawler_auth_headers, default_org_id): @@ -290,11 +349,15 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id): assert data["name"] == UPDATED_NAME assert data["oid"] == default_org_id assert data["description"] == DESCRIPTION + assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 2 assert data["pageCount"] > 0 assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] + assert data["defaultThumbnailName"] resources = data["resources"] assert resources @@ -311,12 +374,31 @@ def test_collection_public(crawler_auth_headers, default_org_id): ) assert r.status_code == 404 - # make public + # make public and test replay headers r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", headers=crawler_auth_headers, json={ - "isPublic": True, + "access": "public", + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.headers["Access-Control-Allow-Origin"] == "*" + assert r.headers["Access-Control-Allow-Headers"] == "*" + + # make unlisted and test replay headers + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", + headers=crawler_auth_headers, + json={ + "access": "unlisted", }, ) assert r.status_code == 200 @@ -335,7 +417,7 @@ def test_collection_public(crawler_auth_headers, default_org_id): f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", headers=crawler_auth_headers, json={ - "isPublic": False, + "access": "private", }, ) @@ -346,6 +428,24 @@ def test_collection_public(crawler_auth_headers, default_org_id): assert r.status_code == 404 +def test_collection_access_invalid_value(crawler_auth_headers, default_org_id): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", + headers=crawler_auth_headers, + json={ + "access": "invalid", + }, + ) + assert r.status_code == 422 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["access"] == "private" + + def test_add_upload_to_collection(crawler_auth_headers, default_org_id): with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh: r = requests.put( @@ -374,6 +474,9 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id): assert data["totalSize"] > 0 assert data["modified"] assert data["tags"] == ["wr-test-2", "wr-test-1"] + assert data["dateEarliest"] + assert data["dateLatest"] + assert data["defaultThumbnailName"] # Verify it was added r = requests.get( @@ -420,15 +523,20 @@ def test_list_collections( assert len(items) == 3 first_coll = [coll for coll in items if coll["name"] == UPDATED_NAME][0] - assert first_coll["id"] + assert first_coll["id"] == _coll_id assert first_coll["name"] == UPDATED_NAME assert first_coll["oid"] == default_org_id assert first_coll["description"] == DESCRIPTION + assert first_coll["caption"] == UPDATED_CAPTION assert first_coll["crawlCount"] == 3 assert first_coll["pageCount"] > 0 assert first_coll["totalSize"] > 0 assert first_coll["modified"] assert first_coll["tags"] == ["wr-test-2", "wr-test-1"] + assert first_coll["access"] == "private" + assert first_coll["dateEarliest"] + assert first_coll["dateLatest"] + assert first_coll["defaultThumbnailName"] second_coll = [coll for coll in items if coll["name"] == SECOND_COLLECTION_NAME][0] assert second_coll["id"] @@ -440,6 +548,9 @@ def test_list_collections( assert second_coll["totalSize"] > 0 assert second_coll["modified"] assert second_coll["tags"] == ["wr-test-2"] + assert second_coll["access"] == "private" + assert second_coll["dateEarliest"] + assert second_coll["dateLatest"] def test_remove_upload_from_collection(crawler_auth_headers, default_org_id): @@ -525,6 +636,26 @@ def test_filter_sort_collections( assert coll["oid"] == default_org_id assert coll.get("description") is None + # Test filtering by access + name_prefix = name_prefix.upper() + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections?access=public", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] == 1 + + items = data["items"] + assert len(items) == 1 + + coll = items[0] + assert coll["id"] + assert coll["name"] == PUBLIC_COLLECTION_NAME + assert coll["oid"] == default_org_id + assert coll.get("description") is None + assert coll["access"] == "public" + # Test sorting by name, ascending (default) r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=name", @@ -666,6 +797,582 @@ def test_filter_sort_collections( assert r.json()["detail"] == "invalid_sort_direction" +def test_list_public_collections( + crawler_auth_headers, + admin_auth_headers, + default_org_id, + non_default_org_id, + crawler_crawl_id, + admin_crawl_id, +): + # Create new public collection + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/collections", + headers=crawler_auth_headers, + json={ + "crawlIds": [crawler_crawl_id], + "name": "Second public collection", + "description": "Lorem ipsum", + "access": "public", + }, + ) + assert r.status_code == 200 + + global _second_public_coll_id + _second_public_coll_id = r.json()["id"] + + # Get default org slug + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + global default_org_slug + default_org_slug = data["slug"] + + org_name = data["name"] + + # Verify that public profile isn't enabled + assert data["enablePublicProfile"] is False + assert data["publicDescription"] == "" + assert data["publicUrl"] == "" + + # Try listing public collections without org public profile enabled + r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections") + assert r.status_code == 404 + assert r.json()["detail"] == "public_profile_not_found" + + # Enable public profile on org + public_description = "This is a test public org!" + public_url = "https://example.com" + + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": True, + "publicDescription": public_description, + "publicUrl": public_url, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["enablePublicProfile"] + assert data["publicDescription"] == public_description + assert data["publicUrl"] == public_url + + # List public collections with no auth (no public profile) + r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections") + assert r.status_code == 200 + data = r.json() + + org_data = data["org"] + assert org_data["name"] == org_name + assert org_data["description"] == public_description + assert org_data["url"] == public_url + + collections = data["collections"] + assert len(collections) == 2 + for collection in collections: + assert collection["id"] in (_public_coll_id, _second_public_coll_id) + assert collection["oid"] + assert collection["access"] == "public" + assert collection["name"] + assert collection["dateEarliest"] + assert collection["dateLatest"] + assert collection["crawlCount"] > 0 + assert collection["pageCount"] > 0 + assert collection["totalSize"] > 0 + + # Test non-existing slug - it should return a 404 but not reveal + # whether or not an org exists with that slug + r = requests.get(f"{API_PREFIX}/public/orgs/nonexistentslug/collections") + assert r.status_code == 404 + assert r.json()["detail"] == "public_profile_not_found" + + +def test_list_public_collections_no_colls(non_default_org_id, admin_auth_headers): + # Test existing org that's not public - should return same 404 as + # if org doesn't exist + r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections") + assert r.status_code == 404 + assert r.json()["detail"] == "public_profile_not_found" + + # Enable public profile on org with zero public collections + r = requests.post( + f"{API_PREFIX}/orgs/{non_default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": True, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # List public collections with no auth - should still get profile even + # with no public collections + r = requests.get(f"{API_PREFIX}/public/orgs/{NON_DEFAULT_ORG_SLUG}/collections") + assert r.status_code == 200 + data = r.json() + assert data["org"]["name"] == NON_DEFAULT_ORG_NAME + assert data["collections"] == [] + + +def test_set_collection_home_url( + crawler_auth_headers, default_org_id, crawler_crawl_id +): + # Get a page id from crawler_crawl_id + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] >= 1 + + page = data["items"][0] + assert page + + page_id = page["id"] + assert page_id + + page_url = page["url"] + page_ts = page["ts"] + + # Set page as home url + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url", + headers=crawler_auth_headers, + json={"pageId": page_id}, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Check that fields were set in collection as expected + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["homeUrl"] == page_url + assert data["homeUrlTs"] == page_ts + assert data["homeUrlPageId"] == page_id + + +def test_collection_url_list(crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/urls", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] >= 1 + urls = data["items"] + assert urls + + for url in urls: + assert url["url"] + assert url["count"] >= 1 + + snapshots = url["snapshots"] + assert snapshots + + for snapshot in snapshots: + assert snapshot["pageId"] + assert snapshot["ts"] + assert snapshot["status"] + + +def test_upload_collection_thumbnail(crawler_auth_headers, default_org_id): + with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh: + r = requests.put( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail?filename=thumbnail.jpg", + headers=crawler_auth_headers, + data=read_in_chunks(fh), + ) + assert r.status_code == 200 + assert r.json()["added"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + thumbnail = r.json()["thumbnail"] + + assert thumbnail["name"] + assert thumbnail["path"] + assert thumbnail["hash"] + assert thumbnail["size"] > 0 + + assert thumbnail["originalFilename"] == "thumbnail.jpg" + assert thumbnail["mime"] == "image/jpeg" + assert thumbnail["userid"] + assert thumbnail["userName"] + assert thumbnail["created"] + + +def test_set_collection_default_thumbnail(crawler_auth_headers, default_org_id): + default_thumbnail_name = "orange-default.avif" + + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}", + headers=crawler_auth_headers, + json={"defaultThumbnailName": default_thumbnail_name}, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == _second_public_coll_id + assert data["defaultThumbnailName"] == default_thumbnail_name + + +def test_list_public_colls_home_url_thumbnail(): + # Check we get expected data for each public collection + # and nothing we don't expect + non_public_fields = ( + "oid", + "modified", + "crawlCount", + "pageCount", + "totalSize", + "tags", + "access", + "homeUrlPageId", + ) + non_public_image_fields = ("originalFilename", "userid", "userName", "created") + + r = requests.get(f"{API_PREFIX}/public/orgs/{default_org_slug}/collections") + assert r.status_code == 200 + collections = r.json()["collections"] + assert len(collections) == 2 + + for coll in collections: + assert coll["id"] in (_public_coll_id, _second_public_coll_id) + assert coll["oid"] + assert coll["access"] == "public" + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + if coll["id"] == _public_coll_id: + assert coll["allowPublicDownload"] is False + + assert coll["caption"] == CAPTION + + assert coll["homeUrl"] + assert coll["homeUrlTs"] + + thumbnail = coll["thumbnail"] + assert thumbnail + + assert thumbnail["name"] + assert thumbnail["path"] + assert thumbnail["hash"] + assert thumbnail["size"] + assert thumbnail["mime"] + + for field in NON_PUBLIC_IMAGE_FIELDS: + assert field not in thumbnail + + if coll["id"] == _second_public_coll_id: + assert coll["description"] + assert coll["defaultThumbnailName"] == "orange-default.avif" + assert coll["allowPublicDownload"] + + +def test_get_public_collection(default_org_id): + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}" + ) + assert r.status_code == 200 + coll = r.json() + + assert coll["id"] == _public_coll_id + assert coll["oid"] == default_org_id + assert coll["access"] == "public" + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + assert coll["caption"] == CAPTION + + assert coll["homeUrl"] + assert coll["homeUrlTs"] + + assert coll["allowPublicDownload"] is False + + thumbnail = coll["thumbnail"] + assert thumbnail + + assert thumbnail["name"] + assert thumbnail["path"] + assert thumbnail["hash"] + assert thumbnail["size"] + assert thumbnail["mime"] + + for field in NON_PUBLIC_IMAGE_FIELDS: + assert field not in thumbnail + + # Invalid org slug - don't reveal whether org exists or not, use + # same exception as if collection doesn't exist + r = requests.get( + f"{API_PREFIX}/public/orgs/doesntexist/collections/{_public_coll_id}" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "collection_not_found" + + # Invalid collection id + random_uuid = uuid4() + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{random_uuid}" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "collection_not_found" + + # Collection isn't public + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{ _coll_id}" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "collection_not_found" + + +def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): + # Make second public coll unlisted + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}", + headers=crawler_auth_headers, + json={ + "access": "unlisted", + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Verify single public collection GET endpoint works for unlisted collection + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_second_public_coll_id}" + ) + assert r.status_code == 200 + coll = r.json() + + assert coll["id"] == _second_public_coll_id + assert coll["oid"] == default_org_id + assert coll["access"] == "unlisted" + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 + assert coll["defaultThumbnailName"] == "orange-default.avif" + assert coll["allowPublicDownload"] + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + +def test_get_public_collection_unlisted_org_profile_disabled( + admin_auth_headers, default_org_id +): + # Disable org profile + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": False, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Verify we can still get public details for unlisted collection + r = requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_second_public_coll_id}" + ) + assert r.status_code == 200 + coll = r.json() + + assert coll["id"] == _second_public_coll_id + assert coll["oid"] == default_org_id + assert coll["access"] == "unlisted" + assert coll["name"] + assert coll["resources"] + assert coll["dateEarliest"] + assert coll["dateLatest"] + assert coll["crawlCount"] > 0 + assert coll["pageCount"] > 0 + assert coll["totalSize"] > 0 + assert coll["defaultThumbnailName"] == "orange-default.avif" + assert coll["allowPublicDownload"] + + for field in NON_PUBLIC_COLL_FIELDS: + assert field not in coll + + # Re-enable org profile + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": True, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + +def test_delete_thumbnail(crawler_auth_headers, default_org_id): + r = requests.delete( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/thumbnail", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["deleted"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json().get("thumbnail") is None + + r = requests.delete( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_public_coll_id}/thumbnail", + headers=crawler_auth_headers, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "thumbnail_not_found" + + +def test_unset_collection_home_url( + crawler_auth_headers, default_org_id, crawler_crawl_id +): + # Unset home url + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}/home-url", + headers=crawler_auth_headers, + json={"pageId": None}, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + # Check that fields were set in collection as expected + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data.get("homeUrl") is None + assert data.get("homeUrlTs") is None + assert data.get("homeUrlPageId") is None + + +def test_download_streaming_public_collection(crawler_auth_headers, default_org_id): + # Check that download is blocked if allowPublicDownload is False + with requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download", + stream=True, + ) as r: + assert r.status_code == 403 + + # Set allowPublicDownload to True and then check downloading works + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_public_coll_id}", + headers=crawler_auth_headers, + json={ + "allowPublicDownload": True, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + with TemporaryFile() as fh: + with requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download", + stream=True, + ) as r: + assert r.status_code == 200 + for chunk in r.iter_content(): + fh.write(chunk) + + fh.seek(0) + with ZipFile(fh, "r") as zip_file: + contents = zip_file.namelist() + + assert len(contents) == 2 + for filename in contents: + assert filename.endswith(".wacz") or filename == "datapackage.json" + assert zip_file.getinfo(filename).compress_type == ZIP_STORED + + +def test_download_streaming_public_collection_profile_disabled( + admin_auth_headers, default_org_id +): + # Disable org public profile and ensure download still works for public collection + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/public-profile", + headers=admin_auth_headers, + json={ + "enablePublicProfile": False, + }, + ) + assert r.status_code == 200 + assert r.json()["updated"] + + with TemporaryFile() as fh: + with requests.get( + f"{API_PREFIX}/public/orgs/{default_org_slug}/collections/{_public_coll_id}/download", + stream=True, + ) as r: + assert r.status_code == 200 + for chunk in r.iter_content(): + fh.write(chunk) + + fh.seek(0) + with ZipFile(fh, "r") as zip_file: + contents = zip_file.namelist() + + assert len(contents) == 2 + for filename in contents: + assert filename.endswith(".wacz") or filename == "datapackage.json" + assert zip_file.getinfo(filename).compress_type == ZIP_STORED + + def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id): # Delete second collection r = requests.delete( diff --git a/backend/test/test_org.py b/backend/test/test_org.py index ab652aa266..a3de79c5cd 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -17,7 +17,7 @@ def test_ensure_only_one_default_org(admin_auth_headers): r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers) data = r.json() - assert data["total"] == 1 + assert data["total"] == 2 orgs = data["items"] default_orgs = [org for org in orgs if org["default"]] diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index fb7543d0a2..3fb1c1c44b 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -232,6 +232,49 @@ def test_get_upload_replay_json_admin( assert "files" not in data +def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): + # Give time for pages to finish being uploaded + time.sleep(10) + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["total"] > 0 + + pages = data["items"] + for page in pages: + assert page["id"] + assert page["oid"] + assert page["crawl_id"] == upload_id + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + + page_id = pages[0]["id"] + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/pages/{page_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + page = r.json() + + assert page["id"] == page_id + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["ts"] + assert page.get("title") or page.get("title") is None + + assert page["notes"] == [] + assert page.get("userid") is None + assert page.get("modified") is None + assert page.get("approved") is None + + def test_replace_upload( admin_auth_headers, default_org_id, uploads_collection_id, upload_id ): diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index 132d3bf8fe..f47dd2acfd 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -8,7 +8,7 @@ metadata: btrix.org: {{ oid }} spec: - ttlSecondsAfterFinished: 0 + ttlSecondsAfterFinished: 90 backoffLimit: 3 template: spec: @@ -38,6 +38,9 @@ spec: - name: OID value: {{ oid }} + - name: CRAWL_TYPE + value: {{ crawl_type }} + envFrom: - configMapRef: name: backend-env-config diff --git a/chart/templates/frontend.yaml b/chart/templates/frontend.yaml index 261b4fec0f..d90ddd7a88 100644 --- a/chart/templates/frontend.yaml +++ b/chart/templates/frontend.yaml @@ -62,9 +62,9 @@ spec: value: "{{ .Values.minio_local_bucket_name }}" {{- end }} - {{- if .Values.inject_analytics }} - - name: INJECT_ANALYTICS - value: {{ .Values.inject_analytics }} + {{- if .Values.inject_extra }} + - name: INJECT_EXTRA + value: {{ .Values.inject_extra }} {{- end }} resources: diff --git a/chart/values.yaml b/chart/values.yaml index d422f60a73..753858922b 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -451,9 +451,9 @@ ingress: ingress_class: nginx -# Optional: Analytics injection script -# This runs as a blocking script on the frontend, so usually you'll want to have it just add a single script tag to the page with the `defer` attribute. -# inject_analytics: // your analytics injection script here +# Optional: Front-end injected script +# This runs as a blocking script on the frontend, so usually you'll want to have it just add a single script tag to the page with the `defer` attribute. Useful for things like analytics and bug tracking. +# inject_extra: // your front-end injected script # Signing Options diff --git a/frontend/00-browsertrix-nginx-init.sh b/frontend/00-browsertrix-nginx-init.sh index c5c112d7d9..a833051d0e 100755 --- a/frontend/00-browsertrix-nginx-init.sh +++ b/frontend/00-browsertrix-nginx-init.sh @@ -5,7 +5,7 @@ rm /etc/nginx/conf.d/default.conf if [ -z "$LOCAL_MINIO_HOST" ]; then echo "no local minio, clearing out minio route" - echo "" > /etc/nginx/includes/minio.conf + echo "" >/etc/nginx/includes/minio.conf else echo "local minio: replacing \$LOCAL_MINIO_HOST with \"$LOCAL_MINIO_HOST\", \$LOCAL_BUCKET with \"$LOCAL_BUCKET\"" sed -i "s/\$LOCAL_MINIO_HOST/$LOCAL_MINIO_HOST/g" /etc/nginx/includes/minio.conf @@ -13,15 +13,15 @@ else fi # Add analytics script, if provided -if [ -z "$INJECT_ANALYTICS" ]; then +if [ -z "$INJECT_EXTRA" ]; then echo "analytics disabled, injecting blank script" - echo "" > /usr/share/nginx/html/extra.js + echo "" >/usr/share/nginx/html/extra.js else echo "analytics enabled, injecting script" - echo "$INJECT_ANALYTICS" > /usr/share/nginx/html/extra.js + echo "$INJECT_EXTRA" >/usr/share/nginx/html/extra.js fi mkdir -p /etc/nginx/resolvers/ -echo resolver $(grep -oP '(?<=nameserver\s)[^\s]+' /etc/resolv.conf | awk '{ if ($1 ~ /:/) { printf "[" $1 "] "; } else { printf $1 " "; } }') valid=10s ipv6=off";" > /etc/nginx/resolvers/resolvers.conf +echo resolver $(grep -oP '(?<=nameserver\s)[^\s]+' /etc/resolv.conf | awk '{ if ($1 ~ /:/) { printf "[" $1 "] "; } else { printf $1 " "; } }') valid=10s ipv6=off";" >/etc/nginx/resolvers/resolvers.conf cat /etc/nginx/resolvers/resolvers.conf diff --git a/frontend/config/define.js b/frontend/config/define.js new file mode 100644 index 0000000000..efe09eda5b --- /dev/null +++ b/frontend/config/define.js @@ -0,0 +1,28 @@ +/** + * Global constants to make available to build + * + * @TODO Consolidate webpack and web-test-runner esbuild configs + */ +const path = require("path"); + +const isDevServer = process.env.WEBPACK_SERVE; + +const dotEnvPath = path.resolve( + process.cwd(), + `.env${isDevServer ? `.local` : ""}`, +); +require("dotenv").config({ + path: dotEnvPath, +}); + +const WEBSOCKET_HOST = + isDevServer && process.env.API_BASE_URL + ? new URL(process.env.API_BASE_URL).host + : process.env.WEBSOCKET_HOST || ""; + +module.exports = { + "window.process.env.WEBSOCKET_HOST": JSON.stringify(WEBSOCKET_HOST), + "window.process.env.ANALYTICS_NAMESPACE": JSON.stringify( + process.env.ANALYTICS_NAMESPACE || "", + ), +}; diff --git a/frontend/docs/docs/deploy/customization.md b/frontend/docs/docs/deploy/customization.md index 54caa35cd0..199cadf6e4 100644 --- a/frontend/docs/docs/deploy/customization.md +++ b/frontend/docs/docs/deploy/customization.md @@ -208,16 +208,22 @@ Browsertrix has the ability to cryptographically sign WACZ files with [Authsign] You can enable sign-ups by setting `registration_enabled` to `"1"`. Once enabled, your users can register by visiting `/sign-up`. -## Analytics +## Inject Extra JavaScript -You can add a script to inject any sort of analytics into the frontend by setting `inject_analytics` to the script. If present, it will be injected as a blocking script tag into every page — so we recommend you create the script tags that handle your analytics from within this script. +You can add a script to inject analytics, bug reporting tools, etc. into the frontend by setting `inject_extra` to script contents of your choosing. If present, it will be injected as a blocking script tag that runs when the frontend web app is initialized. -For example, here's a script that adds Plausible Analytics tracking: +For example, enabling analytics and tracking might look like this: -```ts -const plausible = document.createElement("script"); -plausible.src = "https://plausible.io/js/script.js"; -plausible.defer = true; -plausible.dataset.domain = "app.browsertrix.com"; -document.head.appendChild(plausible); +```yaml +inject_extra: > + const analytics = document.createElement("script"); + analytics.src = "https://cdn.example.com/analytics.js"; + analytics.defer = true; + + document.head.appendChild(analytics); + + window.analytics = window.analytics + || function () { (window.analytics.q = window.analytics.q || []).push(arguments); }; ``` + +Note that the script will only run when the web app loads, i.e. the first time the app is loaded in the browser and on hard refresh. The script will not run again upon clicking a link in the web app. This shouldn't be an issue with most analytics libraries, which should listen for changes to [window history](https://developer.mozilla.org/en-US/docs/Web/API/History). If you have a custom script that needs to re-run when the frontend URL changes, you'll need to add an event listener for the [`popstate` event](https://developer.mozilla.org/en-US/docs/Web/API/Window/popstate_event). diff --git a/frontend/docs/docs/user-guide/crawl-workflows.md b/frontend/docs/docs/user-guide/crawl-workflows.md index f5396e020b..ecf039c0dc 100644 --- a/frontend/docs/docs/user-guide/crawl-workflows.md +++ b/frontend/docs/docs/user-guide/crawl-workflows.md @@ -8,7 +8,7 @@ You can create, view, search for, and run crawl workflows from the **Crawling** ## Create a Crawl Workflow -Create new crawl workflows from the **Crawling** page, or the _Create New ..._ shortcut from **Overview**. +Create new crawl workflows from the **Crawling** page, or the _Create New ..._ shortcut from **Dashboard**. ### Choose what to crawl @@ -38,4 +38,4 @@ Re-running a crawl workflow can be useful to capture a website as it changes ove ## Status -Finished crawl workflows inherit the [status of the last archived item they created](archived-items.md#status). Crawl workflows that are in progress maintain their [own statuses](./running-crawl.md#crawl-workflow-status). \ No newline at end of file +Finished crawl workflows inherit the [status of the last archived item they created](archived-items.md#status). Crawl workflows that are in progress maintain their [own statuses](./running-crawl.md#crawl-workflow-status). diff --git a/frontend/docs/docs/user-guide/getting-started.md b/frontend/docs/docs/user-guide/getting-started.md index c08d6bed55..d0b83a9ccb 100644 --- a/frontend/docs/docs/user-guide/getting-started.md +++ b/frontend/docs/docs/user-guide/getting-started.md @@ -12,7 +12,7 @@ To start crawling with hosted Browsertrix, you'll need a Browsertrix account. [S ## Starting the crawl -Once you've logged in you should see your org [overview](overview.md). If you land somewhere else, navigate to **Overview**. +Once you've logged in you should see your org [overview](overview.md). If you land somewhere else, navigate to **Dashboard**. 1. Tap the _Create New..._ shortcut and select **Crawl Workflow**. 2. Choose **Page List**. We'll get into the details of the options [later](./crawl-workflows.md), but this is a good starting point for a simple crawl. diff --git a/frontend/docs/docs/user-guide/org-settings.md b/frontend/docs/docs/user-guide/org-settings.md index caf842b5d6..5c5f50d37a 100644 --- a/frontend/docs/docs/user-guide/org-settings.md +++ b/frontend/docs/docs/user-guide/org-settings.md @@ -4,7 +4,17 @@ Settings that apply to the entire organization are found in the **Settings** pag ## General -Change your organization's name and URL identifier in this tab. Choose an org name that's unique and memorable, like the name of your company or organization. Org name and URLs are unique to each Browsertrix instance (for example, on browsertrix.com) and you may be asked to change the org name or URL identifier if either are already in use by another org. +### Name and URL + +Choose a display name for your org that's unique and memorable, like the name of your company, organization, or personal project. This name will be visible in the org's [public profile](#profile), if that page is enabled. + +The org URL is where you and other org members will go to view the dashboard, configure org settings, and manage all other org-related activities. Changing this URL will also update the URL of your org's public profile, if enabled. + +Org name and URLs are unique to each Browsertrix instance (for example, on `app.browsertrix.com`) and you may be prompted to change the org name or URL identifier if either are already in use by another org. + +### Profile + +Enable and configure a public profile page for your org. Once enabled, anyone on the internet with a link to your org's profile page will be able to view public information like the org name, description, and public collections. ## Billing diff --git a/frontend/docs/docs/user-guide/overview.md b/frontend/docs/docs/user-guide/overview.md index b67b5d6a08..63b55e7309 100644 --- a/frontend/docs/docs/user-guide/overview.md +++ b/frontend/docs/docs/user-guide/overview.md @@ -1,6 +1,6 @@ # View Usage Stats and Quotas -The **Overview** dashboard delivers key statistics about the org's resource usage. You can also create crawl workflows, upload archived items, create collections, and create browser profiles through the _Create New ..._ shortcut. +Your **Dashboard** delivers key statistics about the org's resource usage. You can also create crawl workflows, upload archived items, create collections, and create browser profiles through the _Create New ..._ shortcut. ## Storage diff --git a/frontend/index.d.ts b/frontend/index.d.ts index 57f96d4072..a711dad908 100644 --- a/frontend/index.d.ts +++ b/frontend/index.d.ts @@ -1,3 +1,4 @@ +declare module "*.avif"; declare module "*.svg"; declare module "*.webp"; declare module "*.css"; diff --git a/frontend/package.json b/frontend/package.json index 6e9c7f4aa5..66b68a2f5c 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -15,7 +15,7 @@ "@prettier/plugin-xml": "^3.4.1", "@rollup/plugin-commonjs": "^18.0.0", "@shoelace-style/localize": "^3.2.1", - "@shoelace-style/shoelace": "~2.15.1", + "@shoelace-style/shoelace": "~2.18.0", "@tailwindcss/container-queries": "^0.1.1", "@types/color": "^3.0.2", "@types/diff": "^5.0.9", diff --git a/frontend/sample.env.local b/frontend/sample.env.local index 74c36fce23..d5fb5c9396 100644 --- a/frontend/sample.env.local +++ b/frontend/sample.env.local @@ -3,3 +3,5 @@ DOCS_URL=https://docs.browsertrix.com/ E2E_USER_EMAIL= E2E_USER_PASSWORD= GLITCHTIP_DSN= +INJECT_EXTRA= +ANALYTICS_NAMESPACE= diff --git a/frontend/src/assets/images/thumbnails/thumbnail-cyan.avif b/frontend/src/assets/images/thumbnails/thumbnail-cyan.avif new file mode 100644 index 0000000000..05092899a4 Binary files /dev/null and b/frontend/src/assets/images/thumbnails/thumbnail-cyan.avif differ diff --git a/frontend/src/assets/images/thumbnails/thumbnail-green.avif b/frontend/src/assets/images/thumbnails/thumbnail-green.avif new file mode 100644 index 0000000000..d2e8ba7567 Binary files /dev/null and b/frontend/src/assets/images/thumbnails/thumbnail-green.avif differ diff --git a/frontend/src/assets/images/thumbnails/thumbnail-orange.avif b/frontend/src/assets/images/thumbnails/thumbnail-orange.avif new file mode 100644 index 0000000000..4f43f820d1 Binary files /dev/null and b/frontend/src/assets/images/thumbnails/thumbnail-orange.avif differ diff --git a/frontend/src/assets/images/thumbnails/thumbnail-yellow.avif b/frontend/src/assets/images/thumbnails/thumbnail-yellow.avif new file mode 100644 index 0000000000..40730cecd4 Binary files /dev/null and b/frontend/src/assets/images/thumbnails/thumbnail-yellow.avif differ diff --git a/frontend/src/components/index.ts b/frontend/src/components/index.ts index a3e01bfe52..40a362b8bc 100644 --- a/frontend/src/components/index.ts +++ b/frontend/src/components/index.ts @@ -7,3 +7,4 @@ import("./not-found"); import("./screencast"); import("./beta-badges"); import("./detail-page-title"); +import("./verified-badge"); diff --git a/frontend/src/components/not-found.ts b/frontend/src/components/not-found.ts index 6bb2fe74e2..3d346e62a6 100644 --- a/frontend/src/components/not-found.ts +++ b/frontend/src/components/not-found.ts @@ -1,17 +1,48 @@ import { localized, msg } from "@lit/localize"; -import { html, LitElement } from "lit"; +import { html, nothing } from "lit"; import { customElement } from "lit/decorators.js"; -@customElement("btrix-not-found") +import { BtrixElement } from "@/classes/BtrixElement"; + @localized() -export class NotFound extends LitElement { - createRenderRoot() { - return this; - } +@customElement("btrix-not-found") +export class NotFound extends BtrixElement { render() { return html` -
+ ${msg("Sorry, we couldn’t find that page")} +
++ ${msg("Check the URL to make sure you’ve entered it correctly.")} +
+
+ ${msg("Did you click a link to get here?")}
+
+ ${this.navigate.isPublicPage
+ ? nothing
+ : html`
+
+ ${msg("Or")}
+