From 612bbb6f42d9fcb2b0c31858f1c53776d452cf51 Mon Sep 17 00:00:00 2001 From: sua yoo Date: Wed, 25 Sep 2024 07:37:18 -0700 Subject: [PATCH] feat: Merge workflow job types (#2068) Resolves https://github.com/webrecorder/browsertrix/issues/2073 ### Changes - Removes "URL List" and "Seeded Crawl" job type distinction and adds as additional crawl scope types instead. - 'New Workflow' button defaults to Single Page - 'New Workflow' dropdown includes Page Crawl (Single Page, Page List, In-Page Links) and Site Crawl (Page in Same Directory, Page on Same Domain, + Subdomains and Custom Page Prefix) - Enables specifying `DOCS_URL` in `.env` - Additional follow-ups in #2090, #2091 --- docs/user-guide/crawl-workflows.md | 12 +- docs/user-guide/getting-started.md | 2 +- docs/user-guide/running-crawl.md | 2 +- docs/user-guide/workflow-setup.md | 105 ++- frontend/sample.env.local | 3 +- frontend/src/components/ui/config-details.ts | 217 +++--- .../crawl-workflows/new-workflow-dialog.ts | 93 +-- .../crawl-workflows/workflow-editor.ts | 637 +++++++++--------- frontend/src/index.ejs | 1 + frontend/src/index.ts | 64 +- .../archived-item-detail.ts | 1 - frontend/src/pages/org/dashboard.ts | 16 +- frontend/src/pages/org/index.ts | 25 +- frontend/src/pages/org/workflow-detail.ts | 12 +- frontend/src/pages/org/workflows-list.ts | 129 ++-- frontend/src/pages/org/workflows-new.ts | 137 ++-- frontend/src/routes.ts | 2 +- frontend/src/shoelace.ts | 1 + .../src/strings/crawl-workflows/scopeType.ts | 19 + .../src/strings/crawl-workflows/section.ts | 2 +- frontend/src/theme.stylesheet.css | 10 +- frontend/src/types/crawler.ts | 17 +- frontend/src/types/events.d.ts | 4 +- frontend/src/types/workflow.ts | 7 + frontend/src/utils/crawler.ts | 9 + frontend/src/utils/workflow.ts | 40 +- frontend/webpack.config.js | 5 + frontend/xliff/es.xlf | 247 +++---- 28 files changed, 911 insertions(+), 908 deletions(-) create mode 100644 frontend/src/strings/crawl-workflows/scopeType.ts create mode 100644 frontend/src/types/workflow.ts diff --git a/docs/user-guide/crawl-workflows.md b/docs/user-guide/crawl-workflows.md index 8b3620aa3b..f5396e020b 100644 --- a/docs/user-guide/crawl-workflows.md +++ b/docs/user-guide/crawl-workflows.md @@ -12,17 +12,17 @@ Create new crawl workflows from the **Crawling** page, or the _Create New ..._ ### Choose what to crawl -The first step in creating a new crawl workflow is to choose what you'd like to crawl. This determines whether the crawl type will be **Page List** or **Site Crawl**. Crawl types can't be changed after the workflow is created—you'll need to create a new crawl workflow. +The first step in creating a new crawl workflow is to choose what you'd like to crawl by defining a **Crawl Scope**. Crawl scopes are categorized as a **Page Crawl** or **Site Crawl**. -#### Page List +#### Page Crawl -Choose this option if you already know the URL of every page you'd like to crawl. The crawler will visit every URL specified in a list, and optionally every URL linked on those pages. +Choose one of these crawl scopes if you know the URL of every page you'd like to crawl and don't need to include any additional pages beyond one hop out. -A Page List workflow is simpler to configure, since you don't need to worry about configuring the workflow to exclude parts of the website that you may not want to archive. +A Page Crawl workflow is simpler to configure, since you don't need to worry about configuring the workflow to exclude parts of the website that you may not want to archive. #### Site Crawl -Let the crawler automatically discover pages based on a domain or start page that you specify. +Choose one of these crawl scopes to have the the crawler automatically find pages based on a domain name, start page URL, or directory on a website. Site Crawl workflows are great for advanced use cases where you don't need (or want) to know every single URL of the website that you're archiving. @@ -34,7 +34,7 @@ Run a crawl workflow by clicking _Run Crawl_ in the actions menu of the workflow While crawling, the **Watch Crawl** section displays a list of queued URLs that will be visited, and streams the current state of the browser windows as they visit pages from the queue. You can [modify the crawl live](./running-crawl.md) by adding URL exclusions or changing the number of crawling instances. -Re-running a crawl workflow can be useful to capture a website as it changes over time, or to run with an updated [crawl scope](workflow-setup.md#crawl-scope). +Re-running a crawl workflow can be useful to capture a website as it changes over time, or to run with an updated [crawl scope](workflow-setup.md#crawl-scope-options). ## Status diff --git a/docs/user-guide/getting-started.md b/docs/user-guide/getting-started.md index a99974bef9..c08d6bed55 100644 --- a/docs/user-guide/getting-started.md +++ b/docs/user-guide/getting-started.md @@ -28,7 +28,7 @@ Once you've logged in you should see your org [overview](overview.md). If you la After running your first crawl, check out the following to learn more about Browsertrix's features: - A detailed list of [crawl workflow setup](workflow-setup.md) options. -- Adding [exclusions](workflow-setup.md#exclusions) to limit your crawl's scope and evading crawler traps by [editing exclusion rules while crawling](running-crawl.md#live-exclusion-editing). +- Adding [exclusions](workflow-setup.md#exclude-pages) to limit your crawl's scope and evading crawler traps by [editing exclusion rules while crawling](running-crawl.md#live-exclusion-editing). - Best practices for crawling with [browser profiles](browser-profiles.md) to capture content only available when logged in to a website. - Managing archived items, including [uploading previously archived content](archived-items.md#uploading-web-archives). - Organizing and combining archived items with [collections](collections.md) for sharing and export. diff --git a/docs/user-guide/running-crawl.md b/docs/user-guide/running-crawl.md index f899f83fdd..6c5396545b 100644 --- a/docs/user-guide/running-crawl.md +++ b/docs/user-guide/running-crawl.md @@ -17,7 +17,7 @@ A crawl workflow that is in progress can be in one of the following states: ## Live Exclusion Editing -While [exclusions](workflow-setup.md#exclusions) can be set before running a crawl workflow, sometimes while crawling the crawler may find new parts of the site that weren't previously known about and shouldn't be crawled, or get stuck browsing parts of a website that automatically generate URLs known as ["crawler traps"](https://en.wikipedia.org/wiki/Spider_trap). +While [exclusions](workflow-setup.md#exclude-pages) can be set before running a crawl workflow, sometimes while crawling the crawler may find new parts of the site that weren't previously known about and shouldn't be crawled, or get stuck browsing parts of a website that automatically generate URLs known as ["crawler traps"](https://en.wikipedia.org/wiki/Spider_trap). If the crawl queue is filled with URLs that should not be crawled, use the _Edit Exclusions_ button on the Watch Crawl page to instruct the crawler what pages should be excluded from the queue. diff --git a/docs/user-guide/workflow-setup.md b/docs/user-guide/workflow-setup.md index 8a55393a32..a3fb93d8e0 100644 --- a/docs/user-guide/workflow-setup.md +++ b/docs/user-guide/workflow-setup.md @@ -6,83 +6,114 @@ Changes to a setting will only apply to subsequent crawls. Crawl settings are shown in the crawl workflow detail **Settings** tab and in the archived item **Crawl Settings** tab. -## Crawl Scope +## Scope -Specify the range and depth of your crawl. Different settings will be shown depending on whether you chose _URL List_ or _Site Crawl_ when creating a new workflow. +Specify the range and depth of your crawl. -??? example "Crawling with HTTP basic auth" - - Both Page List and Site Crawls support [HTTP Basic Auth](https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication) which can be provided as part of the URL, for example: `https://username:password@example.com`. - - **These credentials WILL BE WRITTEN into the archive.** We recommend exercising caution and only archiving with dedicated archival accounts, changing your password or deleting the account when finished. +Crawl scopes are categorized as a **Page Crawl** or **Site Crawl**: -### Crawl Type: Page List +_Page Crawl_ +: Choose one of these crawl scopes if you know the URL of every page you'd like to crawl and don't need to include any additional pages beyond one hop out. -#### Page URL(s) + A Page Crawl workflow can be simpler to configure, since you don't need to worry about configuring the workflow to exclude parts of the website that you may not want to archive. -A list of one or more URLs that the crawler should visit and capture. + ??? info "Page Crawl Use Cases" + - You want to archive a social media post (`Single Page`) + - You have a list of URLs that you can copy-and-paste (`List of Pages`) + - You want to include URLs with different domain names in the same crawl (`List of Pages`) -#### Include Any Linked Page +_Site Crawl_ +: Choose one of these crawl scopes to have the the crawler automatically find pages based on a domain name, start page URL, or directory on a website. -When enabled, the crawler will visit all the links it finds within each page defined in the _Crawl URL(s)_ field. + Site Crawl workflows are great for advanced use cases where you don't need (or want) to know every single URL of the website that you're archiving. -??? example "Crawling tags & search queries with Page List crawls" - This setting can be useful for crawling the content of specific tags or search queries. Specify the tag or search query URL(s) in the _Crawl URL(s)_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page. + ??? info "Site Crawl Use Cases" + - You're archiving a subset of a website, like everything under _website.com/your-username_ (`Pages in Same Directory`) + - You're archiving an entire website _and_ external pages linked to from the website (`Pages on Same Domain` + _Include Any Linked Page_ checked) -#### Fail Crawl on Failed URL +### Crawl Scope Options -When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed". +#### Page Crawl -### Crawl Type: Site Crawl +`Single Page` +: Crawls a single URL and does not include any linked pages. -#### Crawl Start URL +`List of Pages` +: Crawls only specified URLs and does not include any linked pages. -This is the first page that the crawler will visit. It's important to set _Crawl Start URL_ that accurately represents the scope of the pages you wish to crawl as the _Start URL Scope_ selection will depend on this field's contents. +`In-Page Links` +: Crawls only the specified URL and treats linked sections of the page as distinct pages. -You must specify the protocol (likely `http://` or `https://`) as a part of the URL entered into this field. + Any link that begins with the _Crawl Start URL_ followed by a hashtag symbol (`#`) and then a string is considered an in-page link. This is commonly used to link to a section of a page. For example, because the "Scope" section of this guide is linked by its heading as `/user-guide/workflow-setup/#scope` it would be treated as a separate page under the _In-Page Links_ scope. -#### Start URL Scope + This scope can also be useful for crawling websites that are single-page applications where each page has its own hash, such as `example.com/#/blog` and `example.com/#/about`. -`Hashtag Links Only` -: This scope will ignore links that lead to other addresses such as `example.com/path` and will instead instruct the crawler to visit hashtag links such as `example.com/#linkedsection`. +#### Site Crawl - This scope can be useful for crawling certain web apps that may not use unique URLs for their pages. - -`Pages in the Same Directory` +`Pages in Same Directory` : This scope will only crawl pages in the same directory as the _Crawl Start URL_. If `example.com/path` is set as the _Crawl Start URL_, `example.com/path/path2` will be crawled but `example.com/path3` will not. -`Pages on This Domain` +`Pages on Same Domain` : This scope will crawl all pages on the domain entered as the _Crawl Start URL_ however it will ignore subdomains such as `subdomain.example.com`. -`Pages on This Domain and Subdomains` +`Pages on Same Domain + Subdomains` : This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled. `Custom Page Prefix` : This scope will crawl all pages that begin with the _Crawl Start URL_ as well as pages from any URL that begin with the URLs listed in `Extra URL Prefixes in Scope` -#### Max Depth +### Page URL(s) -Only shown with a _Start URL Scope_ of `Pages on This Domain` and above, the _Max Depth_ setting instructs the crawler to stop visiting new links past a specified depth. +One or more URLs of the page to crawl. URLs must follow [valid URL syntax](https://www.w3.org/Addressing/URL/url-spec.html). For example, if you're crawling a page that can be accessed on the public internet, your URL should start with `http://` or `https://`. -#### Extra URL Prefixes in Scope +??? example "Crawling with HTTP basic auth" -Only shown with a _Start URL Scope_ of `Custom Page Prefix`, this field accepts additional URLs or domains that will be crawled if URLs that lead to them are found. + All crawl scopes support [HTTP Basic Auth](https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication) which can be provided as part of the URL, for example: `https://username:password@example.com`. + + **These credentials WILL BE WRITTEN into the archive.** We recommend exercising caution and only archiving with dedicated archival accounts, changing your password or deleting the account when finished. -This can be useful for crawling websites that span multiple domains such as `example.org` and `example.net` +### Crawl Start URL -#### Include Any Linked Page ("one hop out") +This is the first page that the crawler will visit. _Site Crawl_ scopes are based on this URL. -When enabled, the crawler will visit all the links it finds within each page, regardless of the _Start URL Scope_ setting. +### Include Any Linked Page + +When enabled, the crawler will visit all the links it finds within each page defined in the _Crawl URL(s)_ field. + +??? example "Crawling tags & search queries with Page List crawls" + This setting can be useful for crawling the content of specific tags or search queries. Specify the tag or search query URL(s) in the _Crawl URL(s)_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page. + +### Fail Crawl on Failed URL + +When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed". + +### Max Depth in Scope + +Instructs the crawler to stop visiting new links past a specified depth. + +### Extra URL Prefixes in Scope + +This field accepts additional URLs or domains that will be crawled if URLs that lead to them are found. + +This can be useful for crawling websites that span multiple domains such as `example.org` and `example.net`. + +### Include Any Linked Page ("one hop out") + +When enabled, the crawler bypasses the _Crawl Scope_ setting to visit links it finds in each page within scope. The crawler will not visit links it finds in the pages found outside of scope (hence only "one hop out".) This can be useful for capturing links on a page that lead outside the website that is being crawled but should still be included in the archive for context. -#### Check For Sitemap +### Check For Sitemap When enabled, the crawler will check for a sitemap at /sitemap.xml and use it to discover pages to crawl if found. It will not crawl pages found in the sitemap that do not meet the crawl's scope settings or limits. This can be useful for discovering and capturing pages on a website that aren't linked to from the seed and which might not otherwise be captured. -### Exclusions +### Additional Pages + +A list of page URLs outside of the _Crawl Scope_ to include in the crawl. + +### Exclude Pages The exclusions table will instruct the crawler to ignore links it finds on pages where all or part of the link matches an exclusion found in the table. The table is only available in Page List crawls when _Include Any Linked Page_ is enabled. diff --git a/frontend/sample.env.local b/frontend/sample.env.local index d8afed62dd..c8dc6e55ac 100644 --- a/frontend/sample.env.local +++ b/frontend/sample.env.local @@ -1,2 +1,3 @@ API_BASE_URL= -GLITCHTIP_DSN= \ No newline at end of file +DOCS_URL=https://docs.browsertrix.com/ +GLITCHTIP_DSN= diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 157e536bc4..0ff17131e8 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -10,11 +10,13 @@ import RegexColorize from "regex-colorize"; import { RelativeDuration } from "./relative-duration"; import type { CrawlConfig, Seed, SeedConfig } from "@/pages/org/types"; +import scopeTypeLabel from "@/strings/crawl-workflows/scopeType"; import sectionStrings from "@/strings/crawl-workflows/section"; import type { Collection } from "@/types/collection"; +import { WorkflowScopeType } from "@/types/workflow"; import { isApiError } from "@/utils/api"; import { getAppSettings } from "@/utils/app"; -import { DEPTH_SUPPORTED_SCOPES } from "@/utils/crawler"; +import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler"; import { humanizeSchedule } from "@/utils/cron"; import LiteElement, { html } from "@/utils/LiteElement"; import { formatNumber } from "@/utils/localization"; @@ -54,19 +56,6 @@ export class ConfigDetails extends LiteElement { @state() private collections: Collection[] = []; - private readonly scopeTypeLabels: Record< - NonNullable, - string - > = { - prefix: msg("Path Begins with This URL"), - host: msg("Pages on This Domain"), - domain: msg("Pages on This Domain & Subdomains"), - "page-spa": msg("Single Page App (In-Page Links Only)"), - page: msg("Page"), - custom: msg("Custom"), - any: msg("Any"), - }; - async connectedCallback() { super.connectedCallback(); void this.fetchAPIDefaults(); @@ -76,8 +65,6 @@ export class ConfigDetails extends LiteElement { render() { const crawlConfig = this.crawlConfig; const seedsConfig = crawlConfig?.config; - const exclusions = seedsConfig?.exclude || []; - const maxPages = this.seeds?.[0]?.limit ?? seedsConfig?.limit; const renderTimeLimit = ( valueSeconds?: number | null, fallbackValue?: number, @@ -124,41 +111,44 @@ export class ConfigDetails extends LiteElement { ${when( - crawlConfig?.jobType === "seed-crawl", - this.renderConfirmSeededSettings, - this.renderConfirmUrlListSettings, - )} - ${when( - exclusions.length, - () => html` -
- - -
+ seedsConfig, + (config) => html` + ${this.renderSetting( + msg("Crawl Scope"), + when(this.seeds, (seeds) => { + if (!config.scopeType) return; + if (isPageScopeType(config.scopeType) && seeds.length > 1) { + return scopeTypeLabel[WorkflowScopeType.PageList]; + } + return scopeTypeLabel[config.scopeType]; + }), + )} + ${isPageScopeType(config.scopeType) + ? this.renderConfirmUrlListSettings(config) + : this.renderConfirmSeededSettings(config)} `, - () => this.renderSetting(msg("Exclusions"), msg("None")), )}

${sectionStrings.perCrawlLimits}

${this.renderSetting( msg("Max Pages"), - when( - maxPages, - (val: number | string) => - `${formatNumber(+val)} ${pluralOf("pages", +val)}`, - () => - this.orgDefaults?.maxPagesPerCrawl - ? html` - ${formatNumber(this.orgDefaults.maxPagesPerCrawl)} - ${pluralOf("pages", this.orgDefaults.maxPagesPerCrawl)} - ${msg("(default)")}` - : undefined, - ), + when(seedsConfig && this.seeds, (seeds) => { + const primarySeed = seeds[0] as Seed | undefined; + const maxPages = primarySeed?.limit ?? seedsConfig?.limit; + + if (maxPages) { + return `${formatNumber(+maxPages)} ${pluralOf("pages", +maxPages)}`; + } + + if (this.orgDefaults?.maxPagesPerCrawl) { + return html` + ${formatNumber(this.orgDefaults.maxPagesPerCrawl)} + ${pluralOf("pages", this.orgDefaults.maxPagesPerCrawl)} + ${msg("(default)")}`; + } + }), )} ${this.renderSetting( msg("Crawl Time Limit"), @@ -331,51 +321,53 @@ export class ConfigDetails extends LiteElement { `; } - private readonly renderConfirmUrlListSettings = () => { - const crawlConfig = this.crawlConfig; - + private readonly renderConfirmUrlListSettings = ( + config: CrawlConfig["config"], + ) => { return html` ${this.renderSetting( - msg("Page URL(s)"), - html` -
    - ${this.seeds?.map( - (seed: Seed) => html` -
  • - ${seed.url} -
  • - `, - )} -
- `, + config.scopeType === WorkflowScopeType.Page + ? msg("Page URL") + : msg("Page URLs"), + this.seeds?.length + ? html` +
    + ${this.seeds.map( + (seed: Seed) => html` +
  • + ${seed.url} +
  • + `, + )} +
+ ` + : undefined, true, )} ${this.renderSetting( - msg("Include Any Linked Page"), - Boolean(crawlConfig?.config.extraHops), - )} - ${this.renderSetting( - msg("Fail Crawl On Failed URL"), - Boolean(crawlConfig?.config.failOnFailedSeed), + msg("Include Any Linked Page (“one hop out”)"), + Boolean(config.extraHops), )} `; }; - private readonly renderConfirmSeededSettings = () => { + private readonly renderConfirmSeededSettings = ( + config: CrawlConfig["config"], + ) => { if (!this.seeds) return; - const crawlConfig = this.crawlConfig!; - const seedsConfig = crawlConfig.config; const additionalUrlList = this.seeds.slice(1); const primarySeedConfig = this.seeds[0] as SeedConfig | Seed | undefined; const primarySeedUrl = (primarySeedConfig as Seed | undefined)?.url; - const includeUrlList = - primarySeedConfig?.include || seedsConfig.include || []; + const includeUrlList = primarySeedConfig?.include || config.include || []; + const exclusions = config.exclude || []; + const scopeType = config.scopeType!; + return html` ${this.renderSetting( msg("Crawl Start URL"), @@ -390,50 +382,42 @@ export class ConfigDetails extends LiteElement { : undefined, true, )} - ${this.renderSetting( - msg("Crawl Scope"), - this.scopeTypeLabels[ - primarySeedConfig!.scopeType || seedsConfig.scopeType! - ], - )} - ${this.renderSetting( - msg("Extra URL Prefixes in Scope"), - includeUrlList.length - ? html` -
    - ${includeUrlList.map( - (url: string) => - staticHtml`
  • ${unsafeStatic( - new RegexColorize().colorizeText(url) as string, - )}
  • `, - )} -
- ` - : msg("None"), - true, + ${when(scopeType === WorkflowScopeType.Prefix, () => + this.renderSetting( + msg("Extra URL Prefixes in Scope"), + includeUrlList.length + ? html` +
    + ${includeUrlList.map( + (url: string) => + staticHtml`
  • ${unsafeStatic( + new RegexColorize().colorizeText(url) as string, + )}
  • `, + )} +
+ ` + : msg("None"), + true, + ), )} - ${when( - DEPTH_SUPPORTED_SCOPES.includes( - primarySeedConfig!.scopeType || seedsConfig.scopeType!, + ${when(DEPTH_SUPPORTED_SCOPES.includes(scopeType), () => + this.renderSetting( + msg("Max Depth in Scope"), + primarySeedConfig && primarySeedConfig.depth !== null + ? msg(str`${primarySeedConfig.depth} hop(s)`) + : msg("Unlimited (default)"), ), - () => - this.renderSetting( - msg("Max Depth"), - primarySeedConfig && primarySeedConfig.depth !== null - ? msg(str`${primarySeedConfig.depth} hop(s)`) - : msg("Unlimited (default)"), - ), )} ${this.renderSetting( msg("Include Any Linked Page (“one hop out”)"), - Boolean(primarySeedConfig?.extraHops ?? seedsConfig.extraHops), + Boolean(primarySeedConfig?.extraHops ?? config.extraHops), )} ${this.renderSetting( msg("Check For Sitemap"), - Boolean(seedsConfig.useSitemap), + Boolean(config.useSitemap), )} ${this.renderSetting( - msg("List of Additional URLs"), + msg("Additional Page URLs"), additionalUrlList.length ? html`
    @@ -454,6 +438,19 @@ export class ConfigDetails extends LiteElement { : msg("None"), true, )} + ${when( + exclusions.length, + () => html` +
    + + +
    + `, + () => this.renderSetting(msg("Exclusions"), msg("None")), + )} `; }; diff --git a/frontend/src/features/crawl-workflows/new-workflow-dialog.ts b/frontend/src/features/crawl-workflows/new-workflow-dialog.ts index 3419914490..8e8a523eed 100644 --- a/frontend/src/features/crawl-workflows/new-workflow-dialog.ts +++ b/frontend/src/features/crawl-workflows/new-workflow-dialog.ts @@ -3,10 +3,11 @@ import { html } from "lit"; import { customElement, property } from "lit/decorators.js"; import { TailwindElement } from "@/classes/TailwindElement"; +import type { FormState as WorkflowFormState } from "@/utils/workflow"; import seededCrawlSvg from "~assets/images/new-crawl-config_Seeded-Crawl.svg"; import urlListSvg from "~assets/images/new-crawl-config_URL-List.svg"; -export type SelectJobTypeEvent = CustomEvent<"url-list" | "seed-crawl">; +export type SelectJobTypeEvent = CustomEvent; /** * @event select-job-type SelectJobTypeEvent @@ -25,14 +26,14 @@ export class NewWorkflowDialog extends TailwindElement { style="--width: 46rem" >

    ${msg( @@ -88,78 +91,6 @@ export class NewWorkflowDialog extends TailwindElement { - -

    - ${msg(html`Choose Page List if:`)} -

    -
      -
    • ${msg("You want to archive a single page on a website")}
    • -
    • - ${msg("You have a list of URLs that you can copy-and-paste")} -
    • -
    • - ${msg( - "You want to include URLs with different domain names in the same crawl", - )} -
    • -
    -

    - ${msg( - html`A Page List workflow is simpler to configure, since you don't - need to worry about configuring the workflow to exclude parts of - the website that you may not want to archive.`, - )} -

    -

    - ${msg(html`Choose Site Crawl if:`)} -

    -
      -
    • ${msg("You want to archive an entire website")}
    • -
    • - ${msg( - html`You're archiving a subset of a website, like everything - under website.com/your-username`, - )} -
    • -
    • - ${msg( - html`You're archiving a website and external pages - linked to from the website`, - )} -
    • -
    -

    - ${msg( - html`Site Crawl workflows are great for advanced use cases where - you don't need to know every single URL that you want to archive. - You can configure reasonable crawl limits and page limits so that - you don't crawl more than you need to.`, - )} -

    -

    - ${msg( - html`Once you choose a crawl type, you can't go back and change - it. Check out the - crawl workflow setup guide - if you still need help deciding on a crawl type, and try our - community help forum.`, - )} -

    - `; } diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 1079ee0731..7c44877b61 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -14,6 +14,7 @@ import { mergeDeep } from "immutable"; import type { LanguageCode } from "iso-639-1"; import { html, + nothing, type LitElement, type PropertyValues, type TemplateResult, @@ -25,7 +26,6 @@ import { queryAsync, state, } from "lit/decorators.js"; -import { choose } from "lit/directives/choose.js"; import { ifDefined } from "lit/directives/if-defined.js"; import { map } from "lit/directives/map.js"; import { range } from "lit/directives/range.js"; @@ -47,24 +47,25 @@ import type { CollectionsChangeEvent } from "@/features/collections/collections- import type { QueueExclusionTable } from "@/features/crawl-workflows/queue-exclusion-table"; import { infoCol, inputCol } from "@/layouts/columns"; import infoTextStrings from "@/strings/crawl-workflows/infoText"; +import scopeTypeLabels from "@/strings/crawl-workflows/scopeType"; import sectionStrings from "@/strings/crawl-workflows/section"; -import type { - CrawlConfig, - JobType, - Seed, - WorkflowParams, +import { + ScopeType, + type CrawlConfig, + type Seed, + type WorkflowParams, } from "@/types/crawler"; +import { NewWorkflowOnlyScopeType } from "@/types/workflow"; import { isApiError, type Detail } from "@/utils/api"; -import { DEPTH_SUPPORTED_SCOPES } from "@/utils/crawler"; +import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler"; import { getUTCSchedule, humanizeNextDate, humanizeSchedule, } from "@/utils/cron"; import { maxLengthValidator } from "@/utils/form"; -import { formatNumber, getLocale } from "@/utils/localization"; +import { getLocale } from "@/utils/localization"; import { isArchivingDisabled } from "@/utils/orgs"; -import { pluralOf } from "@/utils/pluralize"; import { regexEscape } from "@/utils/string"; import { tw } from "@/utils/tailwind"; import { @@ -110,6 +111,7 @@ const DEFAULT_BEHAVIORS = [ "autofetch", "siteSpecific", ]; +const MAX_ADDITIONAL_URLS = 100; const getDefaultProgressState = (hasConfigId = false): ProgressState => { let activeTab: StepName = "crawlSetup"; @@ -165,7 +167,7 @@ function validURL(url: string) { const trimArray = flow(uniq, compact); const urlListToArray = flow( - (str: string) => (str.length ? str.trim().split(/\s+/g) : []), + (str?: string) => (str?.length ? str.trim().split(/\s+/g) : []), trimArray, ); @@ -186,7 +188,7 @@ export class WorkflowEditor extends BtrixElement { configId?: string; @property({ type: String }) - jobType!: JobType; + initialScopeType?: FormState["scopeType"]; @property({ type: Object }) initialWorkflow?: WorkflowParams; @@ -245,19 +247,6 @@ export class WorkflowEditor extends BtrixElement { private readonly daysOfWeek = getLocalizedWeekDays(); - private readonly scopeTypeLabels: Record< - NonNullable, - string - > = { - prefix: msg("Pages in the Same Directory"), - host: msg("Pages on This Domain"), - domain: msg("Pages on This Domain & Subdomains"), - "page-spa": msg("Hashtag Links Only"), - page: msg("Page"), - custom: msg("Custom Page Prefix"), - any: msg("Any"), - }; - private readonly scheduleTypeLabels: Record< FormState["scheduleType"], string @@ -365,15 +354,18 @@ export class WorkflowEditor extends BtrixElement { private initializeEditor() { this.progressState = getDefaultProgressState(Boolean(this.configId)); - this.formState = getInitialFormState({ + const formState = getInitialFormState({ configId: this.configId, initialSeeds: this.initialSeeds, initialWorkflow: this.initialWorkflow, org: this.org, }); - if (!this.formState.exclusions?.length) { - this.formState.exclusions = [""]; // Add empty slot + + if (this.initialScopeType) { + formState.scopeType = this.initialScopeType; } + + this.formState = formState; } render() { @@ -429,16 +421,9 @@ export class WorkflowEditor extends BtrixElement { )} - ${this.renderPanelContent( - html` - ${choose(this.jobType, [ - ["url-list", () => this.renderUrlListSetup(false)], - ["seed-crawl", () => this.renderSeededCrawlSetup()], - ["custom", () => this.renderUrlListSetup(true)], - ])} - `, - { isFirst: true }, - )} + ${this.renderPanelContent(this.renderScope(), { + isFirst: true, + })} ${this.renderPanelContent(this.renderCrawlLimits())} @@ -637,12 +622,7 @@ export class WorkflowEditor extends BtrixElement { } return html` ${isFirst - ? html` - - - ${msg("Start Over")} - - ` + ? nothing : html` @@ -708,147 +688,203 @@ export class WorkflowEditor extends BtrixElement { return infoCol(content, padTop ? tw`md:pt-[2.35rem]` : tw`md:pt-1`); } - private readonly renderUrlListSetup = (isCustom = false) => { + private readonly renderScope = () => { + const exclusions = trimArray(this.formState.exclusions || []); + return html` ${inputCol(html` - { - if (e.key === "Enter") { - const inputEl = e.target as SlInput; - await inputEl.updateComplete; - if (!inputEl.value) return; - const { isValid, helpText } = this.validateUrlList(inputEl.value); - inputEl.helpText = helpText; - if (isValid) { - inputEl.setCustomValidity(""); - } else { - inputEl.setCustomValidity(helpText); - } - } - }} - @sl-input=${(e: CustomEvent) => { - const inputEl = e.target as SlInput; - if (!inputEl.value) { - inputEl.helpText = msg("At least 1 URL is required."); - } - }} - @sl-change=${async (e: CustomEvent) => { - const inputEl = e.target as SlInput; - if (!inputEl.value) return; - const { isValid, helpText } = this.validateUrlList(inputEl.value); - inputEl.helpText = helpText; - if (isValid) { - inputEl.setCustomValidity(""); - } else { - inputEl.setCustomValidity(helpText); - } - }} - > + + this.changeScopeType( + (e.target as HTMLSelectElement).value as FormState["scopeType"], + )} + > + ${msg("Page Crawl")} + ${scopeTypeLabels[ScopeType.Page]} + + ${scopeTypeLabels[NewWorkflowOnlyScopeType.PageList]} + + + ${scopeTypeLabels[ScopeType.SPA]} + + + ${msg("Site Crawl")} + + ${scopeTypeLabels[ScopeType.Prefix]} + + + ${scopeTypeLabels[ScopeType.Host]} + + + ${scopeTypeLabels[ScopeType.Domain]} + + + ${scopeTypeLabels[ScopeType.Custom]} + + `)} - ${this.renderHelpTextCol( - msg(str`The crawler will visit and record each URL listed in the order - defined here. You can enter a maximum of ${URL_LIST_MAX_URLS.toLocaleString()} URLs, separated by a new line.`), - )} - ${when( - isCustom, - () => html` - ${inputCol(html` - - this.updateFormState({ - scopeType: (e.target as HTMLSelectElement) - .value as FormState["scopeType"], - })} - > - - ${this.scopeTypeLabels["prefix"]} - - - ${this.scopeTypeLabels["host"]} - - - ${this.scopeTypeLabels["domain"]} - - - ${this.scopeTypeLabels["page-spa"]} - - - ${this.scopeTypeLabels["page"]} - - - ${this.scopeTypeLabels["custom"]} - - - ${this.scopeTypeLabels["any"]} - - - `)} - ${this.renderHelpTextCol( - msg(`Tells the crawler which pages it can visit.`), - )} - `, - )} - ${inputCol( - html`${msg(`Tells the crawler which pages it can visit.`)}

    + `)} + ${isPageScopeType(this.formState.scopeType) + ? this.renderPageScope() + : this.renderSiteScope()} + ${!isPageScopeType(this.formState.scopeType) || + this.formState.includeLinkedPages + ? html` +
    + 0}> + ${msg("Exclude Pages")} + ${exclusions.length + ? html`${exclusions.length}` + : ""} +
    + ${inputCol(html` + + `)} + ${this.renderHelpTextCol( + msg( + `Specify exclusion rules for what pages should not be visited.`, + ), + )} +
    +
    +
    + ` + : nothing} + `; + }; + + private readonly renderPageScope = () => { + return html` + ${this.formState.scopeType === ScopeType.Page + ? html` + ${inputCol(html` + { + const inputEl = e.target as SlInput; + await inputEl.updateComplete; + this.updateFormState( + { + urlList: inputEl.value, + }, + true, + ); + if (!inputEl.checkValidity() && validURL(inputEl.value)) { + inputEl.setCustomValidity(""); + inputEl.helpText = ""; + } + }} + @sl-blur=${async (e: Event) => { + const inputEl = e.target as SlInput; + await inputEl.updateComplete; + if (inputEl.value && !validURL(inputEl.value)) { + const text = msg("Please enter a valid URL."); + inputEl.helpText = text; + inputEl.setCustomValidity(text); + } + }} + > + + `)} + ${this.renderHelpTextCol(msg(str`The URL of the page to crawl.`))} + ` + : html` + ${inputCol(html` + { + if (e.key === "Enter") { + const inputEl = e.target as SlInput; + await inputEl.updateComplete; + if (!inputEl.value) return; + const { isValid, helpText } = this.validateUrlList( + inputEl.value, + MAX_ADDITIONAL_URLS, + ); + inputEl.helpText = helpText; + if (isValid) { + inputEl.setCustomValidity(""); + } else { + inputEl.setCustomValidity(helpText); + } + } + }} + @sl-input=${(e: CustomEvent) => { + const inputEl = e.target as SlInput; + if (!inputEl.value) { + inputEl.helpText = msg("At least 1 URL is required."); + } + }} + @sl-change=${async (e: CustomEvent) => { + const inputEl = e.target as SlInput; + if (!inputEl.value) return; + const { isValid, helpText } = this.validateUrlList( + inputEl.value, + MAX_ADDITIONAL_URLS, + ); + inputEl.helpText = helpText; + if (isValid) { + inputEl.setCustomValidity(""); + } else { + inputEl.setCustomValidity(helpText); + } + }} + > + `)} + ${this.renderHelpTextCol( + msg( + str`The crawler will visit and record each URL listed here. You can enter up to ${MAX_ADDITIONAL_URLS.toLocaleString()} URLs.`, + ), + )} + `} + ${inputCol(html` + - ${msg("Include any linked page")} - `, - )} - ${this.renderHelpTextCol( - msg(`If checked, the crawler will visit pages one link away from a Crawl - URL.`), - false, - )} - ${inputCol( - html` - ${msg("Fail crawl on failed URL")} - `, - )} + ${msg("Include any linked page (“one hop out”)")} +
    + `)} ${this.renderHelpTextCol( - msg( - `If checked, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled.`, - ), + msg(`If checked, the crawler will visit pages one link away.`), false, )} - ${when( - this.formState.includeLinkedPages || this.jobType === "custom", - () => html` - ${inputCol(html` - - `)} - ${this.renderHelpTextCol(infoTextStrings["exclusions"])} - `, - )} `; }; - private readonly renderSeededCrawlSetup = () => { + private readonly renderSiteScope = () => { const urlPlaceholder = "https://example.com/path/page.html"; let exampleUrl = new URL(urlPlaceholder); if (this.formState.primarySeedUrl) { @@ -866,7 +902,7 @@ https://example.com/path`} let helpText: TemplateResult | string; switch (this.formState.scopeType) { - case "prefix": + case ScopeType.Prefix: helpText = msg( html`Will crawl all pages and paths in the same directory, e.g. `, ); break; - case "host": + case ScopeType.Host: helpText = msg( html`Will crawl all pages on ${exampleHost} and ignore pages on any subdomains.`, ); break; - case "domain": + case ScopeType.Domain: helpText = msg( html`Will crawl all pages on ${exampleHost} and subdomain.${exampleHost}.`, ); break; - case "page-spa": + case ScopeType.SPA: helpText = msg( - html`Will only visit - ${exampleDomain}${examplePathname} - hash anchor links, e.g. + html`Will crawl hash anchor links as pages. For example, ${exampleDomain}${examplePathname}#example-page`, + > + will be treated as a separate page.`, ); break; - case "custom": + case ScopeType.Custom: helpText = msg( html`Will crawl all page URLs that begin with - `)} - ${this.renderHelpTextCol(msg(`The starting point of your crawl.`))} - ${inputCol(html` - - this.updateFormState({ - scopeType: (e.target as HTMLSelectElement) - .value as FormState["scopeType"], - })} >
    ${helpText}
    - - ${this.scopeTypeLabels["page-spa"]} - - - ${this.scopeTypeLabels["prefix"]} - - ${this.scopeTypeLabels["host"]} - - ${this.scopeTypeLabels["domain"]} - - - ${this.scopeTypeLabels["custom"]} - -
    + `)} - ${this.renderHelpTextCol( - msg(`Tells the crawler which pages it can visit.`), + ${this.renderHelpTextCol(msg(`The starting point of your crawl.`))} + ${when( + this.formState.scopeType === ScopeType.Custom, + () => html` + ${inputCol(html` + + `)} + ${this.renderHelpTextCol( + msg(`If the crawler finds pages outside of the Crawl Scope they + will only be saved if they begin with URLs listed here.`), + )} + `, )} ${when( - DEPTH_SUPPORTED_SCOPES.includes(this.formState.scopeType!), + DEPTH_SUPPORTED_SCOPES.includes(this.formState.scopeType), () => html` ${inputCol(html` html` - ${inputCol(html` - - `)} - ${this.renderHelpTextCol( - msg(`If the crawler finds pages outside of the Start URL Scope they - will only be saved if they begin with URLs listed here.`), - )} - `, - )} ${inputCol(html` - 0}> - ${msg("Exclusions")} - ${exclusions.length - ? html`${exclusions.length}` - : ""} -
    - ${inputCol(html` - - `)} - ${this.renderHelpTextCol( - msg( - `Specify exclusion rules for what pages should not be visited.`, - ), - )} -
    -
    - ${msg("Additional URLs")} + ${msg("Additional Pages")} ${additionalUrlList.length ? html`${additionalUrlList.length}` : ""} @@ -1106,7 +1081,7 @@ https://example.net`} ${inputCol(html` `)} ${this.renderHelpTextCol( - msg(str`The crawler will visit and record each URL listed here. Other - links on these pages will not be crawled. You can enter up to ${maxAdditionalURls.toLocaleString()} URLs.`), + msg( + str`The crawler will visit and record each URL listed here. You can enter up to ${MAX_ADDITIONAL_URLS.toLocaleString()} URLs.`, + ), )}
    @@ -1167,7 +1143,7 @@ https://archiveweb.page/images/${"logo.svg"}`} const minPages = Math.max( 1, urlListToArray(this.formState.urlList).length + - (this.jobType === "seed-crawl" ? 1 : 0), + (isPageScopeType(this.formState.scopeType) ? 0 : 1), ); const onInputMinMax = async (e: CustomEvent) => { const inputEl = e.target as SlInput; @@ -1651,21 +1627,26 @@ https://archiveweb.page/images/${"logo.svg"}`} private readonly renderConfirmSettings = () => { const errorAlert = when(this.formHasError, () => { + const pageScope = isPageScopeType(this.formState.scopeType); const crawlSetupUrl = `${window.location.href.split("#")[0]}#crawlSetup`; const errorMessage = this.hasRequiredFields() ? msg( "There are issues with this Workflow. Please go through previous steps and fix all issues to continue.", ) - : msg( - html`There is an issue with this Crawl Workflow:

    Crawl - URL(s) required in - Crawl Setup.

    - Please fix to continue.`, - ); + : html` + ${msg("There is an issue with this Crawl Workflow:")}

    + ${msg( + html`${pageScope ? msg("Page URL(s)") : msg("Crawl Start URL")} + required in + Scope. `, + )} +

    + ${msg("Please fix to continue.")} + `; return this.renderErrorAlert(errorMessage); }); @@ -1696,11 +1677,38 @@ https://archiveweb.page/images/${"logo.svg"}`} `; }; + private changeScopeType(value: FormState["scopeType"]) { + const prevScopeType = this.formState.scopeType; + const formState: Partial = { + scopeType: value, + }; + const urls = urlListToArray(this.formState.urlList); + + const isPageScope = isPageScopeType(value); + const isPrevPageScope = isPageScopeType(prevScopeType); + + if (isPageScope === isPrevPageScope) { + if (isPageScope) { + formState.urlList = urls[0]; + } + } else { + if (isPrevPageScope) { + formState.primarySeedUrl = urls[0]; + formState.urlList = urls.slice(1).join("\n"); + } else if (isPageScope) { + formState.urlList = [this.formState.primarySeedUrl, ...urls].join("\n"); + } + } + + this.updateFormState(formState); + } + private hasRequiredFields(): boolean { - if (this.jobType === "seed-crawl") { - return Boolean(this.formState.primarySeedUrl); + if (isPageScopeType(this.formState.scopeType)) { + return Boolean(this.formState.urlList); } - return Boolean(this.formState.urlList); + + return Boolean(this.formState.primarySeedUrl); } private async scrollToPanelTop() { @@ -1712,30 +1720,6 @@ https://archiveweb.page/images/${"logo.svg"}`} } } - private getDefaultJobName() { - // Set default crawl name based on seed URLs - if (!this.formState.primarySeedUrl && !this.formState.urlList) { - return; - } - let jobName = ""; - if (this.jobType === "seed-crawl") { - jobName = this.formState.primarySeedUrl; - } else { - const urlList = urlListToArray(this.formState.urlList); - - const firstUrl = urlList[0].trim(); - if (urlList.length > 1) { - const remainder = urlList.length - 1; - jobName = msg( - str`${firstUrl} + ${formatNumber(remainder, { notation: "compact" })} more ${pluralOf("URLs", remainder)}`, - ); - } else { - jobName = firstUrl; - } - } - return jobName; - } - private async handleRemoveRegex(e: CustomEvent) { const { exclusions } = e.target as QueueExclusionTable; @@ -2091,7 +2075,8 @@ https://archiveweb.page/images/${"logo.svg"}`} private parseConfig(): NewCrawlConfigParams { const config: NewCrawlConfigParams = { - jobType: this.jobType, + // Job types are now merged into a single type + jobType: "custom", name: this.formState.jobName || "", description: this.formState.description, scale: this.formState.scale, @@ -2103,9 +2088,9 @@ https://archiveweb.page/images/${"logo.svg"}`} tags: this.formState.tags, autoAddCollections: this.formState.autoAddCollections, config: { - ...(this.jobType === "seed-crawl" - ? this.parseSeededConfig() - : this.parseUrlListConfig()), + ...(isPageScopeType(this.formState.scopeType) + ? this.parseUrlListConfig() + : this.parseSeededConfig()), behaviorTimeout: this.formState.behaviorTimeoutSeconds, pageLoadTimeout: this.formState.pageLoadTimeoutSeconds, pageExtraDelay: this.formState.pageExtraDelaySeconds, @@ -2132,10 +2117,10 @@ https://archiveweb.page/images/${"logo.svg"}`} > { const config = { seeds: urlListToArray(this.formState.urlList).map((seedUrl) => { - const newSeed: Seed = { url: seedUrl, scopeType: "page" }; + const newSeed: Seed = { url: seedUrl, scopeType: ScopeType.Page }; return newSeed; }), - scopeType: "page" as FormState["scopeType"], + scopeType: ScopeType.Page, extraHops: this.formState.includeLinkedPages ? 1 : 0, useSitemap: false, failOnFailedSeed: this.formState.failOnFailedSeed, @@ -2154,7 +2139,7 @@ https://archiveweb.page/images/${"logo.svg"}`} : []; const additionalSeedUrlList = this.formState.urlList ? urlListToArray(this.formState.urlList).map((seedUrl) => { - const newSeed: Seed = { url: seedUrl, scopeType: "page" }; + const newSeed: Seed = { url: seedUrl, scopeType: ScopeType.Page }; return newSeed; }) : []; @@ -2163,23 +2148,23 @@ https://archiveweb.page/images/${"logo.svg"}`} // the 'custom' scope here indicates we have extra URLs, actually set to 'prefix' // scope on backend to ensure seed URL is also added as part of standard prefix scope scopeType: - this.formState.scopeType === "custom" - ? "prefix" - : this.formState.scopeType, + this.formState.scopeType === ScopeType.Custom + ? ScopeType.Prefix + : (this.formState.scopeType as ScopeType), include: - this.formState.scopeType === "custom" + this.formState.scopeType === ScopeType.Custom ? [...includeUrlList.map((url) => regexEscape(url))] : [], extraHops: this.formState.includeLinkedPages ? 1 : 0, }; - if (DEPTH_SUPPORTED_SCOPES.includes(this.formState.scopeType!)) { + if (DEPTH_SUPPORTED_SCOPES.includes(this.formState.scopeType)) { primarySeed.depth = this.formState.maxScopeDepth; } const config = { seeds: [primarySeed, ...additionalSeedUrlList], - scopeType: this.formState.scopeType, + scopeType: this.formState.scopeType as ScopeType, useSitemap: this.formState.useSitemap, failOnFailedSeed: false, }; diff --git a/frontend/src/index.ejs b/frontend/src/index.ejs index b8453a7816..81bfe6f4b6 100644 --- a/frontend/src/index.ejs +++ b/frontend/src/index.ejs @@ -38,6 +38,7 @@