diff --git a/.changeset/beige-trains-sort.md b/.changeset/beige-trains-sort.md new file mode 100644 index 000000000..e46cae857 --- /dev/null +++ b/.changeset/beige-trains-sort.md @@ -0,0 +1,5 @@ +--- +'gitbook': patch +--- + +Support llms.txt diff --git a/bun.lockb b/bun.lockb index dd3a4e5dd..0e4e94a31 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/packages/gitbook/package.json b/packages/gitbook/package.json index edf7f5251..bad7ad451 100644 --- a/packages/gitbook/package.json +++ b/packages/gitbook/package.json @@ -40,6 +40,7 @@ "jsontoxml": "^1.0.1", "katex": "^0.16.9", "mathjax": "^3.2.2", + "mdast-util-to-markdown": "^2.1.2", "memoizee": "^0.4.15", "next": "14.2.15", "next-themes": "^0.2.1", @@ -74,6 +75,7 @@ "@types/js-cookie": "^3.0.6", "@types/jsontoxml": "^1.0.5", "@types/jsonwebtoken": "^9.0.6", + "@types/mdast": "^4.0.4", "@types/node": "^20", "@types/object-hash": "^3.0.6", "@types/parse-cache-control": "^1.0.4", diff --git a/packages/gitbook/src/app/middleware/(site)/(core)/llms.txt/route.ts b/packages/gitbook/src/app/middleware/(site)/(core)/llms.txt/route.ts new file mode 100644 index 000000000..1c40de78b --- /dev/null +++ b/packages/gitbook/src/app/middleware/(site)/(core)/llms.txt/route.ts @@ -0,0 +1,141 @@ +import { SiteSection, SiteSpace, SiteStructure } from '@gitbook/api'; +import assertNever from 'assert-never'; +import { Heading, ListItem, Paragraph, Root, RootContent } from 'mdast'; +import { toMarkdown } from 'mdast-util-to-markdown'; +import { NextRequest } from 'next/server'; + +import { getPublishedContentSite, getRevisionPages } from '@/lib/api'; +import { getAbsoluteHref } from '@/lib/links'; +import { getPagePath } from '@/lib/pages'; +import { joinPath } from '@/lib/paths'; +import { checkIsRootPointer, getSiteContentPointer } from '@/lib/pointer'; +import { getIndexablePages } from '@/lib/sitemap'; + +export const runtime = 'edge'; + +/** + * Generate a sitemap.xml for the current space. + */ +export async function GET(req: NextRequest) { + const pointer = await getSiteContentPointer(); + + const { structure: siteStructure, site } = await getPublishedContentSite({ + organizationId: pointer.organizationId, + siteId: pointer.siteId, + siteShareKey: pointer.siteShareKey, + }); + + // This sitemap is only available at root (/sitemap.xml). + if (!checkIsRootPointer(pointer, siteStructure)) { + return new Response('Not found', { status: 404 }); + } + + const tree: Root = { + type: 'root', + children: [ + { + type: 'heading', + depth: 1, + children: [{ type: 'text', value: site.title }], + }, + ...(await getNodesFromSiteStructure(siteStructure)), + ], + }; + + return new Response(toMarkdown(tree), { + headers: { + 'Content-Type': 'text/plain; charset=utf-8', + }, + }); +} + +/** + * Get Sitemap Nodes from site structure. + */ +async function getNodesFromSiteStructure(siteStructure: SiteStructure): Promise { + switch (siteStructure.type) { + case 'sections': + return getNodesFromSections(siteStructure.structure); + case 'siteSpaces': + return getNodesFromSiteSpaces(siteStructure.structure, { depth: 2 }); + default: + assertNever(siteStructure); + } +} + +/** + * Get Sitemap Nodes from site sections. + */ +async function getNodesFromSections(siteSections: SiteSection[]): Promise { + const all = await Promise.all( + siteSections.map(async (siteSection): Promise => { + const siteSpaceNodes = await getNodesFromSiteSpaces(siteSection.siteSpaces, { + depth: 3, + }); + return [ + { + type: 'heading', + depth: 2, + children: [{ type: 'text', value: siteSection.title }], + }, + ...siteSpaceNodes, + ]; + }), + ); + return all.flat(); +} + +/** + * Get Sitemap Nodes from site spaces. + */ +async function getNodesFromSiteSpaces( + siteSpaces: SiteSpace[], + options: { depth: Heading['depth'] }, +): Promise { + const all = await Promise.all( + siteSpaces.map(async (siteSpace): Promise => { + const siteSpaceUrl = siteSpace.urls.published; + if (!siteSpaceUrl) { + return []; + } + const rootPages = await getRevisionPages(siteSpace.space.id, siteSpace.space.revision, { + metadata: false, + }); + const pages = getIndexablePages(rootPages); + const listChildren = await Promise.all( + pages.map(async ({ page }): Promise => { + const url = await getAbsoluteHref( + joinPath(new URL(siteSpaceUrl).pathname, getPagePath(rootPages, page)), + true, + ); + const children: Paragraph['children'] = [ + { + type: 'link', + url, + children: [{ type: 'text', value: page.title }], + }, + ]; + if (page.description) { + children.push({ type: 'text', value: `: ${page.description}` }); + } + return { + type: 'listItem', + children: [{ type: 'paragraph', children }], + }; + }), + ); + return [ + { + type: 'heading', + depth: options.depth, + children: [{ type: 'text', value: siteSpace.title }], + }, + { + type: 'list', + children: listChildren, + }, + ]; + }), + ); + return all.flat(); +}