From 4e8c9d8cf300a87e9dddb92ec5420abf9dd70822 Mon Sep 17 00:00:00 2001
From: Roy Scheepens <roy.scheepens@gmail.com>
Date: Fri, 15 Nov 2024 10:35:51 +0100
Subject: [PATCH 1/3] fix: embeddings

---
 biome.json                                    |  10 +-
 .../google-tag-manager.tsx                    |   0
 lib/generate-embeddings.ts                    | 856 +++++++++---------
 pages/_app.tsx                                |  27 +-
 4 files changed, 449 insertions(+), 444 deletions(-)
 rename lib/GoogleTagManager.tsx => components/google-tag-manager.tsx (100%)

diff --git a/biome.json b/biome.json
index 47cdaaa..cbff4ed 100644
--- a/biome.json
+++ b/biome.json
@@ -1,7 +1,7 @@
 {
-  "$schema": "./node_modules/@biomejs/biome/configuration_schema.json",
-  "extends": ["./node_modules/@onbeam/biome-config/biome.json"],
-  "files": {
-    "ignore": ["./styled-system"]
-  }
+	"$schema": "./node_modules/@biomejs/biome/configuration_schema.json",
+	"extends": ["./node_modules/@onbeam/biome-config/biome.json"],
+	"files": {
+		"ignore": ["./styled-system"]
+	}
 }
diff --git a/lib/GoogleTagManager.tsx b/components/google-tag-manager.tsx
similarity index 100%
rename from lib/GoogleTagManager.tsx
rename to components/google-tag-manager.tsx
diff --git a/lib/generate-embeddings.ts b/lib/generate-embeddings.ts
index 4fbeb5e..374064c 100644
--- a/lib/generate-embeddings.ts
+++ b/lib/generate-embeddings.ts
@@ -1,25 +1,25 @@
-import { createHash } from 'node:crypto';
-import { basename, dirname, join } from 'node:path';
-import { createClient } from '@supabase/supabase-js';
-import dotenv from 'dotenv';
-import { readFile, readdir, stat } from 'node:fs/promises';
-import GithubSlugger from 'github-slugger';
-import { Content, Root } from 'mdast';
-import { fromMarkdown } from 'mdast-util-from-markdown';
-import { frontmatterFromMarkdown } from 'mdast-util-frontmatter';
-import { mdxFromMarkdown } from 'mdast-util-mdx';
-import { toMarkdown } from 'mdast-util-to-markdown';
-import { toString as toStringUtil } from 'mdast-util-to-string';
-import { frontmatter } from 'micromark-extension-frontmatter';
-import { mdxjs } from 'micromark-extension-mdxjs';
-import OpenAI from 'openai';
-import { u } from 'unist-builder';
-import { filter } from 'unist-util-filter';
-import yargs from 'yargs';
+import { createHash } from "node:crypto";
+import { readFile, readdir, stat } from "node:fs/promises";
+import { basename, dirname, join } from "node:path";
+import { createClient } from "@supabase/supabase-js";
+import dotenv from "dotenv";
+import GithubSlugger from "github-slugger";
+import { Content, Root } from "mdast";
+import { fromMarkdown } from "mdast-util-from-markdown";
+import { frontmatterFromMarkdown } from "mdast-util-frontmatter";
+import { mdxFromMarkdown } from "mdast-util-mdx";
+import { toMarkdown } from "mdast-util-to-markdown";
+import { toString as toStringUtil } from "mdast-util-to-string";
+import { frontmatter } from "micromark-extension-frontmatter";
+import { mdxjs } from "micromark-extension-mdxjs";
+import OpenAI from "openai";
+import { u } from "unist-builder";
+import { filter } from "unist-util-filter";
+import yargs from "yargs";
 
 dotenv.config();
 
-const ignoredFiles = ['pages/_app.mdx', 'pages/index.mdx', 'pages/404.mdx'];
+const ignoredFiles = ["pages/_app.mdx", "pages/index.mdx", "pages/404.mdx"];
 
 /**
  * Splits a `mdast` tree into multiple trees based on
@@ -29,35 +29,35 @@ const ignoredFiles = ['pages/_app.mdx', 'pages/index.mdx', 'pages/404.mdx'];
  * Useful to split a markdown file into smaller sections.
  */
 function splitTreeBy(tree: Root, predicate: (node: Content) => boolean) {
-  return tree.children.reduce<Root[]>((trees, node) => {
-    const [lastTree] = trees.slice(-1);
+	return tree.children.reduce<Root[]>((trees, node) => {
+		const [lastTree] = trees.slice(-1);
 
-    if (!lastTree || predicate(node)) {
-      const tree: Root = u('root', [node]);
-      return trees.concat(tree);
-    }
+		if (!lastTree || predicate(node)) {
+			const tree: Root = u("root", [node]);
+			return trees.concat(tree);
+		}
 
-    lastTree.children.push(node);
-    return trees;
-  }, []);
+		lastTree.children.push(node);
+		return trees;
+	}, []);
 }
 
 function extractMetaTags(mdxTree: Root) {
-  const metaTagsNode = mdxTree.children.find(({ type }) => type === 'yaml');
+	const metaTagsNode = mdxTree.children.find(({ type }) => type === "yaml");
 
-  if (!metaTagsNode) {
-    return {};
-  }
+	if (!metaTagsNode) {
+		return {};
+	}
 
-  const parsed = metaTagsNode.value.split(/\\r?\\n/).reduce((meta, line) => {
-    const [key, value] = line.split(': ');
-    return {
-      ...meta,
-      [key]: value,
-    };
-  }, {});
+	const parsed = metaTagsNode.value.split(/\\r?\\n/).reduce((meta, line) => {
+		const [key, value] = line.split(": ");
+		return {
+			...meta,
+			[key]: value,
+		};
+	}, {});
 
-  return parsed;
+	return parsed;
 }
 
 /**
@@ -66,28 +66,31 @@ function extractMetaTags(mdxTree: Root) {
  * @param slug
  * @returns
  */
+
+// biome-ignore lint/suspicious/noExplicitAny: any are you ok?
 const parseMetaTitle = (meta: any, slug: string): string => {
-  if (!meta[slug]) return slug;
+	if (!meta[slug]) return slug;
 
-  if (typeof meta[slug] === 'object') {
-    return `${(meta[slug] as any).title}` ?? slug;
-  }
+	if (typeof meta[slug] === "object") {
+		// biome-ignore lint/suspicious/noExplicitAny: any are you ok?
+		return `${(meta[slug] as any).title}` ?? slug;
+	}
 
-  return meta[slug] as string;
+	return meta[slug] as string;
 };
 
 type Meta = ReturnType<typeof extractMetaTags>;
 
 type Section = {
-  content: string;
-  heading?: string;
-  slug?: string;
+	content: string;
+	heading?: string;
+	slug?: string;
 };
 
 type ProcessedMdx = {
-  checksum: string;
-  meta: Meta;
-  sections: Section[];
+	checksum: string;
+	meta: Meta;
+	sections: Section[];
 };
 
 /**
@@ -96,400 +99,403 @@ type ProcessedMdx = {
  * and splits it into sub-sections based on criteria.
  */
 function processMdxForSearch(title: string, content: string): ProcessedMdx {
-  const checksum = createHash('sha256').update(content).digest('base64');
-
-  const mdxTree = fromMarkdown(content, {
-    extensions: [mdxjs(), frontmatter()],
-    mdastExtensions: [mdxFromMarkdown(), frontmatterFromMarkdown(['yaml'])],
-  });
-
-  // Extract meta tags from markdown
-  const meta = extractMetaTags(mdxTree);
-  if (!meta.title) meta.title = title;
-
-  // Remove all MDX elements from markdown
-  const mdTree = filter(
-    mdxTree,
-    (node) =>
-      ![
-        'mdxjsEsm',
-        'mdxJsxFlowElement',
-        'mdxJsxTextElement',
-        'mdxFlowExpression',
-        'mdxTextExpression',
-      ].includes(node.type),
-  );
-
-  if (!mdTree) {
-    return {
-      checksum,
-      meta,
-      sections: [],
-    };
-  }
-
-  const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading');
-
-  const slugger = new GithubSlugger();
-
-  const sections = sectionTrees
-    // Filter out trees that contain only the page's metadata
-    .filter(({ children }) => children[0]?.type !== 'yaml')
-    .map((tree) => {
-      const [firstNode] = tree.children;
-
-      const heading =
-        firstNode.type === 'heading' ? toStringUtil(firstNode) : undefined;
-      const slug = heading ? slugger.slug(heading) : undefined;
-
-      return {
-        content: toMarkdown(tree),
-        heading,
-        slug,
-      };
-    });
-
-  return {
-    checksum,
-    meta,
-    sections,
-  };
+	const checksum = createHash("sha256").update(content).digest("base64");
+
+	const mdxTree = fromMarkdown(content, {
+		extensions: [mdxjs(), frontmatter()],
+		mdastExtensions: [mdxFromMarkdown(), frontmatterFromMarkdown(["yaml"])],
+	});
+
+	// Extract meta tags from markdown
+	const meta = extractMetaTags(mdxTree);
+	if (!meta.title) meta.title = title;
+
+	// Remove all MDX elements from markdown
+	const mdTree = filter(
+		mdxTree,
+		(node) =>
+			![
+				"mdxjsEsm",
+				"mdxJsxFlowElement",
+				"mdxJsxTextElement",
+				"mdxFlowExpression",
+				"mdxTextExpression",
+			].includes(node.type),
+	);
+
+	if (!mdTree) {
+		return {
+			checksum,
+			meta,
+			sections: [],
+		};
+	}
+
+	const sectionTrees = splitTreeBy(mdTree, (node) => node.type === "heading");
+
+	const slugger = new GithubSlugger();
+
+	const sections = sectionTrees
+		// Filter out trees that contain only the page's metadata
+		.filter(({ children }) => children[0]?.type !== "yaml")
+		.map((tree) => {
+			const [firstNode] = tree.children;
+
+			const heading =
+				firstNode.type === "heading" ? toStringUtil(firstNode) : undefined;
+			const slug = heading ? slugger.slug(heading) : undefined;
+
+			return {
+				content: toMarkdown(tree),
+				heading,
+				slug,
+			};
+		});
+
+	return {
+		checksum,
+		meta,
+		sections,
+	};
 }
 
 type WalkEntry = {
-  path: string;
-  parentPath?: string;
+	path: string;
+	parentPath?: string;
 };
 
 async function walk(dir: string, parentPath?: string): Promise<WalkEntry[]> {
-  const immediateFiles = await readdir(dir);
-
-  const recursiveFiles = await Promise.all(
-    immediateFiles.map(async (file) => {
-      const path = join(dir, file);
-      const stats = await stat(path);
-      if (stats.isDirectory()) {
-        // Keep track of document hierarchy (if this dir has corresponding doc file)
-        const docPath = `${basename(path)}.mdx`;
-
-        return walk(
-          path,
-          immediateFiles.includes(docPath)
-            ? join(dirname(path), docPath)
-            : parentPath,
-        );
-      }
-      if (stats.isFile()) {
-        return [
-          {
-            path: path,
-            parentPath,
-          },
-        ];
-      }
-      return [];
-    }),
-  );
-
-  const flattenedFiles = recursiveFiles.reduce(
-    (all, folderContents) => all.concat(folderContents),
-    [],
-  );
-
-  return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path));
+	const immediateFiles = await readdir(dir);
+
+	const recursiveFiles = await Promise.all(
+		immediateFiles.map(async (file) => {
+			const path = join(dir, file);
+			const stats = await stat(path);
+			if (stats.isDirectory()) {
+				// Keep track of document hierarchy (if this dir has corresponding doc file)
+				const docPath = `${basename(path)}.mdx`;
+
+				return walk(
+					path,
+					immediateFiles.includes(docPath)
+						? join(dirname(path), docPath)
+						: parentPath,
+				);
+			}
+			if (stats.isFile()) {
+				return [
+					{
+						path: path,
+						parentPath,
+					},
+				];
+			}
+			return [];
+		}),
+	);
+
+	const flattenedFiles = recursiveFiles.reduce(
+		(all, folderContents) => all.concat(folderContents),
+		[],
+	);
+
+	return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path));
 }
 
 abstract class BaseEmbeddingSource {
-  checksum?: string;
-  meta?: Meta;
-  sections?: Section[];
-
-  constructor(
-    public source: string,
-    public path: string,
-    public parentPath?: string,
-  ) {}
-
-  abstract load(): Promise<{
-    checksum: string;
-    meta?: Meta;
-    sections: Section[];
-  }>;
+	checksum?: string;
+	meta?: Meta;
+	sections?: Section[];
+
+	constructor(
+		public source: string,
+		public path: string,
+		public parentPath?: string,
+	) {}
+
+	abstract load(): Promise<{
+		checksum: string;
+		meta?: Meta;
+		sections: Section[];
+	}>;
 }
 
 class MarkdownEmbeddingSource extends BaseEmbeddingSource {
-  type = 'markdown' as const;
-
-  constructor(
-    source: string,
-    public filePath: string,
-    public parentFilePath?: string,
-  ) {
-    const path = filePath.replace(/^pages/, '').replace(/\.mdx?$/, '');
-    const parentPath = parentFilePath
-      ?.replace(/^pages/, '')
-      .replace(/\.mdx?$/, '');
-
-    super(source, path, parentPath);
-  }
-
-  async load() {
-    const contents = await readFile(this.filePath, 'utf8');
-
-    const slug = this.filePath
-      .split('/')
-      .at(-1)
-      .replace(/\.mdx?$/, '');
-
-    const metaPath = this.filePath.replace(/[^/]+$/, '_meta.json');
-    const metaJson = await readFile(metaPath, 'utf8');
-
-    const title = parseMetaTitle(JSON.parse(metaJson), slug);
-
-    const { checksum, meta, sections } = processMdxForSearch(title, contents);
-
-    this.checksum = checksum;
-    this.meta = meta;
-    this.sections = sections;
-
-    return {
-      checksum,
-      meta,
-      sections,
-    };
-  }
+	type = "markdown" as const;
+
+	constructor(
+		source: string,
+		public filePath: string,
+		public parentFilePath?: string,
+	) {
+		const path = filePath.replace(/^pages/, "").replace(/\.mdx?$/, "");
+		const parentPath = parentFilePath
+			?.replace(/^pages/, "")
+			.replace(/\.mdx?$/, "");
+
+		super(source, path, parentPath);
+	}
+
+	async load() {
+		const contents = await readFile(this.filePath, "utf8");
+
+		const slug = this.filePath.split("/").at(-1)?.replace(/\.mdx?$/, "") ?? "";
+
+		const metaPath = join(
+			process.cwd(),
+			this.filePath.replace(/[^/]+$/, "_meta.ts"),
+		);
+
+		const metaFile = (await import(metaPath)).default;
+
+		const title = parseMetaTitle(metaFile, slug);
+
+		const { checksum, meta, sections } = processMdxForSearch(title, contents);
+
+		this.checksum = checksum;
+		this.meta = meta;
+		this.sections = sections;
+
+		return {
+			checksum,
+			meta,
+			sections,
+		};
+	}
 }
 
 type EmbeddingSource = MarkdownEmbeddingSource;
 
 async function generateEmbeddings() {
-  const argv = await yargs.option('refresh', {
-    alias: 'r',
-    description: 'Refresh data',
-    type: 'boolean',
-  }).argv;
-
-  const shouldRefresh = argv.refresh;
-
-  if (
-    !process.env.SUPABASE_URL ||
-    !process.env.SUPABASE_SERVICE_ROLE_KEY ||
-    !process.env.OPENAI_KEY
-  ) {
-    return console.info(
-      'Environment variables SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, and OPENAI_KEY are required: skipping embeddings generation',
-    );
-  }
-
-  const supabaseClient = createClient(
-    process.env.SUPABASE_URL,
-    process.env.SUPABASE_SERVICE_ROLE_KEY,
-    {
-      auth: {
-        persistSession: false,
-        autoRefreshToken: false,
-      },
-    },
-  );
-
-  const embeddingSources: EmbeddingSource[] = [
-    ...(await walk('pages'))
-      .filter(({ path }) => /\.mdx?$/.test(path))
-      .filter(({ path }) => !ignoredFiles.includes(path))
-      .map(
-        (entry) =>
-          new MarkdownEmbeddingSource('guide', entry.path, entry.parentPath),
-      ),
-  ];
-
-  console.info(`Discovered ${embeddingSources.length} pages`);
-
-  if (!shouldRefresh) {
-    console.info('Checking which pages are new or have changed');
-  } else console.info('Refresh flag set, re-generating all pages');
-
-  for (const embeddingSource of embeddingSources) {
-    const { type, source, path, parentPath } = embeddingSource;
-
-    try {
-      const { checksum, meta, sections } = await embeddingSource.load();
-
-      // Check for existing page in DB and compare checksums
-      const { error: fetchPageError, data: existingPage } = await supabaseClient
-        .from('docs_page')
-        .select('id, path, checksum, parentPage:parent_page_id(id, path)')
-        .filter('path', 'eq', path)
-        .limit(1)
-        .maybeSingle();
-
-      if (fetchPageError) {
-        throw fetchPageError;
-      }
-
-      type ParentPage<T> = T extends any[] ? T[number] | null : T;
-
-      // We use checksum to determine if this page & its sections need to be regenerated
-      if (!shouldRefresh && existingPage?.checksum === checksum) {
-        const existingParentPage =
-          existingPage?.parentPage as unknown as ParentPage<
-            typeof existingPage.parentPage
-          >;
-
-        // If parent page changed, update it
-        if (existingParentPage?.path !== parentPath) {
-          console.info(
-            `[${path}] Parent page has changed. Updating to '${parentPath}'...`,
-          );
-          const { error: fetchParentPageError, data: parentPage } =
-            await supabaseClient
-              .from('docs_page')
-              .select()
-              .filter('path', 'eq', parentPath)
-              .limit(1)
-              .maybeSingle();
-
-          if (fetchParentPageError) {
-            throw fetchParentPageError;
-          }
-
-          const { error: updatePageError } = await supabaseClient
-            .from('docs_page')
-            .update({ parent_page_id: parentPage?.id })
-            .filter('id', 'eq', existingPage.id);
-
-          if (updatePageError) {
-            throw updatePageError;
-          }
-        }
-        continue;
-      }
-
-      if (existingPage) {
-        if (!shouldRefresh) {
-          console.info(
-            `[${path}] Docs have changed, removing old page sections and their embeddings`,
-          );
-        } else
-          console.info(
-            '[$path] Refresh flag set, removing old page sections and their embeddings',
-          );
-
-        const { error: deletePageSectionError } = await supabaseClient
-          .from('docs_page_section')
-          .delete()
-          .filter('page_id', 'eq', existingPage.id);
-
-        if (deletePageSectionError) {
-          throw deletePageSectionError;
-        }
-      }
-
-      const { error: fetchParentPageError, data: parentPage } =
-        await supabaseClient
-          .from('docs_page')
-          .select()
-          .filter('path', 'eq', parentPath)
-          .limit(1)
-          .maybeSingle();
-
-      if (fetchParentPageError) {
-        throw fetchParentPageError;
-      }
-
-      // Create/update page record. Intentionally clear checksum until we
-      // have successfully generated all page sections.
-      const { error: upsertPageError, data: page } = await supabaseClient
-        .from('docs_page')
-        .upsert(
-          {
-            checksum: null,
-            path,
-            type,
-            source,
-            meta,
-            parent_page_id: parentPage?.id,
-          },
-          { onConflict: 'path' },
-        )
-        .select()
-        .limit(1)
-        .single();
-
-      if (upsertPageError) {
-        throw upsertPageError;
-      }
-
-      console.info(
-        `[${path}] Adding ${sections.length} page sections (with embeddings)`,
-      );
-      for (const { slug, heading, content } of sections) {
-        // OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
-        const input = content.replace(/\n/g, ' ');
-
-        try {
-          const openai = new OpenAI({
-            apiKey: process.env.OPENAI_KEY,
-          });
-
-          const embeddingResponse = await openai.embeddings.create({
-            model: 'text-embedding-ada-002',
-            input,
-          });
-
-          const [responseData] = embeddingResponse.data;
-
-          const { error: insertPageSectionError } = await supabaseClient
-            .from('docs_page_section')
-            .insert({
-              page_id: page.id,
-              slug,
-              heading,
-              content,
-              token_count: embeddingResponse.usage.total_tokens,
-              embedding: responseData.embedding,
-            })
-            .select()
-            .limit(1)
-            .single();
-
-          if (insertPageSectionError) {
-            throw insertPageSectionError;
-          }
-        } catch (err) {
-          // TODO: decide how to better handle failed embeddings
-          console.error(
-            `Failed to generate embeddings for '${path}' page section starting with '${input.slice(
-              0,
-              40,
-            )}...'`,
-          );
-
-          throw err;
-        }
-      }
-
-      // Set page checksum so that we know this page was stored successfully
-      const { error: updatePageError } = await supabaseClient
-        .from('docs_page')
-        .update({ checksum })
-        .filter('id', 'eq', page.id);
-
-      if (updatePageError) {
-        throw updatePageError;
-      }
-    } catch (err) {
-      console.error(
-        `Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.`,
-      );
-      console.error(err);
-    }
-  }
-
-  console.info('Embedding generation complete');
+	// @ts-ignore
+	const argv = await yargs.option("refresh", {
+		alias: "r",
+		description: "Refresh data",
+		type: "boolean",
+	}).argv;
+
+	const shouldRefresh = argv.refresh;
+
+	if (
+		!process.env.SUPABASE_URL ||
+		!process.env.SUPABASE_SERVICE_ROLE_KEY ||
+		!process.env.OPENAI_KEY
+	) {
+		return console.info(
+			"Environment variables SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, and OPENAI_KEY are required: skipping embeddings generation",
+		);
+	}
+
+	const supabaseClient = createClient(
+		process.env.SUPABASE_URL,
+		process.env.SUPABASE_SERVICE_ROLE_KEY,
+		{
+			auth: {
+				persistSession: false,
+				autoRefreshToken: false,
+			},
+		},
+	);
+
+	const embeddingSources: EmbeddingSource[] = [
+		...(await walk("pages"))
+			.filter(({ path }) => /\.mdx?$/.test(path))
+			.filter(({ path }) => !ignoredFiles.includes(path))
+			.map(
+				(entry) =>
+					new MarkdownEmbeddingSource("guide", entry.path, entry.parentPath),
+			),
+	];
+
+	console.info(`Discovered ${embeddingSources.length} pages`);
+
+	if (!shouldRefresh) {
+		console.info("Checking which pages are new or have changed");
+	} else console.info("Refresh flag set, re-generating all pages");
+
+	for (const embeddingSource of embeddingSources) {
+		const { type, source, path, parentPath } = embeddingSource;
+
+		try {
+			const { checksum, meta, sections } = await embeddingSource.load();
+
+			// Check for existing page in DB and compare checksums
+			const { error: fetchPageError, data: existingPage } = await supabaseClient
+				.from("docs_page")
+				.select("id, path, checksum, parentPage:parent_page_id(id, path)")
+				.filter("path", "eq", path)
+				.limit(1)
+				.maybeSingle();
+
+			if (fetchPageError) {
+				throw fetchPageError;
+			}
+
+			// biome-ignore lint/suspicious/noExplicitAny: any are you ok?
+			type ParentPage<T> = T extends any[] ? T[number] | null : T;
+
+			// We use checksum to determine if this page & its sections need to be regenerated
+			if (!shouldRefresh && existingPage?.checksum === checksum) {
+				const existingParentPage =
+					existingPage?.parentPage as unknown as ParentPage<
+						typeof existingPage.parentPage
+					>;
+
+				// If parent page changed, update it
+				if (existingParentPage?.path !== parentPath) {
+					console.info(
+						`[${path}] Parent page has changed. Updating to '${parentPath}'...`,
+					);
+					const { error: fetchParentPageError, data: parentPage } =
+						await supabaseClient
+							.from("docs_page")
+							.select()
+							.filter("path", "eq", parentPath)
+							.limit(1)
+							.maybeSingle();
+
+					if (fetchParentPageError) {
+						throw fetchParentPageError;
+					}
+
+					const { error: updatePageError } = await supabaseClient
+						.from("docs_page")
+						.update({ parent_page_id: parentPage?.id })
+						.filter("id", "eq", existingPage.id);
+
+					if (updatePageError) {
+						throw updatePageError;
+					}
+				}
+				continue;
+			}
+
+			if (existingPage) {
+				if (!shouldRefresh) {
+					console.info(
+						`[${path}] Docs have changed, removing old page sections and their embeddings`,
+					);
+				} else
+					console.info(
+						"[$path] Refresh flag set, removing old page sections and their embeddings",
+					);
+
+				const { error: deletePageSectionError } = await supabaseClient
+					.from("docs_page_section")
+					.delete()
+					.filter("page_id", "eq", existingPage.id);
+
+				if (deletePageSectionError) {
+					throw deletePageSectionError;
+				}
+			}
+
+			const { error: fetchParentPageError, data: parentPage } =
+				await supabaseClient
+					.from("docs_page")
+					.select()
+					.filter("path", "eq", parentPath)
+					.limit(1)
+					.maybeSingle();
+
+			if (fetchParentPageError) {
+				throw fetchParentPageError;
+			}
+
+			// Create/update page record. Intentionally clear checksum until we
+			// have successfully generated all page sections.
+			const { error: upsertPageError, data: page } = await supabaseClient
+				.from("docs_page")
+				.upsert(
+					{
+						checksum: null,
+						path,
+						type,
+						source,
+						meta,
+						parent_page_id: parentPage?.id,
+					},
+					{ onConflict: "path" },
+				)
+				.select()
+				.limit(1)
+				.single();
+
+			if (upsertPageError) {
+				throw upsertPageError;
+			}
+
+			console.info(
+				`[${path}] Adding ${sections.length} page sections (with embeddings)`,
+			);
+			for (const { slug, heading, content } of sections) {
+				// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
+				const input = content.replace(/\n/g, " ");
+
+				try {
+					const openai = new OpenAI({
+						apiKey: process.env.OPENAI_KEY,
+					});
+
+					const embeddingResponse = await openai.embeddings.create({
+						model: "text-embedding-ada-002",
+						input,
+					});
+
+					const [responseData] = embeddingResponse.data;
+
+					const { error: insertPageSectionError } = await supabaseClient
+						.from("docs_page_section")
+						.insert({
+							page_id: page.id,
+							slug,
+							heading,
+							content,
+							token_count: embeddingResponse.usage.total_tokens,
+							embedding: responseData.embedding,
+						})
+						.select()
+						.limit(1)
+						.single();
+
+					if (insertPageSectionError) {
+						throw insertPageSectionError;
+					}
+				} catch (err) {
+					// TODO: decide how to better handle failed embeddings
+					console.error(
+						`Failed to generate embeddings for '${path}' page section starting with '${input.slice(
+							0,
+							40,
+						)}...'`,
+					);
+
+					throw err;
+				}
+			}
+
+			// Set page checksum so that we know this page was stored successfully
+			const { error: updatePageError } = await supabaseClient
+				.from("docs_page")
+				.update({ checksum })
+				.filter("id", "eq", page.id);
+
+			if (updatePageError) {
+				throw updatePageError;
+			}
+		} catch (err) {
+			console.error(
+				`Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.`,
+			);
+			console.error(err);
+		}
+	}
+
+	console.info("Embedding generation complete");
 }
 
 async function main() {
-  await generateEmbeddings();
+	await generateEmbeddings();
 }
 
 main().catch((err) => console.error(err));
diff --git a/pages/_app.tsx b/pages/_app.tsx
index a03c6e2..68b91ca 100644
--- a/pages/_app.tsx
+++ b/pages/_app.tsx
@@ -1,17 +1,16 @@
-import React from 'react';
-import { Analytics } from '@vercel/analytics/react';
-import { CookieConsentModal } from '@onbeam/features';
-
-import '../styles.css';
-import { GoogleTagManager } from '../lib/GoogleTagManager';
+import { CookieConsentModal } from "@onbeam/features";
+import { Analytics } from "@vercel/analytics/react";
+import React from "react";
+import { GoogleTagManager } from "../components/google-tag-manager";
+import "../styles.css";
 
 export default function Nextra({ Component, pageProps }) {
-  return (
-    <>
-      <Component {...pageProps} />
-      <Analytics />
-      <CookieConsentModal />
-      <GoogleTagManager />
-    </>
-  );
+	return (
+		<>
+			<Component {...pageProps} />
+			<Analytics />
+			<CookieConsentModal />
+			<GoogleTagManager />
+		</>
+	);
 }

From a321e7d2370474426a13fe329c151f23907888b8 Mon Sep 17 00:00:00 2001
From: Roy Scheepens <roy.scheepens@gmail.com>
Date: Fri, 15 Nov 2024 10:46:16 +0100
Subject: [PATCH 2/3] fix: formatting

---
 biome.json                 |  10 +-
 lib/generate-embeddings.ts | 862 +++++++++++++++++++------------------
 pages/_app.tsx             |  26 +-
 3 files changed, 451 insertions(+), 447 deletions(-)

diff --git a/biome.json b/biome.json
index cbff4ed..47cdaaa 100644
--- a/biome.json
+++ b/biome.json
@@ -1,7 +1,7 @@
 {
-	"$schema": "./node_modules/@biomejs/biome/configuration_schema.json",
-	"extends": ["./node_modules/@onbeam/biome-config/biome.json"],
-	"files": {
-		"ignore": ["./styled-system"]
-	}
+  "$schema": "./node_modules/@biomejs/biome/configuration_schema.json",
+  "extends": ["./node_modules/@onbeam/biome-config/biome.json"],
+  "files": {
+    "ignore": ["./styled-system"]
+  }
 }
diff --git a/lib/generate-embeddings.ts b/lib/generate-embeddings.ts
index 374064c..416c6cd 100644
--- a/lib/generate-embeddings.ts
+++ b/lib/generate-embeddings.ts
@@ -1,25 +1,25 @@
-import { createHash } from "node:crypto";
-import { readFile, readdir, stat } from "node:fs/promises";
-import { basename, dirname, join } from "node:path";
-import { createClient } from "@supabase/supabase-js";
-import dotenv from "dotenv";
-import GithubSlugger from "github-slugger";
-import { Content, Root } from "mdast";
-import { fromMarkdown } from "mdast-util-from-markdown";
-import { frontmatterFromMarkdown } from "mdast-util-frontmatter";
-import { mdxFromMarkdown } from "mdast-util-mdx";
-import { toMarkdown } from "mdast-util-to-markdown";
-import { toString as toStringUtil } from "mdast-util-to-string";
-import { frontmatter } from "micromark-extension-frontmatter";
-import { mdxjs } from "micromark-extension-mdxjs";
-import OpenAI from "openai";
-import { u } from "unist-builder";
-import { filter } from "unist-util-filter";
-import yargs from "yargs";
+import { createHash } from 'node:crypto';
+import { readFile, readdir, stat } from 'node:fs/promises';
+import { basename, dirname, join } from 'node:path';
+import { createClient } from '@supabase/supabase-js';
+import dotenv from 'dotenv';
+import GithubSlugger from 'github-slugger';
+import { Content, Root } from 'mdast';
+import { fromMarkdown } from 'mdast-util-from-markdown';
+import { frontmatterFromMarkdown } from 'mdast-util-frontmatter';
+import { mdxFromMarkdown } from 'mdast-util-mdx';
+import { toMarkdown } from 'mdast-util-to-markdown';
+import { toString as toStringUtil } from 'mdast-util-to-string';
+import { frontmatter } from 'micromark-extension-frontmatter';
+import { mdxjs } from 'micromark-extension-mdxjs';
+import OpenAI from 'openai';
+import { u } from 'unist-builder';
+import { filter } from 'unist-util-filter';
+import yargs from 'yargs';
 
 dotenv.config();
 
-const ignoredFiles = ["pages/_app.mdx", "pages/index.mdx", "pages/404.mdx"];
+const ignoredFiles = ['pages/_app.mdx', 'pages/index.mdx', 'pages/404.mdx'];
 
 /**
  * Splits a `mdast` tree into multiple trees based on
@@ -29,35 +29,35 @@ const ignoredFiles = ["pages/_app.mdx", "pages/index.mdx", "pages/404.mdx"];
  * Useful to split a markdown file into smaller sections.
  */
 function splitTreeBy(tree: Root, predicate: (node: Content) => boolean) {
-	return tree.children.reduce<Root[]>((trees, node) => {
-		const [lastTree] = trees.slice(-1);
+  return tree.children.reduce<Root[]>((trees, node) => {
+    const [lastTree] = trees.slice(-1);
 
-		if (!lastTree || predicate(node)) {
-			const tree: Root = u("root", [node]);
-			return trees.concat(tree);
-		}
+    if (!lastTree || predicate(node)) {
+      const tree: Root = u('root', [node]);
+      return trees.concat(tree);
+    }
 
-		lastTree.children.push(node);
-		return trees;
-	}, []);
+    lastTree.children.push(node);
+    return trees;
+  }, []);
 }
 
 function extractMetaTags(mdxTree: Root) {
-	const metaTagsNode = mdxTree.children.find(({ type }) => type === "yaml");
+  const metaTagsNode = mdxTree.children.find(({ type }) => type === 'yaml');
 
-	if (!metaTagsNode) {
-		return {};
-	}
+  if (!metaTagsNode) {
+    return {};
+  }
 
-	const parsed = metaTagsNode.value.split(/\\r?\\n/).reduce((meta, line) => {
-		const [key, value] = line.split(": ");
-		return {
-			...meta,
-			[key]: value,
-		};
-	}, {});
+  const parsed = metaTagsNode.value.split(/\\r?\\n/).reduce((meta, line) => {
+    const [key, value] = line.split(': ');
+    return {
+      ...meta,
+      [key]: value,
+    };
+  }, {});
 
-	return parsed;
+  return parsed;
 }
 
 /**
@@ -69,28 +69,28 @@ function extractMetaTags(mdxTree: Root) {
 
 // biome-ignore lint/suspicious/noExplicitAny: any are you ok?
 const parseMetaTitle = (meta: any, slug: string): string => {
-	if (!meta[slug]) return slug;
+  if (!meta[slug]) return slug;
 
-	if (typeof meta[slug] === "object") {
-		// biome-ignore lint/suspicious/noExplicitAny: any are you ok?
-		return `${(meta[slug] as any).title}` ?? slug;
-	}
+  if (typeof meta[slug] === 'object') {
+    // biome-ignore lint/suspicious/noExplicitAny: any are you ok?
+    return `${(meta[slug] as any).title}` ?? slug;
+  }
 
-	return meta[slug] as string;
+  return meta[slug] as string;
 };
 
 type Meta = ReturnType<typeof extractMetaTags>;
 
 type Section = {
-	content: string;
-	heading?: string;
-	slug?: string;
+  content: string;
+  heading?: string;
+  slug?: string;
 };
 
 type ProcessedMdx = {
-	checksum: string;
-	meta: Meta;
-	sections: Section[];
+  checksum: string;
+  meta: Meta;
+  sections: Section[];
 };
 
 /**
@@ -99,403 +99,407 @@ type ProcessedMdx = {
  * and splits it into sub-sections based on criteria.
  */
 function processMdxForSearch(title: string, content: string): ProcessedMdx {
-	const checksum = createHash("sha256").update(content).digest("base64");
-
-	const mdxTree = fromMarkdown(content, {
-		extensions: [mdxjs(), frontmatter()],
-		mdastExtensions: [mdxFromMarkdown(), frontmatterFromMarkdown(["yaml"])],
-	});
-
-	// Extract meta tags from markdown
-	const meta = extractMetaTags(mdxTree);
-	if (!meta.title) meta.title = title;
-
-	// Remove all MDX elements from markdown
-	const mdTree = filter(
-		mdxTree,
-		(node) =>
-			![
-				"mdxjsEsm",
-				"mdxJsxFlowElement",
-				"mdxJsxTextElement",
-				"mdxFlowExpression",
-				"mdxTextExpression",
-			].includes(node.type),
-	);
-
-	if (!mdTree) {
-		return {
-			checksum,
-			meta,
-			sections: [],
-		};
-	}
-
-	const sectionTrees = splitTreeBy(mdTree, (node) => node.type === "heading");
-
-	const slugger = new GithubSlugger();
-
-	const sections = sectionTrees
-		// Filter out trees that contain only the page's metadata
-		.filter(({ children }) => children[0]?.type !== "yaml")
-		.map((tree) => {
-			const [firstNode] = tree.children;
-
-			const heading =
-				firstNode.type === "heading" ? toStringUtil(firstNode) : undefined;
-			const slug = heading ? slugger.slug(heading) : undefined;
-
-			return {
-				content: toMarkdown(tree),
-				heading,
-				slug,
-			};
-		});
-
-	return {
-		checksum,
-		meta,
-		sections,
-	};
+  const checksum = createHash('sha256').update(content).digest('base64');
+
+  const mdxTree = fromMarkdown(content, {
+    extensions: [mdxjs(), frontmatter()],
+    mdastExtensions: [mdxFromMarkdown(), frontmatterFromMarkdown(['yaml'])],
+  });
+
+  // Extract meta tags from markdown
+  const meta = extractMetaTags(mdxTree);
+  if (!meta.title) meta.title = title;
+
+  // Remove all MDX elements from markdown
+  const mdTree = filter(
+    mdxTree,
+    (node) =>
+      ![
+        'mdxjsEsm',
+        'mdxJsxFlowElement',
+        'mdxJsxTextElement',
+        'mdxFlowExpression',
+        'mdxTextExpression',
+      ].includes(node.type),
+  );
+
+  if (!mdTree) {
+    return {
+      checksum,
+      meta,
+      sections: [],
+    };
+  }
+
+  const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading');
+
+  const slugger = new GithubSlugger();
+
+  const sections = sectionTrees
+    // Filter out trees that contain only the page's metadata
+    .filter(({ children }) => children[0]?.type !== 'yaml')
+    .map((tree) => {
+      const [firstNode] = tree.children;
+
+      const heading =
+        firstNode.type === 'heading' ? toStringUtil(firstNode) : undefined;
+      const slug = heading ? slugger.slug(heading) : undefined;
+
+      return {
+        content: toMarkdown(tree),
+        heading,
+        slug,
+      };
+    });
+
+  return {
+    checksum,
+    meta,
+    sections,
+  };
 }
 
 type WalkEntry = {
-	path: string;
-	parentPath?: string;
+  path: string;
+  parentPath?: string;
 };
 
 async function walk(dir: string, parentPath?: string): Promise<WalkEntry[]> {
-	const immediateFiles = await readdir(dir);
-
-	const recursiveFiles = await Promise.all(
-		immediateFiles.map(async (file) => {
-			const path = join(dir, file);
-			const stats = await stat(path);
-			if (stats.isDirectory()) {
-				// Keep track of document hierarchy (if this dir has corresponding doc file)
-				const docPath = `${basename(path)}.mdx`;
-
-				return walk(
-					path,
-					immediateFiles.includes(docPath)
-						? join(dirname(path), docPath)
-						: parentPath,
-				);
-			}
-			if (stats.isFile()) {
-				return [
-					{
-						path: path,
-						parentPath,
-					},
-				];
-			}
-			return [];
-		}),
-	);
-
-	const flattenedFiles = recursiveFiles.reduce(
-		(all, folderContents) => all.concat(folderContents),
-		[],
-	);
-
-	return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path));
+  const immediateFiles = await readdir(dir);
+
+  const recursiveFiles = await Promise.all(
+    immediateFiles.map(async (file) => {
+      const path = join(dir, file);
+      const stats = await stat(path);
+      if (stats.isDirectory()) {
+        // Keep track of document hierarchy (if this dir has corresponding doc file)
+        const docPath = `${basename(path)}.mdx`;
+
+        return walk(
+          path,
+          immediateFiles.includes(docPath)
+            ? join(dirname(path), docPath)
+            : parentPath,
+        );
+      }
+      if (stats.isFile()) {
+        return [
+          {
+            path: path,
+            parentPath,
+          },
+        ];
+      }
+      return [];
+    }),
+  );
+
+  const flattenedFiles = recursiveFiles.reduce(
+    (all, folderContents) => all.concat(folderContents),
+    [],
+  );
+
+  return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path));
 }
 
 abstract class BaseEmbeddingSource {
-	checksum?: string;
-	meta?: Meta;
-	sections?: Section[];
-
-	constructor(
-		public source: string,
-		public path: string,
-		public parentPath?: string,
-	) {}
-
-	abstract load(): Promise<{
-		checksum: string;
-		meta?: Meta;
-		sections: Section[];
-	}>;
+  checksum?: string;
+  meta?: Meta;
+  sections?: Section[];
+
+  constructor(
+    public source: string,
+    public path: string,
+    public parentPath?: string,
+  ) {}
+
+  abstract load(): Promise<{
+    checksum: string;
+    meta?: Meta;
+    sections: Section[];
+  }>;
 }
 
 class MarkdownEmbeddingSource extends BaseEmbeddingSource {
-	type = "markdown" as const;
-
-	constructor(
-		source: string,
-		public filePath: string,
-		public parentFilePath?: string,
-	) {
-		const path = filePath.replace(/^pages/, "").replace(/\.mdx?$/, "");
-		const parentPath = parentFilePath
-			?.replace(/^pages/, "")
-			.replace(/\.mdx?$/, "");
-
-		super(source, path, parentPath);
-	}
-
-	async load() {
-		const contents = await readFile(this.filePath, "utf8");
-
-		const slug = this.filePath.split("/").at(-1)?.replace(/\.mdx?$/, "") ?? "";
-
-		const metaPath = join(
-			process.cwd(),
-			this.filePath.replace(/[^/]+$/, "_meta.ts"),
-		);
-
-		const metaFile = (await import(metaPath)).default;
-
-		const title = parseMetaTitle(metaFile, slug);
-
-		const { checksum, meta, sections } = processMdxForSearch(title, contents);
-
-		this.checksum = checksum;
-		this.meta = meta;
-		this.sections = sections;
-
-		return {
-			checksum,
-			meta,
-			sections,
-		};
-	}
+  type = 'markdown' as const;
+
+  constructor(
+    source: string,
+    public filePath: string,
+    public parentFilePath?: string,
+  ) {
+    const path = filePath.replace(/^pages/, '').replace(/\.mdx?$/, '');
+    const parentPath = parentFilePath
+      ?.replace(/^pages/, '')
+      .replace(/\.mdx?$/, '');
+
+    super(source, path, parentPath);
+  }
+
+  async load() {
+    const contents = await readFile(this.filePath, 'utf8');
+
+    const slug =
+      this.filePath
+        .split('/')
+        .at(-1)
+        ?.replace(/\.mdx?$/, '') ?? '';
+
+    const metaPath = join(
+      process.cwd(),
+      this.filePath.replace(/[^/]+$/, '_meta.ts'),
+    );
+
+    const metaFile = (await import(metaPath)).default;
+
+    const title = parseMetaTitle(metaFile, slug);
+
+    const { checksum, meta, sections } = processMdxForSearch(title, contents);
+
+    this.checksum = checksum;
+    this.meta = meta;
+    this.sections = sections;
+
+    return {
+      checksum,
+      meta,
+      sections,
+    };
+  }
 }
 
 type EmbeddingSource = MarkdownEmbeddingSource;
 
 async function generateEmbeddings() {
-	// @ts-ignore
-	const argv = await yargs.option("refresh", {
-		alias: "r",
-		description: "Refresh data",
-		type: "boolean",
-	}).argv;
-
-	const shouldRefresh = argv.refresh;
-
-	if (
-		!process.env.SUPABASE_URL ||
-		!process.env.SUPABASE_SERVICE_ROLE_KEY ||
-		!process.env.OPENAI_KEY
-	) {
-		return console.info(
-			"Environment variables SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, and OPENAI_KEY are required: skipping embeddings generation",
-		);
-	}
-
-	const supabaseClient = createClient(
-		process.env.SUPABASE_URL,
-		process.env.SUPABASE_SERVICE_ROLE_KEY,
-		{
-			auth: {
-				persistSession: false,
-				autoRefreshToken: false,
-			},
-		},
-	);
-
-	const embeddingSources: EmbeddingSource[] = [
-		...(await walk("pages"))
-			.filter(({ path }) => /\.mdx?$/.test(path))
-			.filter(({ path }) => !ignoredFiles.includes(path))
-			.map(
-				(entry) =>
-					new MarkdownEmbeddingSource("guide", entry.path, entry.parentPath),
-			),
-	];
-
-	console.info(`Discovered ${embeddingSources.length} pages`);
-
-	if (!shouldRefresh) {
-		console.info("Checking which pages are new or have changed");
-	} else console.info("Refresh flag set, re-generating all pages");
-
-	for (const embeddingSource of embeddingSources) {
-		const { type, source, path, parentPath } = embeddingSource;
-
-		try {
-			const { checksum, meta, sections } = await embeddingSource.load();
-
-			// Check for existing page in DB and compare checksums
-			const { error: fetchPageError, data: existingPage } = await supabaseClient
-				.from("docs_page")
-				.select("id, path, checksum, parentPage:parent_page_id(id, path)")
-				.filter("path", "eq", path)
-				.limit(1)
-				.maybeSingle();
-
-			if (fetchPageError) {
-				throw fetchPageError;
-			}
-
-			// biome-ignore lint/suspicious/noExplicitAny: any are you ok?
-			type ParentPage<T> = T extends any[] ? T[number] | null : T;
-
-			// We use checksum to determine if this page & its sections need to be regenerated
-			if (!shouldRefresh && existingPage?.checksum === checksum) {
-				const existingParentPage =
-					existingPage?.parentPage as unknown as ParentPage<
-						typeof existingPage.parentPage
-					>;
-
-				// If parent page changed, update it
-				if (existingParentPage?.path !== parentPath) {
-					console.info(
-						`[${path}] Parent page has changed. Updating to '${parentPath}'...`,
-					);
-					const { error: fetchParentPageError, data: parentPage } =
-						await supabaseClient
-							.from("docs_page")
-							.select()
-							.filter("path", "eq", parentPath)
-							.limit(1)
-							.maybeSingle();
-
-					if (fetchParentPageError) {
-						throw fetchParentPageError;
-					}
-
-					const { error: updatePageError } = await supabaseClient
-						.from("docs_page")
-						.update({ parent_page_id: parentPage?.id })
-						.filter("id", "eq", existingPage.id);
-
-					if (updatePageError) {
-						throw updatePageError;
-					}
-				}
-				continue;
-			}
-
-			if (existingPage) {
-				if (!shouldRefresh) {
-					console.info(
-						`[${path}] Docs have changed, removing old page sections and their embeddings`,
-					);
-				} else
-					console.info(
-						"[$path] Refresh flag set, removing old page sections and their embeddings",
-					);
-
-				const { error: deletePageSectionError } = await supabaseClient
-					.from("docs_page_section")
-					.delete()
-					.filter("page_id", "eq", existingPage.id);
-
-				if (deletePageSectionError) {
-					throw deletePageSectionError;
-				}
-			}
-
-			const { error: fetchParentPageError, data: parentPage } =
-				await supabaseClient
-					.from("docs_page")
-					.select()
-					.filter("path", "eq", parentPath)
-					.limit(1)
-					.maybeSingle();
-
-			if (fetchParentPageError) {
-				throw fetchParentPageError;
-			}
-
-			// Create/update page record. Intentionally clear checksum until we
-			// have successfully generated all page sections.
-			const { error: upsertPageError, data: page } = await supabaseClient
-				.from("docs_page")
-				.upsert(
-					{
-						checksum: null,
-						path,
-						type,
-						source,
-						meta,
-						parent_page_id: parentPage?.id,
-					},
-					{ onConflict: "path" },
-				)
-				.select()
-				.limit(1)
-				.single();
-
-			if (upsertPageError) {
-				throw upsertPageError;
-			}
-
-			console.info(
-				`[${path}] Adding ${sections.length} page sections (with embeddings)`,
-			);
-			for (const { slug, heading, content } of sections) {
-				// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
-				const input = content.replace(/\n/g, " ");
-
-				try {
-					const openai = new OpenAI({
-						apiKey: process.env.OPENAI_KEY,
-					});
-
-					const embeddingResponse = await openai.embeddings.create({
-						model: "text-embedding-ada-002",
-						input,
-					});
-
-					const [responseData] = embeddingResponse.data;
-
-					const { error: insertPageSectionError } = await supabaseClient
-						.from("docs_page_section")
-						.insert({
-							page_id: page.id,
-							slug,
-							heading,
-							content,
-							token_count: embeddingResponse.usage.total_tokens,
-							embedding: responseData.embedding,
-						})
-						.select()
-						.limit(1)
-						.single();
-
-					if (insertPageSectionError) {
-						throw insertPageSectionError;
-					}
-				} catch (err) {
-					// TODO: decide how to better handle failed embeddings
-					console.error(
-						`Failed to generate embeddings for '${path}' page section starting with '${input.slice(
-							0,
-							40,
-						)}...'`,
-					);
-
-					throw err;
-				}
-			}
-
-			// Set page checksum so that we know this page was stored successfully
-			const { error: updatePageError } = await supabaseClient
-				.from("docs_page")
-				.update({ checksum })
-				.filter("id", "eq", page.id);
-
-			if (updatePageError) {
-				throw updatePageError;
-			}
-		} catch (err) {
-			console.error(
-				`Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.`,
-			);
-			console.error(err);
-		}
-	}
-
-	console.info("Embedding generation complete");
+  // @ts-ignore
+  const argv = await yargs.option('refresh', {
+    alias: 'r',
+    description: 'Refresh data',
+    type: 'boolean',
+  }).argv;
+
+  const shouldRefresh = argv.refresh;
+
+  if (
+    !process.env.SUPABASE_URL ||
+    !process.env.SUPABASE_SERVICE_ROLE_KEY ||
+    !process.env.OPENAI_KEY
+  ) {
+    return console.info(
+      'Environment variables SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, and OPENAI_KEY are required: skipping embeddings generation',
+    );
+  }
+
+  const supabaseClient = createClient(
+    process.env.SUPABASE_URL,
+    process.env.SUPABASE_SERVICE_ROLE_KEY,
+    {
+      auth: {
+        persistSession: false,
+        autoRefreshToken: false,
+      },
+    },
+  );
+
+  const embeddingSources: EmbeddingSource[] = [
+    ...(await walk('pages'))
+      .filter(({ path }) => /\.mdx?$/.test(path))
+      .filter(({ path }) => !ignoredFiles.includes(path))
+      .map(
+        (entry) =>
+          new MarkdownEmbeddingSource('guide', entry.path, entry.parentPath),
+      ),
+  ];
+
+  console.info(`Discovered ${embeddingSources.length} pages`);
+
+  if (!shouldRefresh) {
+    console.info('Checking which pages are new or have changed');
+  } else console.info('Refresh flag set, re-generating all pages');
+
+  for (const embeddingSource of embeddingSources) {
+    const { type, source, path, parentPath } = embeddingSource;
+
+    try {
+      const { checksum, meta, sections } = await embeddingSource.load();
+
+      // Check for existing page in DB and compare checksums
+      const { error: fetchPageError, data: existingPage } = await supabaseClient
+        .from('docs_page')
+        .select('id, path, checksum, parentPage:parent_page_id(id, path)')
+        .filter('path', 'eq', path)
+        .limit(1)
+        .maybeSingle();
+
+      if (fetchPageError) {
+        throw fetchPageError;
+      }
+
+      // biome-ignore lint/suspicious/noExplicitAny: any are you ok?
+      type ParentPage<T> = T extends any[] ? T[number] | null : T;
+
+      // We use checksum to determine if this page & its sections need to be regenerated
+      if (!shouldRefresh && existingPage?.checksum === checksum) {
+        const existingParentPage =
+          existingPage?.parentPage as unknown as ParentPage<
+            typeof existingPage.parentPage
+          >;
+
+        // If parent page changed, update it
+        if (existingParentPage?.path !== parentPath) {
+          console.info(
+            `[${path}] Parent page has changed. Updating to '${parentPath}'...`,
+          );
+          const { error: fetchParentPageError, data: parentPage } =
+            await supabaseClient
+              .from('docs_page')
+              .select()
+              .filter('path', 'eq', parentPath)
+              .limit(1)
+              .maybeSingle();
+
+          if (fetchParentPageError) {
+            throw fetchParentPageError;
+          }
+
+          const { error: updatePageError } = await supabaseClient
+            .from('docs_page')
+            .update({ parent_page_id: parentPage?.id })
+            .filter('id', 'eq', existingPage.id);
+
+          if (updatePageError) {
+            throw updatePageError;
+          }
+        }
+        continue;
+      }
+
+      if (existingPage) {
+        if (!shouldRefresh) {
+          console.info(
+            `[${path}] Docs have changed, removing old page sections and their embeddings`,
+          );
+        } else
+          console.info(
+            '[$path] Refresh flag set, removing old page sections and their embeddings',
+          );
+
+        const { error: deletePageSectionError } = await supabaseClient
+          .from('docs_page_section')
+          .delete()
+          .filter('page_id', 'eq', existingPage.id);
+
+        if (deletePageSectionError) {
+          throw deletePageSectionError;
+        }
+      }
+
+      const { error: fetchParentPageError, data: parentPage } =
+        await supabaseClient
+          .from('docs_page')
+          .select()
+          .filter('path', 'eq', parentPath)
+          .limit(1)
+          .maybeSingle();
+
+      if (fetchParentPageError) {
+        throw fetchParentPageError;
+      }
+
+      // Create/update page record. Intentionally clear checksum until we
+      // have successfully generated all page sections.
+      const { error: upsertPageError, data: page } = await supabaseClient
+        .from('docs_page')
+        .upsert(
+          {
+            checksum: null,
+            path,
+            type,
+            source,
+            meta,
+            parent_page_id: parentPage?.id,
+          },
+          { onConflict: 'path' },
+        )
+        .select()
+        .limit(1)
+        .single();
+
+      if (upsertPageError) {
+        throw upsertPageError;
+      }
+
+      console.info(
+        `[${path}] Adding ${sections.length} page sections (with embeddings)`,
+      );
+      for (const { slug, heading, content } of sections) {
+        // OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
+        const input = content.replace(/\n/g, ' ');
+
+        try {
+          const openai = new OpenAI({
+            apiKey: process.env.OPENAI_KEY,
+          });
+
+          const embeddingResponse = await openai.embeddings.create({
+            model: 'text-embedding-ada-002',
+            input,
+          });
+
+          const [responseData] = embeddingResponse.data;
+
+          const { error: insertPageSectionError } = await supabaseClient
+            .from('docs_page_section')
+            .insert({
+              page_id: page.id,
+              slug,
+              heading,
+              content,
+              token_count: embeddingResponse.usage.total_tokens,
+              embedding: responseData.embedding,
+            })
+            .select()
+            .limit(1)
+            .single();
+
+          if (insertPageSectionError) {
+            throw insertPageSectionError;
+          }
+        } catch (err) {
+          // TODO: decide how to better handle failed embeddings
+          console.error(
+            `Failed to generate embeddings for '${path}' page section starting with '${input.slice(
+              0,
+              40,
+            )}...'`,
+          );
+
+          throw err;
+        }
+      }
+
+      // Set page checksum so that we know this page was stored successfully
+      const { error: updatePageError } = await supabaseClient
+        .from('docs_page')
+        .update({ checksum })
+        .filter('id', 'eq', page.id);
+
+      if (updatePageError) {
+        throw updatePageError;
+      }
+    } catch (err) {
+      console.error(
+        `Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.`,
+      );
+      console.error(err);
+    }
+  }
+
+  console.info('Embedding generation complete');
 }
 
 async function main() {
-	await generateEmbeddings();
+  await generateEmbeddings();
 }
 
 main().catch((err) => console.error(err));
diff --git a/pages/_app.tsx b/pages/_app.tsx
index 68b91ca..0c5ad55 100644
--- a/pages/_app.tsx
+++ b/pages/_app.tsx
@@ -1,16 +1,16 @@
-import { CookieConsentModal } from "@onbeam/features";
-import { Analytics } from "@vercel/analytics/react";
-import React from "react";
-import { GoogleTagManager } from "../components/google-tag-manager";
-import "../styles.css";
+import { CookieConsentModal } from '@onbeam/features';
+import { Analytics } from '@vercel/analytics/react';
+import React from 'react';
+import { GoogleTagManager } from '../components/google-tag-manager';
+import '../styles.css';
 
 export default function Nextra({ Component, pageProps }) {
-	return (
-		<>
-			<Component {...pageProps} />
-			<Analytics />
-			<CookieConsentModal />
-			<GoogleTagManager />
-		</>
-	);
+  return (
+    <>
+      <Component {...pageProps} />
+      <Analytics />
+      <CookieConsentModal />
+      <GoogleTagManager />
+    </>
+  );
 }

From 7a094c6311f9148839a3024f809a1d1f666b19d4 Mon Sep 17 00:00:00 2001
From: Leon van der Noll <leonvdnoll@gmail.com>
Date: Fri, 15 Nov 2024 10:51:47 +0100
Subject: [PATCH 3/3] fix: remove biome-ignores

---
 lib/generate-embeddings.ts | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lib/generate-embeddings.ts b/lib/generate-embeddings.ts
index 416c6cd..b943044 100644
--- a/lib/generate-embeddings.ts
+++ b/lib/generate-embeddings.ts
@@ -67,13 +67,11 @@ function extractMetaTags(mdxTree: Root) {
  * @returns
  */
 
-// biome-ignore lint/suspicious/noExplicitAny: any are you ok?
 const parseMetaTitle = (meta: any, slug: string): string => {
   if (!meta[slug]) return slug;
 
   if (typeof meta[slug] === 'object') {
-    // biome-ignore lint/suspicious/noExplicitAny: any are you ok?
-    return `${(meta[slug] as any).title}` ?? slug;
+    return meta[slug]?.title ? `${(meta[slug] as any).title}` : slug;
   }
 
   return meta[slug] as string;
@@ -335,7 +333,6 @@ async function generateEmbeddings() {
         throw fetchPageError;
       }
 
-      // biome-ignore lint/suspicious/noExplicitAny: any are you ok?
       type ParentPage<T> = T extends any[] ? T[number] | null : T;
 
       // We use checksum to determine if this page & its sections need to be regenerated