Skip to content

Commit

Permalink
Add SDK support for web scraping (#30)
Browse files Browse the repository at this point in the history
  • Loading branch information
EvanBoyle authored Jul 15, 2024
1 parent 29ca780 commit b2783a4
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 8 deletions.
4 changes: 3 additions & 1 deletion .npmignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ test_data
.prettier*
.eslint*
.nvm*
vitest*
vitest*
images
.github
29 changes: 26 additions & 3 deletions catalog.ts
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ export class Catalog {
let hasText = false;
let hasFile = false;
let hasJson = false;
let hasUrl = false;
let hasSitemapUrl = false;
for (const doc of batch) {
switch (doc.contentType) {
case "markdown":
Expand All @@ -148,6 +150,12 @@ export class Catalog {
case "file":
hasFile = true;
break;
case "url":
hasUrl = true;
break;
case "sitemap-url":
hasSitemapUrl = true;
break;
default:
throw new Error(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
Expand All @@ -156,9 +164,12 @@ export class Catalog {
}
}

if ([hasText, hasJson, hasFile].filter((v) => v).length > 1) {
if (
[hasText, hasJson, hasFile, hasUrl, hasSitemapUrl].filter((v) => v)
.length > 1
) {
throw new Error(
`cannot mix file, text, and json content in batch upsert. all documents in batch must have the same contentType.`,
`cannot mix file, text, url, sitemap-url, json content in batch upsert. all documents in batch must have the same contentType.`,
);
}

Expand All @@ -181,7 +192,7 @@ export class Catalog {
);
}

if (res.status !== 200) {
if (res.status > 202) {
throw new Error(`Failed to upsert documents: ${res.statusText}`);
}
}
Expand Down Expand Up @@ -278,6 +289,18 @@ const mapBatch = async (batch: DocumentBatch) => {
content: undefined,
});
break;
case "url":
documents.push({
documentId: doc.url,
contentType: "url",
});
break;
case "sitemap-url":
documents.push({
documentId: doc.sitemapUrl,
contentType: "sitemap-url",
});
break;
default:
throw new Error(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
Expand Down
26 changes: 24 additions & 2 deletions document.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,28 @@
import { CortexApiClient } from "./api-client.js";
import { Catalog } from "./catalog.js";

export type UrlContentType = "url";
export type SitemapContentType = "sitemap-url";
export type TextContentType = "text" | "markdown";
export type JSONContentType = "json";
export type FileContentType = "file";

export type ContentType = FileContentType | TextContentType | JSONContentType;
export type ContentType =
| FileContentType
| TextContentType
| JSONContentType
| UrlContentType
| SitemapContentType;

export type UrlDocument = {
url: string;
contentType: UrlContentType;
};

export type SitemapDocument = {
sitemapUrl: string;
contentType: SitemapContentType;
};

export type JSONDocument = {
documentId: string;
Expand All @@ -31,7 +48,12 @@ export type FileDocument = {
imageUrl?: string;
};

export type DocumentBatch = TextDocument[] | JSONDocument[] | FileDocument[];
export type DocumentBatch =
| TextDocument[]
| JSONDocument[]
| FileDocument[]
| UrlDocument[]
| SitemapDocument[];

export type DocumentInput = {
documentId: string;
Expand Down
6 changes: 4 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"url": "https://github.com/cortexclick/cortex-sdk",
"type": "git"
},
"version": "0.0.3",
"version": "0.0.4",
"type": "module",
"main": "index.js",
"scripts": {
Expand All @@ -34,7 +34,9 @@
"test:prod": "pulumi env run npm-test npm run test",
"test:fast:dev": "CORTEX_API_URL=http://localhost:3001 pulumi env run npm-test npm run test:fast",
"test:fast:prod": "pulumi env run npm-test npm run test:fast",
"test:indexers:dev": "CORTEX_API_URL=http://localhost:3001 pulumi env run npm-test vitest indexers.test.ts"
"test:indexers:dev": "CORTEX_API_URL=http://localhost:3001 pulumi env run npm-test vitest indexers.test.ts",
"test:scraping:dev": "RUN_SCRAPER_TESTS=true CORTEX_API_URL=https://api-dev.cortexclick.com pulumi env run npm-test vitest scraping.test.ts",
"test:scraping:prod": "RUN_SCRAPER_TESTS=true CORTEX_API_URL=https://api.cortexclick.com pulumi env run npm-test vitest scraping.test.ts"
},
"keywords": [],
"author": "",
Expand Down
158 changes: 158 additions & 0 deletions scraping.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import { expect, test } from "vitest";
import { CatalogConfig } from "./catalog";
import { SitemapDocument, UrlDocument } from "./document";
import { testClient } from "./vitest-test-client";

const runScraperTests = process.env.RUN_SCRAPER_TESTS === "true";

const expectedSitemapUrls = 4;

test.skipIf(!runScraperTests)(
"Test scraping single URL",
{ timeout: 60000 },
async () => {
const catalogName = `catalog-${Math.floor(Math.random() * 10000)}`;

const config: CatalogConfig = {
description: "foo bar",
instructions: ["a", "b"],
};

const catalog = await testClient.configureCatalog(catalogName, config);

const docs: UrlDocument[] = [
{
url: "https://www.cortexclick.com/",
contentType: "url",
},
];

await catalog.upsertDocuments(docs);

let docsFound = false;

while (!docsFound) {
const docCount = await catalog.documentCount();
if (docCount === 1) {
docsFound = true;
} else {
console.log("no docs found. sleeping...");
await sleep(5000);
}
}

const docCount = await catalog.documentCount();
expect(docCount).toBe(1);

await catalog.delete();
},
);

test.skipIf(!runScraperTests)(
"Test scraping sitemap",
{ timeout: 60000 },
async () => {
const catalogName = `catalog-${Math.floor(Math.random() * 10000)}`;

const config: CatalogConfig = {
description: "foo bar",
instructions: ["a", "b"],
};

const catalog = await testClient.configureCatalog(catalogName, config);

const docs: SitemapDocument[] = [
{
sitemapUrl: "https://www.cortexclick.com/sitemap.xml",
contentType: "sitemap-url",
},
];

await catalog.upsertDocuments(docs);

let docsFound = false;

while (!docsFound) {
const docCount = await catalog.documentCount();
if (docCount === expectedSitemapUrls) {
docsFound = true;
} else {
console.log(`${docCount} docs found. sleeping...`);
await sleep(5000);
}
}

await catalog.delete();
},
);

test.skipIf(!runScraperTests)(
"Test isolation of scraping multiple catalogs at once",
{ timeout: 60000 },
async () => {
const catalogName1 = `catalog-${Math.floor(Math.random() * 10000)}`;
const catalogName2 = `catalog-${Math.floor(Math.random() * 10000)}`;
const catalogName3 = `catalog-${Math.floor(Math.random() * 10000)}`;
const catalogName4 = `catalog-${Math.floor(Math.random() * 10000)}`;
const catalogName5 = `catalog-${Math.floor(Math.random() * 10000)}`;

const config: CatalogConfig = {
description: "foo bar",
instructions: ["a", "b"],
};

const catalog1 = await testClient.configureCatalog(catalogName1, config);
const catalog2 = await testClient.configureCatalog(catalogName2, config);
const catalog3 = await testClient.configureCatalog(catalogName3, config);
const catalog4 = await testClient.configureCatalog(catalogName4, config);
const catalog5 = await testClient.configureCatalog(catalogName5, config);

const docs: SitemapDocument[] = [
{
sitemapUrl: "https://www.cortexclick.com/sitemap.xml",
contentType: "sitemap-url",
},
];

catalog1.upsertDocuments(docs);
catalog2.upsertDocuments(docs);
catalog3.upsertDocuments(docs);
catalog4.upsertDocuments(docs);
catalog5.upsertDocuments(docs);

let docsFound = false;

while (!docsFound) {
const catalog1Count = await catalog1.documentCount();
const catalog2Count = await catalog2.documentCount();
const catalog3Count = await catalog3.documentCount();
const catalog4Count = await catalog4.documentCount();
const catalog5Count = await catalog5.documentCount();
if (
[
catalog1Count,
catalog2Count,
catalog3Count,
catalog4Count,
catalog5Count,
].every((e) => e === 4)
) {
docsFound = true;
} else {
await sleep(5000);
}
}

await catalog1.delete();
await catalog2.delete();
await catalog3.delete();
await catalog4.delete();
await catalog5.delete();
},
);

function sleep(ms: number) {
return new Promise((resolve) => {
setTimeout(resolve, ms);
});
}

0 comments on commit b2783a4

Please sign in to comment.