Skip to content

Commit

Permalink
refactor handling of max size for html/js/css (#525)
Browse files Browse the repository at this point in the history
- due to a typo (and lack of type-checking!) incorrectly passed in
matchFetchSize instead of maxFetchSize, resulting in text/css/js for
>5MB instead of >25MB not properly streamed back to the browser
- add type checking to AsyncFetcherOptions to avoid this in the future.
- refactor to avoid checking size altogether for 'essential resources',
html(document), js and css, instead always fetch them fully and continue
in the browser. Only apply rewriting if <25MB.
- fixes #522
  • Loading branch information
ikreymer authored Apr 3, 2024
1 parent 6e3ea7f commit a3f93ca
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 44 deletions.
89 changes: 46 additions & 43 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import { Crawler } from "../crawler.js";
import { WARCResourceWriter } from "./warcresourcewriter.js";

const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
const MAX_BROWSER_TEXT_FETCH_SIZE = 25_000_000;
const MAX_TEXT_REWRITE_SIZE = 25_000_000;

const MAX_NETWORK_LOAD_SIZE = 200_000_000;

Expand Down Expand Up @@ -67,6 +67,25 @@ export type PageInfoRecord = {
};
};

// =================================================================
export type AsyncFetchOptions = {
tempdir: string;
reqresp: RequestResponseInfo;
expectedSize?: number;
// eslint-disable-next-line no-use-before-define
recorder: Recorder;
networkId: string;
filter?: (resp: Response) => boolean;
ignoreDupe?: boolean;
maxFetchSize?: number;
};

// =================================================================
export type ResponseStreamAsyncFetchOptions = AsyncFetchOptions & {
cdp: CDPSession;
requestId: string;
};

// =================================================================
export class Recorder {
workerid: WorkerId;
Expand Down Expand Up @@ -307,7 +326,7 @@ export class Recorder {
if (!this.shouldSkip(headers, url, method, type)) {
const reqresp = this.pendingReqResp(requestId);
if (reqresp) {
reqresp.fillRequest(request);
reqresp.fillRequest(request, type || "");
}
}
}
Expand Down Expand Up @@ -577,23 +596,20 @@ export class Recorder {

let streamingConsume = false;

const contentType = this._getContentType(responseHeaders);

// set max fetch size higher for HTML responses for current page
const matchFetchSize = this.allowLargeContent(contentType)
? MAX_BROWSER_TEXT_FETCH_SIZE
: MAX_BROWSER_DEFAULT_FETCH_SIZE;

if (contentLen < 0 || contentLen > matchFetchSize) {
const opts = {
// if contentLength is large or unknown, do streaming, unless its an essential resource
// in which case, need to do a full fetch either way
if (
(contentLen < 0 || contentLen > MAX_BROWSER_DEFAULT_FETCH_SIZE) &&
!this.isEssentialResource(reqresp.resourceType)
) {
const opts: ResponseStreamAsyncFetchOptions = {
tempdir: this.tempdir,
reqresp,
expectedSize: contentLen,
recorder: this,
networkId,
cdp,
requestId,
matchFetchSize,
};

// fetching using response stream, await here and then either call fulFill, or if not started, return false
Expand Down Expand Up @@ -652,7 +668,7 @@ export class Recorder {
}
}

const rewritten = await this.rewriteResponse(reqresp, contentType);
const rewritten = await this.rewriteResponse(reqresp, responseHeaders);

// if in service worker, serialize here
// as won't be getting a loadingFinished message
Expand Down Expand Up @@ -861,17 +877,20 @@ export class Recorder {

async rewriteResponse(
reqresp: RequestResponseInfo,
contentType: string | null,
responseHeaders?: Protocol.Fetch.HeaderEntry[],
) {
const { url, extraOpts, payload } = reqresp;

if (!payload || !payload.length) {
// don't rewrite if payload is missing or too big
if (!payload || !payload.length || payload.length > MAX_TEXT_REWRITE_SIZE) {
return false;
}

let newString = null;
let string = null;

const contentType = this._getContentType(responseHeaders);

switch (contentType) {
case "application/x-mpegURL":
case "application/vnd.apple.mpegurl":
Expand Down Expand Up @@ -915,20 +934,10 @@ export class Recorder {
} else {
return false;
}

//return Buffer.from(newString).toString("base64");
}

allowLargeContent(contentType: string | null) {
const allowLargeCTs = [
"text/html",
"application/json",
"text/javascript",
"application/javascript",
"application/x-javascript",
];

return allowLargeCTs.includes(contentType || "");
isEssentialResource(resourceType: string | undefined) {
return ["document", "stylesheet", "script"].includes(resourceType || "");
}

_getContentType(
Expand Down Expand Up @@ -1127,16 +1136,7 @@ class AsyncFetcher {
filter = undefined,
ignoreDupe = false,
maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE,
}: {
tempdir: string;
reqresp: RequestResponseInfo;
expectedSize?: number;
recorder: Recorder;
networkId: string;
filter?: (resp: Response) => boolean;
ignoreDupe?: boolean;
maxFetchSize?: number;
}) {
}: AsyncFetchOptions) {
this.reqresp = reqresp;
this.reqresp.expectedSize = expectedSize;
this.reqresp.asyncLoading = true;
Expand Down Expand Up @@ -1248,9 +1248,16 @@ class AsyncFetcher {
if (externalBuffer) {
const { currSize, buffers, fh } = externalBuffer;

// if fully buffered in memory, then populate the payload to return to browser
if (buffers && buffers.length && !fh) {
reqresp.payload = Buffer.concat(buffers, currSize);
externalBuffer.buffers = [reqresp.payload];
} else if (fh) {
logger.warn(
"Large streamed written to WARC, but not returned to browser, requires reading into memory",
{ url, actualSize: reqresp.readSize, maxSize: this.maxFetchSize },
"recorder",
);
}
}

Expand Down Expand Up @@ -1405,9 +1412,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher {
cdp: CDPSession;
requestId: string;

// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
constructor(opts: any) {
constructor(opts: ResponseStreamAsyncFetchOptions) {
super(opts);
this.cdp = opts.cdp;
this.requestId = opts.requestId;
Expand All @@ -1430,9 +1435,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher {
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
cdp: CDPSession;

// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
constructor(opts: any) {
constructor(opts: ResponseStreamAsyncFetchOptions) {
super(opts);
this.cdp = opts.cdp;
}
Expand Down
2 changes: 1 addition & 1 deletion src/util/reqresp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ export class RequestResponseInfo {
this.frameId = params.frameId;
}

fillRequest(request: Protocol.Network.Request, resourceType?: string) {
fillRequest(request: Protocol.Network.Request, resourceType: string) {
this.url = request.url;
this.method = request.method;
if (!this.requestHeaders) {
Expand Down

0 comments on commit a3f93ca

Please sign in to comment.