Skip to content

Commit

Permalink
Invert "does this entry match the filter" logic
Browse files Browse the repository at this point in the history
Before this change, we turned filters into a list of path prefixes and
then checked entry paths against those prefixes. Rough pseudocode:

```javascript
function doesEntryMatchFilter({ path }, filter) {
  const pathPrefixes = pathPrefixesFromFilter(filter)
  return pathPrefixes.some((prefix) => path.startsWith(prefix))
}
```

For performance and simplicity, I think it's cleaner if we "look up"
entry paths in the existing filter object. Rough pseudocode:

```javascript
function doesEntryMatchFilter({ path }, filter) {
  const { type, variant } = parsePath(path)
  return filter[type]?.includes(variant)
}
```

I think this has two advantages:

- It's less code. We don't need to worry about de-duping paths (e.g.,
  `/photos/original` versus `/photos/`). We don't need to worry about
  sketchy paths (e.g., `/photos/garbage/../original`).
- It's faster (at least, in theory). Rather than having to iterate over
  every path prefix, we only need to iterate over the variants of each
  path. (This could be further optimized with a `Set`.)
  • Loading branch information
EvanHahn committed Oct 30, 2024
1 parent 074710a commit e59b670
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 53 deletions.
16 changes: 7 additions & 9 deletions src/blob-store/downloader.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { TypedEmitter } from 'tiny-typed-emitter'
import { createEntriesStream } from './entries-stream.js'
import { pathPrefixesFromFilter } from './utils.js'
import { filePathMatchesFilter } from './utils.js'

/** @import Hyperdrive from 'hyperdrive' */
/** @import { BlobFilter } from '../types.js' */

/**
* Like hyperdrive.download() but 'live', and for multiple drives.
Expand Down Expand Up @@ -32,7 +33,7 @@ export class Downloader extends TypedEmitter {
#entriesStream
#processEntriesPromise
#ac = new AbortController()
#pathPrefixes
#shouldDownloadFile

/**
* @param {import('./index.js').THyperdriveIndex} driveIndex
Expand All @@ -41,9 +42,12 @@ export class Downloader extends TypedEmitter {
*/
constructor(driveIndex, { filter } = {}) {
super()
this.#pathPrefixes = filter ? pathPrefixesFromFilter(filter) : []
this.#driveIndex = driveIndex

this.#shouldDownloadFile = filter
? filePathMatchesFilter.bind(null, filter)
: () => true

this.#entriesStream = createEntriesStream(driveIndex, { live: true })
this.#entriesStream.once('error', this.#handleError)

Expand Down Expand Up @@ -78,12 +82,6 @@ export class Downloader extends TypedEmitter {
throw new Error('Entries stream ended unexpectedly')
}

/** @param {string} filePath */
#shouldDownloadFile(filePath) {
if (!this.#pathPrefixes.length) return true
return this.#pathPrefixes.some((prefix) => filePath.startsWith(prefix))
}

/**
* Update state and queue missing entries for download
*
Expand Down
67 changes: 23 additions & 44 deletions src/blob-store/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,35 +8,38 @@
import { Transform } from 'node:stream'

/**
* Convert a filter to an array of path prefixes that match the filter. These
* path prefixes can be used to filter entries by
* `entry.key.startsWith(pathPrefix)`.
*
* @param {GenericBlobFilter} filter
* @returns {readonly string[]} array of folders that match the filter
* @param {string} filePath
* @returns {boolean}
*/
export function pathPrefixesFromFilter(filter) {
const pathPrefixes = []
for (const [type, variants] of Object.entries(filter)) {
if (variants.length === 0) {
pathPrefixes.push(`/${type}/`)
continue
}
const dedupedVariants = new Set(variants)
for (const variant of dedupedVariants) {
pathPrefixes.push(`/${type}/${variant}/`)
}
export function filePathMatchesFilter(filter, filePath) {
const pathParts = filePath.split('/', 4)
const [shouldBeEmpty, type, variant] = pathParts

if (typeof shouldBeEmpty !== 'string' || shouldBeEmpty) return false

if (!type) return false
if (!Object.hasOwn(filter, type)) return false

const allowedVariants = filter[type] ?? []
if (allowedVariants.length === 0) {
return pathParts.length >= 3
} else {
return (
pathParts.length >= 4 &&
typeof variant === 'string' &&
allowedVariants.includes(variant)
)
}
return filterSubfoldersAndDuplicates(pathPrefixes)
}

/** @type {import("../types.js").BlobStoreEntriesStream} */
export class FilterEntriesStream extends Transform {
#pathPrefixes
#isIncludedInFilter
/** @param {GenericBlobFilter} filter */
constructor(filter) {
super({ objectMode: true })
this.#pathPrefixes = pathPrefixesFromFilter(filter)
this.#isIncludedInFilter = filePathMatchesFilter.bind(null, filter)
}
/**
* @param {import("hyperdrive").HyperdriveEntry} entry
Expand All @@ -45,31 +48,7 @@ export class FilterEntriesStream extends Transform {
*/
_transform(entry, _, callback) {
const { key: filePath } = entry
const isIncludedInFilter = this.#pathPrefixes.some((pathPrefix) =>
filePath.startsWith(pathPrefix)
)
if (isIncludedInFilter) this.push(entry)
if (this.#isIncludedInFilter(filePath)) this.push(entry)
callback()
}
}

/**
* Take an array of folders, remove any folders that are duplicates or subfolders of another
*
* @param {readonly string[]} folders
* @returns {readonly string[]}
*/
function filterSubfoldersAndDuplicates(folders) {
/** @type {Set<string>} */
const filtered = new Set()
for (let i = 0; i < folders.length; i++) {
const isSubfolderOfAnotherFolder = !!folders.find((folder, index) => {
if (index === i) return false
// Deduping is done by the Set, if we do it here we don't get either
if (folder === folders[i]) return true
return folders[i].startsWith(folder)
})
if (!isSubfolderOfAnotherFolder) filtered.add(folders[i])
}
return Array.from(filtered)
}
50 changes: 50 additions & 0 deletions test/blob-store/utils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import test from 'node:test'
import assert from 'node:assert/strict'
import { filePathMatchesFilter } from '../../src/blob-store/utils.js'

test('filePathMatchesFilter', () => {
const filter = {
photo: ['a', 'b'],
video: [],
}

const shouldMatch = [
'/photo/a/foo.jpg',
'/photo/b/foo.jpg',
'/photo/a/',
'/video/foo.mp4',
'/video/foo/bar.mp4',
'/video/',
'/video///',
]
for (const filePath of shouldMatch) {
assert(
filePathMatchesFilter(filter, filePath),
`${filePath} matches filter`
)
}

const shouldntMatch = [
'/photo/c/foo.jpg',
'/photo/c/',
'/photo/a',
'/photo/ax/foo.jpg',
'/photo/c/../a/foo.jpg',
'/photo',
'/photo/',
'/photo//',
'/PHOTO/a/foo.jpg',
'/audio/a/foo.mp3',
'photo/a/foo.jpg',
'//photo/a/foo.jpg',
' /photo/a/foo.jpg',
'/hasOwnProperty/',
'/hasOwnProperty/a/foo.jpg',
]
for (const filePath of shouldntMatch) {
assert(
!filePathMatchesFilter(filter, filePath),
`${filePath} doesn't match filter`
)
}
})

0 comments on commit e59b670

Please sign in to comment.