diff --git a/README.md b/README.md index bda725f..25900bc 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,25 @@ Default: `true` Indicates whether [Google AMP pages](https://www.ampproject.org/) should be ignored and not be added to the sitemap. +### onAdd(url) + +Modify the URL before it's added to the sitemap. A false or undefined return value ignores the URL, and it will not be added to the sitemap. Useful for crawling locally before hosting. + +Type: `function` +Default: `null` + +Example: + +```JavaScript +// create generator +const generator = SitemapGenerator(host, { + onAdd: (url) => { + // Replace host in sitemap. + return url.replace(host, 'https://example.com') + } +}); +``` + ### lastMod Type: `boolean` diff --git a/src/index.js b/src/index.js index 43ee7c2..f4ca05a 100644 --- a/src/index.js +++ b/src/index.js @@ -29,7 +29,8 @@ module.exports = function SitemapGenerator(uri, opts) { changeFreq: '', priorityMap: [], ignoreAMP: true, - ignore: null + ignore: null, + onAdd: null }; if (!uri) { @@ -95,14 +96,15 @@ module.exports = function SitemapGenerator(uri, opts) { // fetch complete event crawler.on('fetchcomplete', (queueItem, page) => { - const { url, depth } = queueItem; + const { depth } = queueItem; + const url = opts.onAdd ? opts.onAdd(queueItem.url) : queueItem.url; if ( - (opts.ignore && opts.ignore(url)) || + (!url || opts.ignore && opts.ignore(url)) || /(]+noindex).*?>)/.test(page) || // check if robots noindex is present (options.ignoreAMP && /]+(amp|⚡)[^>]*>/.test(page)) // check if it's an amp page ) { - emitter.emit('ignore', url); + emitter.emit('ignore', queueItem.url); } else { emitter.emit('add', url);