forked from simplecrawler/simplecrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcache-backend-fs.js
232 lines (193 loc) · 6.68 KB
/
cache-backend-fs.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// Simplecrawler - FS cache backend
// Tries to ensure a local 'cache' of a website is as close as possible to a mirror of the website itself.
// The idea is that it is then possible to re-serve the website just using the cache.
var fs = require("fs");
var crypto = require("crypto");
// Constructor for filesystem cache backend
var backend = function backend(loadParameter) {
this.loaded = false;
this.index = [];
this.location = typeof(loadParameter) === "string" && loadParameter.length > 0 ? loadParameter : process.cwd() + "/cache/";
this.location = this.location.substr(this.location.length-1) === "/" ? this.location : this.location + "/";
};
// Function for sanitising paths
// We try to get the most understandable, file-system friendly paths we can.
// An extension is added if not present or inappropriate - if a better one can be determined.
// Querystrings are hashed to truncate without (hopefully) collision.
function sanitisePath(path,queueObject) {
// Remove first slash (as we set one later.)
path = path.replace(/^\//,"");
var pathStack = [];
// Trim whitespace. If no path is present - assume index.html.
var sanitisedPath = path.length ? path.replace(/\s*$/ig,"") : "index.html";
var headers = queueObject.stateData.headers;
if (sanitisedPath.match(/\?/)) {
sanitisedPathParts = sanitisedPath.split(/\?/g);
var resource = sanitisedPathParts.shift();
var hashedQS = crypto.createHash("sha1").update(sanitisedPathParts.join("?")).digest("hex");
sanitisedPath = resource + "?" + hashedQS;
}
pathStack = sanitisedPath.split(/\//g);
pathStack = pathStack.map(function(pathChunk,count) {
if (pathChunk.length >= 250) {
return crypto.createHash("sha1").update(pathChunk).digest("hex");
}
return pathChunk;
});
sanitisedPath = pathStack.join("/");
// Try to get a file extension for the file - for ease of identification
// We run through this if we either:
// 1) haven't got a file extension at all, or:
// 2) have an HTML file without an HTML file extension (might be .php, .aspx, .do, or some other server-processed type)
if (!sanitisedPath.match(/\.[a-z0-9]{1,6}$/i) || (headers["content-type"] && headers["content-type"].match(/text\/html/i) && !sanitisedPath.match(/\.htm[l]?$/i))) {
var subMimeType = "";
var mimeParts = [];
if (headers["content-type"] && headers["content-type"].match(/text\/html/i)) {
if (sanitisedPath.match(/\/$/)) {
sanitisedPath += "index.html";
} else {
sanitisedPath += ".html";
}
} else if (headers["content-type"] && (mimeParts = headers["content-type"].match(/(image|video|audio|application)\/([a-z0-9]+)/i))) {
subMimeType = mimeParts[2];
sanitisedPath += "." + subMimeType;
}
}
return sanitisedPath;
}
backend.prototype.fileExists = function(location) {
try {
fs.statSync(location);
return true;
} catch (er) {
return false;
}
};
backend.prototype.isDirectory = function(location) {
try {
if (fs.statSync(location).isDirectory()) {
return true;
}
return false;
} catch (er) {
return false;
}
};
backend.prototype.load = function() {
var backend = this;
if (!this.fileExists(this.location) && this.isDirectory(this.location)) {
throw new Error("Unable to verify cache location exists.");
}
try {
var fileData;
if ((fileData = fs.readFileSync(this.location + "cacheindex.json")) && fileData.length) {
this.index = JSON.parse(fileData.toString("utf8"));
this.loaded = true;
}
} catch(error) {
if (error.code === "ENOENT") {
// Cache index doesn't exist. Assume this is a new cache.
// Just leave the memory index empty for now.
this.loaded = true;
} else {
throw error;
}
}
// Flush store to disk when closing.
process.on("exit",function() {
backend.flushToDisk.apply(backend);
});
};
backend.prototype.flushToDisk = function(callback) {
fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback);
};
backend.prototype.setItem = function(queueObject,data,callback) {
callback = callback instanceof Function ? callback : function(){};
var backend = this;
var pathStack = [queueObject.protocol, queueObject.domain, queueObject.port];
pathStack = pathStack.concat(sanitisePath(queueObject.path,queueObject).split(/\/+/g));
var cacheItemExists = false;
var firstInstanceIndex = NaN;
if (this.index.reduce(function(prev,current,index,array) {
firstInstanceIndex = !isNaN(firstInstanceIndex) ? firstInstanceIndex : index;
return prev || current.url === queueObject.url;
},false)) {
cacheItemExists = true;
}
var writeFileData = function(currentPath,data) {
fs.writeFile(currentPath,data,function(error) {
if (error) throw error;
fs.writeFile(currentPath + ".cacheData.json",JSON.stringify(queueObject),function(error) {
if (error) throw error;
var cacheObject = {
url: queueObject.url,
etag: queueObject.stateData.headers['etag'],
lastModified: queueObject.stateData.headers['last-modified'],
dataFile: currentPath,
metaFile: currentPath + ".cacheData.json"
};
if (cacheItemExists) {
backend.index[firstInstanceIndex] = cacheObject;
} else {
backend.index.push(cacheObject);
}
callback(cacheObject);
});
});
};
pathStack.forEach(function(pathChunk,count) {
var currentPath = backend.location + pathStack.slice(0,count+1).join("/");
if (backend.fileExists(backend.location + pathStack.slice(0,count+1).join("/"))) {
if (!backend.isDirectory(currentPath)) {
if (count === pathStack.length -1) {
// Just overwrite the file...
writeFileData(currentPath,data);
} else {
throw new Error("Cache storage of resource (%s) blocked by file: %s",queueObject.url,currentPath);
}
}
} else {
if (count === pathStack.length -1) {
// Write the file data in
writeFileData(currentPath,data);
} else {
fs.mkdirSync(currentPath);
}
}
});
};
backend.prototype.getItem = function(queueObject,callback) {
var cacheItemResult = this.index.filter(function(item) {
return item.url === queueObject.url;
});
if (cacheItemResult.length) {
var cacheItem = cacheItemResult.shift();
callback({
"url": cacheItem.url,
"etag": cacheItem.etag,
"lastModified": cacheItem.lastModified,
"getData": function(callback) {
fs.readFile(cacheItem.dataFile,function(error,data) {
if (error) {
callback(error);
return false;
}
callback(null,data);
});
},
"getMetadata": function(callback) {
fs.readFile(cacheItem.metaFile,function(error,data) {
if (error) {
callback(error);
return false;
}
callback(null,JSON.parse(data.toString("utf8")));
});
}
});
} else {
callback(null);
}
return false;
};
exports.backend = backend;