-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
260 lines (227 loc) · 7.42 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
const debug = require('debug')('w3c-xml-validator')
const htmlParser = require('node-html-parser')
const request = require('got')
/**
* Throws an exception if the value provided to this library is the wrong data
* type or empty.
*
* @param {any} input The value to check.
*
* @return {undefined}
*/
function validateInput (input) {
if (Buffer.isBuffer(input)) {
debug('input value is a buffer')
return
}
if (typeof input !== 'string' || input.length === 0) {
throw new Error('The XML input is required and must be a non-empty string value (or buffer).')
}
debug('input value is a non-zero length string')
}
/**
* Returns an object that contains a properly formatted "multipart/form-data"
* request payload, and the boundary value used in that payload.
*
* @param {Buffer|String} xml The XML to submit to W3C. It must reference a
* publicly-available DTD.
*
* @return {Object}
*/
function createPayloadForW3C (xml) {
const now = new Date()
/**
* The multipart/form-data boundary value (sans preceeding hyphen characters).
* See https://www.rfc-editor.org/rfc/rfc2046#section-5.1 for more info.
* @type {String}
*/
const boundary = `W3CFormBoundary_${now.getTime()}`
/**
* The set of lines that comprise the entire request payload.
* @type {Array}
*/
const lines = []
/**
* The set of fields (and corresponding values) that must be included in the
* request payload. The was determined by inspecting a request submitted
* manually via the W3C web site).
* @type {Object}
*/
const payload = {
fragment: xml,
prefill: '0',
doctype: 'Inline',
prefill_doctype: 'html401',
group: '0'
}
lines.push('')
debug('creating payload for multipart HTTP request...')
Object.keys(payload).forEach((field) => {
debug(` - adding value for "${field}"`)
lines.push(`--${boundary}`)
lines.push(`Content-Disposition: form-data; name="${field}"`)
lines.push('')
lines.push(payload[field])
})
lines.push(`--${boundary}--`)
debug(`payload is comprised of ${lines.length} lines`)
return {
boundary: boundary,
data: lines.join('\r\n')
}
}
/**
* Submits an HTTP request to W3C and returns the reponse.
*
* @param {Object} payload A dictionary containing the keys `boundary`
* (the multipart boundary value, sans hyphens) and
* `data` (the assembled multipart request body).
*
* @return {Promise}
*/
function submitRequestToW3C (payload) {
/**
* This is the web address that the form data will be submitted to.
* @type {String}
*/
const w3cValidatorUrl = 'https://validator.w3.org/check'
debug('submitting request to "%s"...', w3cValidatorUrl)
return request(
{
url: w3cValidatorUrl,
method: 'POST',
headers: {
'content-type': `multipart/form-data; boundary=${payload.boundary}`,
'user-agent': 'w3c-xml-validator'
},
body: payload.data,
throwHttpErrors: false
}
)
}
/**
* Parses the returned HTML document to determine which DTD was identified by
* W3C.
*
* @param {Object} htmlDOM The HTML returned from W3C, represented as a
* basic DOM.
*
* @return {String} The identified DTD.
*/
function getDoctypeFromResponse (htmlDocument) {
/**
* The DOM element containing the DTD (as text). This was determined by
* manually inspecting a returned HTML document.
* @type {Object}
*/
const resultsElem = htmlDocument.querySelector('#results_container').childNodes[4]
// Collapse extra whitespace and remove the trailing punctuation mark.
const sentence = resultsElem
.childNodes[0]
.rawText
.replace(/\s+/g, ' ')
.replace(/!$/, '')
// Return the last word in the sentence.
return sentence.substring(sentence.lastIndexOf(' ') + 1)
}
/**
* Parses the returned HTML document to determine which warnings were
* identified by W3C.
*
* @param {Object} htmlDOM The HTML returned from W3C, represented as a
* basic DOM.
*
* @return {Array} List of strings (each of which describes a
* single warning).
*/
function getWarningsFromResponse (htmlDocument) {
/**
* Warnings and errors are contained in separate OL elements on the page.
* Fortunately, each list has a unique identifier.
*/
const warningsElem = htmlDocument.querySelector('#warnings')
return warningsElem.childNodes
.filter((child) => {
// for the list of node type values, see https://github.com/taoqf/node-html-parser/blob/8f4cedfb0ac1b58da4f72af2f8bb01123c119df4/src/nodes/type.ts
return (child.nodeType === 1)
})
.map((child) => {
return child.childNodes[0].childNodes[2].rawText
})
}
/**
* Parses the returned HTML document to determine which errors were identified
* by W3C.
*
* @param {Object} htmlDOM The HTML returned from W3C, represented as a
* basic DOM.
*
* @return {Array} List of strings (each of which describes a
* single error).
*/
function getErrorsFromResponse (htmlDocument) {
/**
* Warnings and errors are contained in separate OL elements on the page.
* Fortunately, each list has a unique identifier.
*/
const errorsParentElement = htmlDocument.querySelector('#error_loop')
if (errorsParentElement == null) {
debug('when parsing the returned HTML for error messages, the parent container could not be found')
return []
}
return errorsParentElement.childNodes
.filter((child) => {
return (child.nodeType === 1 && child.classNames.indexOf('msg_err') > -1)
})
.map((child) => {
const line = child.childNodes[3]
const msg = child.childNodes[5]
return `${line.text.substring(0, line.text.indexOf(','))}: ${msg.text}`
})
}
/**
* The exported function (entry point for this module).
*
* @param {String} input The XML to validate. It must reference a publicly-
* available DTD.
*
* @return {Promise}
*/
async function exported (input) {
validateInput(input)
const payload = createPayloadForW3C(input)
const response = await submitRequestToW3C(payload)
debug(' - recevied response in %d sec', (response.timings.phases.total / 1000))
debug(' - status: %d', response.statusCode)
debug(' - headers: %o', response.headers)
/**
* Anything other than a 200 indicates a problem with the underlying
* HTTP transmission. In that case, abort!
*
* NOTE: this has nothing to do with whether or not the submitted XML is
* valid. IMO, W3C actually understands how to use HTTP response codes
* correctly.
*/
if (response.statusCode !== 200) {
throw new Error(`The W3C server replied with a ${response.statusCode} status code.`)
}
/**
* The returned HTML, represented as a basic DOM. This makes it easier
* to parse the results and find the data of interest.
* @type {Object}
*/
const htmlDOM = htmlParser.parse(response.body)
/**
* The list of errors (for the submitted XML) reported by W3C. "Valid XML" for
* this library means "no reported errors".
* @type {Array}
*/
const errors = getErrorsFromResponse(htmlDOM)
return {
doctype: getDoctypeFromResponse(htmlDOM),
errors: errors,
isValid: (errors.length === 0),
warnings: getWarningsFromResponse(htmlDOM)
}
}
module.exports = exported