Skip to content

Commit

Permalink
[dfns] Add HTML prose definition when possible
Browse files Browse the repository at this point in the history
Implements the logic discussed in https://github.com/w3c/respec/issues/4522

For each term defined in the specification being processed, the code now looks
for some element flagged with a `data-defines="#term-id"` attribute. If such
an element exists, a `prose` property gets added to the definition in the
`dfns` extract with the HTML contents of that element.

The code applies some clean up to the HTML markup it attaches to the `prose`
property:
- All asides that authoring tools may add here and there get dropped
- Any element that is not a simple block or inline content element gets dropped
- All attributes are dropped

The clean up logic may need refinement over time once we gain experience with
actual definitions. Open questions include:

- Should we be stricter, e.g., only allowing `<p>`, `<br>`, and very common
inline elements?
- Should we keep `href` attributes (with an absolute URL) for `<a>` elements?
- Should we keep `title` attributes for `<abbr>` elements?
- Should we keep `class` attributes for `<pre>` elements to help with syntax
highlighting?
- Should we keep tables? Images?

There is no good mechanism in Reffy to report potential issues encountered
during extraction for the time being. In the meantime, warnings get logged when
the code bumps into elements that seem surprising in the context of a term
definition.
  • Loading branch information
tidoust committed Dec 12, 2023
1 parent 923754a commit 68ccd42
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 1 deletion.
4 changes: 4 additions & 0 deletions schemas/browserlib/extract-dfns.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@
},
"definedIn": {
"type": "string"
},
"prose": {
"type": "string",
"minLength": 1
}
}
}
Expand Down
54 changes: 53 additions & 1 deletion src/browserlib/extract-dfns.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,47 @@ function isNotAlreadyExported(dfn, idx, list) {
return first === dfn;
}

// Extract the element's inner HTML content, removing any complex structure,
// so that the result can be injected elsewhere without creating problems.
function getHtmlProseDefinition(proseEl) {
// Apply modifications to a copy of the element
proseEl = proseEl.cloneNode(true);

// Drop asides that authoring tools add here and there
let el;
const asideSelector = [
'aside', '.mdn-anno', '.wpt-tests-block', '.annotation',
'[id^=dfn-panel-]'
].join(',');
while (el = proseEl.querySelector(asideSelector)) {
el.remove();
}

// Keep simple grouping content and text-level semantics elements
const keepSelector = [
'blockquote', 'dd', 'div', 'dl', 'dt', 'figcaption', 'figure', 'hr', 'li',
'ol', 'p', 'pre', 'ul',
'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', 'code', 'data', 'dfn', 'em',
'i', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'small', 'span',
'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr'
].join(',');
while (el = proseEl.querySelector(`:not(${keepSelector})`)) {
// The content is more complex than anticipated. It may be worth checking
// the definition to assess whether the extraction logic needs to become
// smarter. For lack of a better reporting mechanism for now, let's record
// a warning.
console.warn('[reffy]', `Unexpected element "${el.nodeName}" found in textual definition of "${proseEl.getAttribute('data-defines')}"`);
el.remove();
}

// Drop all attributes
[...proseEl.querySelectorAll('*')].forEach(el => {
el.getAttributeNames().forEach(attr => el.removeAttribute(attr));
});

return proseEl.innerHTML.trim();
}

function definitionMapper(el, idToHeading, usesDfnDataModel) {
let definedIn = 'prose';
const enclosingEl = el.closest('dt,pre,table,h1,h2,h3,h4,h5,h6,.note,.example') || el;
Expand Down Expand Up @@ -157,7 +198,7 @@ function definitionMapper(el, idToHeading, usesDfnDataModel) {
url.hash = '#' + encodeURIComponent(el.getAttribute('id'));
const href = url.toString();

return {
const dfn = {
// ID is the id attribute
// (ID may not be unique in a multi-page spec)
id: el.getAttribute('id'),
Expand Down Expand Up @@ -211,6 +252,17 @@ function definitionMapper(el, idToHeading, usesDfnDataModel) {
// indicates that definition appears in the main body of the specification)
definedIn
};

// Extract a prose definition in HTML for the term, if available
const proseEl = document.querySelector(`[data-defines="#${dfn.id}"]`);
if (proseEl) {
const prose = getHtmlProseDefinition(proseEl);
if (prose) {
dfn.prose = prose;
}
}

return dfn;
}

export default function (spec, idToHeading = {}) {
Expand Down
71 changes: 71 additions & 0 deletions tests/extract-dfns.js
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,77 @@ When initialize(<var>newItem</var>) is called, the following steps are run:</p>`
}],
spec: "CSS2"
},

{
title: "extracts the prose that defines a term",
html: `<p data-defines='#foo'>
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> enters a bar.
</p>`,
changesToBaseDfn: [{
prose: "<dfn>Foo</dfn> enters a bar."
}]
},

{
title: "keeps basic structure for the prose that defines a term",
html: `<div data-defines='#foo'>
<p><dfn id='foo' data-dfn-type='dfn'>Foo</dfn> <i>enters</i> a <b>bar</b>.
<br>The bar has <strong>2 baz</strong> on tap:</p>
<ul>
<li>Baz<sub>1</sub></li>
<li>Baz<sup>2</sup></li>
</ul>
<pre>Foo bar baz</pre>
</div>`,
changesToBaseDfn: [{
prose: `<p><dfn>Foo</dfn> <i>enters</i> a <b>bar</b>.
<br>The bar has <strong>2 baz</strong> on tap:</p>
<ul>
<li>Baz<sub>1</sub></li>
<li>Baz<sup>2</sup></li>
</ul>
<pre>Foo bar baz</pre>`
}]
},

{
title: "extracts prose that defines a term without attributes",
html: `<p data-defines='#foo'>
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> <i class="verb">enters</i> a <a href="#bar">bar</a>.
</p>`,
changesToBaseDfn: [{
prose: "<dfn>Foo</dfn> <i>enters</i> a <a>bar</a>."
}]
},

{
title: "suppresses asides from the prose that defines a term",
html: `<div data-defines='#foo'>
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> enters a bar.
<aside><p>I'm an aside</p></aside>
<p class='mdn-anno'>So am I</p>
<span class='wpt-tests-block'>Lots of tests</span>
<span class='annotation'>And annotations</span>
<div id='dfn-panel-foo'>A list of references</div>
</div>`,
changesToBaseDfn: [{
prose: "<dfn>Foo</dfn> enters a bar."
}]
},

{
title: "suppresses more complex structure from the prose that defines a term",
html: `<div data-defines='#foo'>
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> <i class="verb">enters</i> a <a href="#bar">bar</a>.
<section>
<h4>An inner section</h4>
</section>
<img src="bar.png" alt="A bar">
</div>`,
changesToBaseDfn: [{
prose: "<dfn>Foo</dfn> <i>enters</i> a <a>bar</a>."
}]
}
];

describe("Test definition extraction", function () {
Expand Down

0 comments on commit 68ccd42

Please sign in to comment.