From 7f73b3b836ae75bb40c3ce1bff46c1ac913a2cae Mon Sep 17 00:00:00 2001 From: Demi Obenour Date: Mon, 24 Jul 2017 23:49:29 -0400 Subject: [PATCH] Non-recursive tree deletion This adds an iterator over trees, along with a non-recursive deallocation algorithm. --- src/gumbo.h | 12 +++++ src/parser.c | 149 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 111 insertions(+), 50 deletions(-) diff --git a/src/gumbo.h b/src/gumbo.h index f45a583e..d0a4e121 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -539,6 +539,18 @@ struct GumboInternalNode { } v; }; +/** + * The type for a tree traversal callback. + */ +typedef size_t (*gumbo_tree_iter_callback)(void* userdata, GumboNode* node); + +/** + * Call `cb` for each child of `node` and for `node` itself, passing it + * the node and `userdata` as arguments. + */ +size_t gumbo_tree_traverse( + GumboNode* node, void* userdata, gumbo_tree_iter_callback cb); + /** * The type for an allocator function. Takes the 'userdata' member of the * GumboParser struct as its first argument. Semantics should be the same as diff --git a/src/parser.c b/src/parser.c index dc692b3e..09642242 100644 --- a/src/parser.c +++ b/src/parser.c @@ -32,7 +32,7 @@ #include "util.h" #include "vector.h" -#define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i) +#define AVOID_UNUSED_VARIABLE_WARNING(i) (void) (i) #define GUMBO_STRING(literal) \ { literal, sizeof(literal) - 1 } @@ -124,12 +124,10 @@ static const GumboStringPiece kQuirksModePublicIdPrefixes[] = { GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"), GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"), GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"), - GUMBO_STRING( - "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)" - "extensions to HTML 4.0//"), - GUMBO_STRING( - "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::" - "extensions to HTML 4.0//"), + GUMBO_STRING("-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)" + "extensions to HTML 4.0//"), + GUMBO_STRING("-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::" + "extensions to HTML 4.0//"), GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"), GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"), GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"), @@ -220,7 +218,8 @@ static const ReplacementEntry kSvgAttributeReplacements[] = { REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"), REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"), REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"), - REPLACEMENT_ENTRY("refx", "refX"), REPLACEMENT_ENTRY("refy", "refY"), + REPLACEMENT_ENTRY("refx", "refX"), + REPLACEMENT_ENTRY("refy", "refY"), REPLACEMENT_ENTRY("repeatcount", "repeatCount"), REPLACEMENT_ENTRY("repeatdur", "repeatDur"), REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"), @@ -573,9 +572,9 @@ static GumboInsertionMode get_appropriate_insertion_mode( assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); if (node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML) - return is_last ? - GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; - + return is_last ? GUMBO_INSERTION_MODE_IN_BODY + : GUMBO_INSERTION_MODE_INITIAL; + switch (node->v.element.tag) { case GUMBO_TAG_SELECT: { if (is_last) { @@ -972,7 +971,7 @@ static void append_comment_node( // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context static void clear_stack_to_table_row_context(GumboParser* parser) { while (!node_tag_in_set(get_current_node(parser), - (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) { + (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) { pop_current_node(parser); } } @@ -980,16 +979,16 @@ static void clear_stack_to_table_row_context(GumboParser* parser) { // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context static void clear_stack_to_table_context(GumboParser* parser) { while (!node_tag_in_set(get_current_node(parser), - (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) { + (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) { pop_current_node(parser); } } // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context void clear_stack_to_table_body_context(GumboParser* parser) { - while (!node_tag_in_set(get_current_node(parser), - (gumbo_tagset){TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD), - TAG(TEMPLATE)})) { + while (!node_tag_in_set( + get_current_node(parser), (gumbo_tagset){TAG(HTML), TAG(TBODY), + TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE)})) { pop_current_node(parser); } } @@ -1486,12 +1485,12 @@ static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) { // This is the "generate all implied end tags thoroughly" clause of the spec. // https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) { - for ( - ; node_tag_in_set(get_current_node(parser), - (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), - TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC), - TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)}); - pop_current_node(parser)) + for (; node_tag_in_set(get_current_node(parser), + (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), + TAG(LI), TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), + TAG(RTC), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), + TAG(TR)}); + pop_current_node(parser)) ; } @@ -1958,7 +1957,8 @@ static bool adoption_agency_algorithm( if (last_node == furthest_block) { bookmark = formatting_index + 1; gumbo_debug("Bookmark moved to %d.\n", bookmark); - assert((unsigned int) bookmark <= state->_active_formatting_elements.length); + assert((unsigned int) bookmark <= + state->_active_formatting_elements.length); } // Step 13.9. last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED; @@ -2018,7 +2018,8 @@ static bool adoption_agency_algorithm( gumbo_vector_remove_at( parser, formatting_node_index, &state->_active_formatting_elements); assert(bookmark >= 0); - assert((unsigned int) bookmark <= state->_active_formatting_elements.length); + assert( + (unsigned int) bookmark <= state->_active_formatting_elements.length); gumbo_vector_insert_at(parser, new_formatting_node, bookmark, &state->_active_formatting_elements); @@ -2349,13 +2350,55 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) { } } -static void destroy_node(GumboParser* parser, GumboNode* node) { +size_t gumbo_tree_traverse( + GumboNode* node, void* userdata, gumbo_tree_iter_callback cb) { + GumboNode* current_node = node; + size_t offset = 0, retcode = 0; +tailcall: +#define RECURSE \ + do { \ + offset = current_node->index_within_parent + 1; \ + GumboNode* next_node = current_node->parent; \ + if ((retcode = cb(userdata, current_node))) return retcode; \ + if (current_node == node) return 0; \ + current_node = next_node; \ + goto tailcall; \ + } while (0) + switch (current_node->type) { + case GUMBO_NODE_DOCUMENT: + case GUMBO_NODE_TEMPLATE: + case GUMBO_NODE_ELEMENT: { + GumboVector* children = GUMBO_NODE_DOCUMENT == current_node->type + ? ¤t_node->v.document.children + : ¤t_node->v.element.children; + if (offset >= children->length) { + assert(offset == children->length); + RECURSE; + } else { + current_node = children->data[offset]; + offset = 0; + goto tailcall; + } + } + case GUMBO_NODE_TEXT: + case GUMBO_NODE_CDATA: + case GUMBO_NODE_COMMENT: + case GUMBO_NODE_WHITESPACE: { + assert(0 == offset); + RECURSE; + } + default: + assert(!"Invalid GumboNodeType!"); + abort(); + } +#undef RECURSE +} + +static size_t destroy_one_node(void* parser_, GumboNode* node) { + GumboParser* parser = (GumboParser*) parser_; switch (node->type) { case GUMBO_NODE_DOCUMENT: { GumboDocument* doc = &node->v.document; - for (unsigned int i = 0; i < doc->children.length; ++i) { - destroy_node(parser, doc->children.data[i]); - } gumbo_parser_deallocate(parser, (void*) doc->children.data); gumbo_parser_deallocate(parser, (void*) doc->name); gumbo_parser_deallocate(parser, (void*) doc->public_identifier); @@ -2367,9 +2410,6 @@ static void destroy_node(GumboParser* parser, GumboNode* node) { gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]); } gumbo_parser_deallocate(parser, node->v.element.attributes.data); - for (unsigned int i = 0; i < node->v.element.children.length; ++i) { - destroy_node(parser, node->v.element.children.data[i]); - } gumbo_parser_deallocate(parser, node->v.element.children.data); break; case GUMBO_NODE_TEXT: @@ -2380,6 +2420,11 @@ static void destroy_node(GumboParser* parser, GumboNode* node) { break; } gumbo_parser_deallocate(parser, node); + return 0; +} + +static void destroy_node(GumboParser* parser, GumboNode* node) { + gumbo_tree_traverse(node, parser, &destroy_one_node); } // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody @@ -2524,8 +2569,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { bool result = maybe_implicitly_close_p_tag(parser, token); insert_element_from_token(parser, token); return result; - } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), - TAG(H4), TAG(H5), TAG(H6)})) { + } else if (tag_in(token, kStartTag, + (gumbo_tagset){ + TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6)})) { bool result = maybe_implicitly_close_p_tag(parser, token); if (node_tag_in_set( get_current_node(parser), (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), @@ -2669,11 +2715,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { } return implicitly_close_tags( parser, token, GUMBO_NAMESPACE_HTML, token_tag); - } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), - TAG(H4), TAG(H5), TAG(H6)})) { - if (!has_an_element_in_scope_with_tagname( - parser, 6, (GumboTag[]){GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, - GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) { + } else if (tag_in(token, kEndTag, + (gumbo_tagset){ + TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6)})) { + if (!has_an_element_in_scope_with_tagname(parser, 6, + (GumboTag[]){GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4, + GUMBO_TAG_H5, GUMBO_TAG_H6})) { // No heading open; ignore the token entirely. parser_add_parse_error(parser, token); ignore_token(parser); @@ -2692,8 +2739,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { do { current_node = pop_current_node(parser); } while (!node_tag_in_set( - current_node, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), - TAG(H4), TAG(H5), TAG(H6)})); + current_node, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), TAG(H4), + TAG(H5), TAG(H6)})); return success; } } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) { @@ -3433,13 +3480,15 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) { } parser->_parser_state->_reprocess_current_token = true; return close_current_cell(parser, token); - } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(CAPTION), - TAG(COL), TAG(COLGROUP), TAG(HTML)})) { + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), + TAG(COLGROUP), TAG(HTML)})) { parser_add_parse_error(parser, token); ignore_token(parser); return false; - } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TABLE), TAG(TBODY), - TAG(TFOOT), TAG(THEAD), TAG(TR)})) { + } else if (tag_in(token, kEndTag, + (gumbo_tagset){TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), + TAG(TR)})) { if (!has_an_element_in_table_scope(parser, token->v.end_tag)) { parser_add_parse_error(parser, token); ignore_token(parser); @@ -3591,9 +3640,9 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) { TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { return handle_in_head(parser, token); - } else if (tag_in( - token, kStartTag, (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), - TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) { + } else if (tag_in(token, kStartTag, + (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(TBODY), + TAG(TFOOT), TAG(THEAD)})) { pop_template_insertion_mode(parser); push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); @@ -3875,9 +3924,9 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { do { pop_current_node(parser); } while (!(is_mathml_integration_point(get_current_node(parser)) || - is_html_integration_point(get_current_node(parser)) || - get_current_node(parser)->v.element.tag_namespace == - GUMBO_NAMESPACE_HTML)); + is_html_integration_point(get_current_node(parser)) || + get_current_node(parser)->v.element.tag_namespace == + GUMBO_NAMESPACE_HTML)); parser->_parser_state->_reprocess_current_token = true; return false; }