diff --git a/wagtail_wordpress_import/block_builder.py b/wagtail_wordpress_import/block_builder.py index 4f59e3e8..c180db1c 100644 --- a/wagtail_wordpress_import/block_builder.py +++ b/wagtail_wordpress_import/block_builder.py @@ -1,278 +1,106 @@ -import requests from bs4 import BeautifulSoup -from django.core.files import File -from django.core.files.temp import NamedTemporaryFile -from wagtail.images.models import Image as ImportedImage +from django.conf import settings +from django.utils.module_loading import import_string -TAGS_TO_BLOCKS = [ - "table", - "iframe", - "form", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "img", - "blockquote", -] +from wagtail_wordpress_import.block_builder_defaults import ( + conf_fallback_block, + conf_html_tags_to_blocks, +) -IRELLEVANT_PARENTS = ["p", "div", "span"] -# this is not what i'd expect to see but some images return text/html CDN maybe? -VALID_IMAGE_CONTENT_TYPES = [ - "image/gif", - "image/jpeg", - "image/png", - "image/webp", - "text/html", -] - -IMAGE_SRC_DOMAIN = "https://www.budgetsaresexy.com" # note no trailing / +def conf_promote_child_tags(): + return getattr( + settings, + "WAGTAIL_WORDPRESS_IMPORTER_PROMOTE_CHILD_TAGS", + { + "TAGS_TO_PROMOTE": ["iframe", "form", "blockquote"], + "PARENTS_TO_REMOVE": ["p", "div", "span"], + }, + ) class BlockBuilder: def __init__(self, value, node, logger): - self.soup = BeautifulSoup(value, "lxml", exclude_encodings=True) - self.blocks = [] + self.soup = BeautifulSoup(value, "lxml") + self.blocks = [] # for each page this holds the sequence of StreamBlocks self.logged_items = {"processed": 0, "imported": 0, "skipped": 0, "items": []} self.node = node self.logger = logger - self.set_up() - def set_up(self): + def promote_child_tags(self): """ - iframes, forms can get put inside a p tag, pull them out - extend this to add further tags + Some HTML tags that can be at the top level, e.g. the parent is the + body when using BS4 are getting placed inside or existed inside other HTML tags. + We pull out these HTML tags and move them to the top level. + returns: None + but modifies the page soup """ - for iframe in self.soup.find_all("iframe"): - parent = iframe.previous_element - if parent.name in IRELLEVANT_PARENTS: - parent.replaceWith(iframe) + config_promote_child_tags = conf_promote_child_tags() + promotee_tags = config_promote_child_tags["TAGS_TO_PROMOTE"] + removee_tags = config_promote_child_tags["PARENTS_TO_REMOVE"] - for form in self.soup.find_all("form"): - parent = form.previous_element - if parent.name in IRELLEVANT_PARENTS: - parent.replaceWith(form) + for promotee in promotee_tags: + promotees = self.soup.findAll(promotee) + for promotee in promotees: + if promotee.parent.name in removee_tags: + promotee.parent.replace_with(promotee) - for blockquote in self.soup.find_all("blockquote"): - parent = blockquote.previous_element - if parent.name in IRELLEVANT_PARENTS: - parent.replaceWith(blockquote) + def get_builder_function(self, element): + """ + params + element: an HTML tag + returns: + a function to parse the block from configuration + """ + function = [ + import_string(builder[1]["FUNCTION"]) + for builder in conf_html_tags_to_blocks() + if element.name == builder[0] + ] + if function: + return function[0] def build(self): + """ + params: + None + returns: + a list of block dicts + + The value to be processed her should have only top level HTML tags. + The HTML is parsed to a sequence of StreamField blocks. + If a HTML tag does have child blocks we should parse then inside the + build_block_* method + """ soup = self.soup.find("body").findChildren(recursive=False) - block_value = str("") + cached_fallback_value = ( + "" # keep appending fall back content here, by default is Rich Text block + ) + cached_fallback_function = import_string( + conf_fallback_block() + ) # Rich Text block counter = 0 - - for tag in soup: + for element in soup: # each single top level tag counter += 1 - """ - the process here loops though each soup tag to discover - the block type to use - """ - - # RICHTEXT - if tag.name not in TAGS_TO_BLOCKS: - block_value += str(self.image_linker(str(tag))) - - # TABLE - if tag.name == "table": - if len(block_value) > 0: - self.blocks.append({"type": "rich_text", "value": block_value}) - block_value = str("") - self.blocks.append({"type": "raw_html", "value": str(tag)}) - - # IFRAME/EMBED - if tag.name == "iframe": - if len(block_value) > 0: - self.blocks.append({"type": "rich_text", "value": block_value}) - block_value = str("") - self.blocks.append( - { - "type": "raw_html", - "value": '
{}
'.format( - str(tag) - ), - } + # the builder function for the element tag from config + builder_function = self.get_builder_function(element) + + if builder_function: # build a block + if cached_fallback_value: + cached_fallback_value = cached_fallback_function( + cached_fallback_value, + self.blocks, + ) # before building a block write fall back cache to a block + self.blocks.append(builder_function(element)) # write the new block + else: + if element.text.strip(): # exclude a tag that is empty + cached_fallback_value += str(element) + + if cached_fallback_value and counter == len( + soup + ): # the last tag so just build whats left in the fall back cache + cached_fallback_value = cached_fallback_function( + cached_fallback_value, self.blocks ) - # FORM - if tag.name == "form": - if len(block_value) > 0: - self.blocks.append({"type": "rich_text", "value": block_value}) - block_value = str("") - self.blocks.append({"type": "raw_html", "value": str(tag)}) - - # HEADING - if ( - tag.name == "h1" - or tag.name == "h2" - or tag.name == "h3" - or tag.name == "h4" - or tag.name == "h5" - or tag.name == "h6" - ): - if len(block_value) > 0: - self.blocks.append({"type": "rich_text", "value": block_value}) - block_value = str("") - self.blocks.append( - { - "type": "heading", - "value": {"importance": tag.name, "text": str(tag.text)}, - } - ) - - # IMAGE - if tag.name == "img": - if len(block_value) > 0: - self.blocks.append({"type": "rich_text", "value": block_value}) - block_value = str("") - self.blocks.append({"type": "raw_html", "value": str(tag)}) - - # BLOCKQUOTE - if tag.name == "blockquote": - if len(block_value) > 0: - self.blocks.append({"type": "rich_text", "value": block_value}) - block_value = str("") - cite = "" - if tag.attrs and tag.attrs.get("cite"): - cite = str(tag.attrs["cite"]) - self.blocks.append( - { - "type": "block_quote", - "value": {"quote": str(tag.text), "attribution": cite}, - } - ) - - if counter == len(soup) and len(block_value) > 0: - # when we reach the end and something is in the - # block_value just output and clear - self.blocks.append({"type": "rich_text", "value": block_value}) - block_value = str("") - - # print(self.logged_items) return self.blocks - - def image_linker(self, tag): - soup = BeautifulSoup(tag, "html.parser", exclude_encodings=True) - images = soup.find_all("img") - - for image in images: - image_saved = self.get_image(image) - if image_saved: - alignment = self.get_alignment_class(image) - img_alt = image.attrs["alt"] if "alt" in image.attrs else None - tag = ''.format( - image_saved.id, img_alt, alignment - ) - - return tag - - def get_alignment_class(self, image): - alignment = "fullwidth" - - if "class" in image.attrs: - if "align-left" in image.attrs["class"]: - alignment = "left" - elif "align-right" in image.attrs["class"]: - alignment = "right" - - return alignment - - def get_image(self, image): - - if image.get("src"): - name = image.get("src").split("/")[-1] # need the last part - temp = NamedTemporaryFile(delete=True) - image_src = check_image_src(image.get("src")).strip("/") - else: - self.logged_items["items"].append( - { - "id": self.node.get("wp:post_id"), - "title": self.node.get("title"), - "link": self.node.get("link"), - "reason": "no src provided", - } - ) - self.logger.images.append( - { - "id": self.node.get("wp:post_id"), - "title": self.node.get("title"), - "link": self.node.get("link"), - "reason": "no src provided", - } - ) - return - - try: - image_exists = ImportedImage.objects.get(title=name) - return image_exists - - except ImportedImage.DoesNotExist: - - try: - response = requests.get(image_src, timeout=10) - status_code = response.status_code - content_type = response.headers.get("Content-Type") - - if ( - content_type - and content_type.lower() not in VALID_IMAGE_CONTENT_TYPES - ): - self.logged_items["items"].append( - { - "id": self.node.get("wp:post_id"), - "title": self.node.get("title"), - "link": self.node.get("link"), - "reason": "invalid image types match or no content type", - } - ) - self.logger.images.append( - { - "id": self.node.get("wp:post_id"), - "title": self.node.get("title"), - "link": self.node.get("link"), - "reason": "invalid image types match or no content type", - } - ) - return - - if status_code == 200: - temp.name = name - temp.write(response.content) - temp.flush() - new_image = ImportedImage(file=File(file=temp), title=name) - new_image.save() - return new_image - - except requests.exceptions.ConnectionError: - self.logged_items["items"].append( - { - "id": self.node.get("wp:post_id"), - "title": self.node.get("title"), - "link": self.node.get("link"), - "reason": "connection error", - } - ) - self.logger.images.append( - { - "id": self.node.get("wp:post_id"), - "title": self.node.get("title"), - "link": self.node.get("link"), - "reason": "connection error", - } - ) - - -def check_image_src(src): - # some images have relative src values - if not src.startswith("http"): - print( - "WARNING: relative file {}. Image may be broken, trying with domain name prepended. ".format( - src - ) - ) - return IMAGE_SRC_DOMAIN + "/" + src - return src diff --git a/wagtail_wordpress_import/block_builder_defaults.py b/wagtail_wordpress_import/block_builder_defaults.py new file mode 100644 index 00000000..2fbbabbb --- /dev/null +++ b/wagtail_wordpress_import/block_builder_defaults.py @@ -0,0 +1,290 @@ +import re + +import requests +from bs4 import BeautifulSoup +from django.conf import settings +from django.core.files import File +from django.core.files.temp import NamedTemporaryFile +from wagtail.images.models import Image as ImportedImage + +"""StreamField blocks""" + + +def build_block_quote_block(tag): + block_dict = { + "type": "block_quote", + "value": {"quote": tag.text.strip(), "attribution": tag.cite}, + } + return block_dict + + +def build_form_block(tag): + block_dict = {"type": "raw_html", "value": str(tag)} + return block_dict + + +def build_heading_block(tag): + block_dict = { + "type": "heading", + "value": {"importance": tag.name, "text": tag.text}, + } + return block_dict + + +def build_iframe_block(tag): + block_dict = { + "type": "raw_html", + "value": '
{}
'.format( + str(tag) + ), + } + return block_dict + + +def build_image_block(tag): + def get_image_id(src): + return 1 + + block_dict = {"type": "image", "value": get_image_id(tag.src)} + return block_dict + + +def build_table_block(tag): + block_dict = {"type": "raw_html", "value": str(tag)} + return block_dict + + +def conf_html_tags_to_blocks(): + return getattr( + settings, + "WAGTAIL_WORDPRESS_IMPORTER_CONVERT_HTML_TAGS_TO_BLOCKS", + [ + ( + "h1", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", + }, + ), + ( + "h2", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", + }, + ), + ( + "h3", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", + }, + ), + ( + "h4", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", + }, + ), + ( + "h5", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", + }, + ), + ( + "h6", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", + }, + ), + ( + "table", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_table_block", + }, + ), + ( + "iframe", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_iframe_block", + }, + ), + ( + "form", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_form_block", + }, + ), + ( + "img", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_image_block", + }, + ), + ( + "blockquote", + { + "FUNCTION": "wagtail_wordpress_import.block_builder_defaults.build_block_quote_block", + }, + ), + ], + ) + + +"""Fall back StreamField block""" + + +def conf_fallback_block(): + return getattr( + settings, + "WAGTAIL_WORDPRESS_IMPORTER_FALLBACK_BLOCK", + "wagtail_wordpress_import.block_builder_defaults.build_none_block_content", + ) + + +def build_none_block_content(cache, blocks): + """ + image_linker is called to link up and retrive the remote image + """ + blocks.append({"type": "rich_text", "value": image_linker(cache)}) + cache = "" + return cache + + +"""Rich Text Functions""" + + +def conf_valid_image_content_types(): + return getattr( + settings, + "WAGTAIL_WORDPRESS_IMPORTER_VALID_IMAGE_CONTENT_TYPES", + [ + "image/gif", + "image/jpeg", + "image/png", + "image/webp", + "text/html", + ], + ) + + +def conf_domain_prefix(): + + if hasattr(settings, "WAGTAIL_WORDPRESS_IMPORTER_BASE_URL"): + return getattr(settings, "WAGTAIL_WORDPRESS_IMPORTER_BASE_URL") + + elif not hasattr(settings, "WAGTAIL_WORDPRESS_IMPORTER_BASE_URL") and hasattr( + settings, "BASE_URL" + ): + return getattr(settings, "BASE_URL") + + +def image_linker(html): + """ + params + ====== + html: html from a single rich_text block + + returns + ======= + string: the html with img tags modified + + BS4 performs a find and replace on all img tags found in the HTML. + If the image can be retrived from the remote site and saved into a Wagtail ImageModel + the soup is modified. + """ + soup = BeautifulSoup(html, "html.parser") + images = soup.find_all("img") + for image in images: + if image.attrs and image.attrs["src"]: + image_src = get_abolute_src(image.attrs["src"], conf_domain_prefix()) + saved_image = get_or_save_image(image_src) + if saved_image: + image_embed = soup.new_tag("embed") + image_embed.attrs["embedtype"] = "image" + image_embed.attrs["id"] = saved_image.id + image_embed.attrs["alt"] = get_image_alt(image) + image_embed.attrs["format"] = get_alignment_class(image) + image.replace_with(image_embed) + else: + print(f"IMAGE HAS NO SRC: {image}") + + return str(soup) + + +def get_image_alt(img_tag): + return img_tag.attrs["alt"] if "alt" in img_tag.attrs else None + + +def get_image_file_name(src): + return src.split("/")[-1] if src else None # need the last part + + +def image_exists(name): + try: + return ImportedImage.objects.get(title=name) + except ImportedImage.DoesNotExist: + pass + + +def conf_get_requests_settings(): + return getattr( + settings, + "WAGTAIL_WORDPRESS_IMPORTER_REQUESTS_SETTINGS", + { + "headers": {"User-Agent": "WagtailWordpressImporter"}, + "timeout": 1, + "stream": False, + }, + ) + + +def get_or_save_image(src): + image_file_name = get_image_file_name(src) + existing_image = image_exists(image_file_name) + if not existing_image: + response, valid, type = fetch_url(src) + if valid and (type in conf_valid_image_content_types()): + temp_image = NamedTemporaryFile(delete=True) + temp_image.name = image_file_name + temp_image.write(response.content) + temp_image.flush() + retrieved_image = ImportedImage( + file=File(file=temp_image), title=image_file_name + ) + retrieved_image.save() + temp_image.close() + return retrieved_image + else: + print(f"RECEIVED INVALID RESPONSE: {src}") + return existing_image + + +def fetch_url(src, r=None, status=False, content_type=None): + """general purpose url fetcher with ability to pass in own config""" + try: + r = requests.get(src, **conf_get_requests_settings()) + status = r.status_code == 200 + content_type = ( + r.headers["content-type"].lower() if r.headers.get("content-type") else "" + ) + except requests.ConnectTimeout: + print(f"THERE WAS A PROBLEM WITH REQUESTS FETCHING: {src}") + return r, status, content_type + + +def get_abolute_src(src, domain_prefix=None): + src = re.sub("^\/+", "", src) + if not src.startswith("http") and domain_prefix: + return domain_prefix + "/" + src + return src + + +def get_alignment_class(image): + alignment = "fullwidth" + + if "class" in image.attrs: + if "align-left" in image.attrs["class"]: + alignment = "left" + elif "align-right" in image.attrs["class"]: + alignment = "right" + + return alignment diff --git a/wagtail_wordpress_import/importers/wordpress.py b/wagtail_wordpress_import/importers/wordpress.py index 320fc188..97b74977 100644 --- a/wagtail_wordpress_import/importers/wordpress.py +++ b/wagtail_wordpress_import/importers/wordpress.py @@ -52,8 +52,10 @@ def run(self, *args, **kwargs): exit() for event, node in xml_doc: - # each node represents a tag in the xml - # event is true for the start element + """ + Each node represents a tag in the xml. + `event` is true for a start element. + """ if event == pulldom.START_ELEMENT and node.tagName == "item": xml_doc.expandNode(node) item = node_to_dict(node) @@ -144,8 +146,10 @@ def analyze_html(self, html_analyzer, *, page_types, page_statuses): xml_doc = pulldom.parse(self.xml_file) for event, node in xml_doc: - # each node represents a tag in the xml - # event is true for the start element + """ + Each node represents a tag in the xml. + `event` is true for a start element. + """ if event == pulldom.START_ELEMENT and node.tagName == "item": xml_doc.expandNode(node) item = node_to_dict(node) @@ -217,10 +221,8 @@ def __init__(self, node, logger): def prefilter_content(self, content): """ - FILTERS ARE CUMULATIVE - cache the result of each filter which is run on the output from the previous filter + FILTERS ARE CUMULATIVE: Each filter receives the output from the previous filter. """ - cached_result = content for filter in default_prefilters(): @@ -236,11 +238,11 @@ def cleaned_title(self): def cleaned_slug(self): """ - Oddly some page have no slug and some have illegal characters! - If None make one from title. - Also pass any slug through slugify to be sure and if it's changed make a note + Clean up the slugs from the XML import file + Some pages have no slug and some have unexpected characters. + If a slug is not provided create one from page title. + If a slug is changed its recorded in the logger """ - if not self.node["wp:post_name"]: slug = slugify(self.cleaned_title()) self.slug_changed = "blank slug" # logging @@ -264,9 +266,9 @@ def cleaned_latest_revision_created_at(self): def clean_date(self, value): """ - We need a nice date to be able to save the page later. Some dates are not suitable - date strings in the xml. If thats the case return a specific date so it can be saved - and return the failure for logging + We need a proper date format. + Some dates are not suitable date strings in the xml, if so return a + specific date so it can be saved in Wagtail and record it in the logger. """ if value == "0000-00-00 00:00:00": @@ -289,7 +291,9 @@ def cleaned_link(self): return str(self.node["link"].strip()) def body_stream_field(self, content): - blocks_dict = BlockBuilder(content, self.node, self.logger).build() + builder = BlockBuilder(content, self.node, self.logger) + builder.promote_child_tags() + blocks_dict = builder.build() if debug_enabled(): self.debug_content["block_json"] = blocks_dict return json.dumps(blocks_dict) @@ -301,7 +305,7 @@ def get_yoast_description_value(self): This parses the wp:postmeta field to check if a _yoast_wpseo_metadesc is available. If not it returns a blank string or the default description field - in the XML ... . + from the XML import file ... """ meta_value = "" @@ -335,8 +339,7 @@ def cleaned_data(self): which imports to a standard Wagtail field. This came out of dealing with the Yoast search_description field which we have - included and can be configured by a developer to accept different values as - the wp:postmeta keys + included and can be configured to accept different values that are in wp:postmeta keys """ return { "title": self.cleaned_title(), diff --git a/wagtail_wordpress_import/management/commands/import_xml.py b/wagtail_wordpress_import/management/commands/import_xml.py index db896c95..76cc6207 100644 --- a/wagtail_wordpress_import/management/commands/import_xml.py +++ b/wagtail_wordpress_import/management/commands/import_xml.py @@ -3,6 +3,7 @@ from django.core.management.base import BaseCommand from wagtail_wordpress_import.importers.wordpress import WordpressImporter from wagtail_wordpress_import.logger import Logger +from wagtail_wordpress_import.block_builder_defaults import conf_domain_prefix LOG_DIR = "log" @@ -56,6 +57,13 @@ def add_arguments(self, parser): ) def handle(self, **options): + if not conf_domain_prefix(): + self.stdout.write( + self.style.ERROR( + "BASE_URL or WAGTAIL_WORDPRESS_IMPORTER_BASE_URL: needs to be added to your settings" + ) + ) + exit() xml_file_path = self.get_xml_file(f"{options['xml_file']}") logger = Logger(LOG_DIR) importer = WordpressImporter(xml_file_path) diff --git a/wagtail_wordpress_import/prefilters/transform_styles_defaults.py b/wagtail_wordpress_import/prefilters/transform_styles_defaults.py index 142b89e8..a075376c 100644 --- a/wagtail_wordpress_import/prefilters/transform_styles_defaults.py +++ b/wagtail_wordpress_import/prefilters/transform_styles_defaults.py @@ -43,40 +43,50 @@ def transform_style_center(soup, tag): """ apply a new css class to any existing classes """ - _class = tag.get("class", "") + " align-center" - tag.attrs["class"] = _class.strip() + if tag.attrs.get("class"): + tag.attrs["class"].append("align-center") + else: + tag.attrs["class"] = "align-center" def transform_style_left(soup, tag): """ apply a new css class to any existing classes """ - _class = tag.get("class", "") + " align-left" - tag.attrs["class"] = _class.strip() + if tag.attrs.get("class"): + tag.attrs["class"].append("align-left") + else: + tag.attrs["class"] = "align-left" def transform_style_right(soup, tag): """ apply a new css class to any existing classes """ - _class = tag.get("class", "") + " align-right" - tag.attrs["class"] = _class.strip() + if tag.attrs.get("class"): + tag.attrs["class"].append("align-right") + else: + tag.attrs["class"] = "align-right" def transform_float_left(soup, tag): """ apply a new css class to any existing classes """ - _class = tag.get("class", "") + " float-left" - tag.attrs["class"] = _class.strip() + if tag.attrs.get("class"): + tag.attrs["class"].append("float-left") + else: + tag.attrs["class"] = "float-left" def transform_float_right(soup, tag): """ apply a new css class to any existing classes """ - _class = tag.get("class", "") + " float-right" - tag.attrs["class"] = _class.strip() + if tag.attrs.get("class"): + tag.attrs["class"].append("float-right") + else: + tag.attrs["class"] = "float-right" def conf_styles_mapping(): diff --git a/wagtail_wordpress_import/templates/wagtail_wordpress_import/quote_block.html b/wagtail_wordpress_import/templates/wagtail_wordpress_import/quote_block.html index 33bea5ac..9540a522 100644 --- a/wagtail_wordpress_import/templates/wagtail_wordpress_import/quote_block.html +++ b/wagtail_wordpress_import/templates/wagtail_wordpress_import/quote_block.html @@ -1,4 +1,3 @@ -
-

{{ value.quote }}

- {% if value.attribution %}{{ value.attribution }}{% endif %} +
+ {{ value.quote }}
diff --git a/wagtail_wordpress_import/test/fixtures/raw_html.txt b/wagtail_wordpress_import/test/fixtures/raw_html.txt index fe36583c..fdb17726 100644 --- a/wagtail_wordpress_import/test/fixtures/raw_html.txt +++ b/wagtail_wordpress_import/test/fixtures/raw_html.txt @@ -1,9 +1,18 @@ + +a silly src url Lorem ipsum (xcounterx) dolor sit amet Lorem ipsum dolor sit (xcounterx) amet!

Lorem ipsum dolor sit amet?

+

Absolute image url. + + + +

+ + -

Lorem ipsum dolor sit amet, consectetur adipisici elit, sed eiusmod tempor incidunt ut labore et dolore magna aliqua.?

- - - - - - - - - - - - - - - - - - - - - - - +

Lorem ipsum dolor sit amet, consectetur adipisici elit, sed eiusmod tempor incidunt ut labore et dolore + magna aliqua.?

+
 
ItemAmount
TOTAL:$1,127.67
Lorem 1Lorem 1/1
Lorem 2Lorem 2/1
+ + + + + + + + + + + + + + + + + + + + + + +
 
ItemAmount
TOTAL:$1,127.67
Lorem 1Lorem 1/1
Lorem 2Lorem 2/1
- -
Lorem ipsum dolor sit amet, consectetur adipisici elit, sed eiusmod tempor incidunt ut labore et dolore magna aliqua. -Nihil hic munitissimus habendi senatus locus, nihil horum?.
+ +
+ +
+

+

+ +
+

+ + +

+ +

+ +
Lorem ipsum dolor sit amet, consectetur adipisici + elit, sed eiusmod tempor incidunt ut labore et dolore magna aliqua.
+

+

Lorem ipsum dolor sit amet, consectetur + adipisici + elit, sed eiusmod tempor incidunt ut labore et dolore magna aliqua.
+

diff --git a/wagtail_wordpress_import/test/tests/test_block_builder.py b/wagtail_wordpress_import/test/tests/test_block_builder.py index 7609e74c..04aa9063 100644 --- a/wagtail_wordpress_import/test/tests/test_block_builder.py +++ b/wagtail_wordpress_import/test/tests/test_block_builder.py @@ -1,11 +1,275 @@ -from django.test import TestCase -from wagtail_wordpress_import.block_builder import check_image_src +import os -class TestBlockBuilder(TestCase): +from bs4 import BeautifulSoup +from django.conf import settings +from django.test import TestCase, override_settings, modify_settings +from wagtail_wordpress_import.block_builder import BlockBuilder +from wagtail_wordpress_import.block_builder_defaults import ( + build_block_quote_block, + build_form_block, + build_heading_block, + build_iframe_block, + build_image_block, + build_table_block, + conf_domain_prefix, + get_abolute_src, + get_alignment_class, + get_image_alt, + get_image_file_name, +) - def test_check_image_src(self): - src1 = "https://www.budgetsaresexy.com/folder/myimage.gif" - src2 = "folder/myimage.gif" +BASE_PATH = os.path.dirname(os.path.dirname(__file__)) +FIXTURES_PATH = BASE_PATH + "/fixtures" - self.assertEqual(check_image_src(src1), "https://www.budgetsaresexy.com/folder/myimage.gif") - self.assertEqual(check_image_src(src2), "https://www.budgetsaresexy.com/folder/myimage.gif") + +def get_soup(html, parser): + soup = BeautifulSoup(html, parser) + return soup + + +class TestBlockBuilderRemoveParents(TestCase): + def setUp(self): + raw_html_file = open(f"{FIXTURES_PATH}/raw_html.txt", "r") + self.builder = BlockBuilder(raw_html_file, None, None) + self.builder.promote_child_tags() + self.output_remove_parent_tags = self.builder.soup + self.expected_parent_name = "body" + + def test_remove_parent_tags_iframe(self): + output = self.output_remove_parent_tags + + iframe = output.find("iframe", {"data-testing": "hasnoparent"}) + self.assertTrue(iframe.parent.name == self.expected_parent_name) + + iframe = output.find("iframe", {"data-testing": "hasparent"}) + self.assertTrue(iframe.parent.name == self.expected_parent_name) + + def test_remove_parent_tags_form(self): + output = self.output_remove_parent_tags + + form = output.find("form", attrs={"data-testing": "hasnoparent"}) + self.assertTrue(form.parent.name == self.expected_parent_name) + + form = output.find("form", attrs={"data-testing": "hasparent"}) + self.assertTrue(form.parent.name == self.expected_parent_name) + + def test_remove_parent_tags_blockquote(self): + output = self.output_remove_parent_tags + + blockquote = output.find("blockquote", attrs={"data-testing": "hasnoparent"}) + self.assertTrue(blockquote.parent.name == self.expected_parent_name) + + blockquote = output.find("blockquote", attrs={"data-testing": "hasparent"}) + self.assertTrue(blockquote.parent.name == self.expected_parent_name) + + +class TestBlockBuilderBlockDefaults(TestCase): + def test_build_block_quote_block(self): + input = """
+ Lorem ipsum dolor sit amet. +
""" + + output = build_block_quote_block(get_soup(input, "html.parser")) + self.assertEqual(output["type"], "block_quote") + self.assertIsInstance(output["value"], dict) + self.assertTrue(output["value"]["quote"].startswith("Lorem")) + + def test_build_form_block(self): + input = """
""" + + output = build_form_block(get_soup(input, "html.parser")) + self.assertEqual(output["type"], "raw_html") + self.assertTrue(output["value"].startswith("A heading 1""" + soup = get_soup(input, "html.parser").find("h1") + output = build_heading_block(soup) + self.assertEqual(output["type"], "heading") + self.assertEqual(output["value"]["importance"], "h1") + + def test_build_iframe_block(self): + input = """""" + + output = build_iframe_block(get_soup(input, "html.parser")) + self.assertEqual(output["type"], "raw_html") + self.assertTrue(output["value"].startswith("""" + soup = get_soup(input, "html.parser") + output = build_image_block(soup) + self.assertEqual(output["type"], "image") + self.assertEqual(output["value"], 1) + + def test_build_table_block(self): + input = """ + + + + + + + + + + + + + + + + + + + + + + +
 
ItemAmount
TOTAL:$1,127.67
Lorem 1Lorem 1/1
Lorem 2Lorem 2/1
""" + + output = build_table_block(get_soup(input, "html.parser")) + self.assertEqual(output["type"], "raw_html") + self.assertTrue(output["value"].startswith("Absolute image url. + + + +

+ In the fixture file is the only one that will be converted. + The other img tags will become image blocks + """ + raw_html_file = open(f"{FIXTURES_PATH}/raw_html.txt", "r") + self.builder = BlockBuilder(raw_html_file, None, None) + self.builder.promote_child_tags() + self.blocks = self.builder.build() + + blocks = [ + block["type"] + for block in self.blocks + if block["type"] == "rich_text" and 'embedtype="image"' in block["value"] + ] + + # self.assertEqual(len(blocks), 1) how to test images + + def test_get_image_alt(self): + input = get_soup( + 'image alt', "html.parser" + ).find("img") + self.assertEqual(get_image_alt(input), "image alt") + + def test_get_image_file_name(self): + self.assertEqual(get_image_file_name("fakeimage.jpg"), "fakeimage.jpg") + self.assertEqual(get_image_file_name("folder/fakeimage.jpg"), "fakeimage.jpg") + self.assertEqual( + get_image_file_name( + "http://www.example.com/folder1/folder2//fakeimage.jpg" + ), + "fakeimage.jpg", + ) + + def test_get_abolute_src(self): + self.assertEqual( + get_abolute_src("fakeimage.jpg", "http://www.example.com"), + "http://www.example.com/fakeimage.jpg", + ) + self.assertEqual( + get_abolute_src("folder/fakeimage.jpg", "http://www.example.com"), + "http://www.example.com/folder/fakeimage.jpg", + ) + self.assertEqual( + get_abolute_src("folder/fakeimage.jpg"), + "folder/fakeimage.jpg", + ) # the test settings has no BASE_URL setting so try having no domain prefix + + def test_get_abolute_src_slashes_at_start(self): + self.assertEqual( + get_abolute_src("//folder/fakeimage.jpg", "http://www.example.com"), + "http://www.example.com/folder/fakeimage.jpg", + ) + + def test_get_alignment_class(self): + input = get_soup( + 'image alt', + "html.parser", + ).find("img") + self.assertEqual(get_alignment_class(input), "left") + input = get_soup( + 'image alt', + "html.parser", + ).find("img") + self.assertEqual(get_alignment_class(input), "right") + input = get_soup( + 'image alt', + "html.parser", + ).find("img") + self.assertEqual(get_alignment_class(input), "fullwidth") + + def test_with_real_image(self): + # but we need to test with mocked images if we can. + raw_html_file = """ +

Lorem

+ """ + self.builder = BlockBuilder(raw_html_file, None, None) + self.builder.promote_child_tags() + self.blocks = self.builder.build() + self.assertTrue("... field - If thats blank then search_description is set as a blank value + If the search description is blank or not available then we import the + ... field. + If the field is empty then search_description is set as a blank value. """ fixtures = [ @@ -199,7 +199,7 @@ def test_page_field_values_with_yoast_plugin_enabled(self): class WordpressImporterTestsYoastEnabledMissingTag(TestCase): """ This tests when the expected config for Yoast is different from the - package default it defaults to use the + package default which defaults to use the ... field. """ @@ -253,11 +253,11 @@ def test_page_field_values_with_yoast_plugin_enabled(self): ) class WordpressImporterTestsYoastEnabledChangedKey(TestCase): """ - This tests when the developer changes the config for Yoast in that the key for the - search description is not the same as the package default. - If the search description is blank or not available then we use the - ... field - If thats blank then search_description is set as a blank value + This tests a different configuration for Yoast. + The key for the search description is not the same as the package default. + If the search description is blank or not available then we import the + ... field. + If the field is empty then search_description is set as a blank value. """ fixtures = [ diff --git a/wagtail_wordpress_import/test/tests/test_wordpress_item.py b/wagtail_wordpress_import/test/tests/test_wordpress_item.py index 2bd6dfc7..e7c538d2 100644 --- a/wagtail_wordpress_import/test/tests/test_wordpress_item.py +++ b/wagtail_wordpress_import/test/tests/test_wordpress_item.py @@ -10,6 +10,10 @@ # @override_settings(WAGTAIL_WORDPRESS_IMPORT_YOAST_PLUGIN_ENABLED=False) +@override_settings( + BASE_URL="http://localhost:8000" +) # testing requires a live domain for requests to use, this is something I need to change before package release +# mocking of somesort, using localhost:8000 for now class WordpressItemTests(TestCase): def setUp(self): self.logger = Logger("fakedir")