Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#42 import documents #57

Merged
merged 3 commits into from
Nov 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion wagtail_wordpress_import/block_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def build(self):
"""
soup = self.soup.find("body").findChildren(recursive=False)
cached_fallback_value = (
"" # keep appending fall back content here, by default is Rich Text block
"" # append fall back content here, by default it's a Rich Text block
)
cached_fallback_function = import_string(
conf_fallback_block()
Expand Down
121 changes: 110 additions & 11 deletions wagtail_wordpress_import/block_builder_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from django.core.files import File
from django.core.files.temp import NamedTemporaryFile
from wagtail.images.models import Image as ImportedImage
from wagtail.documents.models import Document as ImportedDocument

"""StreamField blocks"""

Expand Down Expand Up @@ -140,13 +141,16 @@ def conf_fallback_block():
)


def build_none_block_content(cache, blocks):
def build_none_block_content(html, blocks):
"""
image_linker is called to link up and retrive the remote image
document_linker is called to link up and retrive the remote documents
"""
blocks.append({"type": "rich_text", "value": image_linker(cache)})
cache = ""
return cache
html = image_linker(html)
html = document_linker(html)
blocks.append({"type": "rich_text", "value": html})
html = ""
return html


"""Rich Text Functions"""
Expand All @@ -166,6 +170,30 @@ def conf_valid_image_content_types():
)


def conf_valid_document_file_types():
return getattr(
settings,
"",
[
"pdf",
"ppt",
"docx",
],
)


def conf_valid_document_content_types():
return getattr(
settings,
"",
[
"application/pdf",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
],
)


def conf_domain_prefix():

if hasattr(settings, "WAGTAIL_WORDPRESS_IMPORTER_BASE_URL"):
Expand All @@ -186,14 +214,14 @@ def image_linker(html):
string: the html with img tags modified

BS4 performs a find and replace on all img tags found in the HTML.
If the image can be retrived from the remote site and saved into a Wagtail ImageModel
If the image can be retrieved from the remote site and saved into a Wagtail ImageModel
the soup is modified.
"""
soup = BeautifulSoup(html, "html.parser")
images = soup.find_all("img")
for image in images:
if image.attrs and image.attrs["src"]:
image_src = get_abolute_src(image.attrs["src"], conf_domain_prefix())
if image.attrs and image.attrs.get("src"):
image_src = get_absolute_src(image.attrs["src"], conf_domain_prefix())
saved_image = get_or_save_image(image_src)
if saved_image:
image_embed = soup.new_tag("embed")
Expand All @@ -216,20 +244,31 @@ def get_image_file_name(src):
return src.split("/")[-1] if src else None # need the last part


def get_document_file_name(src):
return src.split("/")[-1] if src else None # need the last part


def image_exists(name):
try:
return ImportedImage.objects.get(title=name)
except ImportedImage.DoesNotExist:
pass


def document_exists(name):
try:
return ImportedDocument.objects.get(title=name)
except ImportedDocument.DoesNotExist:
pass


def conf_get_requests_settings():
return getattr(
settings,
"WAGTAIL_WORDPRESS_IMPORTER_REQUESTS_SETTINGS",
{
"headers": {"User-Agent": "WagtailWordpressImporter"},
"timeout": 1,
"timeout": 5,
"stream": False,
},
)
Expand All @@ -252,7 +291,7 @@ def get_or_save_image(src):
temp_image.close()
return retrieved_image
else:
print(f"RECEIVED INVALID RESPONSE: {src}")
print(f"RECEIVED INVALID IMAGE RESPONSE: {src}")
return existing_image


Expand All @@ -265,11 +304,13 @@ def fetch_url(src, r=None, status=False, content_type=None):
r.headers["content-type"].lower() if r.headers.get("content-type") else ""
)
except requests.ConnectTimeout:
print(f"THERE WAS A PROBLEM WITH REQUESTS FETCHING: {src}")
print(f"CONNECTION TIMEOUT: {src}")
except requests.ConnectionError:
print(f"CONNECTION ERROR: {src}")
return r, status, content_type


def get_abolute_src(src, domain_prefix=None):
def get_absolute_src(src, domain_prefix=None):
src = src.lstrip("/")
if not src.startswith("http") and domain_prefix:
return domain_prefix + "/" + src
Expand All @@ -286,3 +327,61 @@ def get_alignment_class(image):
alignment = "right"

return alignment


def document_linker(html):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment: These functions, image_linker, and document_linker, both take and return a string, but parse it with BeautifulSoup. I wonder whether my suggestion the other day that a function should both take and return a soup object was misguided, because it leads to inconsistency within the app. Or whether these should change for the same reason (better performance if we do less parsing and exporting).

Ref. #48 (comment)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you OK with me adding a ticket for this. The first time we get some soup is with the lxml parser and that's the soup we have just before it's passed to this function so we pass a string of the elements.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first time we get some soup is with the lxml parser.

Are you saying that the soups are different because one uses lxml and another uses html.parser? If so, then that's a strong argument for passing around string HTML objects instead of soup instances.

I favour readability and consistency at this stage, especially as we aren't yet aware of any performance concern.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#67 Where we should look at this in more detail.

"""
params
======
html: html from a single rich_text block

returns
=======
string: the html with anchor links modified

BS4 performs a find and replace on all img tags found in the HTML.
If the image can be retrived from the remote site and saved into a Wagtail ImageModel
the soup is modified.
"""
soup = BeautifulSoup(html, "html.parser")
anchors = soup.find_all("a")
for anchor in anchors:
if anchor.attrs and anchor.attrs.get("href"):
anchor_href = get_absolute_src(anchor.attrs["href"], conf_domain_prefix())
anchor_inner_content = anchor.text
saved_document = get_or_save_document(anchor_href)
if saved_document:
document_embed = soup.new_tag("a")
document_embed.attrs["linktype"] = "document"
document_embed.attrs["id"] = saved_document.id
document_embed.string = anchor_inner_content
# image_embed.attrs["alt"] = get_image_alt(image)
# image_embed.attrs["format"] = get_alignment_class(image)
anchor.replace_with(document_embed)
else:
print(f"DOCUMENT HAS NO HREF: {anchor}")

return str(soup)


def get_or_save_document(href):
file_type = href.split(".")[-1]
if file_type in conf_valid_document_file_types():
document_file_name = get_document_file_name(href)
existing_document = document_exists(document_file_name)
if not existing_document:
response, valid, type = fetch_url(href)
if valid and (type in conf_valid_document_content_types()):
temp_document = NamedTemporaryFile(delete=True)
temp_document.name = document_file_name
temp_document.write(response.content)
temp_document.flush()
retrieved_document = ImportedDocument(
file=File(file=temp_document), title=document_file_name
)
retrieved_document.save()
temp_document.close()
return retrieved_document
else:
print(f"RECEIVED INVALID DOCUMENT RESPONSE: {href}")
return existing_document
2 changes: 1 addition & 1 deletion wagtail_wordpress_import/test/fixtures/raw_html.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<span style="font-weight: bold;font-style:italic;">Lorem ipsum (xcounterx) dolor sit amet</span>

<a href="#ideas"><strong>Lorem ipsum dolor sit (xcounterx) amet!</strong></a>

<a href="https://www.budgetsaresexy.com/files/personal-finance-culminating-assignment.pdf">Read this</a>
<h2><strong>Lorem ipsum dolor sit amet?</strong></h2>

<p>Absolute image url.
Expand Down
56 changes: 29 additions & 27 deletions wagtail_wordpress_import/test/tests/test_block_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
build_image_block,
build_table_block,
conf_domain_prefix,
get_abolute_src,
get_absolute_src,
get_alignment_class,
get_image_alt,
get_image_file_name,
Expand Down Expand Up @@ -95,6 +95,7 @@ def test_build_iframe_block(self):
self.assertEqual(output["type"], "raw_html")
self.assertTrue(output["value"].startswith("<div"))

# work in progress
def test_build_image_block(self):
input = """<img src="http://www.example.com/image.jpg" />"""
soup = get_soup(input, "html.parser")
Expand Down Expand Up @@ -219,55 +220,56 @@ def test_get_image_file_name(self):
self.assertEqual(get_image_file_name("fakeimage.jpg"), "fakeimage.jpg")
self.assertEqual(get_image_file_name("folder/fakeimage.jpg"), "fakeimage.jpg")
self.assertEqual(
get_image_file_name(
"http://www.example.com/folder1/folder2//fakeimage.jpg"
),
get_image_file_name("http://www.example.com/folder1/folder2/fakeimage.jpg"),
"fakeimage.jpg",
)

def test_get_abolute_src(self):
def test_get_absolute_src(self):
self.assertEqual(
get_abolute_src("fakeimage.jpg", "http://www.example.com"),
get_absolute_src("fakeimage.jpg", "http://www.example.com"),
"http://www.example.com/fakeimage.jpg",
)
self.assertEqual(
get_abolute_src("folder/fakeimage.jpg", "http://www.example.com"),
get_absolute_src("folder/fakeimage.jpg", "http://www.example.com"),
"http://www.example.com/folder/fakeimage.jpg",
)

def test_get_absolute_src_without_base_url(self):
self.assertEqual(
get_abolute_src("folder/fakeimage.jpg"),
get_absolute_src("folder/fakeimage.jpg"),
"folder/fakeimage.jpg",
) # the test settings has no BASE_URL setting so try having no domain prefix
)

def test_get_abolute_src_slashes_at_start(self):
self.assertEqual(
get_abolute_src("//folder/fakeimage.jpg", "http://www.example.com"),
get_absolute_src("//folder/fakeimage.jpg", "http://www.example.com"),
"http://www.example.com/folder/fakeimage.jpg",
)

def test_get_alignment_class(self):
input = get_soup(
def test_get_alignment_class_align_left(self):
soup = get_soup(
'<img src="fakeimage.jpg" alt="image alt" class="align-left" />',
"html.parser",
).find("img")
self.assertEqual(get_alignment_class(input), "left")
input = get_soup(
self.assertEqual(get_alignment_class(soup), "left")

def test_get_alignment_class_align_right(self):
soup = get_soup(
'<img src="fakeimage.jpg" alt="image alt" class="align-right" />',
"html.parser",
).find("img")
self.assertEqual(get_alignment_class(input), "right")
input = get_soup(
self.assertEqual(get_alignment_class(soup), "right")

def test_get_alignment_class_not_present(self):
soup = get_soup(
'<img src="fakeimage.jpg" alt="image alt" />',
"html.parser",
).find("img")
self.assertEqual(get_alignment_class(input), "fullwidth")

def test_with_real_image(self):
# but we need to test with mocked images if we can.
raw_html_file = """
<p>Lorem <img src="https://dummyimage.com/600x400/000/fff" alt=""></p>
"""
self.builder = BlockBuilder(raw_html_file, None, None)
self.builder.promote_child_tags()
self.blocks = self.builder.build()
self.assertTrue("<embed" in self.blocks[0]["value"])
self.assertEqual(get_alignment_class(soup), "fullwidth")

"""
TODO: Add some more tests
I need to include tests here for images and documents.
I'm not sure how this could be done at the moment.
Also applies to: test_images_linked_rich_text() above
"""
2 changes: 1 addition & 1 deletion wagtail_wordpress_import/test/tests/test_wordpress_item.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import json
from django.test import TestCase
from django.test import TestCase, override_settings
from datetime import datetime
from wagtail_wordpress_import.importers.wordpress import WordpressItem
from wagtail_wordpress_import.logger import Logger
Expand Down