diff --git a/conanfile.py b/conanfile.py index 44a9667d..2950f050 100644 --- a/conanfile.py +++ b/conanfile.py @@ -44,7 +44,7 @@ def requirements(self): self.requires("uchardet/0.0.8") self.requires("utfcpp/4.0.4") if self.options.get_safe("with_pdf2htmlEX", False): - self.requires("pdf2htmlex/0.18.8.rc1-git-6f85c88-odr") + self.requires("pdf2htmlex/0.18.8.rc1-odr-pr1") if self.options.get_safe("with_wvWare", False): self.requires("wvware/1.2.9-odr") diff --git a/src/odr/html_service.hpp b/src/odr/html_service.hpp index e1388b90..57f2da3c 100644 --- a/src/odr/html_service.hpp +++ b/src/odr/html_service.hpp @@ -22,9 +22,11 @@ class HtmlFragment; class HtmlResource; enum class HtmlResourceType { + html_fragment, css, js, image, + font, }; using HtmlResourceLocation = std::optional; diff --git a/src/odr/internal/html/pdf2htmlex_wrapper.cpp b/src/odr/internal/html/pdf2htmlex_wrapper.cpp index baa4a836..da2b8bc6 100644 --- a/src/odr/internal/html/pdf2htmlex_wrapper.cpp +++ b/src/odr/internal/html/pdf2htmlex_wrapper.cpp @@ -4,7 +4,11 @@ #include #include +#include +#include +#include #include +#include #include #include @@ -12,13 +16,12 @@ #include #include -namespace odr::internal { +namespace odr::internal::html { -Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, - const std::string &output_path, - const HtmlConfig &config) { - PDFDoc &pdf_doc = pdf_file.pdf_doc(); +namespace { +pdf2htmlEX::Param create_params(PDFDoc &pdf_doc, const HtmlConfig &config, + const std::string &output_path) { pdf2htmlEX::Param param; // pages @@ -30,7 +33,7 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, param.fit_width = 0; param.fit_height = 0; param.use_cropbox = 1; - param.desired_dpi = 144.0; + param.desired_dpi = 144; // output param.embed_css = 1; @@ -40,9 +43,9 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, param.embed_outline = 1; param.split_pages = 0; param.dest_dir = output_path; - param.css_filename = ""; - param.page_filename = ""; - param.outline_filename = ""; + param.css_filename = "style.css"; + param.page_filename = "page%i.html"; + param.outline_filename = "outline.html"; param.process_nontext = 1; param.process_outline = 1; param.process_annotation = 0; @@ -50,6 +53,7 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, param.printing = 1; param.fallback = 0; param.tmp_file_size_limit = -1; + param.delay_background = 0; // font param.embed_external_font = 0; // TODO 1 @@ -86,7 +90,7 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, // misc param.clean_tmp = 1; - param.tmp_dir = "/tmp"; + param.tmp_dir = output_path; param.data_dir = config.pdf2htmlex_data_path; param.poppler_data_dir = config.poppler_data_path; param.debug = 0; @@ -97,6 +101,158 @@ Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, param.input_filename = ""; param.output_filename = "document.html"; + return param; +} + +} // namespace + +class BackgroundImageResource : public HtmlResource { +public: + static std::string file_name(std::size_t page_number, + const std::string &format) { + std::stringstream stream; + stream << "bg" << page_number; + stream << std::hex << page_number; + stream << "." << format; + return stream.str(); + } + + BackgroundImageResource( + PopplerPdfFile pdf_file, std::string output_path, + std::shared_ptr html_renderer, + std::shared_ptr html_renderer_mutex, int page_number, + const std::string &format) + : HtmlResource(HtmlResourceType::image, file_name(page_number, format), + output_path + "/" + file_name(page_number, format), + odr::File(), false, false), + m_pdf_file{std::move(pdf_file)}, m_output_path{std::move(output_path)}, + m_html_renderer{std::move(html_renderer)}, + m_html_renderer_mutex{std::move(html_renderer_mutex)}, + m_page_number{page_number} {} + + void write_resource(std::ostream &os) const override { + PDFDoc &pdf_doc = m_pdf_file.pdf_doc(); + + std::lock_guard lock(m_mutex); + + if (!std::filesystem::exists(path())) { + std::lock_guard renderer_lock(*m_html_renderer_mutex); + + m_html_renderer->renderPage(&pdf_doc, m_page_number); + } + + { + std::ifstream in(path()); + util::stream::pipe(in, os); + } + } + +private: + PopplerPdfFile m_pdf_file; + std::string m_output_path; + std::shared_ptr m_html_renderer; + std::shared_ptr m_html_renderer_mutex; + int m_page_number; + mutable std::mutex m_mutex; +}; + +class HtmlServiceImpl : public HtmlService { +public: + HtmlServiceImpl(PopplerPdfFile pdf_file, std::string output_path, + std::shared_ptr html_renderer, + std::shared_ptr html_renderer_mutex, + std::shared_ptr html_renderer_param, + HtmlConfig config, HtmlResourceLocator resource_locator) + : HtmlService(std::move(config), std::move(resource_locator), {}), + m_pdf_file{std::move(pdf_file)}, m_output_path{std::move(output_path)}, + m_html_renderer{std::move(html_renderer)}, + m_html_renderer_mutex{std::move(html_renderer_mutex)}, + m_html_renderer_param{std::move(html_renderer_param)} { + for (int i = 1; i <= m_pdf_file.pdf_doc().getNumPages(); ++i) { + auto resource = std::make_shared( + m_pdf_file, m_output_path, m_html_renderer, m_html_renderer_mutex, i, + m_html_renderer_param->bg_format); + std::string file_name = BackgroundImageResource::file_name( + i, m_html_renderer_param->bg_format); + m_resources.emplace_back(std::move(resource), std::move(file_name)); + } + } + + HtmlResources write_document(HtmlWriter &out) const override { + HtmlResources resources; + + { + std::ifstream in(m_output_path + "/document.html"); + util::stream::pipe(in, out.out()); + } + + return resources; + } + +private: + PopplerPdfFile m_pdf_file; + std::string m_output_path; + std::shared_ptr m_html_renderer; + std::shared_ptr m_html_renderer_mutex; + std::shared_ptr m_html_renderer_param; + + HtmlResources m_resources; +}; + +} // namespace odr::internal::html + +namespace odr::internal { + +odr::HtmlService +html::create_poppler_pdf_service(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config) { + PDFDoc &pdf_doc = pdf_file.pdf_doc(); + + auto html_renderer_param = std::make_shared( + create_params(pdf_doc, config, output_path)); + html_renderer_param->embed_image = 0; + html_renderer_param->delay_background = 1; + + if (!pdf_doc.okToCopy()) { + if (html_renderer_param->no_drm == 0) { + throw DocumentCopyProtectedException(""); + } + } + + globalParams = std::make_unique( + !html_renderer_param->poppler_data_dir.empty() + ? html_renderer_param->poppler_data_dir.c_str() + : nullptr); + + // TODO not sure what the `progPath` is used for. it cannot be `nullptr` + // TODO potentially just a cache dir? + auto html_renderer = std::make_shared( + config.fontforge_data_path.c_str(), *html_renderer_param); + html_renderer->process(&pdf_doc); + + globalParams.reset(); + + HtmlResourceLocator resource_locator = + local_resource_locator(output_path, config); + + // renderer is not thread safe + // TODO check if this can be achieved in pdf2htmlEX + auto html_renderer_mutex = std::make_shared(); + + return odr::HtmlService(std::make_shared( + pdf_file, output_path, std::move(html_renderer), + std::move(html_renderer_mutex), std::move(html_renderer_param), config, + resource_locator)); +} + +Html html::translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config) { + PDFDoc &pdf_doc = pdf_file.pdf_doc(); + + pdf2htmlEX::Param param = create_params(pdf_doc, config, output_path); + if (!pdf_doc.okToCopy()) { if (param.no_drm == 0) { throw DocumentCopyProtectedException(""); diff --git a/src/odr/internal/html/pdf2htmlex_wrapper.hpp b/src/odr/internal/html/pdf2htmlex_wrapper.hpp index 1d67e50f..30bd6c4a 100644 --- a/src/odr/internal/html/pdf2htmlex_wrapper.hpp +++ b/src/odr/internal/html/pdf2htmlex_wrapper.hpp @@ -15,7 +15,9 @@ class PopplerPdfFile; namespace odr::internal::html { -HtmlService translate_document(const PopplerPdfFile &pdf_file); +odr::HtmlService create_poppler_pdf_service(const PopplerPdfFile &pdf_file, + const std::string &output_path, + const HtmlConfig &config); Html translate_poppler_pdf_file(const PopplerPdfFile &pdf_file, const std::string &output_path, diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private index 118b6afa..145aa102 160000 --- a/test/data/reference-output/odr-private +++ b/test/data/reference-output/odr-private @@ -1 +1 @@ -Subproject commit 118b6afae107a2326f5eb70e3536e209751eb079 +Subproject commit 145aa102d6d6a87973f384381944cf2bb88f191f diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public index 76d0a13e..82434a11 160000 --- a/test/data/reference-output/odr-public +++ b/test/data/reference-output/odr-public @@ -1 +1 @@ -Subproject commit 76d0a13e5d69081fc41cf2cfc296fb1bd85156f8 +Subproject commit 82434a1139a9597333af90f7a18d15fc04d02cf5 diff --git a/test/docker/compare_output_server.sh b/test/docker/compare_output_server.sh index 57b2f2b0..c39f7942 100755 --- a/test/docker/compare_output_server.sh +++ b/test/docker/compare_output_server.sh @@ -16,5 +16,6 @@ fi docker run -ti \ -v $(pwd):/repo \ -p 8000:8000 \ + --platform linux/amd64 \ ghcr.io/opendocument-app/odr_core_test \ python3 /repo/test/scripts/compare_output_server.py /repo/$REF /repo/$OBS --compare --driver $DRIVER --port 8000