AmazeeLabs · dspachos · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 16, 2025
diff --git a/.lagoon/Dockerfile b/.lagoon/Dockerfile
@@ -15,10 +15,10 @@ RUN npm install -g [email protected] && pnpm config set store-dir /tmp/cache/pnpm
 COPY pnpm-lock.yaml .npmrc /app/
 # COPY patches /app/patches
 RUN --mount=type=cache,target=/tmp/cache pnpm fetch && \
-    # There is a bug in pnpm: `pnpm fetch` creates _some_ node_modules folders
-    # with _some_ packages. This can lead to an incomplete package installation.
-    # So we remove them now.
-    find . -name 'node_modules' -type d -prune -exec rm -rf '{}' +
+  # There is a bug in pnpm: `pnpm fetch` creates _some_ node_modules folders
+  # with _some_ packages. This can lead to an incomplete package installation.
+  # So we remove them now.
+  find . -name 'node_modules' -type d -prune -exec rm -rf '{}' +
 
 # Install composer dependencies.
 # They may contain directive definitions required by prep scripts.
@@ -43,9 +43,9 @@ ENV VITE_DECAP_BRANCH="$LAGOON_GIT_BRANCH"
 # Copy the all package sources, install and prepare them.
 COPY . /app
 RUN --mount=type=cache,target=/tmp/cache pnpm i && \
-    pnpm turbo:prep && \
-    # Remove all node_modules to reduce the size of the image.
-    find . -name 'node_modules' -type d -prune -exec rm -rf '{}' +
+  pnpm turbo:prep && \
+  # Remove all node_modules to reduce the size of the image.
+  find . -name 'node_modules' -type d -prune -exec rm -rf '{}' +
 
 # Deploy apps.
 RUN --mount=type=cache,target=/tmp/cache pnpm deploy --filter "@custom/cms" /tmp/.deploy/cms --prod
@@ -124,3 +124,13 @@ ARG LAGOON_GIT_BRANCH
 ENV VITE_DECAP_BRANCH="$LAGOON_GIT_BRANCH"
 
 CMD pnpm publisher
+
+# ====================================================================================================
+# CONVERTER IMAGE
+# ====================================================================================================
+
+FROM uselagoon/node-18 as convertmd
+
+RUN npm install -g [email protected]
+COPY --from=builder /tmp/.deploy/converter /app
+CMD pnpm start
diff --git a/apps/converter/README.md b/apps/converter/README.md
@@ -0,0 +1,107 @@
+# Silverback Converter
+
+The converter is a Node.js application designed to convert documents from
+various formats (DocX, PDF, and HTML) into Markdown.
+
+This tool is particularly useful for developers and content creators who need to
+transform documents into a format suitable for further processing, analysis, or
+integration with other systems.
+
+## Features
+
+- **DocX to Markdown**: Convert Word documents (`.docx`) to Markdown.
+- **PDF to Markdown**: Convert PDF files to Markdown.
+- **HTML to Markdown**: Extract main content from web pages and convert it to
+  Markdown.
+- **Jina AI Integration**: Fetch and convert content using the Jina AI API.
+  (ATTENTION: EXPERIMENTAL, DO NOT USE THIS)
+
+## Setup and Installation
+
+### Prerequisites
+
+- Node.js (version 18 or higher)
+- npm (Node Package Manager)
+
+### Installation
+
+1. **Install dependencies**:
+
+   ```bash
+   npm i
+   ```
+
+2. **Set up environment variables** (optional):
+   - Create a `.env` file in the root directory.
+   - Add your Jina AI API key if you plan to use the Jina AI integration:
+     ```env
+     JINA_AI_API_KEY=your_jina_ai_api_key
+     ```
+
+### Running the Application
+
+To start the application, run the following command:
+
+```bash
+npm start
+```
+
+The server will start on `http://localhost:3000`.
+
+## Usage
+
+### Endpoints
+
+- **Convert DocX to Markdown**:
+
+  ```
+  GET /convert?path=/path/to/your/document.docx
+  ```
+
+- **Convert PDF to Markdown**:
+
+  ```
+  GET /pdf-convert?path=/path/to/your/document.pdf
+  ```
+
+- **Convert HTML to Markdown**:
+
+  ```
+  GET /html-convert?path=https://example.com
+  ```
+
+- **Fetch and Convert Content with Jina AI**:
+  ```
+  GET /jina-convert?path=https://example.com
+  ```
+
+### Example
+
+To convert a Word document to Markdown, make a GET request to:
+
+```
+http://localhost:3000/convert?path=/path/to/your/document.docx
+```
+
+The response will include the converted Markdown content, the output directory,
+and any warnings generated during the conversion process.
+
+## Configuration
+
+- **Output Directory**: By default, converted files are saved in a directory
+  named after the input file's hash. You can customize the output directory by
+  modifying the `outputDir` variable in the respective conversion scripts.
+- **Image Handling**: Images extracted from documents are saved in an `images`
+  subdirectory within the output directory.
+
+## Dependencies
+
+The application relies on several npm packages, including:
+
+- `mammoth` for DocX conversion
+- `@opendocsg/pdf2md` for PDF conversion
+- `@extractus/article-extractor` for HTML content extraction
+- `turndown` for HTML to Markdown conversion
+- `express` for the server
+
+For a complete list of dependencies, refer to the `package.json` file.
diff --git a/apps/converter/htmlToMarkdown.js b/apps/converter/htmlToMarkdown.js
@@ -0,0 +1,216 @@
+import { extract } from '@extractus/article-extractor';
+import crypto from 'crypto';
+import fs from 'fs-extra';
+import imageType from 'image-type';
+import { JSDOM } from 'jsdom';
+import { applyFixes } from 'markdownlint';
+import { lint as lintSync } from 'markdownlint/sync';
+import fetch from 'node-fetch';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+import {
+  convertToMarkdown,
+  generateFolderName,
+  validateAndFixMarkdown,
+} from './utils/utils.js';
+
+/**
+ * Extracts images from markdown content while preserving their positions
+ * @param {string} markdown - Original markdown content
+ * @returns {{cleanMarkdown: string, extractedImages: Array<{alt: string, url: string, position: number, placeholder: string}>}}
+ */
+function extractImagesWithPositions(markdown) {
+  const imageRegex = /!\[(.*?)\]\((.*?)\)/g;
+  const extractedImages = [];
+  let match;
+  let cleanMarkdown = markdown;
+  let index = 0;
+
+  while ((match = imageRegex.exec(markdown)) !== null) {
+    const placeholder = `__IMAGE_PLACEHOLDER_${index}__`;
+    extractedImages.push({
+      alt: match[1] || '',
+      url: match[2],
+      position: match.index,
+      placeholder,
+    });
+    index++;
+  }
+
+  // Replace images with placeholders
+  extractedImages.forEach((image) => {
+    cleanMarkdown = cleanMarkdown.replace(
+      `![${image.alt}](${image.url})`,
+      image.placeholder,
+    );
+  });
+
+  return {
+    cleanMarkdown,
+    extractedImages,
+  };
+}
+
+/**
+ * Reinserts images just above their original link positions
+ * @param {string} markdown - Markdown content with placeholders
+ * @param {Array<{alt: string, url: string, placeholder: string}>} images - Extracted images
+ * @returns {string} - Markdown with images reinserted
+ */
+function reinsertImages(markdown, images) {
+  let result = markdown;
+
+  // Sort images by their position in reverse order to maintain correct positions
+  const sortedImages = [...images].sort((a, b) => b.position - a.position);
+
+  for (const image of sortedImages) {
+    const imageMarkdown = `![${image.alt}](${image.url})\n\n`;
+    const placeholderPosition = result.indexOf(image.placeholder);
+
+    if (placeholderPosition !== -1) {
+      // Find the start of the line containing the placeholder
+      let lineStart = result.lastIndexOf('\n', placeholderPosition);
+      lineStart = lineStart === -1 ? 0 : lineStart + 1;
+
+      // Insert the image above the line containing the placeholder
+      result =
+        result.slice(0, lineStart) + imageMarkdown + result.slice(lineStart);
+
+      // Remove the placeholder
+      result = result.replace(image.placeholder, '');
+    }
+  }
+
+  // Clean up any double blank lines created during the process
+  result = result.replace(/\n{3,}/g, '\n\n');
+
+  return result.trim();
+}
+
+// @todo Fix this to work locally and live
+const isLagoon = !!process.env.LAGOON;
+const __dirname = isLagoon
+  ? '/app/web/sites/default/files/converted'
+  : '/tmp/converted';
+
+async function extractMainContentFromUrl(url) {
+  try {
+    const mainContent = await extract(url);
+    return mainContent ? mainContent.content : '';
+  } catch (err) {
+    console.error(err);
+  }
+  return '';
+}
+
+async function getImageExtension(buffer) {
+  const type = await imageType(buffer);
+  return type ? `.${type.ext}` : '.png';
+}
+
+async function downloadImage(url) {
+  try {
+    const response = await fetch(url);
+    if (!response.ok)
+      throw new Error(`Failed to fetch image: ${response.statusText}`);
+    return Buffer.from(await response.arrayBuffer());
+  } catch (error) {
+    console.warn(
+      `Warning: Failed to download image from ${url}:`,
+      error.message,
+    );
+    return null;
+  }
+}
+
+function isValidUrl(string) {
+  try {
+    new URL(string);
+    return true;
+  } catch (_) {
+    return false;
+  }
+}
+
+export async function htmlToMarkdown(url) {
+  if (!isValidUrl(url)) {
+    throw new Error('Invalid URL provided: ' + url);
+  }
+
+  const html = await extractMainContentFromUrl(url);
+  // Generate folder name based on HTML content
+  const folderName = generateFolderName(url);
+  const outputDir = path.join(__dirname, folderName);
+  const imagesDir = path.join(outputDir, 'images');
+
+  await fs.ensureDir(outputDir);
+  await fs.ensureDir(imagesDir);
+
+  // Parse HTML using JSDOM
+  const dom = new JSDOM(html);
+  const document = dom.window.document;
+
+  // Process images before conversion
+  const images = document.querySelectorAll('img');
+  const imageMap = new Map();
+
+  for (const img of images) {
+    const srcAttribute = img.getAttribute('src');
+    if (!srcAttribute) continue;
+
+    // Resolve relative URLs to absolute URLs
+    const absoluteUrl = new URL(srcAttribute, url).href;
+
+    const imageBuffer = await downloadImage(absoluteUrl);
+    if (!imageBuffer) continue;
+
+    const extension = await getImageExtension(imageBuffer);
+    const filename = `image-${crypto.randomBytes(4).toString('hex')}${extension}`;
+    const imagePath = path.join(imagesDir, filename);
+
+    await fs.writeFile(imagePath, imageBuffer);
+    imageMap.set(srcAttribute, path.join('images', filename));
+    img.setAttribute('src', path.join('images', filename));
+  }
+
+  // Convert to Markdown
+  let markdown = convertToMarkdown(document.body);
+
+  // Clean up the markdown
+  markdown = markdown
+    .replace(/\n\s*\n\s*\n/g, '\n\n')
+    .replace(/!\[\]\(/g, '![image](')
+    .trim();
+
+  const results = lintSync({ strings: { content: markdown } });
+  const fixed = applyFixes(markdown, results.content);
+  const { markdown: fixedMarkdown, warnings } = validateAndFixMarkdown(fixed);
+
+  const { cleanMarkdown, extractedImages } =
+    extractImagesWithPositions(fixedMarkdown);
+  const correctedMarkdown = reinsertImages(cleanMarkdown, extractedImages);
+
+  const fixEmptyMarkdownLinks = (markdown) => {
+    // Regular expression to match markdown links with empty URL but with title
+    // Captures: []("title")
+    const emptyLinkRegex = /\[\]\(([^)]+)\s+"([^"]+)"\)/g;
+
+    // Replace empty links with their title text as link text
+    return markdown.replace(emptyLinkRegex, (match, url, title) => {
+      return `[${title}](${url} "${title}")`;
+    });
+  };
+
+  const fixedLinksMarkdown = fixEmptyMarkdownLinks(correctedMarkdown);
+
+  // Save markdown file
+  const mdPath = path.join(outputDir, 'content.md');
+  await fs.writeFile(mdPath, fixedLinksMarkdown);
+
+  return {
+    markdownPath: mdPath,
+    warnings: warnings,
+    outputDir,
+  };
+}