Skip to content

Commit

Permalink
Add spacy-layout [ci skip]
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Nov 19, 2024
1 parent 3ecec13 commit 3e30b5b
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 2 deletions.
42 changes: 42 additions & 0 deletions website/meta/universe.json
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,48 @@
"website": "https://ines.io"
}
},
{
"id": "spacy-layout",
"slogan": "Process PDFs, Word documents and more with spaCy",
"github": "explosion/spacy-layout",
"description": "This plugin integrates with [Docling](https://ds4sd.github.io/docling/) to bring structured processing of PDFs, Word documents and other input formats to your spaCy pipeline. It outputs clean, structured data in a text-based format and outputs spaCy's familiar `Doc` objects that let you access labelled text spans like sections, headings, or footnotes.\n\nThis workflow makes it easy to apply powerful NLP techniques to your documents, including linguistic analysis, named entity recognition, text classification and more. It's also great for implementing chunking for RAG pipelines.",
"pip": "spacy-layout",
"category": [
"pipeline"
],
"code_example": [
"import spacy",
"from spacy_layout import spaCyLayout",
"",
"nlp = spacy.blank(\"en\")",
"layout = spaCyLayout(nlp)",
"",
"# Process a document and create a spaCy Doc object",
"doc = layout(\"./starcraft.pdf\")",
"",
"# The text-based contents of the document",
"print(doc.text)",
"# Document layout including pages and page sizes",
"print(doc._.layout)",
"",
"# Layout spans for different sections",
"for span in doc.spans[\"layout\"]:",
" # Document section and token and character offsets into the text",
" print(span.text, span.start, span.end, span.start_char, span.end_char)",
" # Section type, e.g. \"text\", \"title\", \"section_header\" etc.",
" print(span.label_)",
" # Layout features of the section, including bounding box",
" print(span._.layout)",
" # Closest heading to the span (accuracy depends on document structure)",
" print(span._.heading)"
],
"author": "Ines Montani",
"author_links": {
"twitter": "_inesmontani",
"github": "ines",
"website": "https://ines.io"
}
},
{
"id": "spacyopentapioca",
"title": "spaCyOpenTapioca",
Expand Down
4 changes: 2 additions & 2 deletions website/src/templates/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
}

const navAlert = (
<Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
💥 <strong>New:</strong> Case study with S&P Global
<Link to="https://github.com/explosion/spacy-layout" noLinkLayout>
💥 <strong>New:</strong> spaCy for PDFs and Word docs
</Link>
)

Expand Down

0 comments on commit 3e30b5b

Please sign in to comment.