chore!: rename all mentions of ingest to index (#130)

Swiftide is not an ingestion pipeline (loading data), but an indexing pipeline (prepping for search). There is now a temporary, deprecated re-export to match the previous api.
bosun-ai · Jul 7, 2024 · 84dd65d · 84dd65d
1 parent 9334934
commit 84dd65d
Show file tree

Hide file tree

Showing 45 changed files with 479 additions and 465 deletions.
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
@@ -0,0 +1,8 @@
+# configuration for https://github.com/DavidAnson/markdownlint
+
+first-line-heading: false
+no-inline-html: false
+line-length: false
+
+# to support repeated headers in the changelog
+no-duplicate-heading: false
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,9 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - AWS bedrock support ([#92](https://github.com/bosun-ai/swiftide/pull/92))
 - (readme): Add diagram to the readme ([#107](https://github.com/bosun-ai/swiftide/pull/107))
-- (ingestion_pipeline): Implement filter ([#109](https://github.com/bosun-ai/swiftide/pull/109))
-- (ingestion_pipeline): Splitting and merging streams
-- (ingestion_pipeline): Build a pipeline from a stream
+- (indexing_pipeline): Implement filter ([#109](https://github.com/bosun-ai/swiftide/pull/109))
+- (indexing_pipeline): Splitting and merging streams
+- (indexing_pipeline): Build a pipeline from a stream
 - (openai): Add tests for builder
 
 ### Changed
@@ -23,7 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
-- Fix oversight in ingestion pipeline tests
+- Fix oversight in indexing pipeline tests
 - (deps): Update rust crate text-splitter to 0.14.0 ([#105](https://github.com/bosun-ai/swiftide/pull/105))
 - Replace unwrap with expect and add comment on panic
 - (transformers): Fix too small chunks being retained and api
@@ -40,9 +40,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - (ci): Add support for merge queues
 - (ci): Add concurrency configuration
 - (readme): Add diagram to the readme (#107)
-- (ingestion_pipeline): Implement filter (#109)
-- (ingestion_pipeline): Splitting and merging streams
-- (ingestion_pipeline): Build a pipeline from a stream
+- (indexing_pipeline): Implement filter (#109)
+- (indexing_pipeline): Splitting and merging streams
+- (indexing_pipeline): Build a pipeline from a stream
 - (openai): Add tests for builder
 
 ### Changed
@@ -55,7 +55,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
-- Fix oversight in ingestion pipeline tests
+- Fix oversight in indexing pipeline tests
 - (ci): Fix release-plz changelog parsing
 - (ci): Fix benchmarks in ci
 - (deps): Update rust crate spider to v1.98.3 (#100)
@@ -92,14 +92,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- (ingestion_stream): Implement into for Result<Vec<IngestionNode>>
+- (indexing_stream): Implement into for Result<Vec<Node>>
 
 ### Changed
 
 - Cleanup changelog
 - Create CONTRIBUTING.md
 - Readme updates
-- (ingestion_pipeline): Log_all combines other log helpers
+- (indexing_pipeline): Log_all combines other log helpers
 - Release
 
 ### Fixed
@@ -123,22 +123,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - (ci): Single changelog for all (future) crates in root (#57)
 - (integrations): Support fastembed (#60)
-- (ingestion_pipeline): Optional error filtering and logging (#75)
-- (ingestion_pipeline): Implement throttling a pipeline (#77)
+- (indexing_pipeline): Optional error filtering and logging (#75)
+- (indexing_pipeline): Implement throttling a pipeline (#77)
 - (integrations): Implement Persist for Redis (#80)
-- (ingestion_node): Add constructor with defaults
+- (indexing_node): Add constructor with defaults
 - (traits): Add automock for simpleprompt
 - (transformers): Add transformers for title, summary and keywords
 - (examples): Example for markdown with all metadata
-- (ingestion_node): Improved human readable Debug
-- (ingestion_stream): Improved stream developer experience (#81)
+- (indexing_node): Improved human readable Debug
+- (indexing_stream): Improved stream developer experience (#81)
 - (loaders): File loader performance improvements
 - (benchmarks): Add benchmark for the file loader
 - (benchmarks): Add benchmark for simple local pipeline
 - (loaders): Add scraping using `spider`
 - (integrations,transformers): Add transformer for converting html to markdown
 - (persist): In memory storage for testing, experimentation and debugging
-- (examples,scraping): Add example scraping and ingesting a url
+- (examples,scraping): Add example scraping and indexing a url
 
 ### Changed
 
@@ -154,7 +154,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
-- (ingestion_pipeline): Concurrency does not work when spawned (#76)
+- (indexing_pipeline): Concurrency does not work when spawned (#76)
 
 ## [swiftide-v0.3.3] - 2024-06-16
 
@@ -193,10 +193,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- (ingestion_pipeline): Support chained storage backends (#46)
-- (ingestion_pipeline): Concurrency improvements (#48)
+- (indexing_pipeline): Support chained storage backends (#46)
+- (indexing_pipeline): Concurrency improvements (#48)
 - Configurable concurrency for transformers and chunkers (#47)
-- (ingestion_pipeline): Early return if any error encountered (#49)
+- (indexing_pipeline): Early return if any error encountered (#49)
 
 ### Changed
 
@@ -223,19 +223,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 
 - Release v0.1.0 (#8)
-- (swiftide): Documented file swiftide/src/ingestion/ingestion_pipeline.rs (#14)
-- (swiftide): Documented file swiftide/src/ingestion/ingestion_stream.rs (#16)
-- (swiftide): Documented file swiftide/src/ingestion/ingestion_node.rs (#15)
+- (swiftide): Documented file swiftide/src/indexing/indexing_pipeline.rs (#14)
+- (swiftide): Documented file swiftide/src/indexing/indexing_stream.rs (#16)
+- (swiftide): Documented file swiftide/src/indexing/indexing_node.rs (#15)
 - (swiftide): Documented file swiftide/src/integrations/openai/mod.rs (#21)
 - (swiftide): Documented file swiftide/src/integrations/treesitter/splitter.rs (#30)
 - (swiftide): Documented file swiftide/src/integrations/redis/node_cache.rs (#29)
 - (swiftide): Documented file swiftide/src/integrations/qdrant/persist.rs (#24)
 - (swiftide): Documented file swiftide/src/integrations/redis/mod.rs (#23)
 - (swiftide): Documented file swiftide/src/integrations/qdrant/mod.rs (#22)
-- (swiftide): Documented file swiftide/src/integrations/qdrant/ingestion_node.rs (#20)
-- (swiftide): Documented file swiftide/src/ingestion/mod.rs (#28)
+- (swiftide): Documented file swiftide/src/integrations/qdrant/indexing_node.rs (#20)
+- (swiftide): Documented file swiftide/src/indexing/mod.rs (#28)
 - (swiftide): Documented file swiftide/src/integrations/treesitter/supported_languages.rs (#26)
-- (swiftide): Documented file swiftide/tests/ingestion_pipeline.rs (#41)
+- (swiftide): Documented file swiftide/tests/indexing_pipeline.rs (#41)
 - (swiftide): Documented file swiftide/src/loaders/mod.rs (#40)
 - (swiftide): Documented file swiftide/src/transformers/chunk_code.rs (#39)
 - (swiftide): Documented file swiftide/src/transformers/metadata_qa_text.rs (#36)
@@ -255,15 +255,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Replace databuoy with new ingestion pipeline (#322)
+- Replace databuoy with new indexing pipeline (#322)
 - (fluyt/code_ops): Add languages to chunker and range for chunk size (#334)
 - Add debug info to qdrant setup
 - (fluyt): Add verbose log on checking if index exists
 - (fluyt): Significant tracing improvements (#368)
 - Add rust-toolchain on stable
 - Fix build and add feature flags for all integrations
 - (ci): Set up basic test and release actions (#1)
-- (ingestion_pipeline): Default concurrency is the number of cpus (#6)
+- (indexing_pipeline): Default concurrency is the number of cpus (#6)
 - (doc): Setup basic readme (#5)
 
 ### Changed

diff --git a/README.md b/README.md
@@ -45,10 +45,10 @@
     <img src="https://github.com/bosun-ai/swiftide/blob/master/images/logo.png" alt="Logo" width="250" height="250">
   </a>
 
-<h3 align="center">Swiftide</h3>
+  <h3 align="center">Swiftide</h3>
 
   <p align="center">
-Blazing fast data pipelines for Retrieval Augmented Generation written in Rust 
+Blazing fast data pipelines for Retrieval Augmented Generation written in Rust
     <br />
     <a href="https://swiftide.rs"><strong>Explore the docs »</strong></a>
     <br />
@@ -68,15 +68,15 @@ Blazing fast data pipelines for Retrieval Augmented Generation written in Rust
 
 <!-- [![Product Name Screen Shot][product-screenshot]](https://example.com) -->
 
-**Swiftide** is a straightforward, easy-to-use, easy-to-extend asynchronous data ingestion and processing library. It is designed to be used in a RAG (Retrieval Augmented Generation) system. It is built to be fast and efficient, with a focus on parallel processing and asynchronous operations.
+**Swiftide** is a straightforward, easy-to-use, easy-to-extend asynchronous data indexing and processing library. It is designed to be used in a RAG (Retrieval Augmented Generation) system. It is built to be fast and efficient, with a focus on parallel processing and asynchronous operations.
 
 <div align="center">
   <a href="https://github.com/bosun-ai/swiftide">
     <img src="https://github.com/bosun-ai/swiftide/blob/master/images/rag-dark.svg" alt="RAG" width="100%" >
   </a>
 </div>
 
-While working with other Python-based tooling, frustrations arose around performance, stability, and ease of use. Thus, Swiftide was born. Ingestion performance went from multiple tens of minutes to a few seconds.
+While working with other Python-based tooling, frustrations arose around performance, stability, and ease of use. Thus, Swiftide was born. Indexing performance went from multiple tens of minutes to a few seconds.
 
 Part of the [bosun.ai](https://bosun.ai) project. An upcoming platform for autonomous code improvement.
 
@@ -87,7 +87,7 @@ We <3 feedback: project ideas, suggestions, and complaints are very welcome. Fee
 ## Example
 
 ```rust
-IngestionPipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"]))
+indexing::Pipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"]))
         .filter_cached(Redis::try_from_url(
             redis_url,
             "swiftide-examples",
@@ -113,7 +113,7 @@ IngestionPipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"]))
 
 ## Features
 
-- Extremely fast streaming ingestion pipeline with async, parallel processing
+- Extremely fast streaming indexing pipeline with async, parallel processing
 - Integrations with OpenAI, Redis, Qdrant, FastEmbed, and Treesitter
 - A variety of loaders, transformers, and embedders and other common, generic tools
 - Bring your own transformers by extending straightforward traits
@@ -124,7 +124,7 @@ IngestionPipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"]))
 
 ## Vision
 
-Our goal is to create an extremely fast, extendable platform for data ingestion and querying to further the development of automated LLM applications, with an easy-to-use and easy-to-extend api.
+Our goal is to create an extremely fast, extendable platform for data indexing and querying to further the development of automated LLM applications, with an easy-to-use and easy-to-extend api.
 
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 
@@ -144,9 +144,11 @@ Other integrations will need to be installed accordingly.
 
 1. Set up a new Rust project
 2. Add swiftide
+
    ```sh
    cargo add swiftide
    ```
+
 3. Enable the features of integrations you would like to have or use 'all' in your `Cargo.toml`
 4. Write a pipeline (see our examples and documentation)
 
@@ -158,13 +160,13 @@ Other integrations will need to be installed accordingly.
 
 Before building your stream, you need to enable and configure any integrations required. See /examples.
 
-A stream starts with a Loader that emits IngestionNodes. For instance, with the Fileloader each file is a Node.
+A stream starts with a Loader that emits Nodes. For instance, with the Fileloader each file is a Node.
 
 You can then slice and dice, augment, and filter nodes. Each different kind of step in the pipeline requires different traits. This enables extension.
 
-IngestionNodes have a path, chunk and metadata. Currently metadata is copied over when chunking and _always_ embedded when using the OpenAIEmbed transformer.
+Nodes have a path, chunk and metadata. Currently metadata is copied over when chunking and _always_ embedded when using the OpenAIEmbed transformer.
 
-- **from_loader** `(impl Loader)` starting point of the stream, creates and emits IngestionNodes
+- **from_loader** `(impl Loader)` starting point of the stream, creates and emits Nodes
 - **filter_cached** `(impl NodeCache)` filters cached nodes
 - **then** `(impl Transformer)` transforms the node and puts it on the stream
 - **then_in_batch** `(impl BatchTransformer)` transforms multiple nodes and puts them on the stream
@@ -225,12 +227,8 @@ Distributed under the MIT License. See `LICENSE` for more information.
 
 [contributors-shield]: https://img.shields.io/github/contributors/bosun-ai/swiftide.svg?style=flat-square
 [contributors-url]: https://github.com/bosun-ai/swiftide/graphs/contributors
-[forks-shield]: https://img.shields.io/github/forks/bosun-ai/swiftide.svg?style=flat-square
-[forks-url]: https://github.com/bosun-ai/swiftide/network/members
 [stars-shield]: https://img.shields.io/github/stars/bosun-ai/swiftide.svg?style=flat-square
 [stars-url]: https://github.com/bosun-ai/swiftide/stargazers
-[issues-shield]: https://img.shields.io/github/issues/bosun-ai/swiftide.svg?style=flat-square
-[issues-url]: https://github.com/bosun-ai/swiftide/issues
 [license-shield]: https://img.shields.io/github/license/bosun-ai/swiftide.svg?style=flat-square
 [license-url]: https://github.com/bosun-ai/swiftide/blob/master/LICENSE.txt
 [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=flat-square&logo=linkedin&colorB=555

diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
@@ -19,6 +19,6 @@ path = "fileloader.rs"
 harness = false
 
 [[bench]]
-name = "ingest-readme-local"
+name = "index-readme-local"
 path = "local_pipeline.rs"
 harness = false
diff --git a/benchmarks/fileloader.rs b/benchmarks/fileloader.rs
@@ -1,7 +1,7 @@
 use anyhow::Result;
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use futures_util::stream::TryStreamExt;
-use swiftide::{ingestion::StreamExt, traits::Loader};
+use swiftide::{indexing::StreamExt, traits::Loader};
 
 async fn run_fileloader(num_files: usize) -> Result<usize> {
     let mut total_nodes = 0;

diff --git a/benchmarks/local_pipeline.rs b/benchmarks/local_pipeline.rs
@@ -1,15 +1,15 @@
 use anyhow::Result;
 use criterion::{criterion_group, criterion_main, Criterion};
 use swiftide::{
-    ingestion::IngestionPipeline,
+    indexing::Pipeline,
     integrations::fastembed::FastEmbed,
     loaders::FileLoader,
     persist::MemoryStorage,
     transformers::{ChunkMarkdown, Embed},
 };
 
 async fn run_pipeline() -> Result<()> {
-    IngestionPipeline::from_loader(FileLoader::new("README.md").with_extensions(&["md"]))
+    Pipeline::from_loader(FileLoader::new("README.md").with_extensions(&["md"]))
         .then_chunk(ChunkMarkdown::from_chunk_range(20..256))
         .then_in_batch(10, Embed::new(FastEmbed::builder().batch_size(10).build()?))
         .then_store_with(MemoryStorage::default())

diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -16,24 +16,24 @@ serde_json = "1.0"
 spider = "1.98"
 
 [[example]]
-name = "ingest-codebase"
-path = "ingest_codebase.rs"
+name = "index-codebase"
+path = "index_codebase.rs"
 
 [[example]]
 name = "fastembed"
 path = "fastembed.rs"
 
 [[example]]
-name = "ingest-redis"
-path = "ingest_into_redis.rs"
+name = "index-redis"
+path = "index_into_redis.rs"
 
 [[example]]
-name = "ingest-markdown-metadata"
-path = "ingest_markdown_lots_of_metadata.rs"
+name = "index-markdown-metadata"
+path = "index_markdown_lots_of_metadata.rs"
 
 [[example]]
-name = "scraping-ingest"
-path = "scraping_ingest_to_markdown.rs"
+name = "scraping-index"
+path = "scraping_index_to_markdown.rs"
 
 [[example]]
 name = "aws-bedrock"

diff --git a/examples/aws_bedrock.rs b/examples/aws_bedrock.rs
@@ -12,9 +12,7 @@
 //! [examples]: https://github.com/bosun-ai/swiftide/blob/master/examples
 //! [AWS Bedrock documentation]: https://docs.aws.amazon.com/bedrock/
 
-use swiftide::{
-    ingestion, integrations, loaders::FileLoader, persist::MemoryStorage, transformers,
-};
+use swiftide::{indexing, integrations, loaders::FileLoader, persist::MemoryStorage, transformers};
 
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
@@ -27,7 +25,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     let memory_storage = MemoryStorage::default();
 
-    ingestion::IngestionPipeline::from_loader(FileLoader::new("./README.md"))
+    indexing::Pipeline::from_loader(FileLoader::new("./README.md"))
         .log_nodes()
         .then_chunk(transformers::ChunkMarkdown::from_chunk_range(100..512))
         .then(transformers::MetadataSummary::new(aws_bedrock.clone()))

diff --git a/examples/fastembed.rs b/examples/fastembed.rs
@@ -1,6 +1,6 @@
-//! # [Swiftide] Ingesting the Swiftide itself example
+//! # [Swiftide] Indexing the Swiftide itself example
 //!
-//! This example demonstrates how to ingest the Swiftide codebase itself using FastEmbed.
+//! This example demonstrates how to index the Swiftide codebase itself using FastEmbed.
 //!
 //! The pipeline will:
 //! - Load all `.rs` files from the current directory
@@ -11,7 +11,7 @@
 //! [examples]: https://github.com/bosun-ai/swiftide/blob/master/examples
 
 use swiftide::{
-    ingestion,
+    indexing,
     integrations::{fastembed::FastEmbed, qdrant::Qdrant},
     loaders::FileLoader,
     transformers::Embed,
@@ -26,7 +26,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         .unwrap_or("http://localhost:6334")
         .to_owned();
 
-    ingestion::IngestionPipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"]))
+    indexing::Pipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"]))
         .then_in_batch(10, Embed::new(FastEmbed::builder().batch_size(10).build()?))
         .then_store_with(
             Qdrant::try_from_url(qdrant_url)?