-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
64a56c0
commit 9793347
Showing
3 changed files
with
201 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
61 changes: 61 additions & 0 deletions
61
packages/ragbits-document-search/src/ragbits/document_search/documents/source_resolver.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
from collections.abc import Sequence | ||
from typing import ClassVar | ||
|
||
from ragbits.document_search.documents.sources import Source | ||
|
||
|
||
class SourceResolver: | ||
"""Registry for source URI protocols and their handlers. | ||
This class provides a mechanism to register and resolve different source protocols (like 'file://', 'gcs://', etc.) | ||
to their corresponding Source implementations. | ||
Example: | ||
>>> SourceResolver.register_protocol("gcs", GCSSource) | ||
>>> sources = SourceResolver.resolve("gcs://my-bucket/path/to/files/*") | ||
""" | ||
_protocol_handlers: ClassVar[dict[str, type[Source]]] = {} | ||
|
||
@classmethod | ||
def register_protocol(cls, protocol: str, source_class: type[Source]) -> None: | ||
"""Register a source class for a specific protocol. | ||
Args: | ||
protocol: The protocol identifier (e.g., 'file', 'gcs', 's3') | ||
source_class: The Source subclass that handles this protocol | ||
""" | ||
cls._protocol_handlers[protocol] = source_class | ||
|
||
@classmethod | ||
def resolve(cls, uri: str) -> Sequence[Source]: | ||
"""Resolve a URI into a sequence of Source objects. | ||
The URI format should be: protocol://path | ||
For example: | ||
- file:///path/to/files/* | ||
- gcs://bucket/prefix/* | ||
- s3://bucket/prefix/* | ||
Args: | ||
uri: The URI to resolve | ||
Returns: | ||
A sequence of Source objects | ||
Raises: | ||
ValueError: If the URI format is invalid or the protocol is not supported | ||
""" | ||
try: | ||
protocol, path = uri.split("://", 1) | ||
except ValueError: | ||
raise ValueError(f"Invalid URI format: {uri}. Expected format: protocol://path") | ||
|
||
if protocol not in cls._protocol_handlers: | ||
supported = ", ".join(sorted(cls._protocol_handlers.keys())) | ||
raise ValueError( | ||
f"Unsupported protocol: {protocol}. " | ||
f"Supported protocols are: {supported}" | ||
) | ||
|
||
handler_class = cls._protocol_handlers[protocol] | ||
return handler_class.from_uri(path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters