Skip to content

Chunker

Chunking

Bases: Task

Task for chunking documents using different strategies.

This task acts as a wrapper around specific chunker implementations, allowing for flexible configuration based on the provided chunker object or interval.

Source code in sieves/tasks/preprocessing/chunking/core.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class Chunking(Task):
    """Task for chunking documents using different strategies.

    This task acts as a wrapper around specific chunker implementations,
    allowing for flexible configuration based on the provided chunker object or interval.
    """

    def __init__(
        self,
        chunker: _ChunkerArgType,
        task_id: str | None = None,
        show_progress: bool = True,
        include_meta: bool = False,
    ):
        """Initialize the Chunker task.

        :param chunker: The chunker instance (chonkie.BaseChunker) or the interval (int) for NaiveChunker.
        :param task_id: Task ID.
        :param show_progress: Whether to show progress bar for processed documents.
        :param include_meta: Whether to include meta information generated by the task.
        """
        super().__init__(task_id=task_id, show_progress=show_progress, include_meta=include_meta)
        self._chunker_arg = chunker
        self._task = self._init_chunker_task()

    def _init_chunker_task(self) -> _ChunkerType:
        """Initialize the specific chunker task based on the type of _chunker_arg.

        :return: Initialized chunker task instance.
        :raises TypeError: If the type of _chunker_arg is not supported.
        """
        chunker_task: _ChunkerType

        match self._chunker_arg:
            case chunker if isinstance(chunker, chonkie.BaseChunker):
                chunker_task = chunking.chonkie_.Chonkie(
                    chunker=chunker,
                    task_id=self.id,
                    show_progress=self._show_progress,
                    include_meta=self._include_meta,
                )
            case interval if isinstance(interval, int):
                chunker_task = chunking.naive.NaiveChunker(
                    interval=interval,
                    task_id=self.id,
                    show_progress=self._show_progress,
                    include_meta=self._include_meta,
                )
            case _:
                raise TypeError(
                    f"Unsupported type for 'chunker' argument: {type(self._chunker_arg)}. "
                    f"Expected chonkie.BaseChunker or int."
                )

        return chunker_task

    def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
        """Process documents by chunking their text.

        :param docs: Documents to process.
        :return: Processed documents with chunks added.
        """
        docs_iters = itertools.tee(docs, 2)
        assert all(doc.text for doc in docs_iters[0]), ValueError("Documents have to have a value for .text.")
        yield from self._task(docs_iters[1])

    @property
    def _state(self) -> dict[str, Any]:
        """Return attributes to serialize.
        :return: Dict of attributes to serialize.
        """
        return {
            **super()._state,
            "chunker": self._chunker_arg,
        }

    @classmethod
    def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Chunking:
        """
        Generate Chunker instance from config.
        :param config: Config to generate instance from.
        :param kwargs: Values to inject into loaded config.
        :return: Deserialized Chunker instance.
        """
        return cls(**config.to_init_dict(cls, **kwargs))

id property

Returns task ID. Used by pipeline for results and dependency management.

Returns:

Type Description
str

Task ID.

__call__(docs)

Process documents by chunking their text.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Documents to process.

required

Returns:

Type Description
Iterable[Doc]

Processed documents with chunks added.

Source code in sieves/tasks/preprocessing/chunking/core.py
75
76
77
78
79
80
81
82
83
def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
    """Process documents by chunking their text.

    :param docs: Documents to process.
    :return: Processed documents with chunks added.
    """
    docs_iters = itertools.tee(docs, 2)
    assert all(doc.text for doc in docs_iters[0]), ValueError("Documents have to have a value for .text.")
    yield from self._task(docs_iters[1])

__init__(chunker, task_id=None, show_progress=True, include_meta=False)

Initialize the Chunker task.

Parameters:

Name Type Description Default
chunker _ChunkerArgType

The chunker instance (chonkie.BaseChunker) or the interval (int) for NaiveChunker.

required
task_id str | None

Task ID.

None
show_progress bool

Whether to show progress bar for processed documents.

True
include_meta bool

Whether to include meta information generated by the task.

False
Source code in sieves/tasks/preprocessing/chunking/core.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def __init__(
    self,
    chunker: _ChunkerArgType,
    task_id: str | None = None,
    show_progress: bool = True,
    include_meta: bool = False,
):
    """Initialize the Chunker task.

    :param chunker: The chunker instance (chonkie.BaseChunker) or the interval (int) for NaiveChunker.
    :param task_id: Task ID.
    :param show_progress: Whether to show progress bar for processed documents.
    :param include_meta: Whether to include meta information generated by the task.
    """
    super().__init__(task_id=task_id, show_progress=show_progress, include_meta=include_meta)
    self._chunker_arg = chunker
    self._task = self._init_chunker_task()

deserialize(config, **kwargs) classmethod

Generate Chunker instance from config.

Parameters:

Name Type Description Default
config Config

Config to generate instance from.

required
kwargs dict[str, Any]

Values to inject into loaded config.

{}

Returns:

Type Description
Chunking

Deserialized Chunker instance.

Source code in sieves/tasks/preprocessing/chunking/core.py
 95
 96
 97
 98
 99
100
101
102
103
@classmethod
def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Chunking:
    """
    Generate Chunker instance from config.
    :param config: Config to generate instance from.
    :param kwargs: Values to inject into loaded config.
    :return: Deserialized Chunker instance.
    """
    return cls(**config.to_init_dict(cls, **kwargs))

serialize()

Serializes task.

Returns:

Type Description
Config

Config instance.

Source code in sieves/tasks/core.py
50
51
52
53
54
def serialize(self) -> Config:
    """Serializes task.
    :return: Config instance.
    """
    return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})