Skip to content

Chunking

Allows chunking of documents into segments.

Chonkie

Bases: Task

Chunker wrapping the chonkie library.

Source code in sieves/tasks/preprocessing/chunkers.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
class Chonkie(Task):
    """Chunker wrapping the chonkie library."""

    def __init__(
        self,
        chunker: chonkie.BaseChunker,
        task_id: str | None = None,
        show_progress: bool = True,
        include_meta: bool = False,
    ):
        """Initialize chunker.
        :param task_id: Task ID.
        :param show_progress: Whether to show progress bar for processed documents.
        :param include_meta: Whether to include meta information generated by the task.
        """
        super().__init__(task_id=task_id, show_progress=show_progress, include_meta=include_meta)
        self._chunker = chunker

    def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
        """Split documents into chunks.
        :param docs: Documents to split.
        :return: Split documents.
        """
        docs = list(docs)

        pbar: tqdm | None = tqdm(total=len(docs)) if self._show_progress else None
        try:
            chunks = self._chunker.chunk_batch([doc.text for doc in docs])
            assert len(chunks) == len(docs)

            for doc, doc_chunks in zip(docs, chunks):
                if self._include_meta:
                    doc.meta |= {self.id: {doc_chunks}}
                doc.chunks = [chunk.text for chunk in doc_chunks]
                if pbar:
                    pbar.update(1)

            return docs
        finally:
            if pbar:
                pbar.close()

    @property
    def _state(self) -> dict[str, Any]:
        return {
            **super()._state,
            "chunker": self._chunker,
        }

id property

Returns task ID. Used by pipeline for results and dependency management.

Returns:

Type Description
str

Task ID.

__call__(docs)

Split documents into chunks.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Documents to split.

required

Returns:

Type Description
Iterable[Doc]

Split documents.

Source code in sieves/tasks/preprocessing/chunkers.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
    """Split documents into chunks.
    :param docs: Documents to split.
    :return: Split documents.
    """
    docs = list(docs)

    pbar: tqdm | None = tqdm(total=len(docs)) if self._show_progress else None
    try:
        chunks = self._chunker.chunk_batch([doc.text for doc in docs])
        assert len(chunks) == len(docs)

        for doc, doc_chunks in zip(docs, chunks):
            if self._include_meta:
                doc.meta |= {self.id: {doc_chunks}}
            doc.chunks = [chunk.text for chunk in doc_chunks]
            if pbar:
                pbar.update(1)

        return docs
    finally:
        if pbar:
            pbar.close()

__init__(chunker, task_id=None, show_progress=True, include_meta=False)

Initialize chunker.

Parameters:

Name Type Description Default
task_id str | None

Task ID.

None
show_progress bool

Whether to show progress bar for processed documents.

True
include_meta bool

Whether to include meta information generated by the task.

False
Source code in sieves/tasks/preprocessing/chunkers.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def __init__(
    self,
    chunker: chonkie.BaseChunker,
    task_id: str | None = None,
    show_progress: bool = True,
    include_meta: bool = False,
):
    """Initialize chunker.
    :param task_id: Task ID.
    :param show_progress: Whether to show progress bar for processed documents.
    :param include_meta: Whether to include meta information generated by the task.
    """
    super().__init__(task_id=task_id, show_progress=show_progress, include_meta=include_meta)
    self._chunker = chunker

deserialize(config, **kwargs) classmethod

Generate Task instance from config.

Parameters:

Name Type Description Default
config Config

Config to generate instance from.

required
kwargs dict[str, Any]

Values to inject into loaded config.

{}

Returns:

Type Description
Task

Deserialized Task instance.

Source code in sieves/tasks/core.py
56
57
58
59
60
61
62
63
64
@classmethod
def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Task:
    """Generate Task instance from config.
    :param config: Config to generate instance from.
    :param kwargs: Values to inject into loaded config.
    :return: Deserialized Task instance.
    """
    # Deserialize and inject engine.
    return cls(**config.to_init_dict(cls, **kwargs))

serialize()

Serializes task.

Returns:

Type Description
Config

Config instance.

Source code in sieves/tasks/core.py
50
51
52
53
54
def serialize(self) -> Config:
    """Serializes task.
    :return: Config instance.
    """
    return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})

NaiveChunker

Bases: Task

Chunks by sentence counts. Only for test purposes.

Source code in sieves/tasks/preprocessing/chunkers.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
class NaiveChunker(Task):
    """Chunks by sentence counts. Only for test purposes."""

    def __init__(
        self,
        interval: int,
        task_id: str | None = None,
        show_progress: bool = True,
        include_meta: bool = False,
    ):
        """Initialize chunker.
        :param interval: Token count interval for chunks.
        :param task_id: Task ID.
        :param show_progress: Whether to show progress bar for processed documents.
        :param include_meta: Whether to include meta information generated by the task.
        """
        super().__init__(task_id=task_id, show_progress=show_progress, include_meta=include_meta)
        self._interval = interval

    def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
        """Split documents into chunks.
        :param docs: Documents to split.
        :return: Split documents.
        """
        docs = list(docs)

        pbar: tqdm | None = tqdm(total=len(docs)) if self._show_progress else None
        for doc in docs:
            assert doc.text
            sentences = [sent for sent in re.split("[?!.]", doc.text) if len(sent.strip())]
            doc.chunks = [".".join(sentences[i : i + self._interval]) for i in range(0, len(sentences), self._interval)]
            if pbar:
                pbar.update(1)
        if pbar:
            pbar.close()

        return docs

    @property
    def _state(self) -> dict[str, Any]:
        return {
            **super()._state,
            "interval": self._interval,
        }

id property

Returns task ID. Used by pipeline for results and dependency management.

Returns:

Type Description
str

Task ID.

__call__(docs)

Split documents into chunks.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Documents to split.

required

Returns:

Type Description
Iterable[Doc]

Split documents.

Source code in sieves/tasks/preprocessing/chunkers.py
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
    """Split documents into chunks.
    :param docs: Documents to split.
    :return: Split documents.
    """
    docs = list(docs)

    pbar: tqdm | None = tqdm(total=len(docs)) if self._show_progress else None
    for doc in docs:
        assert doc.text
        sentences = [sent for sent in re.split("[?!.]", doc.text) if len(sent.strip())]
        doc.chunks = [".".join(sentences[i : i + self._interval]) for i in range(0, len(sentences), self._interval)]
        if pbar:
            pbar.update(1)
    if pbar:
        pbar.close()

    return docs

__init__(interval, task_id=None, show_progress=True, include_meta=False)

Initialize chunker.

Parameters:

Name Type Description Default
interval int

Token count interval for chunks.

required
task_id str | None

Task ID.

None
show_progress bool

Whether to show progress bar for processed documents.

True
include_meta bool

Whether to include meta information generated by the task.

False
Source code in sieves/tasks/preprocessing/chunkers.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def __init__(
    self,
    interval: int,
    task_id: str | None = None,
    show_progress: bool = True,
    include_meta: bool = False,
):
    """Initialize chunker.
    :param interval: Token count interval for chunks.
    :param task_id: Task ID.
    :param show_progress: Whether to show progress bar for processed documents.
    :param include_meta: Whether to include meta information generated by the task.
    """
    super().__init__(task_id=task_id, show_progress=show_progress, include_meta=include_meta)
    self._interval = interval

deserialize(config, **kwargs) classmethod

Generate Task instance from config.

Parameters:

Name Type Description Default
config Config

Config to generate instance from.

required
kwargs dict[str, Any]

Values to inject into loaded config.

{}

Returns:

Type Description
Task

Deserialized Task instance.

Source code in sieves/tasks/core.py
56
57
58
59
60
61
62
63
64
@classmethod
def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Task:
    """Generate Task instance from config.
    :param config: Config to generate instance from.
    :param kwargs: Values to inject into loaded config.
    :return: Deserialized Task instance.
    """
    # Deserialize and inject engine.
    return cls(**config.to_init_dict(cls, **kwargs))

serialize()

Serializes task.

Returns:

Type Description
Config

Config instance.

Source code in sieves/tasks/core.py
50
51
52
53
54
def serialize(self) -> Config:
    """Serializes task.
    :return: Config instance.
    """
    return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})