Skip to content

Classification

Classification

Bases: PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]

Source code in sieves/tasks/predictive/classification/core.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
class Classification(PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]):
    def __init__(
        self,
        labels: list[str],
        engine: Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode],
        task_id: str | None = None,
        show_progress: bool = True,
        include_meta: bool = True,
        prompt_template: str | None = None,
        prompt_signature_desc: str | None = None,
        fewshot_examples: Iterable[FewshotExample] = (),
    ) -> None:
        """
        Initializes new PredictiveTask.
        :param labels: Labels to predict.
        :param task_id: Task ID.
        :param show_progress: Whether to show progress bar for processed documents.
        :param include_meta: Whether to include meta information generated by the task.
        :param prompt_template: Custom prompt template. If None, task's default template is being used.
        :param prompt_signature_desc: Custom prompt signature description. If None, default will be used.
        :param fewshot_examples: Few-shot examples.
        """
        self._labels = labels
        super().__init__(
            engine=engine,
            task_id=task_id,
            show_progress=show_progress,
            include_meta=include_meta,
            overwrite=False,
            prompt_template=prompt_template,
            prompt_signature_desc=prompt_signature_desc,
            fewshot_examples=fewshot_examples,
        )
        self._fewshot_examples: Iterable[FewshotExample]

    def _init_bridge(self, engine_type: EngineType) -> _TaskBridge:
        """Initialize bridge.
        :return: Engine task.
        :raises ValueError: If engine type is not supported.
        """
        if engine_type == EngineType.glix:
            # GliXBridge needs different arguments than other bridges, hence we instantiate it differently.
            return GliXBridge(
                task_id=self._task_id,
                prompt_template=self._custom_prompt_template,
                prompt_signature_desc=self._custom_prompt_signature_desc,
                prompt_signature=self._labels,
                inference_mode=glix_.InferenceMode.classification,
                label_whitelist=tuple(self._labels),
            )

        bridge_types: dict[EngineType, type[_TaskBridge]] = {
            EngineType.dspy: DSPyClassification,
            EngineType.instructor: InstructorClassification,
            EngineType.huggingface: HuggingFaceClassification,
            EngineType.outlines: OutlinesClassification,
            EngineType.ollama: OllamaClassification,
            EngineType.langchain: LangChainClassification,
        }

        try:
            bridge_type = bridge_types[engine_type]
            assert not issubclass(bridge_type, GliXBridge)

            return bridge_type(
                task_id=self._task_id,
                prompt_template=self._custom_prompt_template,
                prompt_signature_desc=self._custom_prompt_signature_desc,
                labels=self._labels,
            )
        except KeyError as err:
            raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err

    @property
    def supports(self) -> set[EngineType]:
        return {
            EngineType.dspy,
            EngineType.instructor,
            EngineType.glix,
            EngineType.huggingface,
            EngineType.langchain,
            EngineType.ollama,
            EngineType.outlines,
        }

    def _validate_fewshot_examples(self) -> None:
        for fs_example in self._fewshot_examples or []:
            if any([label not in self._labels for label in fs_example.confidence_per_label]) or not all(
                [label in fs_example.confidence_per_label for label in self._labels]
            ):
                raise ValueError(
                    f"Label mismatch: {self._task_id} has labels {self._labels}. Few-shot examples have "
                    f"labels {fs_example.confidence_per_label.keys()}."
                )

    @property
    def _state(self) -> dict[str, Any]:
        return {
            **super()._state,
            "labels": self._labels,
        }

    def to_dataset(self, docs: Iterable[Doc]) -> datasets.Dataset:
        # Define metadata.
        features = datasets.Features(
            {"text": datasets.Value("string"), "label": datasets.Sequence(datasets.Value("float32"))}
        )
        info = datasets.DatasetInfo(
            description=f"Multi-label classification dataset with labels {self._labels}. Generated with sieves "
            f"v{Config.get_version()}.",
            features=features,
        )

        # Fetch data used for generating dataset.
        labels = self._labels
        try:
            data = [(doc.text, doc.results[self._task_id]) for doc in docs]
        except KeyError as err:
            raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err

        def generate_data() -> Iterable[dict[str, Any]]:
            """Yields results as dicts.
            :return: Results as dicts.
            """
            for text, result in data:
                scores = {label_score[0]: label_score[1] for label_score in result}
                yield {"text": text, "label": [scores[label] for label in labels]}

        # Create dataset.
        return datasets.Dataset.from_generator(generate_data, features=features, info=info)

id property

Returns task ID. Used by pipeline for results and dependency management.

Returns:

Type Description
str

Task ID.

prompt_signature_description property

Returns prompt signature description.

Returns:

Type Description
str | None

Prompt signature description.

prompt_template property

Returns prompt template.

Returns:

Type Description
str | None

Prompt template.

__call__(docs)

Execute the task on a set of documents.

Note: the mypy ignore directives are because in practice, TaskX can be a superset of the X types of multiple engines, but there is no way in Python's current typing system to model that. E.g.: TaskInferenceMode could be outlines_.InferenceMode | dspy_.InferenceMode, depending on the class of the dynamically provided engine instance. TypeVars don't support unions however, neither do generics on a higher level of abstraction. We hence ignore these mypy errors, as the involved types should nonetheless be consistent.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Documents to process.

required

Returns:

Type Description
Iterable[Doc]

Processed documents.

Source code in sieves/tasks/predictive/core.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
    """Execute the task on a set of documents.

    Note: the mypy ignore directives are because in practice, TaskX can be a superset of the X types of multiple
    engines, but there is no way in Python's current typing system to model that. E.g.: TaskInferenceMode could be
    outlines_.InferenceMode | dspy_.InferenceMode, depending on the class of the dynamically provided engine
    instance. TypeVars don't support unions however, neither do generics on a higher level of abstraction.
    We hence ignore these mypy errors, as the involved types should nonetheless be consistent.

    :param docs: Documents to process.
    :return Iterable[Doc]: Processed documents.
    """
    docs = list(docs)

    # 1. Compile expected prompt signatures.
    signature = self._bridge.prompt_signature

    # 2. Build executable.
    assert isinstance(self._bridge.inference_mode, enum.Enum)
    executable = self._engine.build_executable(
        inference_mode=self._bridge.inference_mode,
        prompt_template=self.prompt_template,
        prompt_signature=signature,
        fewshot_examples=self._fewshot_examples,
    )

    # 3. Extract values from docs to inject/render those into prompt templates.
    docs_values = self._bridge.extract(docs)

    # 4. Map extracted docs values onto chunks.
    docs_chunks_offsets: list[tuple[int, int]] = []
    docs_chunks_values: list[dict[str, Any]] = []
    for doc, doc_values in zip(docs, docs_values):
        assert doc.text
        doc_chunks_values = [doc_values | {"text": chunk} for chunk in (doc.chunks or [doc.text])]
        docs_chunks_offsets.append((len(docs_chunks_values), len(docs_chunks_values) + len(doc_chunks_values)))
        docs_chunks_values.extend(doc_chunks_values)

    # 5. Execute prompts per chunk.
    results = list(executable(docs_chunks_values))
    assert len(results) == len(docs_chunks_values)

    # 6. Consolidate chunk results.
    results = list(self._bridge.consolidate(results, docs_chunks_offsets))
    assert len(results) == len(docs)

    # 7. Integrate results into docs.
    docs = self._bridge.integrate(results, docs)

    return docs

__init__(labels, engine, task_id=None, show_progress=True, include_meta=True, prompt_template=None, prompt_signature_desc=None, fewshot_examples=())

Initializes new PredictiveTask.

Parameters:

Name Type Description Default
labels list[str]

Labels to predict.

required
task_id str | None

Task ID.

None
show_progress bool

Whether to show progress bar for processed documents.

True
include_meta bool

Whether to include meta information generated by the task.

True
prompt_template str | None

Custom prompt template. If None, task's default template is being used.

None
prompt_signature_desc str | None

Custom prompt signature description. If None, default will be used.

None
fewshot_examples Iterable[FewshotExample]

Few-shot examples.

()
Source code in sieves/tasks/predictive/classification/core.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def __init__(
    self,
    labels: list[str],
    engine: Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode],
    task_id: str | None = None,
    show_progress: bool = True,
    include_meta: bool = True,
    prompt_template: str | None = None,
    prompt_signature_desc: str | None = None,
    fewshot_examples: Iterable[FewshotExample] = (),
) -> None:
    """
    Initializes new PredictiveTask.
    :param labels: Labels to predict.
    :param task_id: Task ID.
    :param show_progress: Whether to show progress bar for processed documents.
    :param include_meta: Whether to include meta information generated by the task.
    :param prompt_template: Custom prompt template. If None, task's default template is being used.
    :param prompt_signature_desc: Custom prompt signature description. If None, default will be used.
    :param fewshot_examples: Few-shot examples.
    """
    self._labels = labels
    super().__init__(
        engine=engine,
        task_id=task_id,
        show_progress=show_progress,
        include_meta=include_meta,
        overwrite=False,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
        fewshot_examples=fewshot_examples,
    )
    self._fewshot_examples: Iterable[FewshotExample]

_init_bridge(engine_type)

Initialize bridge.

Returns:

Type Description
_TaskBridge

Engine task.

Raises:

Type Description
ValueError

If engine type is not supported.

Source code in sieves/tasks/predictive/classification/core.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def _init_bridge(self, engine_type: EngineType) -> _TaskBridge:
    """Initialize bridge.
    :return: Engine task.
    :raises ValueError: If engine type is not supported.
    """
    if engine_type == EngineType.glix:
        # GliXBridge needs different arguments than other bridges, hence we instantiate it differently.
        return GliXBridge(
            task_id=self._task_id,
            prompt_template=self._custom_prompt_template,
            prompt_signature_desc=self._custom_prompt_signature_desc,
            prompt_signature=self._labels,
            inference_mode=glix_.InferenceMode.classification,
            label_whitelist=tuple(self._labels),
        )

    bridge_types: dict[EngineType, type[_TaskBridge]] = {
        EngineType.dspy: DSPyClassification,
        EngineType.instructor: InstructorClassification,
        EngineType.huggingface: HuggingFaceClassification,
        EngineType.outlines: OutlinesClassification,
        EngineType.ollama: OllamaClassification,
        EngineType.langchain: LangChainClassification,
    }

    try:
        bridge_type = bridge_types[engine_type]
        assert not issubclass(bridge_type, GliXBridge)

        return bridge_type(
            task_id=self._task_id,
            prompt_template=self._custom_prompt_template,
            prompt_signature_desc=self._custom_prompt_signature_desc,
            labels=self._labels,
        )
    except KeyError as err:
        raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err

deserialize(config, **kwargs) classmethod

Generate PredictiveTask instance from config.

Parameters:

Name Type Description Default
config Config

Config to generate instance from.

required
kwargs dict[str, Any]

Values to inject into loaded config.

{}

Returns:

Type Description
PredictiveTask[TaskPromptSignature, TaskResult, TaskBridge]

Deserialized PredictiveTask instance.

Source code in sieves/tasks/predictive/core.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
@classmethod
def deserialize(
    cls, config: Config, **kwargs: dict[str, Any]
) -> PredictiveTask[TaskPromptSignature, TaskResult, TaskBridge]:
    """Generate PredictiveTask instance from config.
    :param config: Config to generate instance from.
    :param kwargs: Values to inject into loaded config.
    :return PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]: Deserialized PredictiveTask instance.
    """
    # Validate engine config.
    assert hasattr(config, "engine")
    assert isinstance(config.engine.value, Config)
    engine_config = config.engine.value
    engine_cls = engine_config.config_cls
    assert issubclass(engine_cls, Serializable)
    assert issubclass(engine_cls, Engine)

    # Deserialize and inject engine.
    engine_param: dict[str, Any] = {"engine": engine_cls.deserialize(engine_config, **kwargs["engine"])}
    return cls(**config.to_init_dict(cls, **(kwargs | engine_param)))

serialize()

Serializes task.

Returns:

Type Description
Config

Config instance.

Source code in sieves/tasks/core.py
50
51
52
53
54
def serialize(self) -> Config:
    """Serializes task.
    :return: Config instance.
    """
    return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})

ClassificationBridge

Bases: Bridge[_BridgePromptSignature, _BridgeResult, EngineInferenceMode], ABC

Source code in sieves/tasks/predictive/classification/bridges.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class ClassificationBridge(Bridge[_BridgePromptSignature, _BridgeResult, EngineInferenceMode], abc.ABC):
    def __init__(self, task_id: str, prompt_template: str | None, prompt_signature_desc: str | None, labels: list[str]):
        """
        Initializes InformationExtractionBridge.
        :param task_id: Task ID.
        :param prompt_template: Custom prompt template.
        :param prompt_signature_desc: Custom prompt signature description.
        :param labels: Labels to classify.
        """
        super().__init__(
            task_id=task_id,
            prompt_template=prompt_template,
            prompt_signature_desc=prompt_signature_desc,
            overwrite=False,
        )
        self._labels = labels

_prompt_signature_description abstractmethod property

Returns default prompt signature description.

Returns:

Type Description
str | None

Default prompt signature description.

_prompt_template abstractmethod property

Returns default prompt template.

Returns:

Type Description
str | None

Default prompt template.

inference_mode abstractmethod property

Returns inference mode.

Returns:

Type Description
EngineInferenceMode

Inference mode.

prompt_signature abstractmethod property

Creates output signature (e.g.: Signature in DSPy, Pydantic objects in outlines, JSON schema in jsonformers). This is engine-specific.

Returns:

Type Description
type[TaskPromptSignature] | TaskPromptSignature

Output signature object. This can be an instance (e.g. a regex string) or a class (e.g. a Pydantic class).

prompt_signature_description property

Returns prompt signature description. This is used by some engines to aid the language model in generating structured output.

Returns:

Type Description
str | None

Prompt signature description. None if not used by engine.

prompt_template property

Returns prompt template. Note: different engines have different expectations as how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind engine-specific expectations when creating a prompt template.

Returns:

Type Description
str | None

Prompt template as string. None if not used by engine.

__init__(task_id, prompt_template, prompt_signature_desc, labels)

Initializes InformationExtractionBridge.

Parameters:

Name Type Description Default
task_id str

Task ID.

required
prompt_template str | None

Custom prompt template.

required
prompt_signature_desc str | None

Custom prompt signature description.

required
labels list[str]

Labels to classify.

required
Source code in sieves/tasks/predictive/classification/bridges.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def __init__(self, task_id: str, prompt_template: str | None, prompt_signature_desc: str | None, labels: list[str]):
    """
    Initializes InformationExtractionBridge.
    :param task_id: Task ID.
    :param prompt_template: Custom prompt template.
    :param prompt_signature_desc: Custom prompt signature description.
    :param labels: Labels to classify.
    """
    super().__init__(
        task_id=task_id,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
        overwrite=False,
    )
    self._labels = labels

consolidate(results, docs_offsets) abstractmethod

Consolidates results for document chunks into document results.

Parameters:

Name Type Description Default
results Iterable[TaskResult]

Results per document chunk.

required
docs_offsets list[tuple[int, int]]

Chunk offsets per document. Chunks per document can be obtained with results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]].

required

Returns:

Type Description
Iterable[TaskResult]

Results per document.

Source code in sieves/tasks/predictive/bridges.py
 95
 96
 97
 98
 99
100
101
102
@abc.abstractmethod
def consolidate(self, results: Iterable[TaskResult], docs_offsets: list[tuple[int, int]]) -> Iterable[TaskResult]:
    """Consolidates results for document chunks into document results.
    :param results: Results per document chunk.
    :param docs_offsets: Chunk offsets per document. Chunks per document can be obtained with
        results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]].
    :return Iterable[_TaskResult]: Results per document.
    """

extract(docs)

Extract all values from doc instances that are to be injected into the prompts.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Docs to extract values from.

required

Returns:

Type Description
Iterable[dict[str, Any]]

All values from doc instances that are to be injected into the prompts

Source code in sieves/tasks/predictive/bridges.py
80
81
82
83
84
85
def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.
    :param docs: Docs to extract values from.
    :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
    """
    return ({"text": doc.text if doc.text else None} for doc in docs)

integrate(results, docs) abstractmethod

Integrate results into Doc instances.

Parameters:

Name Type Description Default
results Iterable[TaskResult]

Results from prompt executable.

required
docs Iterable[Doc]

Doc instances to update.

required

Returns:

Type Description
Iterable[Doc]

Updated doc instances.

Source code in sieves/tasks/predictive/bridges.py
87
88
89
90
91
92
93
@abc.abstractmethod
def integrate(self, results: Iterable[TaskResult], docs: Iterable[Doc]) -> Iterable[Doc]:
    """Integrate results into Doc instances.
    :param results: Results from prompt executable.
    :param docs: Doc instances to update.
    :return Iterable[Doc]: Updated doc instances.
    """