Skip to content

Bridge

Bases: Generic[TaskPromptSignature, TaskResult, EngineInferenceMode], ABC

Bridge base class.

Source code in sieves/tasks/predictive/bridges.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class Bridge(Generic[TaskPromptSignature, TaskResult, EngineInferenceMode], abc.ABC):
    """Bridge base class."""

    def __init__(
        self, task_id: str, prompt_instructions: str | None, overwrite: bool, generation_settings: GenerationSettings
    ):
        """Initialize new bridge.

        :param task_id: Task ID.
        :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
        :param overwrite: Whether to overwrite text with produced text. Considered only by bridges for tasks producing
            fluent text - like translation, summarization, PII masking, etc.
        :param generation_settings: Generation settings including inference_mode.
        """
        self._task_id = task_id
        self._custom_prompt_instructions = prompt_instructions
        self._overwrite = overwrite
        self._generation_settings = generation_settings

    @property
    @abc.abstractmethod
    def _default_prompt_instructions(self) -> str:
        """Return default prompt instructions.

        Instructions are injected at the beginning of each prompt.

        :return: Default prompt instructions.
        """

    @property
    def _prompt_instructions(self) -> str:
        """Returns prompt instructions.

        :returns: If `_custom_prompt_instructions` is set, this is used. Otherwise, `_default_prompt_instructions` is
            used.
        """
        return self._custom_prompt_instructions or self._default_prompt_instructions

    @property
    @abc.abstractmethod
    def _prompt_example_template(self) -> str | None:
        """Return default prompt template for example injection.

        Examples are injected between instructions and conclusions.

        :return: Default prompt example template.
        """

    @property
    @abc.abstractmethod
    def _prompt_conclusion(self) -> str | None:
        """Return prompt conclusion.

        Prompt conclusions are injected at the end of each prompt.

        :return: Default prompt conclusion.
        """

    @property
    def prompt_template(self) -> str:
        """Return prompt template.

        Chains `_prompt_instructions`, `_prompt_example_template` and `_prompt_conclusion`.

        Note: different engines have different expectations as how a prompt should look like. E.g. outlines supports the
        Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in
        a different value in the workflow and hence expects the prompt not to include these things. Mind engine-specific
        expectations when creating a prompt template.
        :return str | None: Prompt template as string. None if not used by engine.
        """
        return f"""
        {self._custom_prompt_instructions or self._prompt_instructions}
        {self._prompt_example_template or ""}
        {self._prompt_conclusion or ""}
        """

    @property
    @abc.abstractmethod
    def prompt_signature(self) -> type[TaskPromptSignature] | TaskPromptSignature:
        """Create output signature.

        E.g.: `Signature` in DSPy, Pydantic objects in outlines, JSON schema in jsonformers.
        This is engine-specific.

        :return type[_TaskPromptSignature] | _TaskPromptSignature: Output signature object. This can be an instance
            (e.g. a regex string) or a class (e.g. a Pydantic class).
        """

    @property
    @abc.abstractmethod
    def inference_mode(self) -> EngineInferenceMode:
        """Return inference mode.

        :return EngineInferenceMode: Inference mode.
        """

    def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
        """Extract all values from doc instances that are to be injected into the prompts.

        :param docs: Docs to extract values from.
        :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
        """
        return ({"text": doc.text if doc.text else None} for doc in docs)

    @abc.abstractmethod
    def integrate(self, results: Iterable[TaskResult], docs: Iterable[Doc]) -> Iterable[Doc]:
        """Integrate results into Doc instances.

        :param results: Results from prompt executable.
        :param docs: Doc instances to update.
        :return Iterable[Doc]: Updated doc instances.
        """

    @abc.abstractmethod
    def consolidate(self, results: Iterable[TaskResult], docs_offsets: list[tuple[int, int]]) -> Iterable[TaskResult]:
        """Consolidate results for document chunks into document results.

        :param results: Results per document chunk.
        :param docs_offsets: Chunk offsets per document. Chunks per document can be obtained with
            `results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]]`.
        :return Iterable[_TaskResult]: Results per document.
        """

inference_mode abstractmethod property

Return inference mode.

Returns:

Type Description
EngineInferenceMode

Inference mode.

prompt_signature abstractmethod property

Create output signature.

E.g.: Signature in DSPy, Pydantic objects in outlines, JSON schema in jsonformers. This is engine-specific.

Returns:

Type Description
type[TaskPromptSignature] | TaskPromptSignature

Output signature object. This can be an instance (e.g. a regex string) or a class (e.g. a Pydantic class).

prompt_template property

Return prompt template.

Chains _prompt_instructions, _prompt_example_template and _prompt_conclusion.

Note: different engines have different expectations as how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind engine-specific expectations when creating a prompt template.

Returns:

Type Description
str

Prompt template as string. None if not used by engine.

__init__(task_id, prompt_instructions, overwrite, generation_settings)

Initialize new bridge.

Parameters:

Name Type Description Default
task_id str

Task ID.

required
prompt_instructions str | None

Custom prompt instructions. If None, default instructions are used.

required
overwrite bool

Whether to overwrite text with produced text. Considered only by bridges for tasks producing fluent text - like translation, summarization, PII masking, etc.

required
generation_settings GenerationSettings

Generation settings including inference_mode.

required
Source code in sieves/tasks/predictive/bridges.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def __init__(
    self, task_id: str, prompt_instructions: str | None, overwrite: bool, generation_settings: GenerationSettings
):
    """Initialize new bridge.

    :param task_id: Task ID.
    :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
    :param overwrite: Whether to overwrite text with produced text. Considered only by bridges for tasks producing
        fluent text - like translation, summarization, PII masking, etc.
    :param generation_settings: Generation settings including inference_mode.
    """
    self._task_id = task_id
    self._custom_prompt_instructions = prompt_instructions
    self._overwrite = overwrite
    self._generation_settings = generation_settings

consolidate(results, docs_offsets) abstractmethod

Consolidate results for document chunks into document results.

Parameters:

Name Type Description Default
results Iterable[TaskResult]

Results per document chunk.

required
docs_offsets list[tuple[int, int]]

Chunk offsets per document. Chunks per document can be obtained with results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]].

required

Returns:

Type Description
Iterable[TaskResult]

Results per document.

Source code in sieves/tasks/predictive/bridges.py
135
136
137
138
139
140
141
142
143
@abc.abstractmethod
def consolidate(self, results: Iterable[TaskResult], docs_offsets: list[tuple[int, int]]) -> Iterable[TaskResult]:
    """Consolidate results for document chunks into document results.

    :param results: Results per document chunk.
    :param docs_offsets: Chunk offsets per document. Chunks per document can be obtained with
        `results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]]`.
    :return Iterable[_TaskResult]: Results per document.
    """

extract(docs)

Extract all values from doc instances that are to be injected into the prompts.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Docs to extract values from.

required

Returns:

Type Description
Iterable[dict[str, Any]]

All values from doc instances that are to be injected into the prompts

Source code in sieves/tasks/predictive/bridges.py
118
119
120
121
122
123
124
def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.

    :param docs: Docs to extract values from.
    :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
    """
    return ({"text": doc.text if doc.text else None} for doc in docs)

integrate(results, docs) abstractmethod

Integrate results into Doc instances.

Parameters:

Name Type Description Default
results Iterable[TaskResult]

Results from prompt executable.

required
docs Iterable[Doc]

Doc instances to update.

required

Returns:

Type Description
Iterable[Doc]

Updated doc instances.

Source code in sieves/tasks/predictive/bridges.py
126
127
128
129
130
131
132
133
@abc.abstractmethod
def integrate(self, results: Iterable[TaskResult], docs: Iterable[Doc]) -> Iterable[Doc]:
    """Integrate results into Doc instances.

    :param results: Results from prompt executable.
    :param docs: Doc instances to update.
    :return Iterable[Doc]: Updated doc instances.
    """

Bases: Bridge[Schema, Result, InferenceMode]

Bridge for GLiNER2 models.

Source code in sieves/tasks/predictive/bridges.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
class GliNERBridge(Bridge[gliner2.inference.engine.Schema, gliner_.Result, gliner_.InferenceMode]):
    """Bridge for GLiNER2 models."""

    def __init__(
        self,
        task_id: str,
        prompt_instructions: str | None,
        prompt_signature: gliner2.inference.engine.Schema | gliner2.inference.engine.StructureBuilder,
        generation_settings: GenerationSettings,
        inference_mode: gliner_.InferenceMode,
    ):
        """Initialize GLiNER2 bridge.

        Important: currently only GLiNER2 schemas/structures with one key each are supported. We do NOT support
        composite requests like `create_schema().entities().classification(). ...`.

        :param task_id: Task ID.
        :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
        :param prompt_signature: GLiNER2 schema (list of field definitions).
        :param generation_settings: Generation settings including inference_mode.
        """
        super().__init__(
            task_id=task_id,
            prompt_instructions=prompt_instructions,
            overwrite=False,
            generation_settings=generation_settings,
        )
        self._prompt_signature = prompt_signature
        # If prompt signature is a structure, we create a Pydantic representation of it for easier downstream result
        # processing - e.g. when creating a HF dataset.
        self._prompt_signature_pydantic = (
            self.schema_to_pydantic()
            if isinstance(prompt_signature, gliner2.inference.engine.StructureBuilder)
            else None
        )

        self._inference_mode = inference_mode

    @override
    @property
    def _default_prompt_instructions(self) -> str:
        # GLiNER2 doesn't support custom instructions.
        return ""

    @override
    @property
    def _prompt_example_template(self) -> str | None:
        return None

    @override
    @property
    def _prompt_conclusion(self) -> str | None:
        return None

    @override
    @property
    def prompt_signature(self) -> gliner2.inference.engine.Schema | gliner2.inference.engine.StructureBuilder:
        return self._prompt_signature

    @property
    def prompt_signature_pydantic(self) -> type[pydantic.BaseModel] | None:
        """Return Pydantic model representation of GLiNER2 schema.

        Returns:
            Pydantic model representation of GLiNER2 schema.
        """
        return self._prompt_signature_pydantic

    @override
    @property
    def inference_mode(self) -> gliner_.InferenceMode:
        return self._generation_settings.inference_mode or self._inference_mode

    def schema_to_pydantic(self) -> type[pydantic.BaseModel]:
        """Convert a Gliner2 Schema object to Pydantic models.

        If the schema is a structure with more than one entry, a wrapper class `Schema` is created.

        Returns:
            Pydantic model representation of GLiNER2 schema.
        """
        if isinstance(self._prompt_signature, gliner2.inference.engine.StructureBuilder):
            field_metadata = self._prompt_signature.schema._field_metadata
        else:
            assert isinstance(self._prompt_signature, gliner2.inference.engine.Schema)
            field_metadata = self._prompt_signature._field_metadata

        # Group fields by class name.
        classes: dict[str, dict[str, Any]] = {}
        for key, meta in field_metadata.items():
            class_name, field_name = key.split(".")
            if class_name not in classes:
                classes[class_name] = {}
            classes[class_name][field_name] = meta

        # Create models for each class
        models: dict[str, type[pydantic.BaseModel]] = {}
        for class_name, fields in classes.items():
            field_definitions = {}
            for field_name, meta in fields.items():
                dtype = meta["dtype"]
                choices = meta["choices"]

                # Determine the field type.
                inner_field_type = Literal[*choices] if choices else str  # type: ignore[invalid-type-form]
                field_type = list[inner_field_type] if dtype == "list" else inner_field_type
                field_definitions[field_name] = (field_type, ...)

            model = pydantic.create_model(class_name, **field_definitions)
            models[class_name] = model

        # Create wrapper "Schema" model with lowercase attribute names if more than one structure is present.
        if len(models) > 1:
            raise TypeError(
                "Composite GliNER2 schemas are not supported. Use a single structure/entitity/classification per Sieves"
                " task."
            )

        return models[list(models.keys())[0]]

    @override
    def integrate(self, results: Iterable[gliner_.Result], docs: Iterable[Doc]) -> Iterable[Doc]:
        for doc, result in zip(docs, results):
            match self._inference_mode:
                case gliner_.InferenceMode.classification:
                    assert hasattr(self._prompt_signature.schema, "__getitem__")
                    is_multilabel = self._prompt_signature.schema["classifications"][0]["multi_label"]

                    if is_multilabel:
                        doc.results[self._task_id] = []
                        for res in sorted(result, key=lambda x: x["score"], reverse=True):
                            assert isinstance(res, dict)
                            doc.results[self._task_id].append((res["label"], res["score"]))

                    else:
                        doc.results[self._task_id] = (result[0]["label"], result[0]["score"])

                case gliner_.InferenceMode.entities:
                    doc.results[self._task_id] = result

                case gliner_.InferenceMode.structure:
                    assert len(result) == 1
                    entity_type_name = list(result.keys())[0]
                    assert issubclass(self._prompt_signature_pydantic, pydantic.BaseModel)
                    doc.results[self._task_id] = [
                        self._prompt_signature_pydantic.model_validate(
                            {key: value["text"] for key, value in entity.items()}
                        )
                        for entity in result[entity_type_name]
                    ]

        return docs

    @override
    def consolidate(
        self, results: Iterable[gliner_.Result], docs_offsets: list[tuple[int, int]]
    ) -> Iterable[gliner_.Result]:
        results = list(results)

        # Determine label scores for chunks per document.
        for doc_offset in docs_offsets:
            scores: dict[str, float] = defaultdict(lambda: 0)
            entities: dict[str, list[str] | dict[str, str | list[str]]] = {}

            for res in results[doc_offset[0] : doc_offset[1]]:
                match self._inference_mode:
                    case gliner_.InferenceMode.classification:
                        keys = list(res.keys())
                        assert len(keys) == 1, "Composite GliNER2 schemas are not supported."
                        extracted_res = res[keys[0]]

                        # In case of single-label: pad to list so that we can process in a unified way.
                        if isinstance(extracted_res, dict):
                            extracted_res = [extracted_res]

                        for entry in extracted_res:
                            # GliNER might use two different structures here, depending on the version.
                            if "label" in entry:
                                scores[entry["label"]] += entry["confidence"]
                            else:
                                keys = list(entry.keys())
                                assert len(keys) == 1, "Composite GliNER2 schemas are not supported."
                                for label, confidence in entry[keys[0]]:
                                    scores[label] += confidence

                    case gliner_.InferenceMode.entities:
                        for entity_type in res["entities"]:
                            if len(res["entities"][entity_type]):
                                if entity_type not in entities:
                                    entities[entity_type] = []
                                relevant_entities: list[str] = entities[entity_type]
                                relevant_entities.extend(res["entities"][entity_type])

                    case gliner_.InferenceMode.structure:
                        for entity_type in res:
                            if entity_type not in entities:
                                entities[entity_type] = []
                            relevant_entities: list[str] = entities[entity_type]
                            relevant_entities.extend(res[entity_type])

            match self._inference_mode:
                case gliner_.InferenceMode.classification:
                    # Ensure that all labels have been assigned - GLiNER2 is somtimes negligent about this.
                    assert hasattr(self._prompt_signature.schema, "__getitem__")
                    for label in self._prompt_signature.schema["classifications"][0]["labels"]:
                        if label not in scores:
                            scores[label] = 0.0

                    # Average score, sort in descending order.
                    sorted_scores: list[dict[str, str | float]] = sorted(
                        (
                            {"label": attr, "score": score / (doc_offset[1] - doc_offset[0])}
                            for attr, score in scores.items()
                        ),
                        key=lambda x: x["score"],
                        reverse=True,
                    )

                    yield sorted_scores

                case gliner_.InferenceMode.entities | gliner_.InferenceMode.structure:
                    yield entities

prompt_signature_pydantic property

Return Pydantic model representation of GLiNER2 schema.

Returns: Pydantic model representation of GLiNER2 schema.

prompt_template property

Return prompt template.

Chains _prompt_instructions, _prompt_example_template and _prompt_conclusion.

Note: different engines have different expectations as how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind engine-specific expectations when creating a prompt template.

Returns:

Type Description
str

Prompt template as string. None if not used by engine.

__init__(task_id, prompt_instructions, prompt_signature, generation_settings, inference_mode)

Initialize GLiNER2 bridge.

Important: currently only GLiNER2 schemas/structures with one key each are supported. We do NOT support composite requests like create_schema().entities().classification(). ....

Parameters:

Name Type Description Default
task_id str

Task ID.

required
prompt_instructions str | None

Custom prompt instructions. If None, default instructions are used.

required
prompt_signature Schema | StructureBuilder

GLiNER2 schema (list of field definitions).

required
generation_settings GenerationSettings

Generation settings including inference_mode.

required
Source code in sieves/tasks/predictive/bridges.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def __init__(
    self,
    task_id: str,
    prompt_instructions: str | None,
    prompt_signature: gliner2.inference.engine.Schema | gliner2.inference.engine.StructureBuilder,
    generation_settings: GenerationSettings,
    inference_mode: gliner_.InferenceMode,
):
    """Initialize GLiNER2 bridge.

    Important: currently only GLiNER2 schemas/structures with one key each are supported. We do NOT support
    composite requests like `create_schema().entities().classification(). ...`.

    :param task_id: Task ID.
    :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
    :param prompt_signature: GLiNER2 schema (list of field definitions).
    :param generation_settings: Generation settings including inference_mode.
    """
    super().__init__(
        task_id=task_id,
        prompt_instructions=prompt_instructions,
        overwrite=False,
        generation_settings=generation_settings,
    )
    self._prompt_signature = prompt_signature
    # If prompt signature is a structure, we create a Pydantic representation of it for easier downstream result
    # processing - e.g. when creating a HF dataset.
    self._prompt_signature_pydantic = (
        self.schema_to_pydantic()
        if isinstance(prompt_signature, gliner2.inference.engine.StructureBuilder)
        else None
    )

    self._inference_mode = inference_mode

extract(docs)

Extract all values from doc instances that are to be injected into the prompts.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Docs to extract values from.

required

Returns:

Type Description
Iterable[dict[str, Any]]

All values from doc instances that are to be injected into the prompts

Source code in sieves/tasks/predictive/bridges.py
118
119
120
121
122
123
124
def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.

    :param docs: Docs to extract values from.
    :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
    """
    return ({"text": doc.text if doc.text else None} for doc in docs)

schema_to_pydantic()

Convert a Gliner2 Schema object to Pydantic models.

If the schema is a structure with more than one entry, a wrapper class Schema is created.

Returns: Pydantic model representation of GLiNER2 schema.

Source code in sieves/tasks/predictive/bridges.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
def schema_to_pydantic(self) -> type[pydantic.BaseModel]:
    """Convert a Gliner2 Schema object to Pydantic models.

    If the schema is a structure with more than one entry, a wrapper class `Schema` is created.

    Returns:
        Pydantic model representation of GLiNER2 schema.
    """
    if isinstance(self._prompt_signature, gliner2.inference.engine.StructureBuilder):
        field_metadata = self._prompt_signature.schema._field_metadata
    else:
        assert isinstance(self._prompt_signature, gliner2.inference.engine.Schema)
        field_metadata = self._prompt_signature._field_metadata

    # Group fields by class name.
    classes: dict[str, dict[str, Any]] = {}
    for key, meta in field_metadata.items():
        class_name, field_name = key.split(".")
        if class_name not in classes:
            classes[class_name] = {}
        classes[class_name][field_name] = meta

    # Create models for each class
    models: dict[str, type[pydantic.BaseModel]] = {}
    for class_name, fields in classes.items():
        field_definitions = {}
        for field_name, meta in fields.items():
            dtype = meta["dtype"]
            choices = meta["choices"]

            # Determine the field type.
            inner_field_type = Literal[*choices] if choices else str  # type: ignore[invalid-type-form]
            field_type = list[inner_field_type] if dtype == "list" else inner_field_type
            field_definitions[field_name] = (field_type, ...)

        model = pydantic.create_model(class_name, **field_definitions)
        models[class_name] = model

    # Create wrapper "Schema" model with lowercase attribute names if more than one structure is present.
    if len(models) > 1:
        raise TypeError(
            "Composite GliNER2 schemas are not supported. Use a single structure/entitity/classification per Sieves"
            " task."
        )

    return models[list(models.keys())[0]]