Information Extraction

`InformationExtraction`

Bases: PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]

Source code in sieves/tasks/predictive/information_extraction/core.py

class InformationExtraction(PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]):
    def __init__(
        self,
        entity_type: type[pydantic.BaseModel],
        engine: Engine,
        task_id: str | None = None,
        show_progress: bool = True,
        include_meta: bool = True,
        prompt_template: str | None = None,
        prompt_signature_desc: str | None = None,
        fewshot_examples: Iterable[FewshotExample] = (),
    ) -> None:
        """
        Initializes new PredictiveTask.
        :param entity_type: Object type to extract.
        :param task_id: Task ID.
        :param show_progress: Whether to show progress bar for processed documents.
        :param include_meta: Whether to include meta information generated by the task.
        :param prompt_template: Custom prompt template. If None, task's default template is being used.
        :param prompt_signature_desc: Custom prompt signature description. If None, default will be used.
        :param fewshot_examples: Few-shot examples.
        """
        self._entity_type = entity_type
        if not self._entity_type.model_config.get("frozen", False):
            warnings.warn(
                f"Entity type provided to task {self._task_id} isn't frozen, which means that entities can't "
                f"be deduplicated. Modify entity_type to be frozen=True."
            )

        super().__init__(
            engine=engine,
            task_id=task_id,
            show_progress=show_progress,
            include_meta=include_meta,
            overwrite=False,
            prompt_template=prompt_template,
            prompt_signature_desc=prompt_signature_desc,
            fewshot_examples=fewshot_examples,
        )

    def _init_bridge(self, engine_type: EngineType) -> _TaskBridge:
        """Initialize bridge.
        :param engine_type: Type of engine to initialize bridge for.
        :return _TaskBridge: Engine task bridge.
        :raises ValueError: If engine type is not supported.
        """
        bridge_types: dict[EngineType, type[_TaskBridge]] = {
            EngineType.dspy: DSPyInformationExtraction,
            EngineType.instructor: InstructorInformationExtraction,
            EngineType.langchain: LangChainInformationExtraction,
            EngineType.outlines: OutlinesInformationExtraction,
            EngineType.ollama: OllamaInformationExtraction,
            EngineType.vllm: VLLMInformationExtraction,
        }

        try:
            bridge = bridge_types[engine_type](
                task_id=self._task_id,
                prompt_template=self._custom_prompt_template,
                prompt_signature_desc=self._custom_prompt_signature_desc,
                entity_type=self._entity_type,
            )
        except KeyError as err:
            raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err

        return bridge

    @property
    def supports(self) -> set[EngineType]:
        return {
            EngineType.dspy,
            EngineType.instructor,
            EngineType.langchain,
            EngineType.ollama,
            EngineType.outlines,
            EngineType.vllm,
        }

    @property
    def _state(self) -> dict[str, Any]:
        return {
            **super()._state,
            "entity_type": self._entity_type,
        }

    def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets.Dataset:
        # Define metadata.
        features = datasets.Features(
            {
                "text": datasets.Value("string"),
                "entities": datasets.Sequence(PydanticToHFDatasets.model_cls_to_features(self._entity_type)),
            }
        )
        info = datasets.DatasetInfo(
            description=f"Information extraction dataset for entity type {self._entity_type.__class__.__name__}. "
            f"Generated with sieves v{Config.get_version()}.",
            features=features,
        )

        # Fetch data used for generating dataset.
        try:
            data = [
                (doc.text, [PydanticToHFDatasets.model_to_dict(res) for res in doc.results[self._task_id]])
                for doc in docs
            ]
        except KeyError as err:
            raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err

        def generate_data() -> Iterable[dict[str, Any]]:
            """Yields results as dicts.
            :return: Results as dicts.
            """
            for text, entities in data:
                yield {"text": text, "entities": entities}

        # Create dataset.
        return datasets.Dataset.from_generator(generate_data, features=features, info=info)

    def distill(
        self,
        base_model_id: str,
        distillation_framework: DistillationFramework,
        hf_dataset: datasets.Dataset,
        init_kwargs: dict[str, Any],
        train_kwargs: dict[str, Any],
        output_path: Path | str,
        train_frac: float,
        val_frac: float,
        seed: int | None = None,
    ) -> None:
        raise NotImplementedError

`id` `property`

Returns task ID. Used by pipeline for results and dependency management.

Returns:

Type	Description
`str`	Task ID.

`prompt_signature_description` `property`

Returns prompt signature description.

Returns:

Type	Description
`str \| None`	Prompt signature description.

`prompt_template` `property`

Returns prompt template.

Returns:

Type	Description
`str \| None`	Prompt template.

`call(docs)`

Execute the task on a set of documents.

Parameters:

Name	Type	Description	Default
`docs`	`Iterable[Doc]`	Documents to process.	required

Returns:

Type	Description
`Iterable[Doc]`	Processed documents.

Source code in sieves/tasks/predictive/core.py

def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
    """Execute the task on a set of documents.

    :param docs: Documents to process.
    :return Iterable[Doc]: Processed documents.
    """

    # Note: the mypy ignore directives are because in practice, TaskX can be a superset of the X types of multiple
    # engines, but there is no way in Python's current typing system to model that. E.g.: TaskInferenceMode could be
    # outlines_.InferenceMode | dspy_.InferenceMode, depending on the class of the dynamically provided engine
    # instance. TypeVars don't support unions however, neither do generics on a higher level of abstraction.
    # We hence ignore these mypy errors, as the involved types should nonetheless be consistent.

    docs = list(docs)

    # 1. Compile expected prompt signatures.
    signature = self._bridge.prompt_signature

    # 2. Build executable.
    executable = self._engine.build_executable(
        inference_mode=self._bridge.inference_mode,
        prompt_template=self.prompt_template,
        prompt_signature=signature,
        fewshot_examples=self._fewshot_examples,
    )

    # 3. Extract values from docs to inject/render those into prompt templates.
    docs_values = self._bridge.extract(docs)

    # 4. Map extracted docs values onto chunks.
    docs_chunks_offsets: list[tuple[int, int]] = []
    docs_chunks_values: list[dict[str, Any]] = []
    for doc, doc_values in zip(docs, docs_values):
        assert doc.text
        doc_chunks_values = [doc_values | {"text": chunk} for chunk in (doc.chunks or [doc.text])]
        docs_chunks_offsets.append((len(docs_chunks_values), len(docs_chunks_values) + len(doc_chunks_values)))
        docs_chunks_values.extend(doc_chunks_values)

    # 5. Execute prompts per chunk.
    results = list(executable(tuple(docs_chunks_values)))
    assert len(results) == len(docs_chunks_values)

    # 6. Consolidate chunk results.
    results = list(self._bridge.consolidate(results, docs_chunks_offsets))
    assert len(results) == len(docs)

    # 7. Integrate results into docs.
    docs = self._bridge.integrate(results, docs)

    yield from docs

`init(entity_type, engine, task_id=None, show_progress=True, include_meta=True, prompt_template=None, prompt_signature_desc=None, fewshot_examples=())`

Initializes new PredictiveTask.

Parameters:

Name	Type	Description	Default
`entity_type`	`type[BaseModel]`	Object type to extract.	required
`task_id`	`str \| None`	Task ID.	`None`
`show_progress`	`bool`	Whether to show progress bar for processed documents.	`True`
`include_meta`	`bool`	Whether to include meta information generated by the task.	`True`
`prompt_template`	`str \| None`	Custom prompt template. If None, task's default template is being used.	`None`
`prompt_signature_desc`	`str \| None`	Custom prompt signature description. If None, default will be used.	`None`
`fewshot_examples`	`Iterable[FewshotExample]`	Few-shot examples.	`()`

Source code in sieves/tasks/predictive/information_extraction/core.py

def __init__(
    self,
    entity_type: type[pydantic.BaseModel],
    engine: Engine,
    task_id: str | None = None,
    show_progress: bool = True,
    include_meta: bool = True,
    prompt_template: str | None = None,
    prompt_signature_desc: str | None = None,
    fewshot_examples: Iterable[FewshotExample] = (),
) -> None:
    """
    Initializes new PredictiveTask.
    :param entity_type: Object type to extract.
    :param task_id: Task ID.
    :param show_progress: Whether to show progress bar for processed documents.
    :param include_meta: Whether to include meta information generated by the task.
    :param prompt_template: Custom prompt template. If None, task's default template is being used.
    :param prompt_signature_desc: Custom prompt signature description. If None, default will be used.
    :param fewshot_examples: Few-shot examples.
    """
    self._entity_type = entity_type
    if not self._entity_type.model_config.get("frozen", False):
        warnings.warn(
            f"Entity type provided to task {self._task_id} isn't frozen, which means that entities can't "
            f"be deduplicated. Modify entity_type to be frozen=True."
        )

    super().__init__(
        engine=engine,
        task_id=task_id,
        show_progress=show_progress,
        include_meta=include_meta,
        overwrite=False,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
        fewshot_examples=fewshot_examples,
    )

`deserialize(config, **kwargs)` `classmethod`

Generate PredictiveTask instance from config.

Parameters:

Name	Type	Description	Default
`config`	`Config`	Config to generate instance from.	required
`kwargs`	`dict[str, Any]`	Values to inject into loaded config.	`{}`

Returns:

Type	Description
`PredictiveTask[TaskPromptSignature, TaskResult, TaskBridge]`	Deserialized PredictiveTask instance.

Source code in sieves/tasks/predictive/core.py

@classmethod
def deserialize(
    cls, config: Config, **kwargs: dict[str, Any]
) -> PredictiveTask[TaskPromptSignature, TaskResult, TaskBridge]:
    """Generate PredictiveTask instance from config.
    :param config: Config to generate instance from.
    :param kwargs: Values to inject into loaded config.
    :return PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]: Deserialized PredictiveTask instance.
    """
    # Validate engine config.
    assert hasattr(config, "engine")
    assert isinstance(config.engine.value, Config)
    engine_config = config.engine.value
    engine_cls = engine_config.config_cls
    assert issubclass(engine_cls, Serializable)
    assert issubclass(engine_cls, Engine)

    # Deserialize and inject engine.
    engine_param: dict[str, Any] = {"engine": engine_cls.deserialize(engine_config, **kwargs["engine"])}
    return cls(**config.to_init_dict(cls, **(kwargs | engine_param)))

`serialize()`

Serializes task.

Returns:

Type	Description
`Config`	Config instance.

Source code in sieves/tasks/core.py

def serialize(self) -> Config:
    """Serializes task.
    :return: Config instance.
    """
    return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})

`InformationExtractionBridge`

Bases: Bridge[_BridgePromptSignature, _BridgeResult, EngineInferenceMode], ABC

Source code in sieves/tasks/predictive/information_extraction/bridges.py

class InformationExtractionBridge(
    Bridge[_BridgePromptSignature, _BridgeResult, EngineInferenceMode],
    abc.ABC,
):
    def __init__(
        self,
        task_id: str,
        prompt_template: str | None,
        prompt_signature_desc: str | None,
        entity_type: type[pydantic.BaseModel],
    ):
        """
        Initializes InformationExtractionBridge.
        :param task_id: Task ID.
        :param prompt_template: Custom prompt template.
        :param prompt_signature_desc: Custom prompt signature description.
        :param entity_type: Type to extract.
        """
        super().__init__(
            task_id=task_id,
            prompt_template=prompt_template,
            prompt_signature_desc=prompt_signature_desc,
            overwrite=False,
        )
        self._entity_type = entity_type

`inference_mode` `abstractmethod` `property`

Returns inference mode.

Returns:

Type	Description
`EngineInferenceMode`	Inference mode.

`prompt_signature` `abstractmethod` `property`

Creates output signature (e.g.: Signature in DSPy, Pydantic objects in outlines, JSON schema in jsonformers). This is engine-specific.

Returns:

Type	Description
`type[TaskPromptSignature] \| TaskPromptSignature`	Output signature object. This can be an instance (e.g. a regex string) or a class (e.g. a Pydantic class).

`prompt_signature_description` `property`

Returns prompt signature description. This is used by some engines to aid the language model in generating structured output.

Returns:

Type	Description
`str \| None`	Prompt signature description. None if not used by engine.

`prompt_template` `property`

Returns prompt template. Note: different engines have different expectations as how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind engine-specific expectations when creating a prompt template.

Returns:

Type	Description
`str \| None`	Prompt template as string. None if not used by engine.

`init(task_id, prompt_template, prompt_signature_desc, entity_type)`

Initializes InformationExtractionBridge.

Parameters:

Name	Type	Description	Default
`task_id`	`str`	Task ID.	required
`prompt_template`	`str \| None`	Custom prompt template.	required
`prompt_signature_desc`	`str \| None`	Custom prompt signature description.	required
`entity_type`	`type[BaseModel]`	Type to extract.	required

Source code in sieves/tasks/predictive/information_extraction/bridges.py

def __init__(
    self,
    task_id: str,
    prompt_template: str | None,
    prompt_signature_desc: str | None,
    entity_type: type[pydantic.BaseModel],
):
    """
    Initializes InformationExtractionBridge.
    :param task_id: Task ID.
    :param prompt_template: Custom prompt template.
    :param prompt_signature_desc: Custom prompt signature description.
    :param entity_type: Type to extract.
    """
    super().__init__(
        task_id=task_id,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
        overwrite=False,
    )
    self._entity_type = entity_type

`consolidate(results, docs_offsets)` `abstractmethod`

Consolidates results for document chunks into document results.

Parameters:

Name	Type	Description	Default
`results`	`Iterable[TaskResult]`	Results per document chunk.	required
`docs_offsets`	`list[tuple[int, int]]`	Chunk offsets per document. Chunks per document can be obtained with results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]].	required

Returns:

Type	Description
`Iterable[TaskResult]`	Results per document.

Source code in sieves/tasks/predictive/bridges.py

@abc.abstractmethod
def consolidate(self, results: Iterable[TaskResult], docs_offsets: list[tuple[int, int]]) -> Iterable[TaskResult]:
    """Consolidates results for document chunks into document results.
    :param results: Results per document chunk.
    :param docs_offsets: Chunk offsets per document. Chunks per document can be obtained with
        results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]].
    :return Iterable[_TaskResult]: Results per document.
    """

`extract(docs)`

Extract all values from doc instances that are to be injected into the prompts.

Parameters:

Name	Type	Description	Default
`docs`	`Iterable[Doc]`	Docs to extract values from.	required

Returns:

Type	Description
`Iterable[dict[str, Any]]`	All values from doc instances that are to be injected into the prompts

Source code in sieves/tasks/predictive/bridges.py

def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.
    :param docs: Docs to extract values from.
    :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
    """
    return ({"text": doc.text if doc.text else None} for doc in docs)

`integrate(results, docs)` `abstractmethod`

Integrate results into Doc instances.

Parameters:

Name	Type	Description	Default
`results`	`Iterable[TaskResult]`	Results from prompt executable.	required
`docs`	`Iterable[Doc]`	Doc instances to update.	required

Returns:

Type	Description
`Iterable[Doc]`	Updated doc instances.

Source code in sieves/tasks/predictive/bridges.py

@abc.abstractmethod
def integrate(self, results: Iterable[TaskResult], docs: Iterable[Doc]) -> Iterable[Doc]:
    """Integrate results into Doc instances.
    :param results: Results from prompt executable.
    :param docs: Doc instances to update.
    :return Iterable[Doc]: Updated doc instances.
    """

Information Extraction

InformationExtraction

id property

prompt_signature_description property

prompt_template property

__call__(docs)

__init__(entity_type, engine, task_id=None, show_progress=True, include_meta=True, prompt_template=None, prompt_signature_desc=None, fewshot_examples=())

deserialize(config, **kwargs) classmethod

serialize()

InformationExtractionBridge

inference_mode abstractmethod property

prompt_signature abstractmethod property

prompt_signature_description property

prompt_template property

__init__(task_id, prompt_template, prompt_signature_desc, entity_type)

consolidate(results, docs_offsets) abstractmethod

extract(docs)

integrate(results, docs) abstractmethod

`InformationExtraction`

`id` `property`

`prompt_signature_description` `property`

`prompt_template` `property`

`call(docs)`

`init(entity_type, engine, task_id=None, show_progress=True, include_meta=True, prompt_template=None, prompt_signature_desc=None, fewshot_examples=())`

`deserialize(config, **kwargs)` `classmethod`

`serialize()`

`InformationExtractionBridge`

`inference_mode` `abstractmethod` `property`

`prompt_signature` `abstractmethod` `property`

`prompt_signature_description` `property`

`prompt_template` `property`

`init(task_id, prompt_template, prompt_signature_desc, entity_type)`

`consolidate(results, docs_offsets)` `abstractmethod`

`extract(docs)`

`integrate(results, docs)` `abstractmethod`