Named Entity Recognition

`NER`

Bases: PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]

Source code in sieves/tasks/predictive/ner/core.py

class NER(PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]):
    def __init__(
        self,
        entities: list[str],
        engine: Engine,
        task_id: str | None = None,
        show_progress: bool = True,
        include_meta: bool = True,
        prompt_template: str | None = None,
        prompt_signature_desc: str | None = None,
        fewshot_examples: Iterable[TaskFewshotExample] = (),
    ) -> None:
        """ "
        Initializes new PredictiveTask.
        :param task_id: Task ID.
        :param show_progress: Whether to show progress bar for processed documents.
        :param include_meta: Whether to include meta information generated by the task.
        :param prompt_template: Custom prompt template. If None, task's default template is being used.
        :param prompt_signature_desc: Custom prompt signature description. If None, default will be used.
        :param fewshot_examples: Few-shot examples.
        """
        self._entities = entities or ["PERSON", "LOCATION", "ORGANIZATION"]
        super().__init__(
            engine=engine,
            task_id=task_id,
            show_progress=show_progress,
            include_meta=include_meta,
            overwrite=False,
            prompt_template=prompt_template,
            prompt_signature_desc=prompt_signature_desc,
            fewshot_examples=fewshot_examples,
        )
        self._fewshot_examples: Iterable[TaskFewshotExample]

    def _init_bridge(self, engine_type: EngineType) -> _TaskBridge:
        """Initialize bridge.
        :return: Engine task.
        :raises ValueError: If engine type is not supported.
        """
        bridge_types = {
            EngineType.langchain: LangChainNER,
            EngineType.ollama: OllamaNER,
            EngineType.outlines: OutlinesNER,
            EngineType.dspy: DSPyNER,
            EngineType.instructor: InstructorNER,
            EngineType.glix: GliXNER,
            EngineType.vllm: VLLMNER,
        }
        try:
            bridge_class = bridge_types[engine_type]
            result = bridge_class(
                task_id=self._task_id,
                prompt_template=self._custom_prompt_template,
                prompt_signature_desc=self._custom_prompt_signature_desc,
                entities=self._entities,
            )
            return result  # type: ignore[return-value]
        except KeyError as err:
            raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err

    @property
    def supports(self) -> set[EngineType]:
        return {
            EngineType.langchain,
            EngineType.ollama,
            EngineType.dspy,
            EngineType.outlines,
            EngineType.instructor,
            EngineType.glix,
            EngineType.vllm,
        }

    def _validate_fewshot_examples(self) -> None:
        for fs_example in self._fewshot_examples or []:
            for entity in fs_example.entities:
                if entity.entity_type not in self._entities:
                    raise ValueError(f"Entity {entity.entity_type} not in {self._entities}.")

    @property
    def _state(self) -> dict[str, Any]:
        return {
            **super()._state,
            "entities": self._entities,
        }

    def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets.Dataset:
        # Define metadata and features for the dataset
        features = datasets.Features(
            {
                "text": datasets.Value("string"),
                "entities": datasets.Sequence(
                    datasets.Features(
                        {
                            "text": datasets.Value("string"),
                            "start": datasets.Value("int32"),
                            "end": datasets.Value("int32"),
                            "entity_type": datasets.Value("string"),
                        }
                    )
                ),
            }
        )

        info = datasets.DatasetInfo(
            description=f"Named Entity Recognition dataset with entity types {self._entities}. Generated with sieves "
            f"v{Config.get_version()}.",
            features=features,
        )

        # Fetch data used for generating dataset
        try:
            data: list[tuple[str, list[dict[str, Any]]]] = []
            for doc in docs:
                if self._task_id not in doc.results:
                    raise KeyError(f"Document does not have results for task ID {self._task_id}")

                # Get the entities from the document results
                result = doc.results[self._task_id].entities
                entities: list[dict[str, Any]] = []

                # List format (could be list of dictionaries or other entities)
                for entity in result:
                    assert hasattr(entity, "text")
                    assert hasattr(entity, "start")
                    assert hasattr(entity, "end")
                    assert hasattr(entity, "entity_type")

                    entities.append(
                        {
                            "text": entity.text,
                            "start": entity.start,
                            "end": entity.end,
                            "entity_type": entity.entity_type,
                        }
                    )

                data.append((doc.text or "", entities))

        except KeyError as err:
            raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err

        def generate_data() -> Iterable[dict[str, Any]]:
            """Yields results as dicts.
            :return: Results as dicts.
            """
            for text, entities in data:
                yield {"text": text, "entities": entities}

        # Create dataset
        return datasets.Dataset.from_generator(generate_data, features=features, info=info)

    def distill(
        self,
        base_model_id: str,
        distillation_framework: DistillationFramework,
        hf_dataset: datasets.Dataset,
        init_kwargs: dict[str, Any],
        train_kwargs: dict[str, Any],
        output_path: Path | str,
        train_frac: float,
        val_frac: float,
        seed: int | None = None,
    ) -> None:
        raise NotImplementedError

`id` `property`

Returns task ID. Used by pipeline for results and dependency management.

Returns:

Type	Description
`str`	Task ID.

`prompt_signature_description` `property`

Returns prompt signature description.

Returns:

Type	Description
`str \| None`	Prompt signature description.

`prompt_template` `property`

Returns prompt template.

Returns:

Type	Description
`str \| None`	Prompt template.

`call(docs)`

Execute the task on a set of documents.

Parameters:

Name	Type	Description	Default
`docs`	`Iterable[Doc]`	Documents to process.	required

Returns:

Type	Description
`Iterable[Doc]`	Processed documents.

Source code in sieves/tasks/predictive/core.py

def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
    """Execute the task on a set of documents.

    :param docs: Documents to process.
    :return Iterable[Doc]: Processed documents.
    """

    # Note: the mypy ignore directives are because in practice, TaskX can be a superset of the X types of multiple
    # engines, but there is no way in Python's current typing system to model that. E.g.: TaskInferenceMode could be
    # outlines_.InferenceMode | dspy_.InferenceMode, depending on the class of the dynamically provided engine
    # instance. TypeVars don't support unions however, neither do generics on a higher level of abstraction.
    # We hence ignore these mypy errors, as the involved types should nonetheless be consistent.

    docs = list(docs)

    # 1. Compile expected prompt signatures.
    signature = self._bridge.prompt_signature

    # 2. Build executable.
    executable = self._engine.build_executable(
        inference_mode=self._bridge.inference_mode,
        prompt_template=self.prompt_template,
        prompt_signature=signature,
        fewshot_examples=self._fewshot_examples,
    )

    # 3. Extract values from docs to inject/render those into prompt templates.
    docs_values = self._bridge.extract(docs)

    # 4. Map extracted docs values onto chunks.
    docs_chunks_offsets: list[tuple[int, int]] = []
    docs_chunks_values: list[dict[str, Any]] = []
    for doc, doc_values in zip(docs, docs_values):
        assert doc.text
        doc_chunks_values = [doc_values | {"text": chunk} for chunk in (doc.chunks or [doc.text])]
        docs_chunks_offsets.append((len(docs_chunks_values), len(docs_chunks_values) + len(doc_chunks_values)))
        docs_chunks_values.extend(doc_chunks_values)

    # 5. Execute prompts per chunk.
    results = list(executable(tuple(docs_chunks_values)))
    assert len(results) == len(docs_chunks_values)

    # 6. Consolidate chunk results.
    results = list(self._bridge.consolidate(results, docs_chunks_offsets))
    assert len(results) == len(docs)

    # 7. Integrate results into docs.
    docs = self._bridge.integrate(results, docs)

    yield from docs

`init(entities, engine, task_id=None, show_progress=True, include_meta=True, prompt_template=None, prompt_signature_desc=None, fewshot_examples=())`

" Initializes new PredictiveTask.

Parameters:

Name	Type	Description	Default
`task_id`	`str \| None`	Task ID.	`None`
`show_progress`	`bool`	Whether to show progress bar for processed documents.	`True`
`include_meta`	`bool`	Whether to include meta information generated by the task.	`True`
`prompt_template`	`str \| None`	Custom prompt template. If None, task's default template is being used.	`None`
`prompt_signature_desc`	`str \| None`	Custom prompt signature description. If None, default will be used.	`None`
`fewshot_examples`	`Iterable[TaskFewshotExample]`	Few-shot examples.	`()`

Source code in sieves/tasks/predictive/ner/core.py

def __init__(
    self,
    entities: list[str],
    engine: Engine,
    task_id: str | None = None,
    show_progress: bool = True,
    include_meta: bool = True,
    prompt_template: str | None = None,
    prompt_signature_desc: str | None = None,
    fewshot_examples: Iterable[TaskFewshotExample] = (),
) -> None:
    """ "
    Initializes new PredictiveTask.
    :param task_id: Task ID.
    :param show_progress: Whether to show progress bar for processed documents.
    :param include_meta: Whether to include meta information generated by the task.
    :param prompt_template: Custom prompt template. If None, task's default template is being used.
    :param prompt_signature_desc: Custom prompt signature description. If None, default will be used.
    :param fewshot_examples: Few-shot examples.
    """
    self._entities = entities or ["PERSON", "LOCATION", "ORGANIZATION"]
    super().__init__(
        engine=engine,
        task_id=task_id,
        show_progress=show_progress,
        include_meta=include_meta,
        overwrite=False,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
        fewshot_examples=fewshot_examples,
    )
    self._fewshot_examples: Iterable[TaskFewshotExample]

`deserialize(config, **kwargs)` `classmethod`

Generate PredictiveTask instance from config.

Parameters:

Name	Type	Description	Default
`config`	`Config`	Config to generate instance from.	required
`kwargs`	`dict[str, Any]`	Values to inject into loaded config.	`{}`

Returns:

Type	Description
`PredictiveTask[TaskPromptSignature, TaskResult, TaskBridge]`	Deserialized PredictiveTask instance.

Source code in sieves/tasks/predictive/core.py

@classmethod
def deserialize(
    cls, config: Config, **kwargs: dict[str, Any]
) -> PredictiveTask[TaskPromptSignature, TaskResult, TaskBridge]:
    """Generate PredictiveTask instance from config.
    :param config: Config to generate instance from.
    :param kwargs: Values to inject into loaded config.
    :return PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]: Deserialized PredictiveTask instance.
    """
    # Validate engine config.
    assert hasattr(config, "engine")
    assert isinstance(config.engine.value, Config)
    engine_config = config.engine.value
    engine_cls = engine_config.config_cls
    assert issubclass(engine_cls, Serializable)
    assert issubclass(engine_cls, Engine)

    # Deserialize and inject engine.
    engine_param: dict[str, Any] = {"engine": engine_cls.deserialize(engine_config, **kwargs["engine"])}
    return cls(**config.to_init_dict(cls, **(kwargs | engine_param)))

`serialize()`

Serializes task.

Returns:

Type	Description
`Config`	Config instance.

Source code in sieves/tasks/core.py

def serialize(self) -> Config:
    """Serializes task.
    :return: Config instance.
    """
    return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})

`GliXNER`

Bases: NERBridge[list[str], Result, InferenceMode]

Source code in sieves/tasks/predictive/ner/bridges.py

class GliXNER(NERBridge[list[str], glix_.Result, glix_.InferenceMode]):
    def __init__(
        self,
        entities: list[str],
        task_id: str,
        prompt_template: str | None,
        prompt_signature_desc: str | None,
    ):
        """
        Initializes GliXNER bridge.
        :param entities: List of entity types to extract.
        :param task_id: Task ID.
        :param prompt_template: Custom prompt template.
        :param prompt_signature_desc: Custom prompt signature description.
        """
        super().__init__(
            entities=entities,
            task_id=task_id,
            prompt_template=prompt_template,
            prompt_signature_desc=prompt_signature_desc,
        )

    @property
    def _prompt_template(self) -> str | None:
        return None

    @property
    def _prompt_signature_description(self) -> str | None:
        return None

    @property
    def prompt_signature(self) -> list[str]:
        return self._entities

    @property
    def inference_mode(self) -> glix_.InferenceMode:
        return glix_.InferenceMode.ner

    def consolidate(
        self, results: Iterable[glix_.Result], docs_offsets: list[tuple[int, int]]
    ) -> Iterable[glix_.Result]:
        results = list(results)

        # Simply group results by document without trying to adjust positions
        # Position adjustment will happen in the integrate function
        for doc_offset in docs_offsets:
            doc_results = results[doc_offset[0] : doc_offset[1]]
            all_entities: list[dict[str, Any]] = []

            # Keep track of which chunk each entity came from
            for chunk_idx, chunk_result in enumerate(doc_results):
                # Process entities in this chunk
                for entity in chunk_result:
                    if isinstance(entity, dict):
                        # Add chunk index to the entity for reference in integrate
                        entity_copy = entity.copy()
                        entity_copy["chunk_idx"] = chunk_idx
                        all_entities.append(entity_copy)

            # Yield results for this document (flattened list of entities)
            yield all_entities

    def integrate(self, results: Iterable[glix_.Result], docs: Iterable[Doc]) -> Iterable[Doc]:
        docs_list = list(docs)
        results_list = list(results)

        # Process each document
        for doc, result in zip(docs_list, results_list):
            entities_list: list[Entity] = []
            doc_text = doc.text if doc.text is not None else ""

            # Get chunk information from the document
            chunk_offsets: list[int] = []
            if hasattr(doc, "chunks") and doc.chunks:
                # Calculate beginning position of each chunk in the original text
                current_offset = 0
                for chunk in doc.chunks:
                    chunk_offsets.append(current_offset)
                    current_offset += len(chunk) + 1

            # Process entities in this document
            if result:
                for entity_dict in result:
                    if not isinstance(entity_dict, dict):
                        continue

                    try:
                        entity_text = str(entity_dict.get("text", ""))
                        entity_start = int(entity_dict.get("start", 0))
                        entity_end = int(entity_dict.get("end", 0))
                        entity_type = str(entity_dict.get("label", ""))

                        # Get the chunk index (added in consolidate)
                        chunk_idx = int(entity_dict.get("chunk_idx", 0))

                        # Add chunk offset to entity positions
                        adjusted_start = entity_start
                        adjusted_end = entity_end

                        if chunk_offsets and chunk_idx < len(chunk_offsets):
                            # Adjust positions based on chunk offset
                            adjusted_start += chunk_offsets[chunk_idx]
                            adjusted_end += chunk_offsets[chunk_idx]

                        entities_list.append(
                            Entity(
                                text=entity_text,
                                start=adjusted_start,
                                end=adjusted_end,
                                entity_type=entity_type,
                            )
                        )
                    except (ValueError, TypeError) as e:
                        print(f"Error processing entity: {e}")
                        continue

            # Create the final entities object and store in document results
            entities_obj = Entities(text=doc_text, entities=entities_list)
            doc.results[self._task_id] = entities_obj

        return docs_list

`prompt_signature_description` `property`

Returns prompt signature description. This is used by some engines to aid the language model in generating structured output.

Returns:

Type	Description
`str \| None`	Prompt signature description. None if not used by engine.

`prompt_template` `property`

Returns prompt template. Note: different engines have different expectations as how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind engine-specific expectations when creating a prompt template.

Returns:

Type	Description
`str \| None`	Prompt template as string. None if not used by engine.

`init(entities, task_id, prompt_template, prompt_signature_desc)`

Initializes GliXNER bridge.

Parameters:

Name	Type	Description	Default
`entities`	`list[str]`	List of entity types to extract.	required
`task_id`	`str`	Task ID.	required
`prompt_template`	`str \| None`	Custom prompt template.	required
`prompt_signature_desc`	`str \| None`	Custom prompt signature description.	required

Source code in sieves/tasks/predictive/ner/bridges.py

def __init__(
    self,
    entities: list[str],
    task_id: str,
    prompt_template: str | None,
    prompt_signature_desc: str | None,
):
    """
    Initializes GliXNER bridge.
    :param entities: List of entity types to extract.
    :param task_id: Task ID.
    :param prompt_template: Custom prompt template.
    :param prompt_signature_desc: Custom prompt signature description.
    """
    super().__init__(
        entities=entities,
        task_id=task_id,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
    )

`extract(docs)`

Extract all values from doc instances that are to be injected into the prompts. Overriding the default implementation to include the entity types in the extracted values.

Parameters:

Name	Type	Description	Default
`docs`	`Iterable[Doc]`	Docs to extract values from.	required

Returns:

Type	Description
`Iterable[dict[str, Any]]`	All values from doc instances that are to be injected into the prompts

Source code in sieves/tasks/predictive/ner/bridges.py

def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.
    Overriding the default implementation to include the entity types in the extracted values.
    :param docs: Docs to extract values from.
    :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
    """
    return ({"text": doc.text if doc.text else None, "entity_types": self._entities} for doc in docs)

`NERBridge`

Bases: Bridge[_BridgePromptSignature, _BridgeResult, EngineInferenceMode], ABC

Source code in sieves/tasks/predictive/ner/bridges.py

class NERBridge(Bridge[_BridgePromptSignature, _BridgeResult, EngineInferenceMode], abc.ABC):
    def __init__(
        self,
        entities: list[str],
        task_id: str,
        prompt_template: str | None,
        prompt_signature_desc: str | None,
    ):
        """
        Initializes NERBridge.
        :param task_id: Task ID.
        :param prompt_template: Custom prompt template.
        :param prompt_signature_desc: Custom prompt signature description.
        """
        super().__init__(
            task_id=task_id,
            prompt_template=prompt_template,
            prompt_signature_desc=prompt_signature_desc,
            overwrite=False,
        )
        self._entities = entities

    def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
        """Extract all values from doc instances that are to be injected into the prompts.
        Overriding the default implementation to include the entity types in the extracted values.
        :param docs: Docs to extract values from.
        :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
        """
        return ({"text": doc.text if doc.text else None, "entity_types": self._entities} for doc in docs)

    @staticmethod
    def _find_entity_positions(
        doc_text: str,
        result: _BridgeResult,
    ) -> list[Entity]:
        """
        Find all positions of an entity in a document.
        :param doc_text: The text of the document.
        :param result: The result of the model.
        :return: The list of entities with start/end indices.
        """
        doc_text_lower = doc_text.lower()
        # Create a new result with the same structure as the original
        new_entities: list[Entity] = []

        # Track entities by position to avoid duplicates
        entities_by_position: dict[tuple[int, int], Entity] = {}
        context_list: list[str] = []

        entities_list = getattr(result, "entities", [])
        for entity_with_context in entities_list:
            # Skip if there is no entity
            if not entity_with_context:
                continue

            # Get the entity and context texts from the model
            entity_text = getattr(entity_with_context, "text", "")
            context = getattr(entity_with_context, "context", "")
            entity_type = getattr(entity_with_context, "entity_type", "")

            if not entity_text:
                continue

            entity_text_lower = entity_text.lower()
            context_lower = context.lower() if context else ""
            # Create a list of the unique contexts
            # Avoid adding duplicates as entities witht he same context would be captured twice
            if context_lower not in context_list:
                context_list.append(context_lower)
            else:
                continue
            # Find all occurrences of the context in the document using regex
            context_positions = re.finditer(re.escape(context_lower), doc_text_lower)

            # For each context position that was found (usually is just one), find the entity within that context
            for match in context_positions:
                context_start = match.start()
                entity_start_in_context = context_lower.find(entity_text_lower)

                if entity_start_in_context >= 0:
                    start = context_start + entity_start_in_context
                    end = start + len(entity_text)

                    # Create a new entity with start/end indices
                    new_entity = Entity(
                        text=doc_text[start:end],
                        start=start,
                        end=end,
                        entity_type=entity_type,
                    )

                    # Only add if this exact position hasn't been filled yet
                    position_key = (start, end)
                    if position_key not in entities_by_position:
                        entities_by_position[position_key] = new_entity
                        new_entities.append(new_entity)

        return sorted(new_entities, key=lambda x: x.start)

    def integrate(self, results: Iterable[_BridgeResult], docs: Iterable[Doc]) -> Iterable[Doc]:
        docs_list = list(docs)
        results_list = list(results)

        for doc, result in zip(docs_list, results_list):
            # Get the original text from the document
            doc_text = doc.text or ""
            if hasattr(result, "entities"):
                # Process entities from result if available
                entities_with_position = self._find_entity_positions(doc_text, result)
                # Create a new result with the updated entities
                new_result = Entities(text=doc_text, entities=entities_with_position)
                doc.results[self._task_id] = new_result
            else:
                # Default empty result
                doc.results[self._task_id] = Entities(text=doc_text, entities=[])

        return docs_list

`inference_mode` `abstractmethod` `property`

Returns inference mode.

Returns:

Type	Description
`EngineInferenceMode`	Inference mode.

`prompt_signature` `abstractmethod` `property`

Creates output signature (e.g.: Signature in DSPy, Pydantic objects in outlines, JSON schema in jsonformers). This is engine-specific.

Returns:

Type	Description
`type[TaskPromptSignature] \| TaskPromptSignature`	Output signature object. This can be an instance (e.g. a regex string) or a class (e.g. a Pydantic class).

`prompt_signature_description` `property`

Returns prompt signature description. This is used by some engines to aid the language model in generating structured output.

Returns:

Type	Description
`str \| None`	Prompt signature description. None if not used by engine.

`prompt_template` `property`

Returns prompt template. Note: different engines have different expectations as how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind engine-specific expectations when creating a prompt template.

Returns:

Type	Description
`str \| None`	Prompt template as string. None if not used by engine.

`init(entities, task_id, prompt_template, prompt_signature_desc)`

Initializes NERBridge.

Parameters:

Name	Type	Description	Default
`task_id`	`str`	Task ID.	required
`prompt_template`	`str \| None`	Custom prompt template.	required
`prompt_signature_desc`	`str \| None`	Custom prompt signature description.	required

Source code in sieves/tasks/predictive/ner/bridges.py

def __init__(
    self,
    entities: list[str],
    task_id: str,
    prompt_template: str | None,
    prompt_signature_desc: str | None,
):
    """
    Initializes NERBridge.
    :param task_id: Task ID.
    :param prompt_template: Custom prompt template.
    :param prompt_signature_desc: Custom prompt signature description.
    """
    super().__init__(
        task_id=task_id,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
        overwrite=False,
    )
    self._entities = entities

`consolidate(results, docs_offsets)` `abstractmethod`

Consolidates results for document chunks into document results.

Parameters:

Name	Type	Description	Default
`results`	`Iterable[TaskResult]`	Results per document chunk.	required
`docs_offsets`	`list[tuple[int, int]]`	Chunk offsets per document. Chunks per document can be obtained with results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]].	required

Returns:

Type	Description
`Iterable[TaskResult]`	Results per document.

Source code in sieves/tasks/predictive/bridges.py

@abc.abstractmethod
def consolidate(self, results: Iterable[TaskResult], docs_offsets: list[tuple[int, int]]) -> Iterable[TaskResult]:
    """Consolidates results for document chunks into document results.
    :param results: Results per document chunk.
    :param docs_offsets: Chunk offsets per document. Chunks per document can be obtained with
        results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]].
    :return Iterable[_TaskResult]: Results per document.
    """

`extract(docs)`

Extract all values from doc instances that are to be injected into the prompts. Overriding the default implementation to include the entity types in the extracted values.

Parameters:

Name	Type	Description	Default
`docs`	`Iterable[Doc]`	Docs to extract values from.	required

Returns:

Type	Description
`Iterable[dict[str, Any]]`	All values from doc instances that are to be injected into the prompts

Source code in sieves/tasks/predictive/ner/bridges.py

def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.
    Overriding the default implementation to include the entity types in the extracted values.
    :param docs: Docs to extract values from.
    :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
    """
    return ({"text": doc.text if doc.text else None, "entity_types": self._entities} for doc in docs)

Named Entity Recognition

NER

id property

prompt_signature_description property

prompt_template property

__call__(docs)

__init__(entities, engine, task_id=None, show_progress=True, include_meta=True, prompt_template=None, prompt_signature_desc=None, fewshot_examples=())

deserialize(config, **kwargs) classmethod

serialize()

GliXNER

prompt_signature_description property

prompt_template property

__init__(entities, task_id, prompt_template, prompt_signature_desc)

extract(docs)

NERBridge

inference_mode abstractmethod property

prompt_signature abstractmethod property

prompt_signature_description property

prompt_template property

__init__(entities, task_id, prompt_template, prompt_signature_desc)

consolidate(results, docs_offsets) abstractmethod

extract(docs)

`NER`

`id` `property`

`prompt_signature_description` `property`

`prompt_template` `property`

`call(docs)`

`init(entities, engine, task_id=None, show_progress=True, include_meta=True, prompt_template=None, prompt_signature_desc=None, fewshot_examples=())`

`deserialize(config, **kwargs)` `classmethod`

`serialize()`

`GliXNER`

`prompt_signature_description` `property`

`prompt_template` `property`

`init(entities, task_id, prompt_template, prompt_signature_desc)`

`extract(docs)`

`NERBridge`

`inference_mode` `abstractmethod` `property`

`prompt_signature` `abstractmethod` `property`

`prompt_signature_description` `property`

`prompt_template` `property`

`init(entities, task_id, prompt_template, prompt_signature_desc)`

`consolidate(results, docs_offsets)` `abstractmethod`

`extract(docs)`