Question Answering

The QuestionAnswering task answers questions based on the content of the documents.

Usage

from sieves import tasks

task = tasks.QuestionAnswering(
    questions=["What is the main topic?", "Who are the key figures?"],
    model=model,
)

Results

The QuestionAnswering task returns a unified Result object containing a list of qa_pairs. Each pair couples the input question with its predicted answer and a confidence score.

Confidence scores are self-reported by LLMs and may be None if the model fails to provide them.

class Result(pydantic.BaseModel):
    """Result of a question-answering task. Contains a list of question-answer pairs.

    Attributes:
        qa_pairs: List of question-answer pairs.
    """

    qa_pairs: list[QuestionAnswer] = pydantic.Field(
        description="A list of question-answer pairs corresponding to the provided questions."
    )

Evaluation

Performance of the Question Answering task is assessed using a "judge" model.

Metric: LLM Score (LLM Score). A model-based similarity score (0.0 to 1.0) provided by a DSPy judge, averaged across all question-answer pairs.
Requirement: Each document must have ground-truth answers stored in doc.gold[task_id].
Judge: You must provide a dspy.LM instance to the evaluate() method.

report = task.evaluate(docs, judge=dspy_judge)
print(f"QA Score: {report.metrics['LLM Score']}")

Ground Truth Formats

Ground truth has to be specified in doc.meta using Result instances.

Question Answering predictive task.

`QuestionAnswering`

Bases: PredictiveTask[TaskPromptSignature, TaskResult, _TaskBridge]

Answer questions about a text using structured model wrappers.

Source code in sieves/tasks/predictive/question_answering/core.py

class QuestionAnswering(PredictiveTask[TaskPromptSignature, TaskResult, _TaskBridge]):
    """Answer questions about a text using structured model wrappers."""

    def __init__(
        self,
        questions: list[str],
        model: TaskModel,
        task_id: str | None = None,
        include_meta: bool = True,
        batch_size: int = -1,
        prompt_instructions: str | None = None,
        fewshot_examples: Sequence[FewshotExample] = (),
        model_settings: ModelSettings = ModelSettings(),
        condition: Callable[[Doc], bool] | None = None,
    ) -> None:
        """
        Initialize QuestionAnswering task.

        :param questions: Questions to answer.
        :param model: Model to use.
        :param task_id: Task ID.
        :param include_meta: Whether to include meta information generated by the task.
        :param batch_size: Batch size to use for inference. Use -1 to process all documents at once.
        :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
        :param fewshot_examples: Few-shot examples.
        :param model_settings: Settings for structured generation.
        :param condition: Optional callable that determines whether to process each document.
        """
        self._questions = questions
        super().__init__(
            model=model,
            task_id=task_id,
            include_meta=include_meta,
            batch_size=batch_size,
            overwrite=False,
            prompt_instructions=prompt_instructions,
            fewshot_examples=fewshot_examples,
            model_settings=model_settings,
            condition=condition,
        )

    @property
    @override
    def fewshot_example_type(self) -> type[FewshotExample]:
        """Return few-shot example type.

        :return: Few-shot example type.
        """
        return FewshotExample

    @override
    def _compute_metrics(self, truths: list[Any], preds: list[Any], judge: dspy.LM | None = None) -> dict[str, float]:
        """Compute corpus-level metrics.

        :param truths: List of ground truths.
        :param preds: List of predictions.
        :param judge: Optional DSPy LM instance to use as judge for generative tasks.
        :return: Dictionary of metrics.
        """
        for gold in truths:
            if gold is not None:
                assert isinstance(gold, TaskResult)
        for pred in preds:
            if pred is not None:
                assert isinstance(pred, TaskResult)

        return super()._compute_metrics(truths, preds, judge=judge)

    def _validate_fewshot_examples(self) -> None:
        """Validate that questions, answers and scores have the same length.

        :raises ValueError: If lengths don't match.
        """
        for i, example in enumerate(self._fewshot_examples):
            if len(example.questions) != len(example.answers):
                raise ValueError(
                    f"Length mismatch in few-shot example {i}: {len(example.questions)} questions "
                    f"vs {len(example.answers)} answers."
                )
            if example.scores is not None and len(example.scores) != len(example.answers):
                raise ValueError(
                    f"Length mismatch in few-shot example {i}: {len(example.answers)} answers "
                    f"vs {len(example.scores)} scores."
                )

    @override
    def _init_bridge(self, model_type: ModelType) -> _TaskBridge:
        bridge_types: dict[ModelType, type[_TaskBridge]] = {
            ModelType.dspy: DSPyQuestionAnswering,
            ModelType.outlines: PydanticQA,
            ModelType.langchain: PydanticQA,
        }

        try:
            return bridge_types[model_type](
                task_id=self._task_id,
                prompt_instructions=self._custom_prompt_instructions,
                questions=self._questions,
                model_settings=self._model_settings,
                prompt_signature=self.prompt_signature,
                model_type=model_type,
                fewshot_examples=self._fewshot_examples,
            )
        except KeyError as err:
            raise KeyError(f"Model type {model_type} is not supported by {self.__class__.__name__}.") from err

    @staticmethod
    @override
    def supports() -> set[ModelType]:
        return {
            ModelType.dspy,
            ModelType.langchain,
            ModelType.outlines,
        }

    @property
    @override
    def prompt_signature(self) -> type[pydantic.BaseModel]:
        return TaskResult

    @override
    @property
    def _state(self) -> dict[str, Any]:
        return {
            **super()._state,
            "questions": self._questions,
        }

    @override
    def to_hf_dataset(self, docs: Iterable[Doc], threshold: float | None = None) -> datasets.Dataset:
        # Define metadata.
        features = datasets.Features(
            {
                "text": datasets.Value("string"),
                "answers": datasets.Sequence(datasets.Value("string")),
                "scores": datasets.Sequence(datasets.Value("float32")),
            }
        )
        info = datasets.DatasetInfo(
            description=f"Question-answering dataset with questions {self._questions}. Generated with sieves "
            f"v{Config.get_version()}.",
            features=features,
        )

        # Fetch data used for generating dataset.
        try:
            data: list[tuple[str, list[str], list[float]]] = []
            for doc in docs:
                result = doc.results[self._task_id]
                answers = [qa.answer for qa in result.qa_pairs]
                scores = [qa.score if qa.score is not None else 1.0 for qa in result.qa_pairs]
                data.append((doc.text, answers, scores))
        except KeyError as err:
            raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err

        def generate_data() -> Iterable[dict[str, Any]]:
            """Yield results as dicts.

            :return: Results as dicts.
            """
            for text, answers, scores in data:
                yield {"text": text, "answers": answers, "scores": scores}

        # Create dataset.
        return datasets.Dataset.from_generator(generate_data, features=features, info=info)

    @override
    def distill(
        self,
        base_model_id: str,
        framework: DistillationFramework,
        data: datasets.Dataset | Sequence[Doc],
        output_path: Path | str,
        val_frac: float,
        init_kwargs: dict[str, Any] | None = None,
        train_kwargs: dict[str, Any] | None = None,
        seed: int | None = None,
    ) -> None:
        raise NotImplementedError

`fewshot_example_type` `property`

Return few-shot example type.

Returns:

Type	Description
`type[FewshotExample]`	Few-shot example type.

`fewshot_examples` `property`

Return few-shot examples.

Returns:

Type	Description
`Sequence[FewshotExample]`	Few-shot examples.

`id` `property`

Return task ID.

Used by pipeline for results and dependency management.

Returns:

Type	Description
`str`	Task ID.

`metric` `property`

Return metric name.

Returns:

Type	Description
`str`	Metric name.

`prompt_signature_description` `property`

Return prompt signature description.

Returns:

Type	Description
`str \| None`	Prompt signature description.

`prompt_template` `property`

Return prompt template.

Returns:

Type	Description
`str`	Prompt template.

`add(other)`

Chain this task with another task or pipeline using the + operator.

This returns a new Pipeline that executes this task first, followed by the task(s) in other. The original task(s)/pipeline are not mutated.

Cache semantics: - If other is a Pipeline, the resulting pipeline adopts other's use_cache setting (because the left-hand side is a single task). - If other is a Task, the resulting pipeline defaults to use_cache=True.

Parameters:

Name	Type	Description	Default
`other`	`Task \| Pipeline`	A `Task` or `Pipeline` to execute after this task.	required

Returns:

Type	Description
`Pipeline`	A new `Pipeline` representing the chained execution.

Raises:

Type	Description
`TypeError`	If `other` is not a `Task` or `Pipeline`.

Source code in sieves/tasks/core.py

def __add__(self, other: Task | Pipeline) -> Pipeline:
    """Chain this task with another task or pipeline using the ``+`` operator.

    This returns a new ``Pipeline`` that executes this task first, followed by the
    task(s) in ``other``. The original task(s)/pipeline are not mutated.

    Cache semantics:
    - If ``other`` is a ``Pipeline``, the resulting pipeline adopts ``other``'s
      ``use_cache`` setting (because the left-hand side is a single task).
    - If ``other`` is a ``Task``, the resulting pipeline defaults to ``use_cache=True``.

    :param other: A ``Task`` or ``Pipeline`` to execute after this task.
    :return: A new ``Pipeline`` representing the chained execution.
    :raises TypeError: If ``other`` is not a ``Task`` or ``Pipeline``.
    """
    # Lazy import to avoid circular dependency at module import time.
    from sieves.pipeline import Pipeline

    if isinstance(other, Pipeline):
        return Pipeline(tasks=[self, *other.tasks], use_cache=other.use_cache)

    if isinstance(other, Task):
        return Pipeline(tasks=[self, other])

    raise TypeError(f"Cannot chain Task with {type(other).__name__}")

`call(docs)`

Execute task with conditional logic.

Checks the condition for each document without materializing all docs upfront. Passes all documents that pass the condition to _call() for proper batching. Documents that fail the condition have results[task_id] set to None.

Parameters:

Name	Type	Description	Default
`docs`	`Iterable[Doc]`	Docs to process.	required

Returns:

Type	Description
`Iterable[Doc]`	Processed docs (in original order).

Source code in sieves/tasks/core.py

def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
    """Execute task with conditional logic.

    Checks the condition for each document without materializing all docs upfront.
    Passes all documents that pass the condition to _call() for proper batching.
    Documents that fail the condition have results[task_id] set to None.

    :param docs: Docs to process.
    :return: Processed docs (in original order).
    """
    docs = iter(docs) if not isinstance(docs, Iterator) else docs

    # Materialize docs in batches. This doesn't incur additional memory overhead, as docs are materialized in
    # batches downstream anyway.
    batch_size = self._batch_size if self._batch_size > 0 else sys.maxsize
    while docs_batch := [doc for doc in itertools.islice(docs, batch_size)]:
        # First pass: determine which docs pass the condition by index.
        passing_indices: set[int] = {
            idx for idx, doc in enumerate(docs_batch) if self._condition is None or self._condition(doc)
        }

        # Process all passing docs in one batch.
        processed = self._call(d for i, d in enumerate(docs_batch) if i in passing_indices)
        processed_iter = iter(processed) if not isinstance(processed, Iterator) else processed

        # Iterate through original docs in order and yield results.
        for idx, doc in enumerate(docs_batch):
            if idx in passing_indices:
                # Doc passed condition - use processed result.
                yield next(processed_iter)
            else:
                # Doc failed condition - set `None` result and yield original.
                doc.results[self.id] = None
                yield doc

`init(questions, model, task_id=None, include_meta=True, batch_size=-1, prompt_instructions=None, fewshot_examples=(), model_settings=ModelSettings(), condition=None)`

Initialize QuestionAnswering task.

Parameters:

Name	Type	Description	Default
`questions`	`list[str]`	Questions to answer.	required
`model`	`TaskModel`	Model to use.	required
`task_id`	`str \| None`	Task ID.	`None`
`include_meta`	`bool`	Whether to include meta information generated by the task.	`True`
`batch_size`	`int`	Batch size to use for inference. Use -1 to process all documents at once.	`-1`
`prompt_instructions`	`str \| None`	Custom prompt instructions. If None, default instructions are used.	`None`
`fewshot_examples`	`Sequence[FewshotExample]`	Few-shot examples.	`()`
`model_settings`	`ModelSettings`	Settings for structured generation.	`ModelSettings()`
`condition`	`Callable[[Doc], bool] \| None`	Optional callable that determines whether to process each document.	`None`

Source code in sieves/tasks/predictive/question_answering/core.py

def __init__(
    self,
    questions: list[str],
    model: TaskModel,
    task_id: str | None = None,
    include_meta: bool = True,
    batch_size: int = -1,
    prompt_instructions: str | None = None,
    fewshot_examples: Sequence[FewshotExample] = (),
    model_settings: ModelSettings = ModelSettings(),
    condition: Callable[[Doc], bool] | None = None,
) -> None:
    """
    Initialize QuestionAnswering task.

    :param questions: Questions to answer.
    :param model: Model to use.
    :param task_id: Task ID.
    :param include_meta: Whether to include meta information generated by the task.
    :param batch_size: Batch size to use for inference. Use -1 to process all documents at once.
    :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
    :param fewshot_examples: Few-shot examples.
    :param model_settings: Settings for structured generation.
    :param condition: Optional callable that determines whether to process each document.
    """
    self._questions = questions
    super().__init__(
        model=model,
        task_id=task_id,
        include_meta=include_meta,
        batch_size=batch_size,
        overwrite=False,
        prompt_instructions=prompt_instructions,
        fewshot_examples=fewshot_examples,
        model_settings=model_settings,
        condition=condition,
    )

`deserialize(config, **kwargs)` `classmethod`

Generate PredictiveTask instance from config.

Parameters:

Name	Type	Description	Default
`config`	`Config`	Config to generate instance from.	required
`kwargs`	`dict[str, Any]`	Values to inject into loaded config.	`{}`

Returns:

Type	Description
`PredictiveTask[TaskPromptSignature, TaskResult, TaskBridge]`	Deserialized PredictiveTask instance.

Source code in sieves/tasks/predictive/core.py

@classmethod
def deserialize(
    cls, config: Config, **kwargs: dict[str, Any]
) -> PredictiveTask[TaskPromptSignature, TaskResult, TaskBridge]:
    """Generate PredictiveTask instance from config.

    :param config: Config to generate instance from.
    :param kwargs: Values to inject into loaded config.
    :return PredictiveTask[TaskPromptSignature, TaskResult, _TaskBridge]: Deserialized PredictiveTask instance.
    """
    init_dict = config.to_init_dict(cls, **kwargs)
    init_dict["model_settings"] = ModelSettings.model_validate(init_dict["model_settings"])

    return cls(**init_dict)

`evaluate(docs, judge=None, failure_threshold=0.5)`

Evaluate task performance using DSPy-based evaluation.

Parameters:

Name	Type	Description	Default
`docs`	`Iterable[Doc]`	Documents to evaluate.	required
`judge`	`LM \| None`	Optional DSPy LM instance to use as judge for generative tasks.	`None`
`failure_threshold`	`float`	Decision threshold for whether to mark predicitions as failures.	`0.5`

Returns:

Type	Description
`TaskEvaluationReport`	Evaluation report.

Source code in sieves/tasks/predictive/core.py

def evaluate(
    self, docs: Iterable[Doc], judge: dspy.LM | None = None, failure_threshold: float = 0.5
) -> TaskEvaluationReport:
    """Evaluate task performance using DSPy-based evaluation.

    :param docs: Documents to evaluate.
    :param judge: Optional DSPy LM instance to use as judge for generative tasks.
    :param failure_threshold: Decision threshold for whether to mark predicitions as failures.
    :return: Evaluation report.
    """
    truths: list[Any] = []
    preds: list[Any] = []
    failures: list[Doc] = []

    # Evaluate each doc individually to identify failed predictions.
    for doc in docs:
        if self.id not in doc.results:
            continue

        pred = doc.results[self.id]
        gold = doc.gold.get(self.id, None)

        # Accumulate for corpus-level metrics.
        truths.append(gold)
        preds.append(pred)

        # If gold or prediction is None: we cannot do proper evalution, so we just check whether they're both None
        # to compute score for failure analysis.
        if gold is None or pred is None:
            if gold is not None or pred is not None:
                failures.append(doc)
        else:
            # Convert result and gold to DSPy representation.
            truth = dspy.Example(**self._task_result_to_dspy_dict(gold))
            pred_dspy = dspy.Prediction(**self._task_result_to_dspy_dict(pred))

            # Call internal evaluation logic for per-doc failure analysis.
            score = self._evaluate_dspy_example(truth, pred_dspy, trace=None, model=judge)

            if score < failure_threshold:
                failures.append(doc)

    # Evaluate on corpus level to obtain representative metrics.
    metrics = self._compute_metrics(truths, preds, judge=judge)

    return TaskEvaluationReport(
        metrics=metrics,
        task_id=self.id,
        failures=failures,
    )

`optimize(optimizer, verbose=True)`

Optimize task prompt and few-shot examples with the available optimization config.

Updates task to use best prompt and few-shot examples found by the optimizer.

Parameters:

Name	Type	Description	Default
`optimizer`	`Optimizer`	Optimizer to run.	required
`verbose`	`bool`	Whether to suppress output. DSPy produces a good amount of logs, so this can be useful to not pollute your terminal. Only warnings and errors will be printed.	`True`

Returns:

Type	Description
`tuple[str, Sequence[FewshotExample]]`	Best found prompt and few-shot examples.

Source code in sieves/tasks/predictive/core.py

def optimize(self, optimizer: optimization.Optimizer, verbose: bool = True) -> tuple[str, Sequence[FewshotExample]]:
    """Optimize task prompt and few-shot examples with the available optimization config.

    Updates task to use best prompt and few-shot examples found by the optimizer.

    :param optimizer: Optimizer to run.
    :param verbose: Whether to suppress output. DSPy produces a good amount of logs, so this can be useful to
        not pollute your terminal. Only warnings and errors will be printed.

    :return tuple[str, Sequence[FewshotExample]]: Best found prompt and few-shot examples.
    """
    assert len(self._fewshot_examples) > 1, "At least two few-shot examples need to be provided to optimize."

    # Run optimizer to get best prompt and few-shot examples.
    signature = self._get_task_signature()
    dspy_examples = [ex.to_dspy() for ex in self._fewshot_examples]

    def _pred_eval(truth: dspy.Example, pred: dspy.Prediction, trace: Any | None = None) -> float:
        """Wrap optimization evaluation, inject model.

        :param truth: Ground truth.
        :param pred: Predicted value.
        :param trace: Optional trace information.
        :return: Metric value between 0.0 and 1.0.
        :raises KeyError: If target fields are missing from truth or prediction.
        :raises ValueError: If similarity score cannot be parsed from LLM response.
        """
        return self._evaluate_dspy_example(truth, pred, trace, model=optimizer.model)

    if verbose:
        best_prompt, best_examples = optimizer(signature, dspy_examples, _pred_eval, verbose=verbose)
    else:
        # Temporarily suppress DSPy logs.
        dspy_logger = logging.getLogger("dspy")
        optuna_logger = logging.getLogger("optuna")
        original_dspy_level = dspy_logger.level
        original_optuna_level = optuna_logger.level

        try:
            dspy_logger.setLevel(logging.ERROR)
            optuna_logger.setLevel(logging.ERROR)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                best_prompt, best_examples = optimizer(signature, dspy_examples, _pred_eval, verbose=verbose)
        finally:
            dspy_logger.setLevel(original_dspy_level)
            optuna_logger.setLevel(original_optuna_level)

    # Update few-shot examples and prompt instructions.
    fewshot_example_cls = self._fewshot_examples[0].__class__
    self._fewshot_examples = [fewshot_example_cls.from_dspy(ex) for ex in best_examples]
    self._validate_fewshot_examples()
    self._custom_prompt_instructions = best_prompt

    # Reinitialize bridge to use new prompt and few-shot examples.
    self._bridge = self._init_bridge(ModelType.get_model_type(self._model_wrapper))

    return best_prompt, self._fewshot_examples

`serialize()`

Serialize task.

Returns:

Type	Description
`Config`	Config instance.

Source code in sieves/tasks/core.py

def serialize(self) -> Config:
    """Serialize task.

    :return: Config instance.
    """
    return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})

Bridges for question answering task.

`DSPyQuestionAnswering`

Bases: QuestionAnsweringBridge[PromptSignature, Result, InferenceMode]

DSPy bridge for question answering.

Source code in sieves/tasks/predictive/question_answering/bridges.py

class DSPyQuestionAnswering(QuestionAnsweringBridge[dspy_.PromptSignature, dspy_.Result, dspy_.InferenceMode]):
    """DSPy bridge for question answering."""

    @override
    def _validate(self) -> None:
        assert self._model_type == ModelType.dspy

    @override
    @property
    def _default_prompt_instructions(self) -> str:
        return ""

    @override
    @property
    def inference_mode(self) -> dspy_.InferenceMode:
        return self._model_settings.inference_mode or dspy_.InferenceMode.predict

    @property
    @override
    def _chunk_extractor(self) -> Callable[[Any], Iterable[tuple[str, str, float | None]]]:
        return lambda res: ((qa.question, qa.answer, qa.score) for qa in res.qa_pairs)

    @override
    def integrate(self, results: Sequence[dspy_.Result], docs: list[Doc]) -> list[Doc]:
        for doc, result in zip(docs, results):
            assert len(result.completions.qa_pairs) == 1
            doc.results[self._task_id] = Result(qa_pairs=result.qa_pairs)
        return docs

    @override
    def consolidate(
        self, results: Sequence[dspy_.Result], docs_offsets: list[tuple[int, int]]
    ) -> Sequence[dspy_.Result]:
        consolidated_results_clean = self._consolidation_strategy.consolidate(results, docs_offsets)

        # Wrap back into dspy.Prediction.
        consolidated_results: list[dspy_.Result] = []
        for qa_list in consolidated_results_clean:
            consolidated_results.append(
                dspy.Prediction.from_completions(
                    {
                        "qa_pairs": [[QuestionAnswer(question=q, answer=a, score=s) for q, a, s in qa_list]],
                    },
                    signature=self.prompt_signature,
                )
            )
        return consolidated_results

`model_settings` `property`

Return model settings.

Returns:

Type	Description
`ModelSettings`	Model settings.

`model_type` `property`

Return model type.

Returns:

Type	Description
`ModelType`	Model type.

`prompt_signature` `property`

Create output signature.

E.g.: Signature in DSPy, Pydantic objects in outlines, JSON schema in jsonformers. This is model type-specific.

Returns:

Type	Description
`type[TaskPromptSignature] \| TaskPromptSignature`	Output signature object. This can be an instance (e.g. a regex string) or a class (e.g. a Pydantic class).

`prompt_template` `property`

Return prompt template.

Chains _prompt_instructions, _prompt_example_xml and _prompt_conclusion.

Note: different model have different expectations as to how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind model-specific expectations when creating a prompt template.

Returns:

Type	Description
`str`	Prompt template as string. None if not used by model wrapper.

`init(task_id, prompt_instructions, questions, model_settings, prompt_signature, model_type, fewshot_examples=())`

Initialize question answering bridge.

Parameters:

Name	Type	Description	Default
`task_id`	`str`	Task ID.	required
`prompt_instructions`	`str \| None`	Custom prompt instructions. If None, default instructions are used.	required
`questions`	`list[str]`	Questions to answer.	required
`model_settings`	`ModelSettings`	Settings for structured generation.	required
`prompt_signature`	`type[BaseModel]`	Unified Pydantic prompt signature.	required
`model_type`	`ModelType`	Model type.	required
`fewshot_examples`	`Sequence[BaseModel]`	Few-shot examples.	`()`

Source code in sieves/tasks/predictive/question_answering/bridges.py

def __init__(
    self,
    task_id: str,
    prompt_instructions: str | None,
    questions: list[str],
    model_settings: ModelSettings,
    prompt_signature: type[pydantic.BaseModel],
    model_type: ModelType,
    fewshot_examples: Sequence[pydantic.BaseModel] = (),
):
    """Initialize question answering bridge.

    :param task_id: Task ID.
    :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
    :param questions: Questions to answer.
    :param model_settings: Settings for structured generation.
    :param prompt_signature: Unified Pydantic prompt signature.
    :param model_type: Model type.
    :param fewshot_examples: Few-shot examples.
    """
    super().__init__(
        task_id=task_id,
        prompt_instructions=prompt_instructions,
        overwrite=False,
        model_settings=model_settings,
        prompt_signature=prompt_signature,
        model_type=model_type,
        fewshot_examples=fewshot_examples,
    )
    self._questions = questions
    self._consolidation_strategy = QAConsolidation(questions=self._questions, extractor=self._chunk_extractor)

`extract(docs)`

Extract all values from doc instances that are to be injected into the prompts.

Parameters:

Name	Type	Description	Default
`docs`	`Sequence[Doc]`	Docs to extract values from.	required

Returns:

Type	Description
`Sequence[dict[str, Any]]`	All values from doc instances that are to be injected into the prompts as a sequence.

Source code in sieves/tasks/predictive/bridges.py

def extract(self, docs: Sequence[Doc]) -> Sequence[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.

    :param docs: Docs to extract values from.
    :return: All values from doc instances that are to be injected into the prompts as a sequence.
    """
    return [{"text": doc.text if doc.text else None} for doc in docs]

`PydanticQA`

Bases: QuestionAnsweringBridge[BaseModel, BaseModel, ModelWrapperInferenceMode], ABC

Base class for Pydantic-based question answering bridges.

Source code in sieves/tasks/predictive/question_answering/bridges.py

class PydanticQA(QuestionAnsweringBridge[pydantic.BaseModel, pydantic.BaseModel, ModelWrapperInferenceMode], abc.ABC):
    """Base class for Pydantic-based question answering bridges."""

    @override
    def _validate(self) -> None:
        assert self._model_type in {ModelType.langchain, ModelType.outlines}

    @override
    @property
    def _default_prompt_instructions(self) -> str:
        return (
            "Use the given text to answer the following questions. Ensure you answer each question exactly once. "
            "Prefix each question with the number of the corresponding question. Also provide a confidence score "
            "between 0.0 and 1.0 for each answer."
        )

    @override
    @property
    def _prompt_conclusion(self) -> str | None:
        return "========\n<text>{{ text }}</text>"

    @property
    @override
    def _chunk_extractor(self) -> Callable[[Any], Iterable[tuple[str, str, float | None]]]:
        return lambda res: ((qa.question, qa.answer, qa.score) for qa in res.qa_pairs)

    @override
    def integrate(self, results: Sequence[pydantic.BaseModel], docs: list[Doc]) -> list[Doc]:
        for doc, result in zip(docs, results):
            assert hasattr(result, "qa_pairs")
            doc.results[self._task_id] = Result(qa_pairs=result.qa_pairs)
        return docs

    @override
    def consolidate(
        self, results: Sequence[pydantic.BaseModel], docs_offsets: list[tuple[int, int]]
    ) -> Sequence[pydantic.BaseModel]:
        assert issubclass(self.prompt_signature, pydantic.BaseModel)

        consolidated_results_clean = self._consolidation_strategy.consolidate(results, docs_offsets)
        consolidated_results: list[pydantic.BaseModel] = []

        for qa_list in consolidated_results_clean:
            consolidated_results.append(
                self.prompt_signature(qa_pairs=[QuestionAnswer(question=q, answer=a, score=s) for q, a, s in qa_list])
            )

        return consolidated_results

    @override
    @property
    def model_type(self) -> ModelType:
        return self._model_type

    @override
    @property
    def inference_mode(self) -> outlines_.InferenceMode | langchain_.InferenceMode:
        if self._model_type == ModelType.outlines:
            return self._model_settings.inference_mode or outlines_.InferenceMode.json
        elif self._model_type == ModelType.langchain:
            return self._model_settings.inference_mode or langchain_.InferenceMode.structured

        raise ValueError(f"Unsupported model type: {self._model_type}")

`model_settings` `property`

Return model settings.

Returns:

Type	Description
`ModelSettings`	Model settings.

`prompt_signature` `property`

Create output signature.

E.g.: Signature in DSPy, Pydantic objects in outlines, JSON schema in jsonformers. This is model type-specific.

Returns:

Type	Description
`type[TaskPromptSignature] \| TaskPromptSignature`	Output signature object. This can be an instance (e.g. a regex string) or a class (e.g. a Pydantic class).

`prompt_template` `property`

Return prompt template.

Chains _prompt_instructions, _prompt_example_xml and _prompt_conclusion.

Note: different model have different expectations as to how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind model-specific expectations when creating a prompt template.

Returns:

Type	Description
`str`	Prompt template as string. None if not used by model wrapper.

`init(task_id, prompt_instructions, questions, model_settings, prompt_signature, model_type, fewshot_examples=())`

Initialize question answering bridge.

Parameters:

Name	Type	Description	Default
`task_id`	`str`	Task ID.	required
`prompt_instructions`	`str \| None`	Custom prompt instructions. If None, default instructions are used.	required
`questions`	`list[str]`	Questions to answer.	required
`model_settings`	`ModelSettings`	Settings for structured generation.	required
`prompt_signature`	`type[BaseModel]`	Unified Pydantic prompt signature.	required
`model_type`	`ModelType`	Model type.	required
`fewshot_examples`	`Sequence[BaseModel]`	Few-shot examples.	`()`

Source code in sieves/tasks/predictive/question_answering/bridges.py

def __init__(
    self,
    task_id: str,
    prompt_instructions: str | None,
    questions: list[str],
    model_settings: ModelSettings,
    prompt_signature: type[pydantic.BaseModel],
    model_type: ModelType,
    fewshot_examples: Sequence[pydantic.BaseModel] = (),
):
    """Initialize question answering bridge.

    :param task_id: Task ID.
    :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
    :param questions: Questions to answer.
    :param model_settings: Settings for structured generation.
    :param prompt_signature: Unified Pydantic prompt signature.
    :param model_type: Model type.
    :param fewshot_examples: Few-shot examples.
    """
    super().__init__(
        task_id=task_id,
        prompt_instructions=prompt_instructions,
        overwrite=False,
        model_settings=model_settings,
        prompt_signature=prompt_signature,
        model_type=model_type,
        fewshot_examples=fewshot_examples,
    )
    self._questions = questions
    self._consolidation_strategy = QAConsolidation(questions=self._questions, extractor=self._chunk_extractor)

`extract(docs)`

Extract all values from doc instances that are to be injected into the prompts.

Parameters:

Name	Type	Description	Default
`docs`	`Sequence[Doc]`	Docs to extract values from.	required

Returns:

Type	Description
`Sequence[dict[str, Any]]`	All values from doc instances that are to be injected into the prompts as a sequence.

Source code in sieves/tasks/predictive/bridges.py

def extract(self, docs: Sequence[Doc]) -> Sequence[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.

    :param docs: Docs to extract values from.
    :return: All values from doc instances that are to be injected into the prompts as a sequence.
    """
    return [{"text": doc.text if doc.text else None} for doc in docs]

`QuestionAnsweringBridge`

Bases: Bridge[_BridgePromptSignature, _BridgeResult, ModelWrapperInferenceMode], ABC

Abstract base class for question answering bridges.

Source code in sieves/tasks/predictive/question_answering/bridges.py

class QuestionAnsweringBridge(Bridge[_BridgePromptSignature, _BridgeResult, ModelWrapperInferenceMode], abc.ABC):
    """Abstract base class for question answering bridges."""

    def __init__(
        self,
        task_id: str,
        prompt_instructions: str | None,
        questions: list[str],
        model_settings: ModelSettings,
        prompt_signature: type[pydantic.BaseModel],
        model_type: ModelType,
        fewshot_examples: Sequence[pydantic.BaseModel] = (),
    ):
        """Initialize question answering bridge.

        :param task_id: Task ID.
        :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
        :param questions: Questions to answer.
        :param model_settings: Settings for structured generation.
        :param prompt_signature: Unified Pydantic prompt signature.
        :param model_type: Model type.
        :param fewshot_examples: Few-shot examples.
        """
        super().__init__(
            task_id=task_id,
            prompt_instructions=prompt_instructions,
            overwrite=False,
            model_settings=model_settings,
            prompt_signature=prompt_signature,
            model_type=model_type,
            fewshot_examples=fewshot_examples,
        )
        self._questions = questions
        self._consolidation_strategy = QAConsolidation(questions=self._questions, extractor=self._chunk_extractor)

    @property
    @abc.abstractmethod
    def _chunk_extractor(self) -> Callable[[Any], Iterable[tuple[str, str, float | None]]]:
        """Return a callable that extracts (question, answer, score) tuples from a raw chunk result.

        :return: Extractor callable.
        """

`inference_mode` `abstractmethod` `property`

Return inference mode.

Returns:

Type	Description
`ModelWrapperInferenceMode`	Inference mode.

`model_settings` `property`

Return model settings.

Returns:

Type	Description
`ModelSettings`	Model settings.

`model_type` `property`

Return model type.

Returns:

Type	Description
`ModelType`	Model type.

`prompt_signature` `property`

Create output signature.

E.g.: Signature in DSPy, Pydantic objects in outlines, JSON schema in jsonformers. This is model type-specific.

Returns:

Type	Description
`type[TaskPromptSignature] \| TaskPromptSignature`	Output signature object. This can be an instance (e.g. a regex string) or a class (e.g. a Pydantic class).

`prompt_template` `property`

Return prompt template.

Chains _prompt_instructions, _prompt_example_xml and _prompt_conclusion.

Note: different model have different expectations as to how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind model-specific expectations when creating a prompt template.

Returns:

Type	Description
`str`	Prompt template as string. None if not used by model wrapper.

`init(task_id, prompt_instructions, questions, model_settings, prompt_signature, model_type, fewshot_examples=())`

Initialize question answering bridge.

Parameters:

Name	Type	Description	Default
`task_id`	`str`	Task ID.	required
`prompt_instructions`	`str \| None`	Custom prompt instructions. If None, default instructions are used.	required
`questions`	`list[str]`	Questions to answer.	required
`model_settings`	`ModelSettings`	Settings for structured generation.	required
`prompt_signature`	`type[BaseModel]`	Unified Pydantic prompt signature.	required
`model_type`	`ModelType`	Model type.	required
`fewshot_examples`	`Sequence[BaseModel]`	Few-shot examples.	`()`

Source code in sieves/tasks/predictive/question_answering/bridges.py

def __init__(
    self,
    task_id: str,
    prompt_instructions: str | None,
    questions: list[str],
    model_settings: ModelSettings,
    prompt_signature: type[pydantic.BaseModel],
    model_type: ModelType,
    fewshot_examples: Sequence[pydantic.BaseModel] = (),
):
    """Initialize question answering bridge.

    :param task_id: Task ID.
    :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
    :param questions: Questions to answer.
    :param model_settings: Settings for structured generation.
    :param prompt_signature: Unified Pydantic prompt signature.
    :param model_type: Model type.
    :param fewshot_examples: Few-shot examples.
    """
    super().__init__(
        task_id=task_id,
        prompt_instructions=prompt_instructions,
        overwrite=False,
        model_settings=model_settings,
        prompt_signature=prompt_signature,
        model_type=model_type,
        fewshot_examples=fewshot_examples,
    )
    self._questions = questions
    self._consolidation_strategy = QAConsolidation(questions=self._questions, extractor=self._chunk_extractor)

`consolidate(results, docs_offsets)` `abstractmethod`

Consolidate results for document chunks into document results.

Parameters:

Name	Type	Description	Default
`results`	`Sequence[TaskResult]`	Results per document chunk.	required
`docs_offsets`	`list[tuple[int, int]]`	Chunk offsets per document. Chunks per document can be obtained with `results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]]`.	required

Returns:

Type	Description
`Sequence[TaskResult]`	Results per document as a sequence.

Source code in sieves/tasks/predictive/bridges.py

@abc.abstractmethod
def consolidate(self, results: Sequence[TaskResult], docs_offsets: list[tuple[int, int]]) -> Sequence[TaskResult]:
    """Consolidate results for document chunks into document results.

    :param results: Results per document chunk.
    :param docs_offsets: Chunk offsets per document. Chunks per document can be obtained with
        `results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]]`.
    :return: Results per document as a sequence.
    """

`extract(docs)`

Extract all values from doc instances that are to be injected into the prompts.

Parameters:

Name	Type	Description	Default
`docs`	`Sequence[Doc]`	Docs to extract values from.	required

Returns:

Type	Description
`Sequence[dict[str, Any]]`	All values from doc instances that are to be injected into the prompts as a sequence.

Source code in sieves/tasks/predictive/bridges.py

def extract(self, docs: Sequence[Doc]) -> Sequence[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.

    :param docs: Docs to extract values from.
    :return: All values from doc instances that are to be injected into the prompts as a sequence.
    """
    return [{"text": doc.text if doc.text else None} for doc in docs]

`integrate(results, docs)` `abstractmethod`

Integrate results into Doc instances.

Parameters:

Name	Type	Description	Default
`results`	`Sequence[TaskResult]`	Results from prompt executable.	required
`docs`	`list[Doc]`	Doc instances to update.	required

Returns:

Type	Description
`list[Doc]`	Updated doc instances as a list.

Source code in sieves/tasks/predictive/bridges.py

@abc.abstractmethod
def integrate(self, results: Sequence[TaskResult], docs: list[Doc]) -> list[Doc]:
    """Integrate results into Doc instances.

    :param results: Results from prompt executable.
    :param docs: Doc instances to update.
    :return: Updated doc instances as a list.
    """

Schemas for question answering task.

`FewshotExample`

Bases: FewshotExample

Few-shot example with questions and answers for a context.

Attributes: text: Input text. questions: Questions asked. answers: Expected answers. scores: Confidence scores for answers.

Source code in sieves/tasks/predictive/schemas/question_answering.py

class FewshotExample(BaseFewshotExample):
    """Few-shot example with questions and answers for a context.

    Attributes:
        text: Input text.
        questions: Questions asked.
        answers: Expected answers.
        scores: Confidence scores for answers.
    """

    text: str
    questions: list[str]
    answers: list[str]
    scores: list[float] | None = None

    @property
    def input_fields(self) -> tuple[str, ...]:
        """Return input fields.

        :return: Input fields.
        """
        return ("text", "questions")

    @property
    def target_fields(self) -> tuple[str, ...]:
        """Return target fields.

        :return: Target fields.
        """
        return ("answers", "scores")

    def to_dspy(self) -> dspy.Example:
        """Convert to `dspy.Example` with qa_pairs.

        :returns: Example as `dspy.Example`.
        """
        scores = self.scores or [None] * len(self.answers)
        qa_pairs = [
            QuestionAnswer(
                question=q,
                answer=a,
                score=s,
            )
            for q, a, s in zip(self.questions, self.answers, scores)
        ]

        return dspy.Example(text=self.text, questions=self.questions, qa_pairs=qa_pairs).with_inputs(*self.input_fields)

`input_fields` `property`

Return input fields.

Returns:

Type	Description
`tuple[str, ...]`	Input fields.

`target_fields` `property`

Return target fields.

Returns:

Type	Description
`tuple[str, ...]`	Target fields.

`from_dspy(example)` `classmethod`

Convert from dspy.Example.

Parameters:

Name	Type	Description	Default
`example`	`Example`	Example as `dspy.Example`.	required

Returns:

Type	Description
`Self`	Example as `FewshotExample`.

Source code in sieves/tasks/predictive/schemas/core.py

@classmethod
def from_dspy(cls, example: dspy.Example) -> Self:
    """Convert from `dspy.Example`.

    :param example: Example as `dspy.Example`.
    :returns: Example as `FewshotExample`.
    """
    return cls(**example)

`to_dspy()`

Convert to dspy.Example with qa_pairs.

Returns:

Type	Description
`Example`	Example as `dspy.Example`.

Source code in sieves/tasks/predictive/schemas/question_answering.py

def to_dspy(self) -> dspy.Example:
    """Convert to `dspy.Example` with qa_pairs.

    :returns: Example as `dspy.Example`.
    """
    scores = self.scores or [None] * len(self.answers)
    qa_pairs = [
        QuestionAnswer(
            question=q,
            answer=a,
            score=s,
        )
        for q, a, s in zip(self.questions, self.answers, scores)
    ]

    return dspy.Example(text=self.text, questions=self.questions, qa_pairs=qa_pairs).with_inputs(*self.input_fields)

`QuestionAnswer`

Bases: BaseModel

A pair consisting of a question, its generated answer, and a confidence score.

Attributes: question: Question asked. answer: Answer to the question. score: Confidence score.

Source code in sieves/tasks/predictive/schemas/question_answering.py

class QuestionAnswer(pydantic.BaseModel):
    """A pair consisting of a question, its generated answer, and a confidence score.

    Attributes:
        question: Question asked.
        answer: Answer to the question.
        score: Confidence score.
    """

    question: str = pydantic.Field(
        description="The question being asked. This should exactly match one of the provided questions."
    )
    answer: str = pydantic.Field(description="The generated answer to the question based on the document.")
    score: float | None = pydantic.Field(
        default=None, description="Provide a confidence score for the generated answer, between 0 and 1."
    )

`Result`

Bases: BaseModel

Result of a question-answering task. Contains a list of question-answer pairs.

Attributes: qa_pairs: List of question-answer pairs.

Source code in sieves/tasks/predictive/schemas/question_answering.py

class Result(pydantic.BaseModel):
    """Result of a question-answering task. Contains a list of question-answer pairs.

    Attributes:
        qa_pairs: List of question-answer pairs.
    """

    qa_pairs: list[QuestionAnswer] = pydantic.Field(
        description="A list of question-answer pairs corresponding to the provided questions."
    )

Question Answering

Usage

Results

Evaluation

Ground Truth Formats

QuestionAnswering

fewshot_example_type property

fewshot_examples property

id property

metric property

prompt_signature_description property

prompt_template property

__add__(other)

__call__(docs)

__init__(questions, model, task_id=None, include_meta=True, batch_size=-1, prompt_instructions=None, fewshot_examples=(), model_settings=ModelSettings(), condition=None)

deserialize(config, **kwargs) classmethod

evaluate(docs, judge=None, failure_threshold=0.5)

optimize(optimizer, verbose=True)

serialize()

DSPyQuestionAnswering

model_settings property

model_type property

prompt_signature property

prompt_template property

__init__(task_id, prompt_instructions, questions, model_settings, prompt_signature, model_type, fewshot_examples=())

extract(docs)

PydanticQA

model_settings property

prompt_signature property

prompt_template property

__init__(task_id, prompt_instructions, questions, model_settings, prompt_signature, model_type, fewshot_examples=())

extract(docs)

QuestionAnsweringBridge

inference_mode abstractmethod property

model_settings property

model_type property

prompt_signature property

prompt_template property

__init__(task_id, prompt_instructions, questions, model_settings, prompt_signature, model_type, fewshot_examples=())

consolidate(results, docs_offsets) abstractmethod

extract(docs)

integrate(results, docs) abstractmethod

FewshotExample

input_fields property

target_fields property

from_dspy(example) classmethod

to_dspy()

QuestionAnswer

Result

`QuestionAnswering`

`fewshot_example_type` `property`

`fewshot_examples` `property`

`id` `property`

`metric` `property`

`prompt_signature_description` `property`

`prompt_template` `property`

`add(other)`

`call(docs)`

`init(questions, model, task_id=None, include_meta=True, batch_size=-1, prompt_instructions=None, fewshot_examples=(), model_settings=ModelSettings(), condition=None)`

`deserialize(config, **kwargs)` `classmethod`

`evaluate(docs, judge=None, failure_threshold=0.5)`

`optimize(optimizer, verbose=True)`

`serialize()`

`DSPyQuestionAnswering`

`model_settings` `property`

`model_type` `property`

`prompt_signature` `property`

`prompt_template` `property`

`init(task_id, prompt_instructions, questions, model_settings, prompt_signature, model_type, fewshot_examples=())`

`extract(docs)`

`PydanticQA`

`model_settings` `property`

`prompt_signature` `property`

`prompt_template` `property`

`init(task_id, prompt_instructions, questions, model_settings, prompt_signature, model_type, fewshot_examples=())`

`extract(docs)`

`QuestionAnsweringBridge`

`inference_mode` `abstractmethod` `property`

`model_settings` `property`

`model_type` `property`

`prompt_signature` `property`

`prompt_template` `property`

`init(task_id, prompt_instructions, questions, model_settings, prompt_signature, model_type, fewshot_examples=())`

`consolidate(results, docs_offsets)` `abstractmethod`

`extract(docs)`

`integrate(results, docs)` `abstractmethod`

`FewshotExample`

`input_fields` `property`

`target_fields` `property`

`from_dspy(example)` `classmethod`

`to_dspy()`

`QuestionAnswer`

`Result`