Skip to content

PII Masking

Allows masking of PII (Personally Identifiable Information) in text documents.

FewshotExample

Bases: BaseModel

Example for PII masking few-shot prompting.

Source code in sieves/tasks/predictive/pii_masking/core.py
42
43
44
45
46
47
48
class FewshotExample(pydantic.BaseModel):
    """Example for PII masking few-shot prompting."""

    text: str
    reasoning: str
    masked_text: str
    pii_entities: list[PIIEntity]

PIIEntity

Bases: BaseModel

PII entity.

Source code in sieves/tasks/predictive/pii_masking/core.py
35
36
37
38
39
class PIIEntity(pydantic.BaseModel, frozen=True):
    """PII entity."""

    entity_type: str
    text: str

PIIMasking

Bases: PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]

Task for masking PII (Personally Identifiable Information) in text documents.

Source code in sieves/tasks/predictive/pii_masking/core.py
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
class PIIMasking(PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]):
    """Task for masking PII (Personally Identifiable Information) in text documents."""

    def __init__(
        self,
        engine: Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode],
        pii_types: list[str] | None = None,
        mask_placeholder: str = "[MASKED]",
        task_id: str | None = None,
        show_progress: bool = True,
        include_meta: bool = True,
        overwrite: bool = False,
        prompt_template: str | None = None,
        prompt_signature_desc: str | None = None,
        fewshot_examples: Iterable[FewshotExample] = (),
    ) -> None:
        """
        Initialize PIIMasking task.

        :param engine: Engine to use for PII detection and masking.
        :param pii_types: Types of PII to mask. If None, all common PII types will be masked.
                         E.g., ["NAME", "EMAIL", "PHONE", "ADDRESS", "SSN", "CREDIT_CARD", "DATE_OF_BIRTH"]
        :param mask_placeholder: String to replace PII with.
        :param task_id: Task ID.
        :param show_progress: Whether to show progress bar for processed documents.
        :param include_meta: Whether to include meta information generated by the task.
        :param overwrite: Whether to overwrite original document text with masked text.
        :param prompt_template: Custom prompt template. If None, task's default template is used.
        :param prompt_signature_desc: Custom prompt signature description. If None, default will be used.
        :param fewshot_examples: Few-shot examples.
        """
        self._pii_types = pii_types
        self._mask_placeholder = mask_placeholder

        super().__init__(
            engine=engine,
            task_id=task_id,
            show_progress=show_progress,
            include_meta=include_meta,
            overwrite=overwrite,
            prompt_template=prompt_template,
            prompt_signature_desc=prompt_signature_desc,
            fewshot_examples=fewshot_examples,
        )

    def _init_bridge(self, engine_type: EngineType) -> _TaskBridge:
        """Initialize bridge.
        :param engine_type: Type of engine to initialize bridge for.
        :return PIIBridge: Engine task bridge.
        :raises ValueError: If engine type is not supported.
        """
        bridge_types: dict[EngineType, type[_TaskBridge]] = {
            EngineType.dspy: DSPyPIIMasking,
            EngineType.instructor: InstructorPIIMasking,
            EngineType.langchain: LangChainPIIMasking,
            EngineType.outlines: OutlinesPIIMasking,
            EngineType.ollama: OllamaPIIMasking,
        }

        try:
            return bridge_types[engine_type](
                task_id=self._task_id,
                prompt_template=self._custom_prompt_template,
                prompt_signature_desc=self._custom_prompt_signature_desc,
                mask_placeholder=self._mask_placeholder,
                pii_types=self._pii_types,
                overwrite=self._overwrite,
            )
        except KeyError as err:
            raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err

    @property
    def supports(self) -> set[EngineType]:
        """
        :return set[EngineType]: Supported engine types.
        """
        return {EngineType.dspy, EngineType.instructor, EngineType.langchain, EngineType.ollama, EngineType.outlines}

    @property
    def _state(self) -> dict[str, Any]:
        """
        :return dict[str, Any]: Task state.
        """
        return {
            **super()._state,
            "pii_types": self._pii_types,
            "mask_placeholder": self._mask_placeholder,
        }

    def to_dataset(self, docs: Iterable[Doc]) -> Any:
        """Converts docs to Hugging Face dataset.
        :param docs: Documents to convert.
        :return datasets.Dataset: Converted dataset.
        """
        # Define metadata.
        features = datasets.Features({"text": datasets.Value("string"), "masked_text": datasets.Value("string")})
        info = datasets.DatasetInfo(
            description=f"PII masking dataset. Generated with sieves v{Config.get_version()}.",
            features=features,
        )

        # Fetch data used for generating dataset.
        try:
            data = [(doc.text, doc.results[self._task_id]["masked_text"]) for doc in docs]
        except KeyError as err:
            raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err

        def generate_data() -> Iterable[dict[str, Any]]:
            """Yields results as dicts.
            :return: Results as dicts.
            """
            for text, masked_text in data:
                yield {"text": text, "masked_text": masked_text}

        # Create dataset.
        return datasets.Dataset.from_generator(generate_data, features=features, info=info)

_state property

Returns:

Type Description
dict[str, Any]

Task state.

id property

Returns task ID. Used by pipeline for results and dependency management.

Returns:

Type Description
str

Task ID.

prompt_signature_description property

Returns prompt signature description.

Returns:

Type Description
str | None

Prompt signature description.

prompt_template property

Returns prompt template.

Returns:

Type Description
str | None

Prompt template.

supports property

Returns:

Type Description
set[EngineType]

Supported engine types.

__call__(docs)

Execute the task on a set of documents.

Note: the mypy ignore directives are because in practice, TaskX can be a superset of the X types of multiple engines, but there is no way in Python's current typing system to model that. E.g.: TaskInferenceMode could be outlines_.InferenceMode | dspy_.InferenceMode, depending on the class of the dynamically provided engine instance. TypeVars don't support unions however, neither do generics on a higher level of abstraction. We hence ignore these mypy errors, as the involved types should nonetheless be consistent.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Documents to process.

required

Returns:

Type Description
Iterable[Doc]

Processed documents.

Source code in sieves/tasks/predictive/core.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
    """Execute the task on a set of documents.

    Note: the mypy ignore directives are because in practice, TaskX can be a superset of the X types of multiple
    engines, but there is no way in Python's current typing system to model that. E.g.: TaskInferenceMode could be
    outlines_.InferenceMode | dspy_.InferenceMode, depending on the class of the dynamically provided engine
    instance. TypeVars don't support unions however, neither do generics on a higher level of abstraction.
    We hence ignore these mypy errors, as the involved types should nonetheless be consistent.

    :param docs: Documents to process.
    :return Iterable[Doc]: Processed documents.
    """
    docs = list(docs)

    # 1. Compile expected prompt signatures.
    signature = self._bridge.prompt_signature

    # 2. Build executable.
    assert isinstance(self._bridge.inference_mode, enum.Enum)
    executable = self._engine.build_executable(
        inference_mode=self._bridge.inference_mode,
        prompt_template=self.prompt_template,
        prompt_signature=signature,
        fewshot_examples=self._fewshot_examples,
    )

    # 3. Extract values from docs to inject/render those into prompt templates.
    docs_values = self._bridge.extract(docs)

    # 4. Map extracted docs values onto chunks.
    docs_chunks_offsets: list[tuple[int, int]] = []
    docs_chunks_values: list[dict[str, Any]] = []
    for doc, doc_values in zip(docs, docs_values):
        assert doc.text
        doc_chunks_values = [doc_values | {"text": chunk} for chunk in (doc.chunks or [doc.text])]
        docs_chunks_offsets.append((len(docs_chunks_values), len(docs_chunks_values) + len(doc_chunks_values)))
        docs_chunks_values.extend(doc_chunks_values)

    # 5. Execute prompts per chunk.
    results = list(executable(docs_chunks_values))
    assert len(results) == len(docs_chunks_values)

    # 6. Consolidate chunk results.
    results = list(self._bridge.consolidate(results, docs_chunks_offsets))
    assert len(results) == len(docs)

    # 7. Integrate results into docs.
    docs = self._bridge.integrate(results, docs)

    return docs

__init__(engine, pii_types=None, mask_placeholder='[MASKED]', task_id=None, show_progress=True, include_meta=True, overwrite=False, prompt_template=None, prompt_signature_desc=None, fewshot_examples=())

Initialize PIIMasking task.

Parameters:

Name Type Description Default
engine Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode]

Engine to use for PII detection and masking.

required
pii_types list[str] | None

Types of PII to mask. If None, all common PII types will be masked. E.g., ["NAME", "EMAIL", "PHONE", "ADDRESS", "SSN", "CREDIT_CARD", "DATE_OF_BIRTH"]

None
mask_placeholder str

String to replace PII with.

'[MASKED]'
task_id str | None

Task ID.

None
show_progress bool

Whether to show progress bar for processed documents.

True
include_meta bool

Whether to include meta information generated by the task.

True
overwrite bool

Whether to overwrite original document text with masked text.

False
prompt_template str | None

Custom prompt template. If None, task's default template is used.

None
prompt_signature_desc str | None

Custom prompt signature description. If None, default will be used.

None
fewshot_examples Iterable[FewshotExample]

Few-shot examples.

()
Source code in sieves/tasks/predictive/pii_masking/core.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def __init__(
    self,
    engine: Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode],
    pii_types: list[str] | None = None,
    mask_placeholder: str = "[MASKED]",
    task_id: str | None = None,
    show_progress: bool = True,
    include_meta: bool = True,
    overwrite: bool = False,
    prompt_template: str | None = None,
    prompt_signature_desc: str | None = None,
    fewshot_examples: Iterable[FewshotExample] = (),
) -> None:
    """
    Initialize PIIMasking task.

    :param engine: Engine to use for PII detection and masking.
    :param pii_types: Types of PII to mask. If None, all common PII types will be masked.
                     E.g., ["NAME", "EMAIL", "PHONE", "ADDRESS", "SSN", "CREDIT_CARD", "DATE_OF_BIRTH"]
    :param mask_placeholder: String to replace PII with.
    :param task_id: Task ID.
    :param show_progress: Whether to show progress bar for processed documents.
    :param include_meta: Whether to include meta information generated by the task.
    :param overwrite: Whether to overwrite original document text with masked text.
    :param prompt_template: Custom prompt template. If None, task's default template is used.
    :param prompt_signature_desc: Custom prompt signature description. If None, default will be used.
    :param fewshot_examples: Few-shot examples.
    """
    self._pii_types = pii_types
    self._mask_placeholder = mask_placeholder

    super().__init__(
        engine=engine,
        task_id=task_id,
        show_progress=show_progress,
        include_meta=include_meta,
        overwrite=overwrite,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
        fewshot_examples=fewshot_examples,
    )

_init_bridge(engine_type)

Initialize bridge.

Parameters:

Name Type Description Default
engine_type EngineType

Type of engine to initialize bridge for.

required

Returns:

Type Description
_TaskBridge

Engine task bridge.

Raises:

Type Description
ValueError

If engine type is not supported.

Source code in sieves/tasks/predictive/pii_masking/core.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def _init_bridge(self, engine_type: EngineType) -> _TaskBridge:
    """Initialize bridge.
    :param engine_type: Type of engine to initialize bridge for.
    :return PIIBridge: Engine task bridge.
    :raises ValueError: If engine type is not supported.
    """
    bridge_types: dict[EngineType, type[_TaskBridge]] = {
        EngineType.dspy: DSPyPIIMasking,
        EngineType.instructor: InstructorPIIMasking,
        EngineType.langchain: LangChainPIIMasking,
        EngineType.outlines: OutlinesPIIMasking,
        EngineType.ollama: OllamaPIIMasking,
    }

    try:
        return bridge_types[engine_type](
            task_id=self._task_id,
            prompt_template=self._custom_prompt_template,
            prompt_signature_desc=self._custom_prompt_signature_desc,
            mask_placeholder=self._mask_placeholder,
            pii_types=self._pii_types,
            overwrite=self._overwrite,
        )
    except KeyError as err:
        raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err

_validate_fewshot_examples()

Validates fewshot examples.

Source code in sieves/tasks/predictive/core.py
63
64
65
66
67
def _validate_fewshot_examples(self) -> None:
    """Validates fewshot examples.
    :raises: ValueError if fewshot examples don't pass validation.
    """
    pass

deserialize(config, **kwargs) classmethod

Generate PredictiveTask instance from config.

Parameters:

Name Type Description Default
config Config

Config to generate instance from.

required
kwargs dict[str, Any]

Values to inject into loaded config.

{}

Returns:

Type Description
PredictiveTask[TaskPromptSignature, TaskResult, TaskBridge]

Deserialized PredictiveTask instance.

Source code in sieves/tasks/predictive/core.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
@classmethod
def deserialize(
    cls, config: Config, **kwargs: dict[str, Any]
) -> PredictiveTask[TaskPromptSignature, TaskResult, TaskBridge]:
    """Generate PredictiveTask instance from config.
    :param config: Config to generate instance from.
    :param kwargs: Values to inject into loaded config.
    :return PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]: Deserialized PredictiveTask instance.
    """
    # Validate engine config.
    assert hasattr(config, "engine")
    assert isinstance(config.engine.value, Config)
    engine_config = config.engine.value
    engine_cls = engine_config.config_cls
    assert issubclass(engine_cls, Serializable)
    assert issubclass(engine_cls, Engine)

    # Deserialize and inject engine.
    engine_param: dict[str, Any] = {"engine": engine_cls.deserialize(engine_config, **kwargs["engine"])}
    return cls(**config.to_init_dict(cls, **(kwargs | engine_param)))

serialize()

Serializes task.

Returns:

Type Description
Config

Config instance.

Source code in sieves/tasks/core.py
50
51
52
53
54
def serialize(self) -> Config:
    """Serializes task.
    :return: Config instance.
    """
    return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})

to_dataset(docs)

Converts docs to Hugging Face dataset.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Documents to convert.

required

Returns:

Type Description
Any

Converted dataset.

Source code in sieves/tasks/predictive/pii_masking/core.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def to_dataset(self, docs: Iterable[Doc]) -> Any:
    """Converts docs to Hugging Face dataset.
    :param docs: Documents to convert.
    :return datasets.Dataset: Converted dataset.
    """
    # Define metadata.
    features = datasets.Features({"text": datasets.Value("string"), "masked_text": datasets.Value("string")})
    info = datasets.DatasetInfo(
        description=f"PII masking dataset. Generated with sieves v{Config.get_version()}.",
        features=features,
    )

    # Fetch data used for generating dataset.
    try:
        data = [(doc.text, doc.results[self._task_id]["masked_text"]) for doc in docs]
    except KeyError as err:
        raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err

    def generate_data() -> Iterable[dict[str, Any]]:
        """Yields results as dicts.
        :return: Results as dicts.
        """
        for text, masked_text in data:
            yield {"text": text, "masked_text": masked_text}

    # Create dataset.
    return datasets.Dataset.from_generator(generate_data, features=features, info=info)

Bridges for PII masking task.

DSPyPIIMasking

Bases: PIIBridge[PromptSignature, Result, InferenceMode]

DSPy bridge for PII masking.

Source code in sieves/tasks/predictive/pii_masking/bridges.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
class DSPyPIIMasking(PIIBridge[dspy_.PromptSignature, dspy_.Result, dspy_.InferenceMode]):
    """DSPy bridge for PII masking."""

    @property
    def _prompt_template(self) -> str | None:
        return None

    @property
    def _prompt_signature_description(self) -> str | None:
        default_pii_types_desc = "all types of personally identifiable information"
        pii_types_desc = ", ".join(self._pii_types) if self._pii_types else default_pii_types_desc
        return (
            f"Identify and mask {pii_types_desc} in the given text. Replace each PII instance with "
            f"'{self._mask_placeholder}'."
        )

    @cached_property
    def prompt_signature(self) -> type[dspy_.PromptSignature]:
        """Define prompt signature for DSPy."""
        PIIEntity = self._pii_entity_cls

        class PIIMasking(dspy.Signature):  # type: ignore[misc]
            text: str = dspy.InputField(description="Text to mask PII from.")
            reasoning: str = dspy.OutputField(description="Reasoning about what PII was found and masked.")
            masked_text: str = dspy.OutputField(description="Text with all PII masked.")
            pii_entities: list[PIIEntity] = dspy.OutputField(description="List of PII entities that were masked.")  # type: ignore[valid-type]

        PIIMasking.__doc__ = jinja2.Template(self.prompt_signature_description).render()
        return PIIMasking

    @property
    def inference_mode(self) -> dspy_.InferenceMode:
        """Return inference mode for DSPy engine."""
        return dspy_.InferenceMode.chain_of_thought

    def integrate(self, results: Iterable[dspy_.Result], docs: Iterable[Doc]) -> Iterable[Doc]:
        """Integrate results into docs."""
        for doc, result in zip(docs, results):
            # Store masked text and PII entities in results
            doc.results[self._task_id] = {
                "masked_text": result.masked_text,
                "pii_entities": result.pii_entities,
            }

            if self._overwrite:
                doc.text = result.masked_text

        return docs

    def consolidate(
        self, results: Iterable[dspy_.Result], docs_offsets: list[tuple[int, int]]
    ) -> Iterable[dspy_.Result]:
        """Consolidate results from multiple chunks."""
        results = list(results)
        PIIEntity = self._pii_entity_cls

        # Merge results for each document
        for doc_offset in docs_offsets:
            doc_results = results[doc_offset[0] : doc_offset[1]]
            seen_entities: set[PIIEntity] = set()  # type: ignore[valid-type]
            entities: list[PIIEntity] = []  # type: ignore[valid-type]
            masked_texts: list[str] = []
            reasonings: list[str] = []

            for res in doc_results:
                reasonings.append(res.reasoning)
                masked_texts.append(res.masked_text)
                for entity in res.pii_entities:
                    if entity not in seen_entities:
                        entities.extend(res.pii_entities)
                        seen_entities.add(entity)

            yield dspy.Prediction.from_completions(
                {"masked_text": [" ".join(masked_texts)], "pii_entities": [entities], "reasoning": [str(reasonings)]},
                signature=self.prompt_signature,
            )

inference_mode property

Return inference mode for DSPy engine.

prompt_signature cached property

Define prompt signature for DSPy.

prompt_signature_description property

Returns prompt signature description. This is used by some engines to aid the language model in generating structured output.

Returns:

Type Description
str | None

Prompt signature description. None if not used by engine.

prompt_template property

Returns prompt template. Note: different engines have different expectations as how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind engine-specific expectations when creating a prompt template.

Returns:

Type Description
str | None

Prompt template as string. None if not used by engine.

__init__(task_id, prompt_template, prompt_signature_desc, overwrite, mask_placeholder, pii_types)

Initialize PIIBridge.

Parameters:

Name Type Description Default
task_id str

Task ID.

required
prompt_template str | None

Custom prompt template.

required
prompt_signature_desc str | None

Custom prompt signature description.

required
overwrite bool

Whether to overwrite text with masked text.

required
mask_placeholder str

String to replace PII with.

required
pii_types list[str] | None

Types of PII to mask. If None, all common PII types will be masked.

required
Source code in sieves/tasks/predictive/pii_masking/bridges.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def __init__(
    self,
    task_id: str,
    prompt_template: str | None,
    prompt_signature_desc: str | None,
    overwrite: bool,
    mask_placeholder: str,
    pii_types: list[str] | None,
):
    """
    Initialize PIIBridge.

    :param task_id: Task ID.
    :param prompt_template: Custom prompt template.
    :param prompt_signature_desc: Custom prompt signature description.
    :param overwrite: Whether to overwrite text with masked text.
    :param mask_placeholder: String to replace PII with.
    :param pii_types: Types of PII to mask. If None, all common PII types will be masked.
    """
    super().__init__(
        task_id=task_id,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
        overwrite=overwrite,
    )
    self._mask_placeholder = mask_placeholder
    self._pii_types = pii_types
    self._pii_entity_cls = self._create_pii_entity_cls()

_create_pii_entity_cls()

Creates PII entity class.

Source code in sieves/tasks/predictive/pii_masking/bridges.py
51
52
53
54
55
56
57
58
59
60
61
62
def _create_pii_entity_cls(self) -> type[pydantic.BaseModel]:
    """Creates PII entity class."""
    pii_types = self._pii_types
    PIIType = Literal[*pii_types] if pii_types else str

    class PIIEntity(pydantic.BaseModel, frozen=True):
        """PII entity."""

        entity_type: PIIType  # type: ignore[valid-type]
        text: str

    return PIIEntity

consolidate(results, docs_offsets)

Consolidate results from multiple chunks.

Source code in sieves/tasks/predictive/pii_masking/bridges.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def consolidate(
    self, results: Iterable[dspy_.Result], docs_offsets: list[tuple[int, int]]
) -> Iterable[dspy_.Result]:
    """Consolidate results from multiple chunks."""
    results = list(results)
    PIIEntity = self._pii_entity_cls

    # Merge results for each document
    for doc_offset in docs_offsets:
        doc_results = results[doc_offset[0] : doc_offset[1]]
        seen_entities: set[PIIEntity] = set()  # type: ignore[valid-type]
        entities: list[PIIEntity] = []  # type: ignore[valid-type]
        masked_texts: list[str] = []
        reasonings: list[str] = []

        for res in doc_results:
            reasonings.append(res.reasoning)
            masked_texts.append(res.masked_text)
            for entity in res.pii_entities:
                if entity not in seen_entities:
                    entities.extend(res.pii_entities)
                    seen_entities.add(entity)

        yield dspy.Prediction.from_completions(
            {"masked_text": [" ".join(masked_texts)], "pii_entities": [entities], "reasoning": [str(reasonings)]},
            signature=self.prompt_signature,
        )

extract(docs)

Extract all values from doc instances that are to be injected into the prompts.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Docs to extract values from.

required

Returns:

Type Description
Iterable[dict[str, Any]]

All values from doc instances that are to be injected into the prompts

Source code in sieves/tasks/predictive/bridges.py
80
81
82
83
84
85
def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.
    :param docs: Docs to extract values from.
    :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
    """
    return ({"text": doc.text if doc.text else None} for doc in docs)

integrate(results, docs)

Integrate results into docs.

Source code in sieves/tasks/predictive/pii_masking/bridges.py
100
101
102
103
104
105
106
107
108
109
110
111
112
def integrate(self, results: Iterable[dspy_.Result], docs: Iterable[Doc]) -> Iterable[Doc]:
    """Integrate results into docs."""
    for doc, result in zip(docs, results):
        # Store masked text and PII entities in results
        doc.results[self._task_id] = {
            "masked_text": result.masked_text,
            "pii_entities": result.pii_entities,
        }

        if self._overwrite:
            doc.text = result.masked_text

    return docs

PIIBridge

Bases: Bridge[_BridgePromptSignature, _BridgeResult, EngineInferenceMode], ABC

Abstract base class for PII masking bridges.

Source code in sieves/tasks/predictive/pii_masking/bridges.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
class PIIBridge(Bridge[_BridgePromptSignature, _BridgeResult, EngineInferenceMode], abc.ABC):
    """Abstract base class for PII masking bridges."""

    def __init__(
        self,
        task_id: str,
        prompt_template: str | None,
        prompt_signature_desc: str | None,
        overwrite: bool,
        mask_placeholder: str,
        pii_types: list[str] | None,
    ):
        """
        Initialize PIIBridge.

        :param task_id: Task ID.
        :param prompt_template: Custom prompt template.
        :param prompt_signature_desc: Custom prompt signature description.
        :param overwrite: Whether to overwrite text with masked text.
        :param mask_placeholder: String to replace PII with.
        :param pii_types: Types of PII to mask. If None, all common PII types will be masked.
        """
        super().__init__(
            task_id=task_id,
            prompt_template=prompt_template,
            prompt_signature_desc=prompt_signature_desc,
            overwrite=overwrite,
        )
        self._mask_placeholder = mask_placeholder
        self._pii_types = pii_types
        self._pii_entity_cls = self._create_pii_entity_cls()

    def _create_pii_entity_cls(self) -> type[pydantic.BaseModel]:
        """Creates PII entity class."""
        pii_types = self._pii_types
        PIIType = Literal[*pii_types] if pii_types else str

        class PIIEntity(pydantic.BaseModel, frozen=True):
            """PII entity."""

            entity_type: PIIType  # type: ignore[valid-type]
            text: str

        return PIIEntity

_prompt_signature_description abstractmethod property

Returns default prompt signature description.

Returns:

Type Description
str | None

Default prompt signature description.

_prompt_template abstractmethod property

Returns default prompt template.

Returns:

Type Description
str | None

Default prompt template.

inference_mode abstractmethod property

Returns inference mode.

Returns:

Type Description
EngineInferenceMode

Inference mode.

prompt_signature abstractmethod property

Creates output signature (e.g.: Signature in DSPy, Pydantic objects in outlines, JSON schema in jsonformers). This is engine-specific.

Returns:

Type Description
type[TaskPromptSignature] | TaskPromptSignature

Output signature object. This can be an instance (e.g. a regex string) or a class (e.g. a Pydantic class).

prompt_signature_description property

Returns prompt signature description. This is used by some engines to aid the language model in generating structured output.

Returns:

Type Description
str | None

Prompt signature description. None if not used by engine.

prompt_template property

Returns prompt template. Note: different engines have different expectations as how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind engine-specific expectations when creating a prompt template.

Returns:

Type Description
str | None

Prompt template as string. None if not used by engine.

__init__(task_id, prompt_template, prompt_signature_desc, overwrite, mask_placeholder, pii_types)

Initialize PIIBridge.

Parameters:

Name Type Description Default
task_id str

Task ID.

required
prompt_template str | None

Custom prompt template.

required
prompt_signature_desc str | None

Custom prompt signature description.

required
overwrite bool

Whether to overwrite text with masked text.

required
mask_placeholder str

String to replace PII with.

required
pii_types list[str] | None

Types of PII to mask. If None, all common PII types will be masked.

required
Source code in sieves/tasks/predictive/pii_masking/bridges.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def __init__(
    self,
    task_id: str,
    prompt_template: str | None,
    prompt_signature_desc: str | None,
    overwrite: bool,
    mask_placeholder: str,
    pii_types: list[str] | None,
):
    """
    Initialize PIIBridge.

    :param task_id: Task ID.
    :param prompt_template: Custom prompt template.
    :param prompt_signature_desc: Custom prompt signature description.
    :param overwrite: Whether to overwrite text with masked text.
    :param mask_placeholder: String to replace PII with.
    :param pii_types: Types of PII to mask. If None, all common PII types will be masked.
    """
    super().__init__(
        task_id=task_id,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
        overwrite=overwrite,
    )
    self._mask_placeholder = mask_placeholder
    self._pii_types = pii_types
    self._pii_entity_cls = self._create_pii_entity_cls()

_create_pii_entity_cls()

Creates PII entity class.

Source code in sieves/tasks/predictive/pii_masking/bridges.py
51
52
53
54
55
56
57
58
59
60
61
62
def _create_pii_entity_cls(self) -> type[pydantic.BaseModel]:
    """Creates PII entity class."""
    pii_types = self._pii_types
    PIIType = Literal[*pii_types] if pii_types else str

    class PIIEntity(pydantic.BaseModel, frozen=True):
        """PII entity."""

        entity_type: PIIType  # type: ignore[valid-type]
        text: str

    return PIIEntity

consolidate(results, docs_offsets) abstractmethod

Consolidates results for document chunks into document results.

Parameters:

Name Type Description Default
results Iterable[TaskResult]

Results per document chunk.

required
docs_offsets list[tuple[int, int]]

Chunk offsets per document. Chunks per document can be obtained with results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]].

required

Returns:

Type Description
Iterable[TaskResult]

Results per document.

Source code in sieves/tasks/predictive/bridges.py
 95
 96
 97
 98
 99
100
101
102
@abc.abstractmethod
def consolidate(self, results: Iterable[TaskResult], docs_offsets: list[tuple[int, int]]) -> Iterable[TaskResult]:
    """Consolidates results for document chunks into document results.
    :param results: Results per document chunk.
    :param docs_offsets: Chunk offsets per document. Chunks per document can be obtained with
        results[docs_chunk_offsets[i][0]:docs_chunk_offsets[i][1]].
    :return Iterable[_TaskResult]: Results per document.
    """

extract(docs)

Extract all values from doc instances that are to be injected into the prompts.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Docs to extract values from.

required

Returns:

Type Description
Iterable[dict[str, Any]]

All values from doc instances that are to be injected into the prompts

Source code in sieves/tasks/predictive/bridges.py
80
81
82
83
84
85
def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.
    :param docs: Docs to extract values from.
    :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
    """
    return ({"text": doc.text if doc.text else None} for doc in docs)

integrate(results, docs) abstractmethod

Integrate results into Doc instances.

Parameters:

Name Type Description Default
results Iterable[TaskResult]

Results from prompt executable.

required
docs Iterable[Doc]

Doc instances to update.

required

Returns:

Type Description
Iterable[Doc]

Updated doc instances.

Source code in sieves/tasks/predictive/bridges.py
87
88
89
90
91
92
93
@abc.abstractmethod
def integrate(self, results: Iterable[TaskResult], docs: Iterable[Doc]) -> Iterable[Doc]:
    """Integrate results into Doc instances.
    :param results: Results from prompt executable.
    :param docs: Doc instances to update.
    :return Iterable[Doc]: Updated doc instances.
    """

PydanticBasedPIIMasking

Bases: PIIBridge[BaseModel, BaseModel, EngineInferenceMode], ABC

Base class for Pydantic-based PII masking bridges.

Source code in sieves/tasks/predictive/pii_masking/bridges.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
class PydanticBasedPIIMasking(PIIBridge[pydantic.BaseModel, pydantic.BaseModel, EngineInferenceMode], abc.ABC):
    """Base class for Pydantic-based PII masking bridges."""

    @property
    def _prompt_template(self) -> str | None:
        return """
        Identify and mask Personally Identifiable Information (PII) in the given text.
        {% if pii_types|length > 0 -%}
            Focus on these specific PII types: {{ pii_types|join(', ') }}.
        {% else -%}
            Mask all common types of PII such as names, addresses, phone numbers, emails, SSNs, credit 
            card numbers, etc.
        {% endif -%}
        Replace each instance of PII with "{{ mask_placeholder }}".

        {% if examples|length > 0 -%}
            Examples:
            ----------
            {%- for example in examples %}
                Text: "{{ example.text }}":
                Output:
                    Reasoning: "{{ example.reasoning }}"
                    Masked Text: "{{ example.masked_text }}"
                    PII Entities found: {{ example.pii_entities }}
            {% endfor -%}
            ----------
        {% endif -%}

        ========
        Text: {{ text }}
        Output: 
        """

    @property
    def _prompt_signature_description(self) -> str | None:
        return None

    @cached_property
    def prompt_signature(self) -> type[pydantic.BaseModel]:
        """Define prompt signature for Pydantic-based engines."""
        PIIEntity = self._pii_entity_cls

        class PIIMasking(pydantic.BaseModel, frozen=True):
            reasoning: str
            masked_text: str
            pii_entities: list[PIIEntity]  # type: ignore[valid-type]

        if self.prompt_signature_description:
            PIIMasking.__doc__ = jinja2.Template(self.prompt_signature_description).render()

        return PIIMasking

    def integrate(self, results: Iterable[pydantic.BaseModel], docs: Iterable[Doc]) -> Iterable[Doc]:
        """Integrate results into docs."""
        for doc, result in zip(docs, results):
            assert hasattr(result, "masked_text")
            assert hasattr(result, "pii_entities")
            # Store masked text and PII entities in results
            doc.results[self._task_id] = {"masked_text": result.masked_text, "pii_entities": result.pii_entities}

            if self._overwrite:
                doc.text = result.masked_text

        return docs

    def consolidate(
        self, results: Iterable[pydantic.BaseModel], docs_offsets: list[tuple[int, int]]
    ) -> Iterable[pydantic.BaseModel]:
        """Consolidate results from multiple chunks."""
        results = list(results)
        PIIEntity = self._pii_entity_cls

        # Merge results for each document
        for doc_offset in docs_offsets:
            doc_results = results[doc_offset[0] : doc_offset[1]]
            seen_entities: set[PIIEntity] = set()  # type: ignore[valid-type]
            entities: list[PIIEntity] = []  # type: ignore[valid-type]
            masked_texts: list[str] = []
            reasonings: list[str] = []

            for res in doc_results:
                if res is None:
                    continue  # type: ignore[unreachable]

                assert hasattr(res, "reasoning")
                assert hasattr(res, "masked_text")
                assert hasattr(res, "pii_entities")

                reasonings.append(res.reasoning)
                masked_texts.append(res.masked_text)
                for entity in res.pii_entities:
                    if entity not in seen_entities:
                        entities.extend(res.pii_entities)
                        seen_entities.add(entity)

            yield self.prompt_signature(
                reasoning=str(reasonings), masked_text=" ".join(masked_texts), pii_entities=entities
            )

inference_mode abstractmethod property

Returns inference mode.

Returns:

Type Description
EngineInferenceMode

Inference mode.

prompt_signature cached property

Define prompt signature for Pydantic-based engines.

prompt_signature_description property

Returns prompt signature description. This is used by some engines to aid the language model in generating structured output.

Returns:

Type Description
str | None

Prompt signature description. None if not used by engine.

prompt_template property

Returns prompt template. Note: different engines have different expectations as how a prompt should look like. E.g. outlines supports the Jinja 2 templating format for insertion of values and few-shot examples, whereas DSPy integrates these things in a different value in the workflow and hence expects the prompt not to include these things. Mind engine-specific expectations when creating a prompt template.

Returns:

Type Description
str | None

Prompt template as string. None if not used by engine.

__init__(task_id, prompt_template, prompt_signature_desc, overwrite, mask_placeholder, pii_types)

Initialize PIIBridge.

Parameters:

Name Type Description Default
task_id str

Task ID.

required
prompt_template str | None

Custom prompt template.

required
prompt_signature_desc str | None

Custom prompt signature description.

required
overwrite bool

Whether to overwrite text with masked text.

required
mask_placeholder str

String to replace PII with.

required
pii_types list[str] | None

Types of PII to mask. If None, all common PII types will be masked.

required
Source code in sieves/tasks/predictive/pii_masking/bridges.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def __init__(
    self,
    task_id: str,
    prompt_template: str | None,
    prompt_signature_desc: str | None,
    overwrite: bool,
    mask_placeholder: str,
    pii_types: list[str] | None,
):
    """
    Initialize PIIBridge.

    :param task_id: Task ID.
    :param prompt_template: Custom prompt template.
    :param prompt_signature_desc: Custom prompt signature description.
    :param overwrite: Whether to overwrite text with masked text.
    :param mask_placeholder: String to replace PII with.
    :param pii_types: Types of PII to mask. If None, all common PII types will be masked.
    """
    super().__init__(
        task_id=task_id,
        prompt_template=prompt_template,
        prompt_signature_desc=prompt_signature_desc,
        overwrite=overwrite,
    )
    self._mask_placeholder = mask_placeholder
    self._pii_types = pii_types
    self._pii_entity_cls = self._create_pii_entity_cls()

_create_pii_entity_cls()

Creates PII entity class.

Source code in sieves/tasks/predictive/pii_masking/bridges.py
51
52
53
54
55
56
57
58
59
60
61
62
def _create_pii_entity_cls(self) -> type[pydantic.BaseModel]:
    """Creates PII entity class."""
    pii_types = self._pii_types
    PIIType = Literal[*pii_types] if pii_types else str

    class PIIEntity(pydantic.BaseModel, frozen=True):
        """PII entity."""

        entity_type: PIIType  # type: ignore[valid-type]
        text: str

    return PIIEntity

consolidate(results, docs_offsets)

Consolidate results from multiple chunks.

Source code in sieves/tasks/predictive/pii_masking/bridges.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def consolidate(
    self, results: Iterable[pydantic.BaseModel], docs_offsets: list[tuple[int, int]]
) -> Iterable[pydantic.BaseModel]:
    """Consolidate results from multiple chunks."""
    results = list(results)
    PIIEntity = self._pii_entity_cls

    # Merge results for each document
    for doc_offset in docs_offsets:
        doc_results = results[doc_offset[0] : doc_offset[1]]
        seen_entities: set[PIIEntity] = set()  # type: ignore[valid-type]
        entities: list[PIIEntity] = []  # type: ignore[valid-type]
        masked_texts: list[str] = []
        reasonings: list[str] = []

        for res in doc_results:
            if res is None:
                continue  # type: ignore[unreachable]

            assert hasattr(res, "reasoning")
            assert hasattr(res, "masked_text")
            assert hasattr(res, "pii_entities")

            reasonings.append(res.reasoning)
            masked_texts.append(res.masked_text)
            for entity in res.pii_entities:
                if entity not in seen_entities:
                    entities.extend(res.pii_entities)
                    seen_entities.add(entity)

        yield self.prompt_signature(
            reasoning=str(reasonings), masked_text=" ".join(masked_texts), pii_entities=entities
        )

extract(docs)

Extract all values from doc instances that are to be injected into the prompts.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Docs to extract values from.

required

Returns:

Type Description
Iterable[dict[str, Any]]

All values from doc instances that are to be injected into the prompts

Source code in sieves/tasks/predictive/bridges.py
80
81
82
83
84
85
def extract(self, docs: Iterable[Doc]) -> Iterable[dict[str, Any]]:
    """Extract all values from doc instances that are to be injected into the prompts.
    :param docs: Docs to extract values from.
    :return Iterable[dict[str, Any]]: All values from doc instances that are to be injected into the prompts
    """
    return ({"text": doc.text if doc.text else None} for doc in docs)

integrate(results, docs)

Integrate results into docs.

Source code in sieves/tasks/predictive/pii_masking/bridges.py
195
196
197
198
199
200
201
202
203
204
205
206
def integrate(self, results: Iterable[pydantic.BaseModel], docs: Iterable[Doc]) -> Iterable[Doc]:
    """Integrate results into docs."""
    for doc, result in zip(docs, results):
        assert hasattr(result, "masked_text")
        assert hasattr(result, "pii_entities")
        # Store masked text and PII entities in results
        doc.results[self._task_id] = {"masked_text": result.masked_text, "pii_entities": result.pii_entities}

        if self._overwrite:
            doc.text = result.masked_text

    return docs