Skip to content

Docling

Wrapper for Docling for the conversion of complex files into markdown.

Docling

Bases: Task

Parser wrapping the docling library to convert files into documents.

Source code in sieves/tasks/preprocessing/ocr/docling_.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
class Docling(Task):
    """Parser wrapping the docling library to convert files into documents."""

    def __init__(
        self,
        converter: docling.document_converter.DocumentConverter = None,
        export_format: str = "markdown",
        task_id: str | None = None,
        show_progress: bool = True,
        include_meta: bool = False,
    ):
        """Initialize the docling parser.
        :param converter: Docling parser instance.
        :param task_id: Task ID.
        :param show_progress: Whether to show progress bar for processed documents
        :param include_meta: Whether to include meta information generated by the task.
        """
        super().__init__(task_id=task_id, show_progress=show_progress, include_meta=include_meta)
        self._converter = converter if converter else docling.document_converter.DocumentConverter()
        self._export_format = export_format

    def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
        """Parse resources using docling.

        :param docs: Resources to process.
        :return: Parsed documents
        """
        docs = list(docs)

        # Validate docs.
        have_text = False
        for doc in docs:
            assert doc.uri, ValueError("Documents have to have a value for .uri.")
            if doc.text:
                have_text = True
        if have_text:
            warnings.warn(f"Task {self._task_id} is about to overwrite existing .text values.")

        # Wrap conversion in TQDM if progress should be shown.
        convert = self._converter.convert_all
        if self._show_progress:

            def convert_with_progress(uris: Iterable[Path | str]) -> Any:
                return tqdm(self._converter.convert_all(uris), total=len(docs))

            convert = convert_with_progress

        parsed_resources: list[docling.datamodel.document.ConversionResult] = list(
            convert([resource.uri for resource in docs])
        )
        assert len(parsed_resources) == len(docs)

        for doc, parsed_resource in zip(docs, parsed_resources):
            try:
                if self._include_meta:
                    doc.meta |= {self.id: parsed_resource}
                if self._export_format == "markdown":
                    doc.text = parsed_resource.document.export_to_markdown()
                elif self._export_format == "html":
                    doc.text = parsed_resource.document.export_to_html()
                elif self._export_format == "json":
                    doc.text = parsed_resource.document.export_to_dict()
            except Exception as e:
                logger.error(f"Failed to parse file {doc.uri}: {str(e)}")
                continue

        return docs

    @property
    def _state(self) -> dict[str, Any]:
        return {
            **super()._state,
            "converter": self._converter,
            "export_format": self._export_format,
        }

id property

Returns task ID. Used by pipeline for results and dependency management.

Returns:

Type Description
str

Task ID.

__call__(docs)

Parse resources using docling.

Parameters:

Name Type Description Default
docs Iterable[Doc]

Resources to process.

required

Returns:

Type Description
Iterable[Doc]

Parsed documents

Source code in sieves/tasks/preprocessing/ocr/docling_.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
    """Parse resources using docling.

    :param docs: Resources to process.
    :return: Parsed documents
    """
    docs = list(docs)

    # Validate docs.
    have_text = False
    for doc in docs:
        assert doc.uri, ValueError("Documents have to have a value for .uri.")
        if doc.text:
            have_text = True
    if have_text:
        warnings.warn(f"Task {self._task_id} is about to overwrite existing .text values.")

    # Wrap conversion in TQDM if progress should be shown.
    convert = self._converter.convert_all
    if self._show_progress:

        def convert_with_progress(uris: Iterable[Path | str]) -> Any:
            return tqdm(self._converter.convert_all(uris), total=len(docs))

        convert = convert_with_progress

    parsed_resources: list[docling.datamodel.document.ConversionResult] = list(
        convert([resource.uri for resource in docs])
    )
    assert len(parsed_resources) == len(docs)

    for doc, parsed_resource in zip(docs, parsed_resources):
        try:
            if self._include_meta:
                doc.meta |= {self.id: parsed_resource}
            if self._export_format == "markdown":
                doc.text = parsed_resource.document.export_to_markdown()
            elif self._export_format == "html":
                doc.text = parsed_resource.document.export_to_html()
            elif self._export_format == "json":
                doc.text = parsed_resource.document.export_to_dict()
        except Exception as e:
            logger.error(f"Failed to parse file {doc.uri}: {str(e)}")
            continue

    return docs

__init__(converter=None, export_format='markdown', task_id=None, show_progress=True, include_meta=False)

Initialize the docling parser.

Parameters:

Name Type Description Default
converter DocumentConverter

Docling parser instance.

None
task_id str | None

Task ID.

None
show_progress bool

Whether to show progress bar for processed documents

True
include_meta bool

Whether to include meta information generated by the task.

False
Source code in sieves/tasks/preprocessing/ocr/docling_.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def __init__(
    self,
    converter: docling.document_converter.DocumentConverter = None,
    export_format: str = "markdown",
    task_id: str | None = None,
    show_progress: bool = True,
    include_meta: bool = False,
):
    """Initialize the docling parser.
    :param converter: Docling parser instance.
    :param task_id: Task ID.
    :param show_progress: Whether to show progress bar for processed documents
    :param include_meta: Whether to include meta information generated by the task.
    """
    super().__init__(task_id=task_id, show_progress=show_progress, include_meta=include_meta)
    self._converter = converter if converter else docling.document_converter.DocumentConverter()
    self._export_format = export_format

deserialize(config, **kwargs) classmethod

Generate Task instance from config.

Parameters:

Name Type Description Default
config Config

Config to generate instance from.

required
kwargs dict[str, Any]

Values to inject into loaded config.

{}

Returns:

Type Description
Task

Deserialized Task instance.

Source code in sieves/tasks/core.py
56
57
58
59
60
61
62
63
64
@classmethod
def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Task:
    """Generate Task instance from config.
    :param config: Config to generate instance from.
    :param kwargs: Values to inject into loaded config.
    :return: Deserialized Task instance.
    """
    # Deserialize and inject engine.
    return cls(**config.to_init_dict(cls, **kwargs))

serialize()

Serializes task.

Returns:

Type Description
Config

Config instance.

Source code in sieves/tasks/core.py
50
51
52
53
54
def serialize(self) -> Config:
    """Serializes task.
    :return: Config instance.
    """
    return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})