Marker

`OCR`

Bases: Task

Base class for OCR tasks that extract text from documents. This unified interface allows different OCR converters to be used interchangeably.

Source code in sieves/tasks/preprocessing/ocr/core.py

class OCR(Task):
    """
    Base class for OCR tasks that extract text from documents.
    This unified interface allows different OCR converters to be used interchangeably.
    """

    def __init__(
        self,
        converter: _ConverterType = docling.document_converter.DocumentConverter(),
        export_format: str = "markdown",
        task_id: str | None = None,
        show_progress: bool = True,
        include_meta: bool = False,
        **kwargs: Any,
    ):
        """
        Initialize the OCR task.
        :param converter: The OCR converter to use.
        :param task_id: Task ID.
        :param show_progress: Whether to show progress bar for processed documents.
        :param include_meta: Whether to include meta information generated by the task.
        :param kwargs: Additional arguments for specific OCR implementations.
        """
        super().__init__(task_id=task_id, show_progress=show_progress, include_meta=include_meta)
        self._export_format = export_format
        self._converter = converter
        self._kwargs = kwargs
        self._task = self._init_ocr_task()

    def _init_ocr_task(self) -> Task:
        """
        Initialize the bridge for the specific OCR implementation.
        :return: OCR bridge implementation.
        """
        converter_type = type(self._converter)
        ocr_task: Task
        match converter_type:
            case converter if issubclass(
                converter, (marker.converters.pdf.PdfConverter | marker.converters.table.TableConverter)
            ):
                ocr_task = marker_.Marker(
                    converter=self._converter,
                    export_format=self._export_format,
                    task_id=self.id,
                    show_progress=self._show_progress,
                    include_meta=self._include_meta,
                    **self._kwargs,
                )
            case docling.document_converter.DocumentConverter:
                ocr_task = docling_.Docling(
                    converter=self._converter,
                    export_format=self._export_format,
                    task_id=self.id,
                    show_progress=self._show_progress,
                    include_meta=self._include_meta,
                )
            case _:
                raise ValueError(
                    f"converter type {self._converter} is not supported. Please check the documentation "
                    f"and ensure you're providing a supported converter type."
                )
        assert isinstance(ocr_task, Task)
        return ocr_task

    def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
        """
        Process documents with OCR to extract text.
        :param docs: Documents to process.
        :return: Processed documents with extracted text.
        """
        docs = list(docs)
        assert all(doc.uri for doc in docs), ValueError("Documents have to have a value for .uri.")
        result = self._task(docs)

        yield from result

    @property
    def _state(self) -> dict[str, Any]:
        """
        Returns attributes to serialize.
        :return: Dict of attributes to serialize.
        """
        return {
            **super()._state,
            "converter": self._converter,
            "export_format": self._export_format,
            **self._kwargs,
        }

    @classmethod
    def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> OCR:
        """
        Generate OCR instance from config.
        :param config: Config to generate instance from.
        :param kwargs: Values to inject into loaded config.
        :return: Deserialized OCR instance.
        """
        return cls(**config.to_init_dict(cls, **kwargs))

`id` `property`

Returns task ID. Used by pipeline for results and dependency management.

Returns:

Type	Description
`str`	Task ID.

`call(docs)`

Process documents with OCR to extract text.

Parameters:

Name	Type	Description	Default
`docs`	`Iterable[Doc]`	Documents to process.	required

Returns:

Type	Description
`Iterable[Doc]`	Processed documents with extracted text.

Source code in sieves/tasks/preprocessing/ocr/core.py

def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
    """
    Process documents with OCR to extract text.
    :param docs: Documents to process.
    :return: Processed documents with extracted text.
    """
    docs = list(docs)
    assert all(doc.uri for doc in docs), ValueError("Documents have to have a value for .uri.")
    result = self._task(docs)

    yield from result

`init(converter=docling.document_converter.DocumentConverter(), export_format='markdown', task_id=None, show_progress=True, include_meta=False, **kwargs)`

Initialize the OCR task.

Parameters:

Name	Type	Description	Default
`converter`	`_ConverterType`	The OCR converter to use.	`DocumentConverter()`
`task_id`	`str \| None`	Task ID.	`None`
`show_progress`	`bool`	Whether to show progress bar for processed documents.	`True`
`include_meta`	`bool`	Whether to include meta information generated by the task.	`False`
`kwargs`	`Any`	Additional arguments for specific OCR implementations.	`{}`

Source code in sieves/tasks/preprocessing/ocr/core.py

def __init__(
    self,
    converter: _ConverterType = docling.document_converter.DocumentConverter(),
    export_format: str = "markdown",
    task_id: str | None = None,
    show_progress: bool = True,
    include_meta: bool = False,
    **kwargs: Any,
):
    """
    Initialize the OCR task.
    :param converter: The OCR converter to use.
    :param task_id: Task ID.
    :param show_progress: Whether to show progress bar for processed documents.
    :param include_meta: Whether to include meta information generated by the task.
    :param kwargs: Additional arguments for specific OCR implementations.
    """
    super().__init__(task_id=task_id, show_progress=show_progress, include_meta=include_meta)
    self._export_format = export_format
    self._converter = converter
    self._kwargs = kwargs
    self._task = self._init_ocr_task()

`deserialize(config, **kwargs)` `classmethod`

Generate OCR instance from config.

Parameters:

Name	Type	Description	Default
`config`	`Config`	Config to generate instance from.	required
`kwargs`	`dict[str, Any]`	Values to inject into loaded config.	`{}`

Returns:

Type	Description
`OCR`	Deserialized OCR instance.

Source code in sieves/tasks/preprocessing/ocr/core.py

@classmethod
def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> OCR:
    """
    Generate OCR instance from config.
    :param config: Config to generate instance from.
    :param kwargs: Values to inject into loaded config.
    :return: Deserialized OCR instance.
    """
    return cls(**config.to_init_dict(cls, **kwargs))

`serialize()`

Serializes task.

Returns:

Type	Description
`Config`	Config instance.

Source code in sieves/tasks/core.py

def serialize(self) -> Config:
    """Serializes task.
    :return: Config instance.
    """
    return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})

Marker

OCR

id property

__call__(docs)

__init__(converter=docling.document_converter.DocumentConverter(), export_format='markdown', task_id=None, show_progress=True, include_meta=False, **kwargs)

deserialize(config, **kwargs) classmethod

serialize()

`OCR`

`id` `property`

`call(docs)`

`init(converter=docling.document_converter.DocumentConverter(), export_format='markdown', task_id=None, show_progress=True, include_meta=False, **kwargs)`

`deserialize(config, **kwargs)` `classmethod`

`serialize()`