Compare commits

...

6 Commits

Author SHA1 Message Date
Trenton H
e24a2d8214 fix: add RasterisedDocumentParser to new-style parser shim checks
The new RasterisedDocumentParser uses __enter__/__exit__ for resource
management instead of cleanup(). Update all existing new-style shims to
include it in the isinstance checks:

- documents/consumer.py: _parser_cleanup(), parser_is_new_style
- documents/tasks.py: parser_is_new_style, finally cleanup branch
  (also adds RemoteDocumentParser which was missing from the latter)
- documents/management/commands/document_thumbnails.py: adds new-style
  handling from scratch (enter/exit + 2-arg get_thumbnail signature)

Fix stale import paths in three test files that were still importing
from paperless_tesseract.parsers instead of paperless.parsers.tesseract.

Fix two registry tests that used application/pdf as a proxy for "no
handler" — now that RasterisedDocumentParser is registered, PDF always
has a handler, so switch to a truly unsupported MIME type.

Signal infrastructure and shims remain intact; this is plumbing only.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 14:54:34 -07:00
Trenton H
8e3dfcb4ee fix(types): fully annotate paperless/parsers/tesseract.py
Fixes all mypy and pyrefly errors in the new parser file:

- Add missing type annotations to is_image, has_alpha, get_dpi,
  calculate_a4_dpi, construct_ocrmypdf_parameters, post_process_text
- Narrow Path-only (no str) for image helper args; convert to str when
  building list[str] args for run_subprocess
- Annotate ocrmypdf_args as dict[str, Any] so operator expressions on
  its values type-check and ocrmypdf.ocr(**args) resolves cleanly
- Declare text: str | None = None at top of extract_text to unify
  all assignments to the same type across both branches
- Import Any from typing

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 14:19:22 -07:00
Trenton H
1b45e4d029 tests: rewrite test_tesseract_parser to pytest style with typed fixtures
- Converts all tests from Django TestCase to pytest-style classes
- Adds tesseract_samples_dir, null_app_config, tesseract_parser, and
  make_tesseract_parser fixtures in conftest.py; all DB-free except
  TestOcrmypdfParameters which uses @pytest.mark.django_db
- Defines MakeTesseractParser type alias in conftest.py for autocomplete
- Fixes FBT001 (boolean positional args) by making bool params
  keyword-only with * separator in parametrize test signatures
- Adds type annotations to all fixture parameters for IDE support
- Uses pytest.param(..., id="...") throughout; pytest-mock for patching

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 13:51:34 -07:00
Trenton H
6b279e9368 Update tesseract signals.py to import from new parser location
RasterisedDocumentParser moved to paperless.parsers.tesseract; update
the lazy import in signals.get_parser so the signal-based consumer
declaration continues to work during the registry transition. Pop
logging_group and progress_callback kwargs for constructor compatibility.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 13:04:53 -07:00
Trenton H
97bc53ccdc Refactor RasterisedDocumentParser to ParserProtocol interface
- Add RasterisedDocumentParser to registry.register_defaults()
- Update parser class: remove DocumentParser inheritance, add Protocol
  class attrs/classmethods/properties, context-manager lifecycle
- Add read_file_handle_unicode_errors() to shared parsers/utils.py
- Replace inline unicode-error-handling with shared utility call

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 13:02:43 -07:00
Trenton H
80fa4f6f12 Move tesseract parser, tests, and samples to paperless.parsers
Relocates files in preparation for the Phase 3 Protocol-based parser
refactor, preserving full git history via rename.

- src/paperless_tesseract/parsers.py -> src/paperless/parsers/tesseract.py
- src/paperless_tesseract/tests/test_parser.py -> src/paperless/tests/parsers/test_tesseract_parser.py
- src/paperless_tesseract/tests/test_parser_custom_settings.py -> src/paperless/tests/parsers/test_tesseract_custom_settings.py
- src/paperless_tesseract/tests/samples/* -> src/paperless/tests/samples/tesseract/
- Moves RUF001 suppression from broad per-file pyproject.toml ignore to inline noqa comments on the two affected lines

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 12:50:58 -07:00
38 changed files with 1823 additions and 1023 deletions

View File

@@ -248,9 +248,7 @@ lint.per-file-ignores."docker/wait-for-redis.py" = [
lint.per-file-ignores."src/documents/models.py" = [
"SIM115",
]
lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
"RUF001",
]
lint.isort.force-single-line = true
[tool.codespell]

View File

@@ -54,6 +54,7 @@ from documents.utils import run_subprocess
from paperless.parsers import ParserContext
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.remote import RemoteDocumentParser
from paperless.parsers.tesseract import RasterisedDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
@@ -74,6 +75,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
parser,
(
MailDocumentParser,
RasterisedDocumentParser,
RemoteDocumentParser,
TextDocumentParser,
TikaDocumentParser,
@@ -463,6 +465,7 @@ class ConsumerPlugin(
document_parser,
(
MailDocumentParser,
RasterisedDocumentParser,
RemoteDocumentParser,
TextDocumentParser,
TikaDocumentParser,

View File

@@ -4,6 +4,11 @@ import shutil
from documents.management.commands.base import PaperlessCommand
from documents.models import Document
from documents.parsers import get_parser_class_for_mime_type
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.remote import RemoteDocumentParser
from paperless.parsers.tesseract import RasterisedDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
logger = logging.getLogger("paperless.management.thumbnails")
@@ -22,16 +27,38 @@ def _process_document(doc_id: int) -> None:
parser = parser_class(logging_group=None)
parser_is_new_style = isinstance(
parser,
(
MailDocumentParser,
RasterisedDocumentParser,
RemoteDocumentParser,
TextDocumentParser,
TikaDocumentParser,
),
)
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
if parser_is_new_style:
parser.__enter__()
try:
thumb = parser.get_thumbnail(
document.source_path,
document.mime_type,
document.get_public_filename(),
)
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
if parser_is_new_style:
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
else:
thumb = parser.get_thumbnail(
document.source_path,
document.mime_type,
document.get_public_filename(),
)
shutil.move(thumb, document.thumbnail_path)
finally:
# TODO(stumpylog): Cleanup once all parsers are handled
parser.cleanup()
if parser_is_new_style:
parser.__exit__(None, None, None)
else:
parser.cleanup()
class Command(PaperlessCommand):

View File

@@ -68,6 +68,7 @@ from paperless.config import AIConfig
from paperless.parsers import ParserContext
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.remote import RemoteDocumentParser
from paperless.parsers.tesseract import RasterisedDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
from paperless_ai.indexing import llm_index_add_or_update_document
@@ -326,6 +327,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
parser,
(
MailDocumentParser,
RasterisedDocumentParser,
RemoteDocumentParser,
TextDocumentParser,
TikaDocumentParser,
@@ -440,7 +442,13 @@ def update_document_content_maybe_archive_file(document_id) -> None:
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
if isinstance(
parser,
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
(
MailDocumentParser,
RasterisedDocumentParser,
RemoteDocumentParser,
TextDocumentParser,
TikaDocumentParser,
),
):
parser.__exit__(None, None, None)
else:

View File

@@ -9,9 +9,9 @@ from documents.parsers import get_default_file_extension
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import get_supported_file_extensions
from documents.parsers import is_file_ext_supported
from paperless.parsers.tesseract import RasterisedDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
from paperless_tesseract.parsers import RasterisedDocumentParser
class TestParserDiscovery(TestCase):

View File

@@ -195,6 +195,7 @@ class ParserRegistry:
"""
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.remote import RemoteDocumentParser
from paperless.parsers.tesseract import RasterisedDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
@@ -202,6 +203,7 @@ class ParserRegistry:
self.register_builtin(RemoteDocumentParser)
self.register_builtin(TikaDocumentParser)
self.register_builtin(MailDocumentParser)
self.register_builtin(RasterisedDocumentParser)
# ------------------------------------------------------------------
# Discovery

View File

@@ -1,13 +1,18 @@
from __future__ import annotations
import logging
import os
import re
import shutil
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Any
from typing import Self
from django.conf import settings
from PIL import Image
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from documents.utils import maybe_override_pixel_limit
@@ -16,6 +21,28 @@ from paperless.config import OcrConfig
from paperless.models import ArchiveFileChoices
from paperless.models import CleanChoices
from paperless.models import ModeChoices
from paperless.parsers.utils import read_file_handle_unicode_errors
from paperless.version import __full_version_str__
if TYPE_CHECKING:
import datetime
from types import TracebackType
from paperless.parsers import MetadataEntry
from paperless.parsers import ParserContext
logger = logging.getLogger("paperless.parsing.tesseract")
_SUPPORTED_MIME_TYPES: dict[str, str] = {
"application/pdf": ".pdf",
"image/jpeg": ".jpg",
"image/png": ".png",
"image/tiff": ".tif",
"image/gif": ".gif",
"image/bmp": ".bmp",
"image/webp": ".webp",
"image/heic": ".heic",
}
class NoTextFoundException(Exception):
@@ -26,81 +53,125 @@ class RtlLanguageException(Exception):
pass
class RasterisedDocumentParser(DocumentParser):
class RasterisedDocumentParser:
"""
This parser uses Tesseract to try and get some text out of a rasterised
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
"""
logging_name = "paperless.parsing.tesseract"
name: str = "Paperless-ngx Tesseract OCR Parser"
version: str = __full_version_str__
author: str = "Paperless-ngx Contributors"
url: str = "https://github.com/paperless-ngx/paperless-ngx"
def get_settings(self) -> OcrConfig:
"""
This parser uses the OCR configuration settings to parse documents
"""
return OcrConfig()
# ------------------------------------------------------------------
# Class methods
# ------------------------------------------------------------------
def get_page_count(self, document_path, mime_type):
page_count = None
if mime_type == "application/pdf":
try:
import pikepdf
@classmethod
def supported_mime_types(cls) -> dict[str, str]:
return _SUPPORTED_MIME_TYPES
with pikepdf.Pdf.open(document_path) as pdf:
page_count = len(pdf.pages)
except Exception as e:
self.log.warning(
f"Unable to determine PDF page count {document_path}: {e}",
)
return page_count
@classmethod
def score(
cls,
mime_type: str,
filename: str,
path: Path | None = None,
) -> int | None:
if mime_type in _SUPPORTED_MIME_TYPES:
return 10
return None
def extract_metadata(self, document_path, mime_type):
result = []
if mime_type == "application/pdf":
import pikepdf
# ------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
@property
def can_produce_archive(self) -> bool:
return True
pdf = pikepdf.open(document_path)
meta = pdf.open_metadata()
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
if m is None: # pragma: no cover
continue
namespace = m.group(1)
key_value = m.group(2)
try:
namespace.encode("utf-8")
key_value.encode("utf-8")
except UnicodeEncodeError as e: # pragma: no cover
self.log.debug(f"Skipping metadata key {key}: {e}")
continue
result.append(
{
"namespace": namespace,
"prefix": meta.REVERSE_NS[namespace],
"key": key_value,
"value": value,
},
)
except Exception as e:
self.log.warning(
f"Error while reading metadata {key}: {value}. Error: {e}",
)
return result
@property
def requires_pdf_rendition(self) -> bool:
return False
def get_thumbnail(self, document_path, mime_type, file_name=None):
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def __init__(self, logging_group: object = None) -> None:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
self.tempdir = Path(
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
)
self.settings = OcrConfig()
self.archive_path: Path | None = None
self.text: str | None = None
self.date: datetime.datetime | None = None
self.log = logger
def __enter__(self) -> Self:
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
logger.debug("Cleaning up temporary directory %s", self.tempdir)
shutil.rmtree(self.tempdir, ignore_errors=True)
# ------------------------------------------------------------------
# Core parsing interface
# ------------------------------------------------------------------
def configure(self, context: ParserContext) -> None:
pass
# ------------------------------------------------------------------
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
return self.text
def get_date(self) -> datetime.datetime | None:
return self.date
def get_archive_path(self) -> Path | None:
return self.archive_path
# ------------------------------------------------------------------
# Thumbnail, page count, and metadata
# ------------------------------------------------------------------
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
return make_thumbnail_from_pdf(
self.archive_path or document_path,
self.archive_path or Path(document_path),
self.tempdir,
self.logging_group,
)
def is_image(self, mime_type) -> bool:
def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
if mime_type == "application/pdf":
from paperless.parsers.utils import get_page_count_for_pdf
return get_page_count_for_pdf(Path(document_path), log=self.log)
return None
def extract_metadata(
self,
document_path: Path,
mime_type: str,
) -> list[MetadataEntry]:
if mime_type != "application/pdf":
return []
from paperless.parsers.utils import extract_pdf_metadata
return extract_pdf_metadata(Path(document_path), log=self.log)
def is_image(self, mime_type: str) -> bool:
return mime_type in [
"image/png",
"image/jpeg",
@@ -111,25 +182,25 @@ class RasterisedDocumentParser(DocumentParser):
"image/heic",
]
def has_alpha(self, image) -> bool:
def has_alpha(self, image: Path) -> bool:
with Image.open(image) as im:
return im.mode in ("RGBA", "LA")
def remove_alpha(self, image_path: str) -> Path:
def remove_alpha(self, image_path: Path) -> Path:
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
run_subprocess(
[
settings.CONVERT_BINARY,
"-alpha",
"off",
image_path,
no_alpha_image,
str(image_path),
str(no_alpha_image),
],
logger=self.log,
)
return no_alpha_image
def get_dpi(self, image) -> int | None:
def get_dpi(self, image: Path) -> int | None:
try:
with Image.open(image) as im:
x, _ = im.info["dpi"]
@@ -138,7 +209,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log.warning(f"Error while getting DPI from image {image}: {e}")
return None
def calculate_a4_dpi(self, image) -> int | None:
def calculate_a4_dpi(self, image: Path) -> int | None:
try:
with Image.open(image) as im:
width, _ = im.size
@@ -156,6 +227,7 @@ class RasterisedDocumentParser(DocumentParser):
sidecar_file: Path | None,
pdf_file: Path,
) -> str | None:
text: str | None = None
# When re-doing OCR, the sidecar contains ONLY the new text, not
# the whole text, so do not utilize it in that case
if (
@@ -163,7 +235,7 @@ class RasterisedDocumentParser(DocumentParser):
and sidecar_file.is_file()
and self.settings.mode != "redo"
):
text = self.read_file_handle_unicode_errors(sidecar_file)
text = read_file_handle_unicode_errors(sidecar_file)
if "[OCR skipped on page" not in text:
# This happens when there's already text in the input file.
@@ -191,12 +263,12 @@ class RasterisedDocumentParser(DocumentParser):
"-layout",
"-enc",
"UTF-8",
pdf_file,
str(pdf_file),
tmp.name,
],
logger=self.log,
)
text = self.read_file_handle_unicode_errors(Path(tmp.name))
text = read_file_handle_unicode_errors(Path(tmp.name))
return post_process_text(text)
@@ -211,16 +283,14 @@ class RasterisedDocumentParser(DocumentParser):
def construct_ocrmypdf_parameters(
self,
input_file,
mime_type,
output_file,
sidecar_file,
input_file: Path,
mime_type: str,
output_file: Path,
sidecar_file: Path,
*,
safe_fallback=False,
):
if TYPE_CHECKING:
assert isinstance(self.settings, OcrConfig)
ocrmypdf_args = {
safe_fallback: bool = False,
) -> dict[str, Any]:
ocrmypdf_args: dict[str, Any] = {
"input_file_or_options": input_file,
"output_file": output_file,
# need to use threads, since this will be run in daemonized
@@ -330,7 +400,13 @@ class RasterisedDocumentParser(DocumentParser):
return ocrmypdf_args
def parse(self, document_path: Path, mime_type, file_name=None) -> None:
def parse(
self,
document_path: Path,
mime_type: str,
*,
produce_archive: bool = True,
) -> None:
# This forces tesseract to use one core per page.
os.environ["OMP_THREAD_LIMIT"] = "1"
VALID_TEXT_LENGTH = 50
@@ -458,7 +534,7 @@ class RasterisedDocumentParser(DocumentParser):
self.text = ""
def post_process_text(text):
def post_process_text(text: str | None) -> str | None:
if not text:
return None

View File

@@ -20,6 +20,34 @@ if TYPE_CHECKING:
logger = logging.getLogger("paperless.parsers.utils")
def read_file_handle_unicode_errors(
filepath: Path,
log: logging.Logger | None = None,
) -> str:
"""Read a file as UTF-8 text, replacing invalid bytes rather than raising.
Parameters
----------
filepath:
Absolute path to the file to read.
log:
Logger to use for warnings. Falls back to the module-level logger
when omitted.
Returns
-------
str
File content as a string, with any invalid UTF-8 sequences replaced
by the Unicode replacement character.
"""
_log = log or logger
try:
return filepath.read_text(encoding="utf-8")
except UnicodeDecodeError as e:
_log.warning("Unicode error during text reading, continuing: %s", e)
return filepath.read_bytes().decode("utf-8", errors="replace")
def get_page_count_for_pdf(
document_path: Path,
log: logging.Logger | None = None,

View File

@@ -6,20 +6,29 @@ so it is easy to see which files belong to which test module.
from __future__ import annotations
from contextlib import contextmanager
from typing import TYPE_CHECKING
import pytest
from django.test import override_settings
from paperless.parsers.mail import MailDocumentParser
from paperless.parsers.remote import RemoteDocumentParser
from paperless.parsers.tesseract import RasterisedDocumentParser
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
if TYPE_CHECKING:
from collections.abc import Callable
from collections.abc import Generator
from pathlib import Path
from unittest.mock import MagicMock
from pytest_django.fixtures import SettingsWrapper
from pytest_mock import MockerFixture
#: Type for the ``make_tesseract_parser`` fixture factory.
MakeTesseractParser = Callable[..., Generator[RasterisedDocumentParser, None, None]]
# ------------------------------------------------------------------
@@ -411,3 +420,381 @@ def nginx_base_url() -> Generator[str, None, None]:
The base URL for the nginx HTTP server we expect to be alive
"""
yield "http://localhost:8080"
# ------------------------------------------------------------------
# Tesseract parser sample files
# ------------------------------------------------------------------
@pytest.fixture(scope="session")
def tesseract_samples_dir(samples_dir: Path) -> Path:
"""Absolute path to the tesseract parser sample files directory.
Returns
-------
Path
``<samples_dir>/tesseract/``
"""
return samples_dir / "tesseract"
@pytest.fixture(scope="session")
def document_webp_file(tesseract_samples_dir: Path) -> Path:
"""Path to a WebP document sample file.
Returns
-------
Path
Absolute path to ``tesseract/document.webp``.
"""
return tesseract_samples_dir / "document.webp"
@pytest.fixture(scope="session")
def encrypted_pdf_file(tesseract_samples_dir: Path) -> Path:
"""Path to an encrypted PDF sample file.
Returns
-------
Path
Absolute path to ``tesseract/encrypted.pdf``.
"""
return tesseract_samples_dir / "encrypted.pdf"
@pytest.fixture(scope="session")
def multi_page_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
"""Path to a multi-page digital PDF sample file.
Returns
-------
Path
Absolute path to ``tesseract/multi-page-digital.pdf``.
"""
return tesseract_samples_dir / "multi-page-digital.pdf"
@pytest.fixture(scope="session")
def multi_page_images_alpha_rgb_tiff_file(tesseract_samples_dir: Path) -> Path:
"""Path to a multi-page TIFF with alpha channel in RGB.
Returns
-------
Path
Absolute path to ``tesseract/multi-page-images-alpha-rgb.tiff``.
"""
return tesseract_samples_dir / "multi-page-images-alpha-rgb.tiff"
@pytest.fixture(scope="session")
def multi_page_images_alpha_tiff_file(tesseract_samples_dir: Path) -> Path:
"""Path to a multi-page TIFF with alpha channel.
Returns
-------
Path
Absolute path to ``tesseract/multi-page-images-alpha.tiff``.
"""
return tesseract_samples_dir / "multi-page-images-alpha.tiff"
@pytest.fixture(scope="session")
def multi_page_images_pdf_file(tesseract_samples_dir: Path) -> Path:
"""Path to a multi-page PDF with images.
Returns
-------
Path
Absolute path to ``tesseract/multi-page-images.pdf``.
"""
return tesseract_samples_dir / "multi-page-images.pdf"
@pytest.fixture(scope="session")
def multi_page_images_tiff_file(tesseract_samples_dir: Path) -> Path:
"""Path to a multi-page TIFF sample file.
Returns
-------
Path
Absolute path to ``tesseract/multi-page-images.tiff``.
"""
return tesseract_samples_dir / "multi-page-images.tiff"
@pytest.fixture(scope="session")
def multi_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
"""Path to a multi-page mixed PDF sample file.
Returns
-------
Path
Absolute path to ``tesseract/multi-page-mixed.pdf``.
"""
return tesseract_samples_dir / "multi-page-mixed.pdf"
@pytest.fixture(scope="session")
def no_text_alpha_png_file(tesseract_samples_dir: Path) -> Path:
"""Path to a PNG with alpha channel and no text.
Returns
-------
Path
Absolute path to ``tesseract/no-text-alpha.png``.
"""
return tesseract_samples_dir / "no-text-alpha.png"
@pytest.fixture(scope="session")
def rotated_pdf_file(tesseract_samples_dir: Path) -> Path:
"""Path to a rotated PDF sample file.
Returns
-------
Path
Absolute path to ``tesseract/rotated.pdf``.
"""
return tesseract_samples_dir / "rotated.pdf"
@pytest.fixture(scope="session")
def rtl_test_pdf_file(tesseract_samples_dir: Path) -> Path:
"""Path to an RTL test PDF sample file.
Returns
-------
Path
Absolute path to ``tesseract/rtl-test.pdf``.
"""
return tesseract_samples_dir / "rtl-test.pdf"
@pytest.fixture(scope="session")
def signed_pdf_file(tesseract_samples_dir: Path) -> Path:
"""Path to a signed PDF sample file.
Returns
-------
Path
Absolute path to ``tesseract/signed.pdf``.
"""
return tesseract_samples_dir / "signed.pdf"
@pytest.fixture(scope="session")
def simple_alpha_png_file(tesseract_samples_dir: Path) -> Path:
"""Path to a simple PNG with alpha channel.
Returns
-------
Path
Absolute path to ``tesseract/simple-alpha.png``.
"""
return tesseract_samples_dir / "simple-alpha.png"
@pytest.fixture(scope="session")
def simple_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
"""Path to a simple digital PDF sample file.
Returns
-------
Path
Absolute path to ``tesseract/simple-digital.pdf``.
"""
return tesseract_samples_dir / "simple-digital.pdf"
@pytest.fixture(scope="session")
def simple_no_dpi_png_file(tesseract_samples_dir: Path) -> Path:
"""Path to a simple PNG without DPI information.
Returns
-------
Path
Absolute path to ``tesseract/simple-no-dpi.png``.
"""
return tesseract_samples_dir / "simple-no-dpi.png"
@pytest.fixture(scope="session")
def simple_bmp_file(tesseract_samples_dir: Path) -> Path:
"""Path to a simple BMP sample file.
Returns
-------
Path
Absolute path to ``tesseract/simple.bmp``.
"""
return tesseract_samples_dir / "simple.bmp"
@pytest.fixture(scope="session")
def simple_gif_file(tesseract_samples_dir: Path) -> Path:
"""Path to a simple GIF sample file.
Returns
-------
Path
Absolute path to ``tesseract/simple.gif``.
"""
return tesseract_samples_dir / "simple.gif"
@pytest.fixture(scope="session")
def simple_heic_file(tesseract_samples_dir: Path) -> Path:
"""Path to a simple HEIC sample file.
Returns
-------
Path
Absolute path to ``tesseract/simple.heic``.
"""
return tesseract_samples_dir / "simple.heic"
@pytest.fixture(scope="session")
def simple_jpg_file(tesseract_samples_dir: Path) -> Path:
"""Path to a simple JPG sample file.
Returns
-------
Path
Absolute path to ``tesseract/simple.jpg``.
"""
return tesseract_samples_dir / "simple.jpg"
@pytest.fixture(scope="session")
def simple_png_file(tesseract_samples_dir: Path) -> Path:
"""Path to a simple PNG sample file.
Returns
-------
Path
Absolute path to ``tesseract/simple.png``.
"""
return tesseract_samples_dir / "simple.png"
@pytest.fixture(scope="session")
def simple_tif_file(tesseract_samples_dir: Path) -> Path:
"""Path to a simple TIF sample file.
Returns
-------
Path
Absolute path to ``tesseract/simple.tif``.
"""
return tesseract_samples_dir / "simple.tif"
@pytest.fixture(scope="session")
def single_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
"""Path to a single-page mixed PDF sample file.
Returns
-------
Path
Absolute path to ``tesseract/single-page-mixed.pdf``.
"""
return tesseract_samples_dir / "single-page-mixed.pdf"
@pytest.fixture(scope="session")
def with_form_pdf_file(tesseract_samples_dir: Path) -> Path:
"""Path to a PDF with form sample file.
Returns
-------
Path
Absolute path to ``tesseract/with-form.pdf``.
"""
return tesseract_samples_dir / "with-form.pdf"
# ------------------------------------------------------------------
# Tesseract parser instance and settings helpers
# ------------------------------------------------------------------
@pytest.fixture()
def null_app_config(mocker: MockerFixture) -> MagicMock:
"""Return a MagicMock with all OcrConfig fields set to None.
This allows the parser to fall back to Django settings instead of
hitting the database.
Returns
-------
MagicMock
Mock config with all fields as None
"""
return mocker.MagicMock(
output_type=None,
pages=None,
language=None,
mode=None,
skip_archive_file=None,
image_dpi=None,
unpaper_clean=None,
deskew=None,
rotate_pages=None,
rotate_pages_threshold=None,
max_image_pixels=None,
color_conversion_strategy=None,
user_args=None,
)
@pytest.fixture()
def tesseract_parser(
mocker: MockerFixture,
null_app_config: MagicMock,
) -> Generator[RasterisedDocumentParser, None, None]:
"""Yield a RasterisedDocumentParser and clean up its temporary directory afterwards.
Patches the config system to avoid database access.
Yields
------
RasterisedDocumentParser
A ready-to-use parser instance.
"""
mocker.patch(
"paperless.config.BaseConfig._get_config_instance",
return_value=null_app_config,
)
with RasterisedDocumentParser() as parser:
yield parser
@pytest.fixture()
def make_tesseract_parser(
mocker: MockerFixture,
null_app_config: MagicMock,
) -> MakeTesseractParser:
"""Return a factory for creating RasterisedDocumentParser with Django settings overrides.
This fixture is useful for tests that need to create parsers with different
settings configurations.
Returns
-------
Callable[..., contextmanager[RasterisedDocumentParser]]
A context manager factory that accepts Django settings overrides
"""
mocker.patch(
"paperless.config.BaseConfig._get_config_instance",
return_value=null_app_config,
)
@contextmanager
def _make_parser(**django_settings_overrides):
with override_settings(**django_settings_overrides):
with RasterisedDocumentParser() as parser:
yield parser
return _make_parser

View File

@@ -481,12 +481,17 @@ class TestRemoteParserRegistry:
assert parser_cls is RemoteDocumentParser
@pytest.mark.usefixtures("no_engine_settings")
def test_get_parser_returns_none_for_pdf_when_not_configured(self) -> None:
"""With no tesseract parser registered yet, PDF has no handler if remote is off."""
def test_get_parser_returns_none_for_unsupported_type_when_not_configured(
self,
) -> None:
"""With remote off and a truly unsupported MIME type, registry returns None."""
from paperless.parsers.registry import ParserRegistry
registry = ParserRegistry()
registry.register_defaults()
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
parser_cls = registry.get_parser_for_file(
"application/x-unknown-format",
"doc.xyz",
)
assert parser_cls is None

View File

@@ -10,7 +10,7 @@ from paperless.models import CleanChoices
from paperless.models import ColorConvertChoices
from paperless.models import ModeChoices
from paperless.models import OutputTypeChoices
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless.parsers.tesseract import RasterisedDocumentParser
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

File diff suppressed because it is too large Load Diff

View File

@@ -256,6 +256,9 @@ class TestTextParserRegistry:
from paperless.parsers.registry import get_parser_registry
registry = get_parser_registry()
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
parser_cls = registry.get_parser_for_file(
"application/x-unknown-format",
"doc.xyz",
)
assert parser_cls is None

View File

Before

Width:  |  Height:  |  Size: 5.7 KiB

After

Width:  |  Height:  |  Size: 5.7 KiB

View File

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 32 KiB

View File

Before

Width:  |  Height:  |  Size: 8.2 KiB

After

Width:  |  Height:  |  Size: 8.2 KiB

View File

Before

Width:  |  Height:  |  Size: 6.8 KiB

After

Width:  |  Height:  |  Size: 6.8 KiB

View File

Before

Width:  |  Height:  |  Size: 1.7 MiB

After

Width:  |  Height:  |  Size: 1.7 MiB

View File

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 18 KiB

View File

Before

Width:  |  Height:  |  Size: 19 KiB

After

Width:  |  Height:  |  Size: 19 KiB

View File

Before

Width:  |  Height:  |  Size: 7.2 KiB

After

Width:  |  Height:  |  Size: 7.2 KiB

View File

@@ -1,10 +1,23 @@
def get_parser(*args, **kwargs):
from paperless_tesseract.parsers import RasterisedDocumentParser
from __future__ import annotations
from typing import Any
def get_parser(*args: Any, **kwargs: Any) -> Any:
from paperless.parsers.tesseract import RasterisedDocumentParser
# RasterisedDocumentParser accepts logging_group for constructor compatibility but
# does not store or use it (no legacy DocumentParser base class).
# progress_callback is also not used. Both may arrive as a positional arg
# (consumer) or a keyword arg (views); *args absorbs the positional form,
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
# path with the new ParserRegistry so the shim can be removed at that point.
kwargs.pop("logging_group", None)
kwargs.pop("progress_callback", None)
return RasterisedDocumentParser(*args, **kwargs)
def tesseract_consumer_declaration(sender, **kwargs):
def tesseract_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
return {
"parser": get_parser,
"weight": 0,

View File

@@ -1,924 +0,0 @@
import shutil
import tempfile
import unicodedata
import uuid
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from ocrmypdf import SubprocessOutputError
from documents.parsers import ParseError
from documents.parsers import run_convert
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_tesseract.parsers import post_process_text
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
def assertContainsStrings(self, content, strings) -> None:
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
def test_post_process_text(self) -> None:
text_cases = [
("simple string", "simple string"),
("simple newline\n testing string", "simple newline\ntesting string"),
(
"utf-8 строка с пробелами в конце ",
"utf-8 строка с пробелами в конце",
),
]
for source, result in text_cases:
actual_result = post_process_text(source)
self.assertEqual(
result,
actual_result,
f"strip_exceess_whitespace({source}) != '{result}', but '{actual_result}'",
)
def test_get_text_from_pdf(self) -> None:
parser = RasterisedDocumentParser(uuid.uuid4())
text = parser.extract_text(
None,
self.SAMPLE_FILES / "simple-digital.pdf",
)
self.assertContainsStrings(text.strip(), ["This is a test document."])
def test_get_page_count(self) -> None:
"""
GIVEN:
- PDF file with a single page
- PDF file with multiple pages
WHEN:
- The number of pages is requested
THEN:
- The method returns 1 as the expected number of pages
- The method returns the correct number of pages (6)
"""
parser = RasterisedDocumentParser(uuid.uuid4())
page_count = parser.get_page_count(
str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
self.assertEqual(page_count, 1)
page_count = parser.get_page_count(
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertEqual(page_count, 6)
def test_get_page_count_password_protected(self) -> None:
"""
GIVEN:
- Password protected PDF file
WHEN:
- The number of pages is requested
THEN:
- The method returns None
"""
parser = RasterisedDocumentParser(uuid.uuid4())
with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
page_count = parser.get_page_count(
str(self.SAMPLE_FILES / "password-protected.pdf"),
"application/pdf",
)
self.assertEqual(page_count, None)
self.assertIn("Unable to determine PDF page count", cm.output[0])
def test_thumbnail(self) -> None:
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(
str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
self.assertIsFile(thumb)
@mock.patch("documents.parsers.run_convert")
def test_thumbnail_fallback(self, m) -> None:
def call_convert(input_file, output_file, **kwargs) -> None:
if ".pdf" in str(input_file):
raise ParseError("Does not compute.")
else:
run_convert(input_file=input_file, output_file=output_file, **kwargs)
m.side_effect = call_convert
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(
str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
self.assertIsFile(thumb)
def test_thumbnail_encrypted(self) -> None:
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(
str(self.SAMPLE_FILES / "encrypted.pdf"),
"application/pdf",
)
self.assertIsFile(thumb)
def test_get_dpi(self) -> None:
parser = RasterisedDocumentParser(None)
dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple-no-dpi.png"))
self.assertEqual(dpi, None)
dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple.png"))
self.assertEqual(dpi, 72)
def test_simple_digital(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
def test_with_form(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text(),
["Please enter your name in here:", "This is a PDF document with a form."],
)
@override_settings(OCR_MODE="redo")
def test_with_form_error(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text(),
["Please enter your name in here:", "This is a PDF document with a form."],
)
@override_settings(OCR_MODE="skip")
def test_signed(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "signed.pdf"), "application/pdf")
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text(),
[
"This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable",
"automated testing of signed/encrypted PDFs",
],
)
@override_settings(OCR_MODE="skip")
def test_encrypted(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "encrypted.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertEqual(parser.get_text(), "")
@override_settings(OCR_MODE="redo")
def test_with_form_error_notext(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
)
self.assertContainsStrings(
parser.get_text(),
["Please enter your name in here:", "This is a PDF document with a form."],
)
@override_settings(OCR_MODE="force")
def test_with_form_force(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "with-form.pdf"),
"application/pdf",
)
self.assertContainsStrings(
parser.get_text(),
["Please enter your name in here:", "This is a PDF document with a form."],
)
def test_image_simple(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.png"), "image/png")
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
def test_image_simple_alpha(self) -> None:
parser = RasterisedDocumentParser(None)
with tempfile.TemporaryDirectory() as tempdir:
# Copy sample file to temp directory, as the parsing changes the file
# and this makes it modified to Git
sample_file = self.SAMPLE_FILES / "simple-alpha.png"
dest_file = Path(tempdir) / "simple-alpha.png"
shutil.copy(sample_file, dest_file)
parser.parse(str(dest_file), "image/png")
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
def test_image_calc_a4_dpi(self) -> None:
parser = RasterisedDocumentParser(None)
dpi = parser.calculate_a4_dpi(
str(self.SAMPLE_FILES / "simple-no-dpi.png"),
)
self.assertEqual(dpi, 62)
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
def test_image_dpi_fail(self, m) -> None:
m.return_value = None
parser = RasterisedDocumentParser(None)
def f() -> None:
parser.parse(
str(self.SAMPLE_FILES / "simple-no-dpi.png"),
"image/png",
)
self.assertRaises(ParseError, f)
@override_settings(OCR_IMAGE_DPI=72, MAX_IMAGE_PIXELS=0)
def test_image_no_dpi_default(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png")
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["this is a test document."],
)
def test_multi_page(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_PAGES=2, OCR_MODE="skip")
def test_multi_page_pages_skip(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
def test_multi_page_pages_redo(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_PAGES=2, OCR_MODE="force")
def test_multi_page_pages_force(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip")
def test_multi_page_analog_pages_skip(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
def test_multi_page_analog_pages_redo(self) -> None:
"""
GIVEN:
- File with text contained in images but no text layer
- OCR of only pages 1 and 2 requested
- OCR mode set to redo
WHEN:
- Document is parsed
THEN:
- Text of page 1 and 2 extracted
- An archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
self.assertNotIn("page 3", parser.get_text().lower())
@override_settings(OCR_PAGES=1, OCR_MODE="force")
def test_multi_page_analog_pages_force(self) -> None:
"""
GIVEN:
- File with text contained in images but no text layer
- OCR of only page 1 requested
- OCR mode set to force
WHEN:
- Document is parsed
THEN:
- Only text of page 1 is extracted
- An archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
self.assertNotIn("page 2", parser.get_text().lower())
self.assertNotIn("page 3", parser.get_text().lower())
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_withtext(self) -> None:
"""
GIVEN:
- File with existing text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_notext(self) -> None:
"""
GIVEN:
- File with text contained in images but no text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- An archive file is created with the OCRd text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
self.assertIsNotNone(parser.archive_path)
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
def test_skip_archive_never_withtext(self) -> None:
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to never
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
def test_skip_archive_never_withimages(self) -> None:
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to never
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
def test_skip_archive_withtext_withtext(self) -> None:
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to with_text
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
def test_skip_archive_withtext_withimages(self) -> None:
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to with_text
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
def test_skip_archive_always_withtext(self) -> None:
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to always
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
def test_skip_archive_always_withimages(self) -> None:
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to always
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self) -> None:
"""
GIVEN:
- File with some text contained in images and some in text layer
- OCR mode set to skip
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- An archive file is created with the OCRd text and the original text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
)
with (parser.tempdir / "sidecar.txt").open() as f:
sidecar = f.read()
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
@override_settings(OCR_MODE="redo")
def test_single_page_mixed(self) -> None:
"""
GIVEN:
- File with some text contained in images and some in text layer
- Text and images are mixed on the same page
- OCR mode set to redo
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Full content of the file is parsed (not just the image text)
- An archive file is created with the OCRd text and the original text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "single-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
[
"this is some normal text, present on page 1 of the document.",
"this is some text, but in an image, also on page 1.",
"this is further text on page 1.",
],
)
with (parser.tempdir / "sidecar.txt").open() as f:
sidecar = f.read().lower()
self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
self.assertNotIn(
"this is some normal text, present on page 1 of the document.",
sidecar,
)
@override_settings(OCR_MODE="skip_noarchive")
def test_multi_page_mixed_no_archive(self) -> None:
"""
GIVEN:
- File with some text contained in images and some in text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created as original file contains text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 4", "page 5", "page 6"],
)
@override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
def test_rotate(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "rotated.pdf"), "application/pdf")
self.assertContainsStrings(
parser.get_text(),
[
"This is the text that appears on the first page. Its a lot of text.",
"Even if the pages are rotated, OCRmyPDF still gets the job done.",
"This is a really weird file with lots of nonsense text.",
"If you read this, its your own fault. Also check your screen orientation.",
],
)
def test_multi_page_tiff(self) -> None:
"""
GIVEN:
- Multi-page TIFF image
WHEN:
- Image is parsed
THEN:
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "multi-page-images.tiff"),
"image/tiff",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
def test_multi_page_tiff_alpha(self) -> None:
"""
GIVEN:
- Multi-page TIFF image
- Image include an alpha channel
WHEN:
- Image is parsed
THEN:
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff"
with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name)
parser.parse(
tmp_file.name,
"image/tiff",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
def test_multi_page_tiff_alpha_srgb(self) -> None:
"""
GIVEN:
- Multi-page TIFF image
- Image include an alpha channel
- Image is srgb colorspace
WHEN:
- Image is parsed
THEN:
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
sample_file = str(
self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff",
)
with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name)
parser.parse(
tmp_file.name,
"image/tiff",
)
self.assertIsFile(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
def test_ocrmypdf_parameters(self) -> None:
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters(
input_file="input.pdf",
output_file="output.pdf",
sidecar_file="sidecar.txt",
mime_type="application/pdf",
safe_fallback=False,
)
self.assertEqual(params["input_file_or_options"], "input.pdf")
self.assertEqual(params["output_file"], "output.pdf")
self.assertEqual(params["sidecar"], "sidecar.txt")
with override_settings(OCR_CLEAN="none"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("clean", params)
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["clean"])
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["clean_final"])
self.assertNotIn("clean", params)
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["clean"])
self.assertNotIn("clean_final", params)
with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["deskew"])
with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("deskew", params)
with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("deskew", params)
with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertIn("max_image_mpixels", params)
self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("max_image_mpixels", params)
def test_rtl_language_detection(self) -> None:
"""
GIVEN:
- File with text in an RTL language
WHEN:
- Document is parsed
THEN:
- Text from the document is extracted
"""
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "rtl-test.pdf"),
"application/pdf",
)
# OCR output for RTL text varies across platforms/versions due to
# bidi controls and presentation forms; normalize before assertion.
normalized_text = "".join(
char
for char in unicodedata.normalize("NFKC", parser.get_text())
if unicodedata.category(char) != "Cf" and not char.isspace()
)
self.assertIn("ةرازو", normalized_text)
self.assertTrue(
any(token in normalized_text for token in ("ةیلخادلا", "الاخليد")),
)
@mock.patch("ocrmypdf.ocr")
def test_gs_rendering_error(self, m) -> None:
m.side_effect = SubprocessOutputError("Ghostscript PDF/A rendering failed")
parser = RasterisedDocumentParser(None)
self.assertRaises(
ParseError,
parser.parse,
str(self.SAMPLE_FILES / "simple-digital.pdf"),
"application/pdf",
)
class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).parent / "samples"
def test_bmp(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.bmp"), "image/bmp")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
def test_jpg(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.jpg"), "image/jpeg")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
def test_heic(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.heic"), "image/heic")
self.assertIsFile(parser.archive_path)
self.assertIn("pizza", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=200)
def test_gif(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.gif"), "image/gif")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
def test_tiff(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(str(self.SAMPLE_FILES / "simple.tif"), "image/tiff")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=72)
def test_webp(self) -> None:
parser = RasterisedDocumentParser(None)
parser.parse(
str(self.SAMPLE_FILES / "document.webp"),
"image/webp",
)
self.assertIsFile(parser.archive_path)
# Older tesseracts consistently mangle the space between "a webp",
# tesseract 5.3.0 seems to do a better job, so we're accepting both
self.assertRegex(
parser.get_text().lower(),
r"this is a ?webp document, created 11/14/2022.",
)