From 8115332cc93e3b0eaf49c8febdf60a0f4061de44 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 3 Apr 2026 09:05:21 -0700 Subject: [PATCH] Tests and fix a bug with the img2pdf functionality --- src/paperless/parsers/tesseract.py | 6 +- .../parsers/test_convert_image_to_pdfa.py | 141 ++++++++++++++++++ 2 files changed, 144 insertions(+), 3 deletions(-) create mode 100644 src/paperless/tests/parsers/test_convert_image_to_pdfa.py diff --git a/src/paperless/parsers/tesseract.py b/src/paperless/parsers/tesseract.py index ccdcc3ecb..e1f33c3af 100644 --- a/src/paperless/parsers/tesseract.py +++ b/src/paperless/parsers/tesseract.py @@ -393,13 +393,13 @@ class RasterisedDocumentParser: plain_pdf_path = Path(self.tempdir) / "image_plain.pdf" try: - layout_fun = None + convert_kwargs: dict = {} if self.settings.image_dpi is not None: - layout_fun = img2pdf.get_fixed_dpi_layout_fun( + convert_kwargs["layout_fun"] = img2pdf.get_fixed_dpi_layout_fun( (self.settings.image_dpi, self.settings.image_dpi), ) plain_pdf_path.write_bytes( - img2pdf.convert(str(document_path), layout_fun=layout_fun), + img2pdf.convert(str(document_path), **convert_kwargs), ) except Exception as e: raise ParseError( diff --git a/src/paperless/tests/parsers/test_convert_image_to_pdfa.py b/src/paperless/tests/parsers/test_convert_image_to_pdfa.py new file mode 100644 index 000000000..615900a25 --- /dev/null +++ b/src/paperless/tests/parsers/test_convert_image_to_pdfa.py @@ -0,0 +1,141 @@ +""" +Tests for RasterisedDocumentParser._convert_image_to_pdfa. + +The method converts an image to a PDF/A-2b file using img2pdf (wrapping) +then pikepdf (PDF/A metadata stamping), with a fallback to plain PDF when +pikepdf stamping fails. No Tesseract or Ghostscript is invoked. + +These are unit/integration tests: img2pdf and pikepdf run for real; only +error-path branches mock the respective library call. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import img2pdf +import magic +import pikepdf +import pytest + +from documents.parsers import ParseError + +if TYPE_CHECKING: + from pytest_mock import MockerFixture + + from paperless.parsers.tesseract import RasterisedDocumentParser + + +class TestConvertImageToPdfa: + """_convert_image_to_pdfa: output shape, error paths, DPI handling.""" + + def test_valid_png_produces_pdf_bytes( + self, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: a valid PNG with DPI metadata + WHEN: _convert_image_to_pdfa is called + THEN: the returned file is non-empty and begins with the PDF magic bytes + """ + result = tesseract_parser._convert_image_to_pdfa(simple_png_file) + + assert result.exists() + assert magic.from_file(str(result), mime=True) == "application/pdf" + + def test_output_path_is_archive_pdf_in_tempdir( + self, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: any valid image + WHEN: _convert_image_to_pdfa is called + THEN: the returned path is exactly /archive.pdf + """ + result = tesseract_parser._convert_image_to_pdfa(simple_png_file) + + assert result == Path(tesseract_parser.tempdir) / "archive.pdf" + + def test_img2pdf_failure_raises_parse_error( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: img2pdf.convert raises an exception + WHEN: _convert_image_to_pdfa is called + THEN: a ParseError is raised that mentions "img2pdf conversion failed" + """ + mocker.patch.object(img2pdf, "convert", side_effect=Exception("boom")) + + with pytest.raises(ParseError, match="img2pdf conversion failed"): + tesseract_parser._convert_image_to_pdfa(simple_png_file) + + def test_pikepdf_stamping_failure_falls_back_to_plain_pdf( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: pikepdf.open raises during PDF/A metadata stamping + WHEN: _convert_image_to_pdfa is called + THEN: no exception is raised and the returned file is still a valid PDF + (plain PDF bytes are used as fallback) + """ + mocker.patch.object(pikepdf, "open", side_effect=Exception("pikepdf boom")) + + result = tesseract_parser._convert_image_to_pdfa(simple_png_file) + + assert result.exists() + assert magic.from_file(str(result), mime=True) == "application/pdf" + + def test_image_dpi_setting_applies_fixed_dpi_layout( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_no_dpi_png_file: Path, + ) -> None: + """ + GIVEN: parser.settings.image_dpi = 150 + WHEN: _convert_image_to_pdfa is called with a no-DPI PNG + THEN: img2pdf.get_fixed_dpi_layout_fun is called with (150, 150) + and the output is still a valid PDF + """ + spy = mocker.patch.object( + img2pdf, + "get_fixed_dpi_layout_fun", + wraps=img2pdf.get_fixed_dpi_layout_fun, + ) + tesseract_parser.settings.image_dpi = 150 + + result = tesseract_parser._convert_image_to_pdfa(simple_no_dpi_png_file) + + spy.assert_called_once_with((150, 150)) + assert magic.from_file(str(result), mime=True) == "application/pdf" + + def test_no_image_dpi_setting_skips_fixed_dpi_layout( + self, + mocker: MockerFixture, + tesseract_parser: RasterisedDocumentParser, + simple_png_file: Path, + ) -> None: + """ + GIVEN: parser.settings.image_dpi is None (default) + WHEN: _convert_image_to_pdfa is called + THEN: img2pdf.get_fixed_dpi_layout_fun is never called + """ + spy = mocker.patch.object( + img2pdf, + "get_fixed_dpi_layout_fun", + wraps=img2pdf.get_fixed_dpi_layout_fun, + ) + tesseract_parser.settings.image_dpi = None + + tesseract_parser._convert_image_to_pdfa(simple_png_file) + + spy.assert_not_called()