From 0a9c67e9b1ed746dfba9253fc01363319966eda4 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 12 Mar 2026 15:06:20 -0700 Subject: [PATCH] Chore: move Tika parser and tests to paperless/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move TikaDocumentParser and its tests to the canonical parser package location, matching the pattern established for TextDocumentParser: - src/paperless_tika/parsers.py → src/paperless/parsers/tika.py - src/paperless_tika/tests/test_tika_parser.py → src/paperless/tests/parsers/test_tika_parser.py - src/paperless_tika/tests/samples/ → src/paperless/tests/samples/tika/ Merge tika fixtures (tika_parser, sample_odt_file, sample_docx_file, sample_doc_file, sample_broken_odt) into the shared parsers conftest. Remove the now-empty src/paperless_tika/tests/conftest.py. Content is unchanged — this commit is rename-only so git history is preserved on the moved files. Co-Authored-By: Claude Sonnet 4.6 --- .../parsers.py => paperless/parsers/tika.py} | 0 src/paperless/tests/parsers/conftest.py | 84 ++++++++++++++++++ .../tests/parsers}/test_tika_parser.py | 2 +- .../tests/samples/tika}/multi-part-broken.odt | Bin .../tests/samples/tika}/sample.doc | Bin .../tests/samples/tika}/sample.docx | Bin .../tests/samples/tika}/sample.odt | Bin src/paperless_tika/tests/conftest.py | 41 --------- 8 files changed, 85 insertions(+), 42 deletions(-) rename src/{paperless_tika/parsers.py => paperless/parsers/tika.py} (100%) rename src/{paperless_tika/tests => paperless/tests/parsers}/test_tika_parser.py (100%) rename src/{paperless_tika/tests/samples => paperless/tests/samples/tika}/multi-part-broken.odt (100%) rename src/{paperless_tika/tests/samples => paperless/tests/samples/tika}/sample.doc (100%) rename src/{paperless_tika/tests/samples => paperless/tests/samples/tika}/sample.docx (100%) rename src/{paperless_tika/tests/samples => paperless/tests/samples/tika}/sample.odt (100%) delete mode 100644 src/paperless_tika/tests/conftest.py diff --git a/src/paperless_tika/parsers.py b/src/paperless/parsers/tika.py similarity index 100% rename from src/paperless_tika/parsers.py rename to src/paperless/parsers/tika.py diff --git a/src/paperless/tests/parsers/conftest.py b/src/paperless/tests/parsers/conftest.py index 2d5deb684..936058733 100644 --- a/src/paperless/tests/parsers/conftest.py +++ b/src/paperless/tests/parsers/conftest.py @@ -11,6 +11,7 @@ from typing import TYPE_CHECKING import pytest from paperless.parsers.text import TextDocumentParser +from paperless.parsers.tika import TikaDocumentParser if TYPE_CHECKING: from collections.abc import Generator @@ -74,3 +75,86 @@ def text_parser() -> Generator[TextDocumentParser, None, None]: """ with TextDocumentParser() as parser: yield parser + + +# ------------------------------------------------------------------ +# Tika parser sample files +# ------------------------------------------------------------------ + + +@pytest.fixture(scope="session") +def tika_samples_dir(samples_dir: Path) -> Path: + """Absolute path to the Tika parser sample files directory. + + Returns + ------- + Path + ``/tika/`` + """ + return samples_dir / "tika" + + +@pytest.fixture(scope="session") +def sample_odt_file(tika_samples_dir: Path) -> Path: + """Path to a sample ODT file. + + Returns + ------- + Path + Absolute path to ``tika/sample.odt``. + """ + return tika_samples_dir / "sample.odt" + + +@pytest.fixture(scope="session") +def sample_docx_file(tika_samples_dir: Path) -> Path: + """Path to a sample DOCX file. + + Returns + ------- + Path + Absolute path to ``tika/sample.docx``. + """ + return tika_samples_dir / "sample.docx" + + +@pytest.fixture(scope="session") +def sample_doc_file(tika_samples_dir: Path) -> Path: + """Path to a sample DOC file. + + Returns + ------- + Path + Absolute path to ``tika/sample.doc``. + """ + return tika_samples_dir / "sample.doc" + + +@pytest.fixture(scope="session") +def sample_broken_odt(tika_samples_dir: Path) -> Path: + """Path to a broken ODT file that triggers the multi-part fallback. + + Returns + ------- + Path + Absolute path to ``tika/multi-part-broken.odt``. + """ + return tika_samples_dir / "multi-part-broken.odt" + + +# ------------------------------------------------------------------ +# Tika parser instance +# ------------------------------------------------------------------ + + +@pytest.fixture() +def tika_parser() -> Generator[TikaDocumentParser, None, None]: + """Yield a TikaDocumentParser and clean up its temporary directory afterwards. + + Yields + ------ + TikaDocumentParser + A ready-to-use parser instance. + """ + with TikaDocumentParser() as parser: + yield parser diff --git a/src/paperless_tika/tests/test_tika_parser.py b/src/paperless/tests/parsers/test_tika_parser.py similarity index 100% rename from src/paperless_tika/tests/test_tika_parser.py rename to src/paperless/tests/parsers/test_tika_parser.py index e1c71c131..2cf39da59 100644 --- a/src/paperless_tika/tests/test_tika_parser.py +++ b/src/paperless/tests/parsers/test_tika_parser.py @@ -5,11 +5,11 @@ from pathlib import Path import pytest from httpx import codes +from paperless_tika.parsers import TikaDocumentParser from pytest_django.fixtures import SettingsWrapper from pytest_httpx import HTTPXMock from documents.parsers import ParseError -from paperless_tika.parsers import TikaDocumentParser @pytest.mark.django_db() diff --git a/src/paperless_tika/tests/samples/multi-part-broken.odt b/src/paperless/tests/samples/tika/multi-part-broken.odt similarity index 100% rename from src/paperless_tika/tests/samples/multi-part-broken.odt rename to src/paperless/tests/samples/tika/multi-part-broken.odt diff --git a/src/paperless_tika/tests/samples/sample.doc b/src/paperless/tests/samples/tika/sample.doc similarity index 100% rename from src/paperless_tika/tests/samples/sample.doc rename to src/paperless/tests/samples/tika/sample.doc diff --git a/src/paperless_tika/tests/samples/sample.docx b/src/paperless/tests/samples/tika/sample.docx similarity index 100% rename from src/paperless_tika/tests/samples/sample.docx rename to src/paperless/tests/samples/tika/sample.docx diff --git a/src/paperless_tika/tests/samples/sample.odt b/src/paperless/tests/samples/tika/sample.odt similarity index 100% rename from src/paperless_tika/tests/samples/sample.odt rename to src/paperless/tests/samples/tika/sample.odt diff --git a/src/paperless_tika/tests/conftest.py b/src/paperless_tika/tests/conftest.py deleted file mode 100644 index 5a54dae95..000000000 --- a/src/paperless_tika/tests/conftest.py +++ /dev/null @@ -1,41 +0,0 @@ -from collections.abc import Generator -from pathlib import Path - -import pytest - -from paperless_tika.parsers import TikaDocumentParser - - -@pytest.fixture() -def tika_parser() -> Generator[TikaDocumentParser, None, None]: - try: - parser = TikaDocumentParser(logging_group=None) - yield parser - finally: - # TODO(stumpylog): Cleanup once all parsers are handled - parser.cleanup() - - -@pytest.fixture(scope="session") -def sample_dir() -> Path: - return (Path(__file__).parent / Path("samples")).resolve() - - -@pytest.fixture(scope="session") -def sample_odt_file(sample_dir: Path) -> Path: - return sample_dir / "sample.odt" - - -@pytest.fixture(scope="session") -def sample_docx_file(sample_dir: Path) -> Path: - return sample_dir / "sample.docx" - - -@pytest.fixture(scope="session") -def sample_doc_file(sample_dir: Path) -> Path: - return sample_dir / "sample.doc" - - -@pytest.fixture(scope="session") -def sample_broken_odt(sample_dir: Path) -> Path: - return sample_dir / "multi-part-broken.odt"