diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 5383975d1..7be6ad20d 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -10,8 +10,8 @@ from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_supported_file_extensions from documents.parsers import is_file_ext_supported from paperless.parsers.text import TextDocumentParser +from paperless.parsers.tika import TikaDocumentParser from paperless_tesseract.parsers import RasterisedDocumentParser -from paperless_tika.parsers import TikaDocumentParser class TestParserDiscovery(TestCase): diff --git a/src/paperless_tika/tests/test_live_tika.py b/src/paperless/tests/parsers/test_live_tika.py similarity index 77% rename from src/paperless_tika/tests/test_live_tika.py rename to src/paperless/tests/parsers/test_live_tika.py index 8275708b4..87cdd88a5 100644 --- a/src/paperless_tika/tests/test_live_tika.py +++ b/src/paperless/tests/parsers/test_live_tika.py @@ -4,7 +4,7 @@ from pathlib import Path import pytest from documents.tests.utils import util_call_with_backoff -from paperless_tika.parsers import TikaDocumentParser +from paperless.parsers.tika import TikaDocumentParser @pytest.mark.skipif( @@ -42,14 +42,15 @@ class TestTikaParserAgainstServer: ) assert ( - tika_parser.text + tika_parser.get_text() == "This is an ODT test document, created September 14, 2022" ) - assert tika_parser.archive_path is not None - assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10] + archive = tika_parser.get_archive_path() + assert archive is not None + assert b"PDF-" in archive.read_bytes()[:10] # TODO: Unsure what can set the Creation-Date field in a document, enable when possible - # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14)) + # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14)) def test_basic_parse_docx( self, @@ -74,14 +75,15 @@ class TestTikaParserAgainstServer: ) assert ( - tika_parser.text + tika_parser.get_text() == "This is an DOCX test document, also made September 14, 2022" ) - assert tika_parser.archive_path is not None - with Path(tika_parser.archive_path).open("rb") as f: + archive = tika_parser.get_archive_path() + assert archive is not None + with archive.open("rb") as f: assert b"PDF-" in f.read()[:10] - # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14)) + # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14)) def test_basic_parse_doc( self, @@ -102,13 +104,12 @@ class TestTikaParserAgainstServer: [sample_doc_file, "application/msword"], ) - assert tika_parser.text is not None - assert ( - "This is a test document, saved in the older .doc format" - in tika_parser.text - ) - assert tika_parser.archive_path is not None - with Path(tika_parser.archive_path).open("rb") as f: + text = tika_parser.get_text() + assert text is not None + assert "This is a test document, saved in the older .doc format" in text + archive = tika_parser.get_archive_path() + assert archive is not None + with archive.open("rb") as f: assert b"PDF-" in f.read()[:10] def test_tika_fails_multi_part( @@ -133,6 +134,7 @@ class TestTikaParserAgainstServer: [sample_broken_odt, "application/vnd.oasis.opendocument.text"], ) - assert tika_parser.archive_path is not None - with Path(tika_parser.archive_path).open("rb") as f: + archive = tika_parser.get_archive_path() + assert archive is not None + with archive.open("rb") as f: assert b"PDF-" in f.read()[:10]