Compare commits
6 Commits
feature-ma
...
feature-te
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e24a2d8214 | ||
|
|
8e3dfcb4ee | ||
|
|
1b45e4d029 | ||
|
|
6b279e9368 | ||
|
|
97bc53ccdc | ||
|
|
80fa4f6f12 |
@@ -248,9 +248,7 @@ lint.per-file-ignores."docker/wait-for-redis.py" = [
|
|||||||
lint.per-file-ignores."src/documents/models.py" = [
|
lint.per-file-ignores."src/documents/models.py" = [
|
||||||
"SIM115",
|
"SIM115",
|
||||||
]
|
]
|
||||||
lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
|
|
||||||
"RUF001",
|
|
||||||
]
|
|
||||||
lint.isort.force-single-line = true
|
lint.isort.force-single-line = true
|
||||||
|
|
||||||
[tool.codespell]
|
[tool.codespell]
|
||||||
|
|||||||
@@ -54,6 +54,7 @@ from documents.utils import run_subprocess
|
|||||||
from paperless.parsers import ParserContext
|
from paperless.parsers import ParserContext
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
from paperless.parsers.remote import RemoteDocumentParser
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
@@ -74,6 +75,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
|
|||||||
parser,
|
parser,
|
||||||
(
|
(
|
||||||
MailDocumentParser,
|
MailDocumentParser,
|
||||||
|
RasterisedDocumentParser,
|
||||||
RemoteDocumentParser,
|
RemoteDocumentParser,
|
||||||
TextDocumentParser,
|
TextDocumentParser,
|
||||||
TikaDocumentParser,
|
TikaDocumentParser,
|
||||||
@@ -463,6 +465,7 @@ class ConsumerPlugin(
|
|||||||
document_parser,
|
document_parser,
|
||||||
(
|
(
|
||||||
MailDocumentParser,
|
MailDocumentParser,
|
||||||
|
RasterisedDocumentParser,
|
||||||
RemoteDocumentParser,
|
RemoteDocumentParser,
|
||||||
TextDocumentParser,
|
TextDocumentParser,
|
||||||
TikaDocumentParser,
|
TikaDocumentParser,
|
||||||
|
|||||||
@@ -4,6 +4,11 @@ import shutil
|
|||||||
from documents.management.commands.base import PaperlessCommand
|
from documents.management.commands.base import PaperlessCommand
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
|
from paperless.parsers.text import TextDocumentParser
|
||||||
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.management.thumbnails")
|
logger = logging.getLogger("paperless.management.thumbnails")
|
||||||
|
|
||||||
@@ -22,16 +27,38 @@ def _process_document(doc_id: int) -> None:
|
|||||||
|
|
||||||
parser = parser_class(logging_group=None)
|
parser = parser_class(logging_group=None)
|
||||||
|
|
||||||
|
parser_is_new_style = isinstance(
|
||||||
|
parser,
|
||||||
|
(
|
||||||
|
MailDocumentParser,
|
||||||
|
RasterisedDocumentParser,
|
||||||
|
RemoteDocumentParser,
|
||||||
|
TextDocumentParser,
|
||||||
|
TikaDocumentParser,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||||
|
if parser_is_new_style:
|
||||||
|
parser.__enter__()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
thumb = parser.get_thumbnail(
|
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||||
document.source_path,
|
if parser_is_new_style:
|
||||||
document.mime_type,
|
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||||
document.get_public_filename(),
|
else:
|
||||||
)
|
thumb = parser.get_thumbnail(
|
||||||
|
document.source_path,
|
||||||
|
document.mime_type,
|
||||||
|
document.get_public_filename(),
|
||||||
|
)
|
||||||
shutil.move(thumb, document.thumbnail_path)
|
shutil.move(thumb, document.thumbnail_path)
|
||||||
finally:
|
finally:
|
||||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
# TODO(stumpylog): Cleanup once all parsers are handled
|
||||||
parser.cleanup()
|
if parser_is_new_style:
|
||||||
|
parser.__exit__(None, None, None)
|
||||||
|
else:
|
||||||
|
parser.cleanup()
|
||||||
|
|
||||||
|
|
||||||
class Command(PaperlessCommand):
|
class Command(PaperlessCommand):
|
||||||
|
|||||||
@@ -68,6 +68,7 @@ from paperless.config import AIConfig
|
|||||||
from paperless.parsers import ParserContext
|
from paperless.parsers import ParserContext
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
from paperless.parsers.remote import RemoteDocumentParser
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||||
@@ -326,6 +327,7 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
|||||||
parser,
|
parser,
|
||||||
(
|
(
|
||||||
MailDocumentParser,
|
MailDocumentParser,
|
||||||
|
RasterisedDocumentParser,
|
||||||
RemoteDocumentParser,
|
RemoteDocumentParser,
|
||||||
TextDocumentParser,
|
TextDocumentParser,
|
||||||
TikaDocumentParser,
|
TikaDocumentParser,
|
||||||
@@ -440,7 +442,13 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
|||||||
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
# TODO(stumpylog): Remove branch in the future when all parsers use new protocol
|
||||||
if isinstance(
|
if isinstance(
|
||||||
parser,
|
parser,
|
||||||
(MailDocumentParser, TextDocumentParser, TikaDocumentParser),
|
(
|
||||||
|
MailDocumentParser,
|
||||||
|
RasterisedDocumentParser,
|
||||||
|
RemoteDocumentParser,
|
||||||
|
TextDocumentParser,
|
||||||
|
TikaDocumentParser,
|
||||||
|
),
|
||||||
):
|
):
|
||||||
parser.__exit__(None, None, None)
|
parser.__exit__(None, None, None)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -9,9 +9,9 @@ from documents.parsers import get_default_file_extension
|
|||||||
from documents.parsers import get_parser_class_for_mime_type
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
from documents.parsers import get_supported_file_extensions
|
from documents.parsers import get_supported_file_extensions
|
||||||
from documents.parsers import is_file_ext_supported
|
from documents.parsers import is_file_ext_supported
|
||||||
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
|
||||||
|
|
||||||
|
|
||||||
class TestParserDiscovery(TestCase):
|
class TestParserDiscovery(TestCase):
|
||||||
|
|||||||
@@ -195,6 +195,7 @@ class ParserRegistry:
|
|||||||
"""
|
"""
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
from paperless.parsers.remote import RemoteDocumentParser
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
@@ -202,6 +203,7 @@ class ParserRegistry:
|
|||||||
self.register_builtin(RemoteDocumentParser)
|
self.register_builtin(RemoteDocumentParser)
|
||||||
self.register_builtin(TikaDocumentParser)
|
self.register_builtin(TikaDocumentParser)
|
||||||
self.register_builtin(MailDocumentParser)
|
self.register_builtin(MailDocumentParser)
|
||||||
|
self.register_builtin(RasterisedDocumentParser)
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Discovery
|
# Discovery
|
||||||
|
|||||||
@@ -1,13 +1,18 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Any
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from documents.parsers import make_thumbnail_from_pdf
|
from documents.parsers import make_thumbnail_from_pdf
|
||||||
from documents.utils import maybe_override_pixel_limit
|
from documents.utils import maybe_override_pixel_limit
|
||||||
@@ -16,6 +21,28 @@ from paperless.config import OcrConfig
|
|||||||
from paperless.models import ArchiveFileChoices
|
from paperless.models import ArchiveFileChoices
|
||||||
from paperless.models import CleanChoices
|
from paperless.models import CleanChoices
|
||||||
from paperless.models import ModeChoices
|
from paperless.models import ModeChoices
|
||||||
|
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||||
|
from paperless.version import __full_version_str__
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
from paperless.parsers import MetadataEntry
|
||||||
|
from paperless.parsers import ParserContext
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.parsing.tesseract")
|
||||||
|
|
||||||
|
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||||
|
"application/pdf": ".pdf",
|
||||||
|
"image/jpeg": ".jpg",
|
||||||
|
"image/png": ".png",
|
||||||
|
"image/tiff": ".tif",
|
||||||
|
"image/gif": ".gif",
|
||||||
|
"image/bmp": ".bmp",
|
||||||
|
"image/webp": ".webp",
|
||||||
|
"image/heic": ".heic",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class NoTextFoundException(Exception):
|
class NoTextFoundException(Exception):
|
||||||
@@ -26,81 +53,125 @@ class RtlLanguageException(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class RasterisedDocumentParser(DocumentParser):
|
class RasterisedDocumentParser:
|
||||||
"""
|
"""
|
||||||
This parser uses Tesseract to try and get some text out of a rasterised
|
This parser uses Tesseract to try and get some text out of a rasterised
|
||||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logging_name = "paperless.parsing.tesseract"
|
name: str = "Paperless-ngx Tesseract OCR Parser"
|
||||||
|
version: str = __full_version_str__
|
||||||
|
author: str = "Paperless-ngx Contributors"
|
||||||
|
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||||
|
|
||||||
def get_settings(self) -> OcrConfig:
|
# ------------------------------------------------------------------
|
||||||
"""
|
# Class methods
|
||||||
This parser uses the OCR configuration settings to parse documents
|
# ------------------------------------------------------------------
|
||||||
"""
|
|
||||||
return OcrConfig()
|
|
||||||
|
|
||||||
def get_page_count(self, document_path, mime_type):
|
@classmethod
|
||||||
page_count = None
|
def supported_mime_types(cls) -> dict[str, str]:
|
||||||
if mime_type == "application/pdf":
|
return _SUPPORTED_MIME_TYPES
|
||||||
try:
|
|
||||||
import pikepdf
|
|
||||||
|
|
||||||
with pikepdf.Pdf.open(document_path) as pdf:
|
@classmethod
|
||||||
page_count = len(pdf.pages)
|
def score(
|
||||||
except Exception as e:
|
cls,
|
||||||
self.log.warning(
|
mime_type: str,
|
||||||
f"Unable to determine PDF page count {document_path}: {e}",
|
filename: str,
|
||||||
)
|
path: Path | None = None,
|
||||||
return page_count
|
) -> int | None:
|
||||||
|
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||||
|
return 10
|
||||||
|
return None
|
||||||
|
|
||||||
def extract_metadata(self, document_path, mime_type):
|
# ------------------------------------------------------------------
|
||||||
result = []
|
# Properties
|
||||||
if mime_type == "application/pdf":
|
# ------------------------------------------------------------------
|
||||||
import pikepdf
|
|
||||||
|
|
||||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
@property
|
||||||
|
def can_produce_archive(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
pdf = pikepdf.open(document_path)
|
@property
|
||||||
meta = pdf.open_metadata()
|
def requires_pdf_rendition(self) -> bool:
|
||||||
for key, value in meta.items():
|
return False
|
||||||
if isinstance(value, list):
|
|
||||||
value = " ".join([str(e) for e in value])
|
|
||||||
value = str(value)
|
|
||||||
try:
|
|
||||||
m = namespace_pattern.match(key)
|
|
||||||
if m is None: # pragma: no cover
|
|
||||||
continue
|
|
||||||
namespace = m.group(1)
|
|
||||||
key_value = m.group(2)
|
|
||||||
try:
|
|
||||||
namespace.encode("utf-8")
|
|
||||||
key_value.encode("utf-8")
|
|
||||||
except UnicodeEncodeError as e: # pragma: no cover
|
|
||||||
self.log.debug(f"Skipping metadata key {key}: {e}")
|
|
||||||
continue
|
|
||||||
result.append(
|
|
||||||
{
|
|
||||||
"namespace": namespace,
|
|
||||||
"prefix": meta.REVERSE_NS[namespace],
|
|
||||||
"key": key_value,
|
|
||||||
"value": value,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
self.log.warning(
|
|
||||||
f"Error while reading metadata {key}: {value}. Error: {e}",
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
# ------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def __init__(self, logging_group: object = None) -> None:
|
||||||
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.tempdir = Path(
|
||||||
|
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||||
|
)
|
||||||
|
self.settings = OcrConfig()
|
||||||
|
self.archive_path: Path | None = None
|
||||||
|
self.text: str | None = None
|
||||||
|
self.date: datetime.datetime | None = None
|
||||||
|
self.log = logger
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
logger.debug("Cleaning up temporary directory %s", self.tempdir)
|
||||||
|
shutil.rmtree(self.tempdir, ignore_errors=True)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Core parsing interface
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def configure(self, context: ParserContext) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Result accessors
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_text(self) -> str | None:
|
||||||
|
return self.text
|
||||||
|
|
||||||
|
def get_date(self) -> datetime.datetime | None:
|
||||||
|
return self.date
|
||||||
|
|
||||||
|
def get_archive_path(self) -> Path | None:
|
||||||
|
return self.archive_path
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Thumbnail, page count, and metadata
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||||
return make_thumbnail_from_pdf(
|
return make_thumbnail_from_pdf(
|
||||||
self.archive_path or document_path,
|
self.archive_path or Path(document_path),
|
||||||
self.tempdir,
|
self.tempdir,
|
||||||
self.logging_group,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def is_image(self, mime_type) -> bool:
|
def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
|
||||||
|
if mime_type == "application/pdf":
|
||||||
|
from paperless.parsers.utils import get_page_count_for_pdf
|
||||||
|
|
||||||
|
return get_page_count_for_pdf(Path(document_path), log=self.log)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_metadata(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> list[MetadataEntry]:
|
||||||
|
if mime_type != "application/pdf":
|
||||||
|
return []
|
||||||
|
|
||||||
|
from paperless.parsers.utils import extract_pdf_metadata
|
||||||
|
|
||||||
|
return extract_pdf_metadata(Path(document_path), log=self.log)
|
||||||
|
|
||||||
|
def is_image(self, mime_type: str) -> bool:
|
||||||
return mime_type in [
|
return mime_type in [
|
||||||
"image/png",
|
"image/png",
|
||||||
"image/jpeg",
|
"image/jpeg",
|
||||||
@@ -111,25 +182,25 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"image/heic",
|
"image/heic",
|
||||||
]
|
]
|
||||||
|
|
||||||
def has_alpha(self, image) -> bool:
|
def has_alpha(self, image: Path) -> bool:
|
||||||
with Image.open(image) as im:
|
with Image.open(image) as im:
|
||||||
return im.mode in ("RGBA", "LA")
|
return im.mode in ("RGBA", "LA")
|
||||||
|
|
||||||
def remove_alpha(self, image_path: str) -> Path:
|
def remove_alpha(self, image_path: Path) -> Path:
|
||||||
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
||||||
run_subprocess(
|
run_subprocess(
|
||||||
[
|
[
|
||||||
settings.CONVERT_BINARY,
|
settings.CONVERT_BINARY,
|
||||||
"-alpha",
|
"-alpha",
|
||||||
"off",
|
"off",
|
||||||
image_path,
|
str(image_path),
|
||||||
no_alpha_image,
|
str(no_alpha_image),
|
||||||
],
|
],
|
||||||
logger=self.log,
|
logger=self.log,
|
||||||
)
|
)
|
||||||
return no_alpha_image
|
return no_alpha_image
|
||||||
|
|
||||||
def get_dpi(self, image) -> int | None:
|
def get_dpi(self, image: Path) -> int | None:
|
||||||
try:
|
try:
|
||||||
with Image.open(image) as im:
|
with Image.open(image) as im:
|
||||||
x, _ = im.info["dpi"]
|
x, _ = im.info["dpi"]
|
||||||
@@ -138,7 +209,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def calculate_a4_dpi(self, image) -> int | None:
|
def calculate_a4_dpi(self, image: Path) -> int | None:
|
||||||
try:
|
try:
|
||||||
with Image.open(image) as im:
|
with Image.open(image) as im:
|
||||||
width, _ = im.size
|
width, _ = im.size
|
||||||
@@ -156,6 +227,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
sidecar_file: Path | None,
|
sidecar_file: Path | None,
|
||||||
pdf_file: Path,
|
pdf_file: Path,
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
|
text: str | None = None
|
||||||
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||||
# the whole text, so do not utilize it in that case
|
# the whole text, so do not utilize it in that case
|
||||||
if (
|
if (
|
||||||
@@ -163,7 +235,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
and sidecar_file.is_file()
|
and sidecar_file.is_file()
|
||||||
and self.settings.mode != "redo"
|
and self.settings.mode != "redo"
|
||||||
):
|
):
|
||||||
text = self.read_file_handle_unicode_errors(sidecar_file)
|
text = read_file_handle_unicode_errors(sidecar_file)
|
||||||
|
|
||||||
if "[OCR skipped on page" not in text:
|
if "[OCR skipped on page" not in text:
|
||||||
# This happens when there's already text in the input file.
|
# This happens when there's already text in the input file.
|
||||||
@@ -191,12 +263,12 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"-layout",
|
"-layout",
|
||||||
"-enc",
|
"-enc",
|
||||||
"UTF-8",
|
"UTF-8",
|
||||||
pdf_file,
|
str(pdf_file),
|
||||||
tmp.name,
|
tmp.name,
|
||||||
],
|
],
|
||||||
logger=self.log,
|
logger=self.log,
|
||||||
)
|
)
|
||||||
text = self.read_file_handle_unicode_errors(Path(tmp.name))
|
text = read_file_handle_unicode_errors(Path(tmp.name))
|
||||||
|
|
||||||
return post_process_text(text)
|
return post_process_text(text)
|
||||||
|
|
||||||
@@ -211,16 +283,14 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
def construct_ocrmypdf_parameters(
|
def construct_ocrmypdf_parameters(
|
||||||
self,
|
self,
|
||||||
input_file,
|
input_file: Path,
|
||||||
mime_type,
|
mime_type: str,
|
||||||
output_file,
|
output_file: Path,
|
||||||
sidecar_file,
|
sidecar_file: Path,
|
||||||
*,
|
*,
|
||||||
safe_fallback=False,
|
safe_fallback: bool = False,
|
||||||
):
|
) -> dict[str, Any]:
|
||||||
if TYPE_CHECKING:
|
ocrmypdf_args: dict[str, Any] = {
|
||||||
assert isinstance(self.settings, OcrConfig)
|
|
||||||
ocrmypdf_args = {
|
|
||||||
"input_file_or_options": input_file,
|
"input_file_or_options": input_file,
|
||||||
"output_file": output_file,
|
"output_file": output_file,
|
||||||
# need to use threads, since this will be run in daemonized
|
# need to use threads, since this will be run in daemonized
|
||||||
@@ -330,7 +400,13 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
return ocrmypdf_args
|
return ocrmypdf_args
|
||||||
|
|
||||||
def parse(self, document_path: Path, mime_type, file_name=None) -> None:
|
def parse(
|
||||||
|
self,
|
||||||
|
document_path: Path,
|
||||||
|
mime_type: str,
|
||||||
|
*,
|
||||||
|
produce_archive: bool = True,
|
||||||
|
) -> None:
|
||||||
# This forces tesseract to use one core per page.
|
# This forces tesseract to use one core per page.
|
||||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||||
VALID_TEXT_LENGTH = 50
|
VALID_TEXT_LENGTH = 50
|
||||||
@@ -458,7 +534,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.text = ""
|
self.text = ""
|
||||||
|
|
||||||
|
|
||||||
def post_process_text(text):
|
def post_process_text(text: str | None) -> str | None:
|
||||||
if not text:
|
if not text:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -20,6 +20,34 @@ if TYPE_CHECKING:
|
|||||||
logger = logging.getLogger("paperless.parsers.utils")
|
logger = logging.getLogger("paperless.parsers.utils")
|
||||||
|
|
||||||
|
|
||||||
|
def read_file_handle_unicode_errors(
|
||||||
|
filepath: Path,
|
||||||
|
log: logging.Logger | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""Read a file as UTF-8 text, replacing invalid bytes rather than raising.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filepath:
|
||||||
|
Absolute path to the file to read.
|
||||||
|
log:
|
||||||
|
Logger to use for warnings. Falls back to the module-level logger
|
||||||
|
when omitted.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
File content as a string, with any invalid UTF-8 sequences replaced
|
||||||
|
by the Unicode replacement character.
|
||||||
|
"""
|
||||||
|
_log = log or logger
|
||||||
|
try:
|
||||||
|
return filepath.read_text(encoding="utf-8")
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
_log.warning("Unicode error during text reading, continuing: %s", e)
|
||||||
|
return filepath.read_bytes().decode("utf-8", errors="replace")
|
||||||
|
|
||||||
|
|
||||||
def get_page_count_for_pdf(
|
def get_page_count_for_pdf(
|
||||||
document_path: Path,
|
document_path: Path,
|
||||||
log: logging.Logger | None = None,
|
log: logging.Logger | None = None,
|
||||||
|
|||||||
@@ -6,20 +6,29 @@ so it is easy to see which files belong to which test module.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from contextlib import contextmanager
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from django.test import override_settings
|
||||||
|
|
||||||
from paperless.parsers.mail import MailDocumentParser
|
from paperless.parsers.mail import MailDocumentParser
|
||||||
from paperless.parsers.remote import RemoteDocumentParser
|
from paperless.parsers.remote import RemoteDocumentParser
|
||||||
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
from paperless.parsers.text import TextDocumentParser
|
from paperless.parsers.text import TextDocumentParser
|
||||||
from paperless.parsers.tika import TikaDocumentParser
|
from paperless.parsers.tika import TikaDocumentParser
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Callable
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
from pytest_django.fixtures import SettingsWrapper
|
from pytest_django.fixtures import SettingsWrapper
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
|
#: Type for the ``make_tesseract_parser`` fixture factory.
|
||||||
|
MakeTesseractParser = Callable[..., Generator[RasterisedDocumentParser, None, None]]
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
@@ -411,3 +420,381 @@ def nginx_base_url() -> Generator[str, None, None]:
|
|||||||
The base URL for the nginx HTTP server we expect to be alive
|
The base URL for the nginx HTTP server we expect to be alive
|
||||||
"""
|
"""
|
||||||
yield "http://localhost:8080"
|
yield "http://localhost:8080"
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Tesseract parser sample files
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def tesseract_samples_dir(samples_dir: Path) -> Path:
|
||||||
|
"""Absolute path to the tesseract parser sample files directory.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
``<samples_dir>/tesseract/``
|
||||||
|
"""
|
||||||
|
return samples_dir / "tesseract"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def document_webp_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a WebP document sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/document.webp``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "document.webp"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def encrypted_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to an encrypted PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/encrypted.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "encrypted.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page digital PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-digital.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-digital.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_images_alpha_rgb_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page TIFF with alpha channel in RGB.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-images-alpha-rgb.tiff``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-images-alpha-rgb.tiff"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_images_alpha_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page TIFF with alpha channel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-images-alpha.tiff``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-images-alpha.tiff"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_images_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page PDF with images.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-images.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-images.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_images_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page TIFF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-images.tiff``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-images.tiff"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def multi_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a multi-page mixed PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/multi-page-mixed.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "multi-page-mixed.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def no_text_alpha_png_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a PNG with alpha channel and no text.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/no-text-alpha.png``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "no-text-alpha.png"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def rotated_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a rotated PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/rotated.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "rotated.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def rtl_test_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to an RTL test PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/rtl-test.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "rtl-test.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def signed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a signed PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/signed.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "signed.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_alpha_png_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple PNG with alpha channel.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple-alpha.png``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple-alpha.png"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple digital PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple-digital.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple-digital.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_no_dpi_png_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple PNG without DPI information.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple-no-dpi.png``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple-no-dpi.png"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_bmp_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple BMP sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.bmp``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.bmp"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_gif_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple GIF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.gif``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.gif"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_heic_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple HEIC sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.heic``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.heic"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_jpg_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple JPG sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.jpg``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.jpg"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_png_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple PNG sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.png``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.png"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def simple_tif_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a simple TIF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/simple.tif``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "simple.tif"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def single_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a single-page mixed PDF sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/single-page-mixed.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "single-page-mixed.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def with_form_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||||
|
"""Path to a PDF with form sample file.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path
|
||||||
|
Absolute path to ``tesseract/with-form.pdf``.
|
||||||
|
"""
|
||||||
|
return tesseract_samples_dir / "with-form.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Tesseract parser instance and settings helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def null_app_config(mocker: MockerFixture) -> MagicMock:
|
||||||
|
"""Return a MagicMock with all OcrConfig fields set to None.
|
||||||
|
|
||||||
|
This allows the parser to fall back to Django settings instead of
|
||||||
|
hitting the database.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
MagicMock
|
||||||
|
Mock config with all fields as None
|
||||||
|
"""
|
||||||
|
return mocker.MagicMock(
|
||||||
|
output_type=None,
|
||||||
|
pages=None,
|
||||||
|
language=None,
|
||||||
|
mode=None,
|
||||||
|
skip_archive_file=None,
|
||||||
|
image_dpi=None,
|
||||||
|
unpaper_clean=None,
|
||||||
|
deskew=None,
|
||||||
|
rotate_pages=None,
|
||||||
|
rotate_pages_threshold=None,
|
||||||
|
max_image_pixels=None,
|
||||||
|
color_conversion_strategy=None,
|
||||||
|
user_args=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tesseract_parser(
|
||||||
|
mocker: MockerFixture,
|
||||||
|
null_app_config: MagicMock,
|
||||||
|
) -> Generator[RasterisedDocumentParser, None, None]:
|
||||||
|
"""Yield a RasterisedDocumentParser and clean up its temporary directory afterwards.
|
||||||
|
|
||||||
|
Patches the config system to avoid database access.
|
||||||
|
|
||||||
|
Yields
|
||||||
|
------
|
||||||
|
RasterisedDocumentParser
|
||||||
|
A ready-to-use parser instance.
|
||||||
|
"""
|
||||||
|
mocker.patch(
|
||||||
|
"paperless.config.BaseConfig._get_config_instance",
|
||||||
|
return_value=null_app_config,
|
||||||
|
)
|
||||||
|
with RasterisedDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def make_tesseract_parser(
|
||||||
|
mocker: MockerFixture,
|
||||||
|
null_app_config: MagicMock,
|
||||||
|
) -> MakeTesseractParser:
|
||||||
|
"""Return a factory for creating RasterisedDocumentParser with Django settings overrides.
|
||||||
|
|
||||||
|
This fixture is useful for tests that need to create parsers with different
|
||||||
|
settings configurations.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Callable[..., contextmanager[RasterisedDocumentParser]]
|
||||||
|
A context manager factory that accepts Django settings overrides
|
||||||
|
"""
|
||||||
|
mocker.patch(
|
||||||
|
"paperless.config.BaseConfig._get_config_instance",
|
||||||
|
return_value=null_app_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _make_parser(**django_settings_overrides):
|
||||||
|
with override_settings(**django_settings_overrides):
|
||||||
|
with RasterisedDocumentParser() as parser:
|
||||||
|
yield parser
|
||||||
|
|
||||||
|
return _make_parser
|
||||||
|
|||||||
@@ -481,12 +481,17 @@ class TestRemoteParserRegistry:
|
|||||||
assert parser_cls is RemoteDocumentParser
|
assert parser_cls is RemoteDocumentParser
|
||||||
|
|
||||||
@pytest.mark.usefixtures("no_engine_settings")
|
@pytest.mark.usefixtures("no_engine_settings")
|
||||||
def test_get_parser_returns_none_for_pdf_when_not_configured(self) -> None:
|
def test_get_parser_returns_none_for_unsupported_type_when_not_configured(
|
||||||
"""With no tesseract parser registered yet, PDF has no handler if remote is off."""
|
self,
|
||||||
|
) -> None:
|
||||||
|
"""With remote off and a truly unsupported MIME type, registry returns None."""
|
||||||
from paperless.parsers.registry import ParserRegistry
|
from paperless.parsers.registry import ParserRegistry
|
||||||
|
|
||||||
registry = ParserRegistry()
|
registry = ParserRegistry()
|
||||||
registry.register_defaults()
|
registry.register_defaults()
|
||||||
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
parser_cls = registry.get_parser_for_file(
|
||||||
|
"application/x-unknown-format",
|
||||||
|
"doc.xyz",
|
||||||
|
)
|
||||||
|
|
||||||
assert parser_cls is None
|
assert parser_cls is None
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from paperless.models import CleanChoices
|
|||||||
from paperless.models import ColorConvertChoices
|
from paperless.models import ColorConvertChoices
|
||||||
from paperless.models import ModeChoices
|
from paperless.models import ModeChoices
|
||||||
from paperless.models import OutputTypeChoices
|
from paperless.models import OutputTypeChoices
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
|
|
||||||
|
|
||||||
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||||
1174
src/paperless/tests/parsers/test_tesseract_parser.py
Normal file
@@ -256,6 +256,9 @@ class TestTextParserRegistry:
|
|||||||
from paperless.parsers.registry import get_parser_registry
|
from paperless.parsers.registry import get_parser_registry
|
||||||
|
|
||||||
registry = get_parser_registry()
|
registry = get_parser_registry()
|
||||||
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
parser_cls = registry.get_parser_for_file(
|
||||||
|
"application/x-unknown-format",
|
||||||
|
"doc.xyz",
|
||||||
|
)
|
||||||
|
|
||||||
assert parser_cls is None
|
assert parser_cls is None
|
||||||
|
|||||||
|
Before Width: | Height: | Size: 5.7 KiB After Width: | Height: | Size: 5.7 KiB |
|
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
|
Before Width: | Height: | Size: 8.2 KiB After Width: | Height: | Size: 8.2 KiB |
|
Before Width: | Height: | Size: 6.8 KiB After Width: | Height: | Size: 6.8 KiB |
|
Before Width: | Height: | Size: 1.7 MiB After Width: | Height: | Size: 1.7 MiB |
|
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 18 KiB |
|
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
|
Before Width: | Height: | Size: 7.2 KiB After Width: | Height: | Size: 7.2 KiB |
@@ -1,10 +1,23 @@
|
|||||||
def get_parser(*args, **kwargs):
|
from __future__ import annotations
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser(*args: Any, **kwargs: Any) -> Any:
|
||||||
|
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||||
|
|
||||||
|
# RasterisedDocumentParser accepts logging_group for constructor compatibility but
|
||||||
|
# does not store or use it (no legacy DocumentParser base class).
|
||||||
|
# progress_callback is also not used. Both may arrive as a positional arg
|
||||||
|
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
||||||
|
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
||||||
|
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||||
|
kwargs.pop("logging_group", None)
|
||||||
|
kwargs.pop("progress_callback", None)
|
||||||
return RasterisedDocumentParser(*args, **kwargs)
|
return RasterisedDocumentParser(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def tesseract_consumer_declaration(sender, **kwargs):
|
def tesseract_consumer_declaration(sender: Any, **kwargs: Any) -> dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"parser": get_parser,
|
"parser": get_parser,
|
||||||
"weight": 0,
|
"weight": 0,
|
||||||
|
|||||||
@@ -1,924 +0,0 @@
|
|||||||
import shutil
|
|
||||||
import tempfile
|
|
||||||
import unicodedata
|
|
||||||
import uuid
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
from django.test import TestCase
|
|
||||||
from django.test import override_settings
|
|
||||||
from ocrmypdf import SubprocessOutputError
|
|
||||||
|
|
||||||
from documents.parsers import ParseError
|
|
||||||
from documents.parsers import run_convert
|
|
||||||
from documents.tests.utils import DirectoriesMixin
|
|
||||||
from documents.tests.utils import FileSystemAssertsMixin
|
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
|
||||||
from paperless_tesseract.parsers import post_process_text
|
|
||||||
|
|
||||||
|
|
||||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|
||||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
|
||||||
|
|
||||||
def assertContainsStrings(self, content, strings) -> None:
|
|
||||||
# Asserts that all strings appear in content, in the given order.
|
|
||||||
indices = []
|
|
||||||
for s in strings:
|
|
||||||
if s in content:
|
|
||||||
indices.append(content.index(s))
|
|
||||||
else:
|
|
||||||
self.fail(f"'{s}' is not in '{content}'")
|
|
||||||
self.assertListEqual(indices, sorted(indices))
|
|
||||||
|
|
||||||
def test_post_process_text(self) -> None:
|
|
||||||
text_cases = [
|
|
||||||
("simple string", "simple string"),
|
|
||||||
("simple newline\n testing string", "simple newline\ntesting string"),
|
|
||||||
(
|
|
||||||
"utf-8 строка с пробелами в конце ",
|
|
||||||
"utf-8 строка с пробелами в конце",
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
for source, result in text_cases:
|
|
||||||
actual_result = post_process_text(source)
|
|
||||||
self.assertEqual(
|
|
||||||
result,
|
|
||||||
actual_result,
|
|
||||||
f"strip_exceess_whitespace({source}) != '{result}', but '{actual_result}'",
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_get_text_from_pdf(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
|
||||||
text = parser.extract_text(
|
|
||||||
None,
|
|
||||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertContainsStrings(text.strip(), ["This is a test document."])
|
|
||||||
|
|
||||||
def test_get_page_count(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- PDF file with a single page
|
|
||||||
- PDF file with multiple pages
|
|
||||||
WHEN:
|
|
||||||
- The number of pages is requested
|
|
||||||
THEN:
|
|
||||||
- The method returns 1 as the expected number of pages
|
|
||||||
- The method returns the correct number of pages (6)
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
|
||||||
page_count = parser.get_page_count(
|
|
||||||
str(self.SAMPLE_FILES / "simple-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertEqual(page_count, 1)
|
|
||||||
|
|
||||||
page_count = parser.get_page_count(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertEqual(page_count, 6)
|
|
||||||
|
|
||||||
def test_get_page_count_password_protected(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Password protected PDF file
|
|
||||||
WHEN:
|
|
||||||
- The number of pages is requested
|
|
||||||
THEN:
|
|
||||||
- The method returns None
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
|
||||||
with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
|
|
||||||
page_count = parser.get_page_count(
|
|
||||||
str(self.SAMPLE_FILES / "password-protected.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertEqual(page_count, None)
|
|
||||||
self.assertIn("Unable to determine PDF page count", cm.output[0])
|
|
||||||
|
|
||||||
def test_thumbnail(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
|
||||||
thumb = parser.get_thumbnail(
|
|
||||||
str(self.SAMPLE_FILES / "simple-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsFile(thumb)
|
|
||||||
|
|
||||||
@mock.patch("documents.parsers.run_convert")
|
|
||||||
def test_thumbnail_fallback(self, m) -> None:
|
|
||||||
def call_convert(input_file, output_file, **kwargs) -> None:
|
|
||||||
if ".pdf" in str(input_file):
|
|
||||||
raise ParseError("Does not compute.")
|
|
||||||
else:
|
|
||||||
run_convert(input_file=input_file, output_file=output_file, **kwargs)
|
|
||||||
|
|
||||||
m.side_effect = call_convert
|
|
||||||
|
|
||||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
|
||||||
thumb = parser.get_thumbnail(
|
|
||||||
str(self.SAMPLE_FILES / "simple-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsFile(thumb)
|
|
||||||
|
|
||||||
def test_thumbnail_encrypted(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
|
||||||
thumb = parser.get_thumbnail(
|
|
||||||
str(self.SAMPLE_FILES / "encrypted.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsFile(thumb)
|
|
||||||
|
|
||||||
def test_get_dpi(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple-no-dpi.png"))
|
|
||||||
self.assertEqual(dpi, None)
|
|
||||||
|
|
||||||
dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple.png"))
|
|
||||||
self.assertEqual(dpi, 72)
|
|
||||||
|
|
||||||
def test_simple_digital(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "simple-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
|
|
||||||
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
|
|
||||||
|
|
||||||
def test_with_form(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "with-form.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text(),
|
|
||||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="redo")
|
|
||||||
def test_with_form_error(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "with-form.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertIsNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text(),
|
|
||||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip")
|
|
||||||
def test_signed(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
parser.parse(str(self.SAMPLE_FILES / "signed.pdf"), "application/pdf")
|
|
||||||
|
|
||||||
self.assertIsNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text(),
|
|
||||||
[
|
|
||||||
"This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable",
|
|
||||||
"automated testing of signed/encrypted PDFs",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip")
|
|
||||||
def test_encrypted(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "encrypted.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertIsNone(parser.archive_path)
|
|
||||||
self.assertEqual(parser.get_text(), "")
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="redo")
|
|
||||||
def test_with_form_error_notext(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "with-form.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text(),
|
|
||||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="force")
|
|
||||||
def test_with_form_force(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "with-form.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text(),
|
|
||||||
["Please enter your name in here:", "This is a PDF document with a form."],
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_image_simple(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
parser.parse(str(self.SAMPLE_FILES / "simple.png"), "image/png")
|
|
||||||
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
|
|
||||||
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
|
|
||||||
|
|
||||||
def test_image_simple_alpha(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tempdir:
|
|
||||||
# Copy sample file to temp directory, as the parsing changes the file
|
|
||||||
# and this makes it modified to Git
|
|
||||||
sample_file = self.SAMPLE_FILES / "simple-alpha.png"
|
|
||||||
dest_file = Path(tempdir) / "simple-alpha.png"
|
|
||||||
shutil.copy(sample_file, dest_file)
|
|
||||||
|
|
||||||
parser.parse(str(dest_file), "image/png")
|
|
||||||
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
|
|
||||||
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
|
|
||||||
|
|
||||||
def test_image_calc_a4_dpi(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
dpi = parser.calculate_a4_dpi(
|
|
||||||
str(self.SAMPLE_FILES / "simple-no-dpi.png"),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(dpi, 62)
|
|
||||||
|
|
||||||
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
|
|
||||||
def test_image_dpi_fail(self, m) -> None:
|
|
||||||
m.return_value = None
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
def f() -> None:
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "simple-no-dpi.png"),
|
|
||||||
"image/png",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertRaises(ParseError, f)
|
|
||||||
|
|
||||||
@override_settings(OCR_IMAGE_DPI=72, MAX_IMAGE_PIXELS=0)
|
|
||||||
def test_image_no_dpi_default(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
parser.parse(str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png")
|
|
||||||
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["this is a test document."],
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_multi_page(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_PAGES=2, OCR_MODE="skip")
|
|
||||||
def test_multi_page_pages_skip(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
|
|
||||||
def test_multi_page_pages_redo(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_PAGES=2, OCR_MODE="force")
|
|
||||||
def test_multi_page_pages_force(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip")
|
|
||||||
def test_multi_page_analog_pages_skip(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
|
|
||||||
def test_multi_page_analog_pages_redo(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with text contained in images but no text layer
|
|
||||||
- OCR of only pages 1 and 2 requested
|
|
||||||
- OCR mode set to redo
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text of page 1 and 2 extracted
|
|
||||||
- An archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
|
|
||||||
self.assertNotIn("page 3", parser.get_text().lower())
|
|
||||||
|
|
||||||
@override_settings(OCR_PAGES=1, OCR_MODE="force")
|
|
||||||
def test_multi_page_analog_pages_force(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with text contained in images but no text layer
|
|
||||||
- OCR of only page 1 requested
|
|
||||||
- OCR mode set to force
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Only text of page 1 is extracted
|
|
||||||
- An archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
|
|
||||||
self.assertNotIn("page 2", parser.get_text().lower())
|
|
||||||
self.assertNotIn("page 3", parser.get_text().lower())
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip_noarchive")
|
|
||||||
def test_skip_noarchive_withtext(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with existing text layer
|
|
||||||
- OCR mode set to skip_noarchive
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from images is extracted
|
|
||||||
- No archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip_noarchive")
|
|
||||||
def test_skip_noarchive_notext(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with text contained in images but no text layer
|
|
||||||
- OCR mode set to skip_noarchive
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from images is extracted
|
|
||||||
- An archive file is created with the OCRd text
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertIsNotNone(parser.archive_path)
|
|
||||||
|
|
||||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
|
|
||||||
def test_skip_archive_never_withtext(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with existing text layer
|
|
||||||
- OCR_SKIP_ARCHIVE_FILE set to never
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from text layer is extracted
|
|
||||||
- Archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNotNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
|
|
||||||
def test_skip_archive_never_withimages(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with text contained in images but no text layer
|
|
||||||
- OCR_SKIP_ARCHIVE_FILE set to never
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from images is extracted
|
|
||||||
- Archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNotNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
|
|
||||||
def test_skip_archive_withtext_withtext(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with existing text layer
|
|
||||||
- OCR_SKIP_ARCHIVE_FILE set to with_text
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from text layer is extracted
|
|
||||||
- No archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
|
|
||||||
def test_skip_archive_withtext_withimages(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with text contained in images but no text layer
|
|
||||||
- OCR_SKIP_ARCHIVE_FILE set to with_text
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from images is extracted
|
|
||||||
- Archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNotNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
|
|
||||||
def test_skip_archive_always_withtext(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with existing text layer
|
|
||||||
- OCR_SKIP_ARCHIVE_FILE set to always
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from text layer is extracted
|
|
||||||
- No archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
|
|
||||||
def test_skip_archive_always_withimages(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with text contained in images but no text layer
|
|
||||||
- OCR_SKIP_ARCHIVE_FILE set to always
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from images is extracted
|
|
||||||
- No archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-images.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip")
|
|
||||||
def test_multi_page_mixed(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with some text contained in images and some in text layer
|
|
||||||
- OCR mode set to skip
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from images is extracted
|
|
||||||
- An archive file is created with the OCRd text and the original text
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNotNone(parser.archive_path)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
|
|
||||||
)
|
|
||||||
|
|
||||||
with (parser.tempdir / "sidecar.txt").open() as f:
|
|
||||||
sidecar = f.read()
|
|
||||||
|
|
||||||
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="redo")
|
|
||||||
def test_single_page_mixed(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with some text contained in images and some in text layer
|
|
||||||
- Text and images are mixed on the same page
|
|
||||||
- OCR mode set to redo
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from images is extracted
|
|
||||||
- Full content of the file is parsed (not just the image text)
|
|
||||||
- An archive file is created with the OCRd text and the original text
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "single-page-mixed.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNotNone(parser.archive_path)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
[
|
|
||||||
"this is some normal text, present on page 1 of the document.",
|
|
||||||
"this is some text, but in an image, also on page 1.",
|
|
||||||
"this is further text on page 1.",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
with (parser.tempdir / "sidecar.txt").open() as f:
|
|
||||||
sidecar = f.read().lower()
|
|
||||||
|
|
||||||
self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
|
|
||||||
self.assertNotIn(
|
|
||||||
"this is some normal text, present on page 1 of the document.",
|
|
||||||
sidecar,
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip_noarchive")
|
|
||||||
def test_multi_page_mixed_no_archive(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with some text contained in images and some in text layer
|
|
||||||
- OCR mode set to skip_noarchive
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from images is extracted
|
|
||||||
- No archive file is created as original file contains text
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-mixed.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 4", "page 5", "page 6"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
|
|
||||||
def test_rotate(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(str(self.SAMPLE_FILES / "rotated.pdf"), "application/pdf")
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text(),
|
|
||||||
[
|
|
||||||
"This is the text that appears on the first page. It’s a lot of text.",
|
|
||||||
"Even if the pages are rotated, OCRmyPDF still gets the job done.",
|
|
||||||
"This is a really weird file with lots of nonsense text.",
|
|
||||||
"If you read this, it’s your own fault. Also check your screen orientation.",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_multi_page_tiff(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Multi-page TIFF image
|
|
||||||
WHEN:
|
|
||||||
- Image is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from all pages extracted
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "multi-page-images.tiff"),
|
|
||||||
"image/tiff",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_multi_page_tiff_alpha(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Multi-page TIFF image
|
|
||||||
- Image include an alpha channel
|
|
||||||
WHEN:
|
|
||||||
- Image is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from all pages extracted
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff"
|
|
||||||
with tempfile.NamedTemporaryFile() as tmp_file:
|
|
||||||
shutil.copy(sample_file, tmp_file.name)
|
|
||||||
parser.parse(
|
|
||||||
tmp_file.name,
|
|
||||||
"image/tiff",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_multi_page_tiff_alpha_srgb(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Multi-page TIFF image
|
|
||||||
- Image include an alpha channel
|
|
||||||
- Image is srgb colorspace
|
|
||||||
WHEN:
|
|
||||||
- Image is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from all pages extracted
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
sample_file = str(
|
|
||||||
self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff",
|
|
||||||
)
|
|
||||||
with tempfile.NamedTemporaryFile() as tmp_file:
|
|
||||||
shutil.copy(sample_file, tmp_file.name)
|
|
||||||
parser.parse(
|
|
||||||
tmp_file.name,
|
|
||||||
"image/tiff",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_ocrmypdf_parameters(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
params = parser.construct_ocrmypdf_parameters(
|
|
||||||
input_file="input.pdf",
|
|
||||||
output_file="output.pdf",
|
|
||||||
sidecar_file="sidecar.txt",
|
|
||||||
mime_type="application/pdf",
|
|
||||||
safe_fallback=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(params["input_file_or_options"], "input.pdf")
|
|
||||||
self.assertEqual(params["output_file"], "output.pdf")
|
|
||||||
self.assertEqual(params["sidecar"], "sidecar.txt")
|
|
||||||
|
|
||||||
with override_settings(OCR_CLEAN="none"):
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
|
||||||
self.assertNotIn("clean", params)
|
|
||||||
self.assertNotIn("clean_final", params)
|
|
||||||
|
|
||||||
with override_settings(OCR_CLEAN="clean"):
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
|
||||||
self.assertTrue(params["clean"])
|
|
||||||
self.assertNotIn("clean_final", params)
|
|
||||||
|
|
||||||
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
|
||||||
self.assertTrue(params["clean_final"])
|
|
||||||
self.assertNotIn("clean", params)
|
|
||||||
|
|
||||||
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
|
||||||
self.assertTrue(params["clean"])
|
|
||||||
self.assertNotIn("clean_final", params)
|
|
||||||
|
|
||||||
with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
|
||||||
self.assertTrue(params["deskew"])
|
|
||||||
|
|
||||||
with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
|
||||||
self.assertNotIn("deskew", params)
|
|
||||||
|
|
||||||
with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
|
||||||
self.assertNotIn("deskew", params)
|
|
||||||
|
|
||||||
with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
|
||||||
self.assertIn("max_image_mpixels", params)
|
|
||||||
self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
|
|
||||||
|
|
||||||
with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
|
||||||
self.assertNotIn("max_image_mpixels", params)
|
|
||||||
|
|
||||||
def test_rtl_language_detection(self) -> None:
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with text in an RTL language
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from the document is extracted
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "rtl-test.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
# OCR output for RTL text varies across platforms/versions due to
|
|
||||||
# bidi controls and presentation forms; normalize before assertion.
|
|
||||||
normalized_text = "".join(
|
|
||||||
char
|
|
||||||
for char in unicodedata.normalize("NFKC", parser.get_text())
|
|
||||||
if unicodedata.category(char) != "Cf" and not char.isspace()
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertIn("ةرازو", normalized_text)
|
|
||||||
self.assertTrue(
|
|
||||||
any(token in normalized_text for token in ("ةیلخادلا", "الاخليد")),
|
|
||||||
)
|
|
||||||
|
|
||||||
@mock.patch("ocrmypdf.ocr")
|
|
||||||
def test_gs_rendering_error(self, m) -> None:
|
|
||||||
m.side_effect = SubprocessOutputError("Ghostscript PDF/A rendering failed")
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
|
|
||||||
self.assertRaises(
|
|
||||||
ParseError,
|
|
||||||
parser.parse,
|
|
||||||
str(self.SAMPLE_FILES / "simple-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|
||||||
SAMPLE_FILES = Path(__file__).parent / "samples"
|
|
||||||
|
|
||||||
def test_bmp(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(str(self.SAMPLE_FILES / "simple.bmp"), "image/bmp")
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertIn("this is a test document", parser.get_text().lower())
|
|
||||||
|
|
||||||
def test_jpg(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(str(self.SAMPLE_FILES / "simple.jpg"), "image/jpeg")
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertIn("this is a test document", parser.get_text().lower())
|
|
||||||
|
|
||||||
def test_heic(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(str(self.SAMPLE_FILES / "simple.heic"), "image/heic")
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertIn("pizza", parser.get_text().lower())
|
|
||||||
|
|
||||||
@override_settings(OCR_IMAGE_DPI=200)
|
|
||||||
def test_gif(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(str(self.SAMPLE_FILES / "simple.gif"), "image/gif")
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertIn("this is a test document", parser.get_text().lower())
|
|
||||||
|
|
||||||
def test_tiff(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(str(self.SAMPLE_FILES / "simple.tif"), "image/tiff")
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
self.assertIn("this is a test document", parser.get_text().lower())
|
|
||||||
|
|
||||||
@override_settings(OCR_IMAGE_DPI=72)
|
|
||||||
def test_webp(self) -> None:
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
str(self.SAMPLE_FILES / "document.webp"),
|
|
||||||
"image/webp",
|
|
||||||
)
|
|
||||||
self.assertIsFile(parser.archive_path)
|
|
||||||
# Older tesseracts consistently mangle the space between "a webp",
|
|
||||||
# tesseract 5.3.0 seems to do a better job, so we're accepting both
|
|
||||||
self.assertRegex(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
r"this is a ?webp document, created 11/14/2022.",
|
|
||||||
)
|
|
||||||