mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-07-03 10:44:18 +00:00
77 lines
2.4 KiB
Python
77 lines
2.4 KiB
Python
import os
|
|
from unittest import mock, skipIf
|
|
|
|
import pyocr
|
|
from django.test import TestCase
|
|
from pyocr.libtesseract.tesseract_raw import \
|
|
TesseractError as OtherTesseractError
|
|
|
|
from ..parsers import image_to_string, strip_excess_whitespace
|
|
|
|
|
|
class FakeTesseract(object):
|
|
|
|
@staticmethod
|
|
def can_detect_orientation():
|
|
return True
|
|
|
|
@staticmethod
|
|
def detect_orientation(file_handle, lang):
|
|
raise OtherTesseractError("arbitrary status", "message")
|
|
|
|
@staticmethod
|
|
def image_to_string(file_handle, lang):
|
|
return "This is test text"
|
|
|
|
|
|
class FakePyOcr(object):
|
|
|
|
@staticmethod
|
|
def get_available_tools():
|
|
return [FakeTesseract]
|
|
|
|
|
|
class TestOCR(TestCase):
|
|
|
|
text_cases = [
|
|
("simple string", "simple string"),
|
|
(
|
|
"simple newline\n testing string",
|
|
"simple newline\ntesting string"
|
|
),
|
|
(
|
|
"utf-8 строка с пробелами в конце ",
|
|
"utf-8 строка с пробелами в конце"
|
|
)
|
|
]
|
|
|
|
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
|
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
|
|
|
def test_strip_excess_whitespace(self):
|
|
for source, result in self.text_cases:
|
|
actual_result = strip_excess_whitespace(source)
|
|
self.assertEqual(
|
|
result,
|
|
actual_result,
|
|
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
|
source,
|
|
result,
|
|
actual_result
|
|
)
|
|
)
|
|
|
|
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
|
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
|
|
def test_image_to_string_with_text_free_page(self):
|
|
"""
|
|
This test is sort of silly, since it's really just reproducing an odd
|
|
exception thrown by pyocr when it encounters a page with no text.
|
|
Actually running this test against an installation of Tesseract results
|
|
in a segmentation fault rooted somewhere deep inside pyocr where I
|
|
don't care to dig. Regardless, if you run the consumer normally,
|
|
text-free pages are now handled correctly so long as we work around
|
|
this weird exception.
|
|
"""
|
|
image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"])
|