Compare commits

..

10 Commits

Author SHA1 Message Date
shamoon
e5afbccffc Lint 2026-03-17 11:48:53 -07:00
shamoon
b8faae72ab Update tests 2026-03-17 11:46:44 -07:00
shamoon
8cff99bef3 Update __init__.py 2026-03-17 11:43:22 -07:00
shamoon
b2bbc2c0ac Basic option selection 2026-03-17 11:42:17 -07:00
shamoon
03c71c604f Retry action, basic frontend, cleanup handler 2026-03-17 11:39:53 -07:00
shamoon
fe89ff760b Move it out of consumer 2026-03-17 11:35:52 -07:00
shamoon
83eabbdf63 Try this 2026-03-17 11:35:11 -07:00
shamoon
24da26959d Update consumer.py 2026-03-17 11:34:12 -07:00
shamoon
220267099a Fix tests 2026-03-17 11:34:11 -07:00
shamoon
0f1a529b51 Messing around 2026-03-17 11:33:01 -07:00
36 changed files with 492 additions and 651 deletions

View File

@@ -157,9 +157,6 @@ updates:
postgres:
patterns:
- "docker.io/library/postgres*"
greenmail:
patterns:
- "docker.io/greenmail*"
- package-ecosystem: "pre-commit" # See documentation for possible values
directory: "/" # Location of package manifests
schedule:

View File

@@ -50,7 +50,7 @@ repos:
- 'prettier-plugin-organize-imports@4.3.0'
# Python hooks
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.15.6
rev: v0.15.5
hooks:
- id: ruff-check
- id: ruff-format

View File

@@ -18,13 +18,13 @@ services:
- "--log-level=warn"
- "--log-format=text"
tika:
image: docker.io/apache/tika:3.2.3.0
image: docker.io/apache/tika:latest
hostname: tika
container_name: tika
network_mode: host
restart: unless-stopped
greenmail:
image: docker.io/greenmail/standalone:2.1.8
image: greenmail/standalone:2.1.8
hostname: greenmail
container_name: greenmail
environment:

View File

@@ -14,6 +14,7 @@
# Paths and folders
#PAPERLESS_CONSUMPTION_DIR=../consume
#PAPERLESS_CONSUMPTION_FAILED_DIR=../consume/failed
#PAPERLESS_DATA_DIR=../data
#PAPERLESS_EMPTY_TRASH_DIR=
#PAPERLESS_MEDIA_ROOT=../media

View File

@@ -112,6 +112,9 @@
</td>
<td scope="row">
<div class="btn-group" role="group">
@if (task.status === PaperlessTaskStatus.Failed) {
<ng-container *ngTemplateOutlet="retryDropdown; context: { task: task }"></ng-container>
}
<button class="btn btn-sm btn-outline-secondary" (click)="dismissTask(task); $event.stopPropagation();" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.PaperlessTask }">
<i-bs name="check" class="me-1"></i-bs><ng-container i18n>Dismiss</ng-container>
</button>
@@ -184,3 +187,25 @@
</li>
</ul>
<div [ngbNavOutlet]="nav"></div>
<ng-template #retryDropdown let-task="task">
<div ngbDropdown>
<button class="btn btn-sm btn-outline-primary" (click)="$event.stopImmediatePropagation()" ngbDropdownToggle>
<i-bs name="arrow-repeat"></i-bs>&nbsp;<ng-container i18n>Retry</ng-container>
</button>
<div ngbDropdownMenu class="shadow retry-dropdown">
<div class="p-2">
<ul class="list-group list-group-flush">
<li class="list-group-item small" i18n>
<pngx-input-check [(ngModel)]="retryClean" i18n-title title="Attempt to clean pdf"></pngx-input-check>
</li>
</ul>
<div class="d-flex justify-content-end">
<button class="btn btn-sm btn-outline-primary" (click)="retryTask(task); $event.stopPropagation();">
<ng-container i18n>Proceed</ng-container>
</button>
</div>
</div>
</div>
</div>
</ng-template>

View File

@@ -37,3 +37,7 @@ pre {
.z-10 {
z-index: 10;
}
.retry-dropdown {
width: 300px;
}

View File

@@ -16,7 +16,7 @@ import {
NgbNavItem,
} from '@ng-bootstrap/ng-bootstrap'
import { allIcons, NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
import { throwError } from 'rxjs'
import { of, throwError } from 'rxjs'
import { routes } from 'src/app/app-routing.module'
import {
PaperlessTask,
@@ -32,6 +32,7 @@ import { TasksService } from 'src/app/services/tasks.service'
import { ToastService } from 'src/app/services/toast.service'
import { environment } from 'src/environments/environment'
import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
import { CheckComponent } from '../../common/input/check/check.component'
import { PageHeaderComponent } from '../../common/page-header/page-header.component'
import { TasksComponent, TaskTab } from './tasks.component'
@@ -138,6 +139,7 @@ describe('TasksComponent', () => {
PageHeaderComponent,
IfPermissionsDirective,
CustomDatePipe,
CheckComponent,
ConfirmDialogComponent,
],
providers: [
@@ -184,8 +186,10 @@ describe('TasksComponent', () => {
`Failed${currentTasksLength}`
)
expect(
fixture.debugElement.queryAll(By.css('table input[type="checkbox"]'))
).toHaveLength(currentTasksLength + 1)
fixture.debugElement.queryAll(
By.css('table td > .form-check input[type="checkbox"]')
)
).toHaveLength(currentTasksLength)
currentTasksLength = tasks.filter(
(t) => t.status === PaperlessTaskStatus.Complete
@@ -389,4 +393,20 @@ describe('TasksComponent', () => {
expect(component.filterText).toEqual('')
expect(component.filterTargetID).toEqual(0)
})
it('should retry a task, show toast on error or success', () => {
const retrySpy = jest.spyOn(tasksService, 'retryTask')
const toastInfoSpy = jest.spyOn(toastService, 'showInfo')
const toastErrorSpy = jest.spyOn(toastService, 'showError')
retrySpy.mockReturnValueOnce(of({ task_id: '123' }))
component.retryTask(tasks[0])
expect(retrySpy).toHaveBeenCalledWith(tasks[0], false)
expect(toastInfoSpy).toHaveBeenCalledWith('Retrying task...')
retrySpy.mockReturnValueOnce(throwError(() => new Error('test')))
component.retryTask(tasks[0])
expect(toastErrorSpy).toHaveBeenCalledWith(
'Failed to retry task',
new Error('test')
)
})
})

View File

@@ -20,12 +20,13 @@ import {
takeUntil,
timer,
} from 'rxjs'
import { PaperlessTask } from 'src/app/data/paperless-task'
import { PaperlessTask, PaperlessTaskStatus } from 'src/app/data/paperless-task'
import { IfPermissionsDirective } from 'src/app/directives/if-permissions.directive'
import { CustomDatePipe } from 'src/app/pipes/custom-date.pipe'
import { TasksService } from 'src/app/services/tasks.service'
import { ToastService } from 'src/app/services/toast.service'
import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
import { CheckComponent } from '../../common/input/check/check.component'
import { PageHeaderComponent } from '../../common/page-header/page-header.component'
import { LoadingComponentWithPermissions } from '../../loading-component/loading.component'
@@ -54,6 +55,7 @@ const FILTER_TARGETS = [
PageHeaderComponent,
IfPermissionsDirective,
CustomDatePipe,
CheckComponent,
SlicePipe,
FormsModule,
ReactiveFormsModule,
@@ -75,6 +77,7 @@ export class TasksComponent
private readonly router = inject(Router)
private readonly toastService = inject(ToastService)
public PaperlessTaskStatus = PaperlessTaskStatus
public activeTab: TaskTab
public selectedTasks: Set<number> = new Set()
public togggleAll: boolean = false
@@ -105,6 +108,8 @@ export class TasksComponent
: FILTER_TARGETS.slice(0, 1)
}
public retryClean: boolean = false
get dismissButtonText(): string {
return this.selectedTasks.size > 0
? $localize`Dismiss selected`
@@ -178,6 +183,17 @@ export class TasksComponent
this.router.navigate(['documents', task.related_document])
}
retryTask(task: PaperlessTask) {
this.tasksService.retryTask(task, this.retryClean).subscribe({
next: () => {
this.toastService.showInfo($localize`Retrying task...`)
},
error: (e) => {
this.toastService.showError($localize`Failed to retry task`, e)
},
})
}
expandTask(task: PaperlessTask) {
this.expandedTask = this.expandedTask == task.id ? undefined : task.id
}

View File

@@ -147,4 +147,33 @@ describe('TasksService', () => {
result: 'success',
})
})
it('should call retry task api endpoint', () => {
const task = {
id: 1,
type: PaperlessTaskType.File,
status: PaperlessTaskStatus.Failed,
acknowledged: false,
task_id: '1234',
task_file_name: 'file1.pdf',
date_created: new Date(),
}
tasksService.retryTask(task, true).subscribe()
const reloadSpy = jest.spyOn(tasksService, 'reload')
const req = httpTestingController.expectOne(
`${environment.apiBaseUrl}tasks/${task.id}/retry/`
)
expect(req.request.method).toEqual('POST')
expect(req.request.body).toEqual({
clean: true,
})
req.flush({ task_id: 12345 })
expect(reloadSpy).toHaveBeenCalled()
httpTestingController
.expectOne(
`${environment.apiBaseUrl}tasks/?task_name=consume_file&acknowledged=false`
)
.flush([])
})
})

View File

@@ -81,6 +81,20 @@ export class TasksService {
)
}
public retryTask(task: PaperlessTask, clean: boolean): Observable<any> {
return this.http
.post(`${this.baseUrl}tasks/${task.id}/retry/`, {
clean,
})
.pipe(
takeUntil(this.unsubscribeNotifer),
first(),
tap(() => {
this.reload()
})
)
}
public cancelPending(): void {
this.unsubscribeNotifer.next(true)
}

View File

@@ -52,7 +52,6 @@ from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
from paperless_mail.parsers import MailDocumentParser
LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -68,7 +67,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
TODO(stumpylog): Remove me in the future
"""
if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
if isinstance(parser, TextDocumentParser):
parser.__exit__(None, None, None)
else:
parser.cleanup()
@@ -175,6 +174,17 @@ class ConsumerPluginMixin:
):
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
self.log.error(log_message or message, exc_info=exc_info)
# Move the file to the failed directory
if (
self.input_doc.original_file.exists()
and not Path(
settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
).exists()
):
copy_file_with_basic_stats(
self.input_doc.original_file,
settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
)
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
@@ -449,12 +459,6 @@ class ConsumerPlugin(
progress_callback=progress_callback,
)
# New-style parsers use __enter__/__exit__ for resource management.
# _parser_cleanup (below) handles __exit__; call __enter__ here.
# TODO(stumpylog): Remove me in the future
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
document_parser.__enter__()
self.log.debug(f"Parser: {type(document_parser).__name__}")
# Parse the document. This may take some time.
@@ -483,7 +487,7 @@ class ConsumerPlugin(
self.filename,
self.input_doc.mailrule_id,
)
elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
elif isinstance(document_parser, TextDocumentParser):
# TODO(stumpylog): Remove me in the future
document_parser.parse(self.working_copy, mime_type)
else:
@@ -496,7 +500,7 @@ class ConsumerPlugin(
ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
)
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
if isinstance(document_parser, TextDocumentParser):
# TODO(stumpylog): Remove me in the future
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
else:

View File

@@ -2411,6 +2411,14 @@ class TasksViewSerializer(OwnedObjectSerializer):
return list(duplicates.values("id", "title", "deleted_at"))
class RetryTaskSerializer(serializers.Serializer):
clean = serializers.BooleanField(
default=False,
write_only=True,
required=False,
)
class RunTaskViewSerializer(serializers.Serializer[dict[str, Any]]):
task_name = serializers.ChoiceField(
choices=PaperlessTask.TaskName.choices,

View File

@@ -631,6 +631,19 @@ def update_filename_and_move_files(
)
@receiver(models.signals.post_save, sender=PaperlessTask)
def cleanup_failed_documents(sender, instance: PaperlessTask, **kwargs):
if instance.status != states.FAILURE or not instance.acknowledged:
return
if instance.task_file_name:
try:
Path(settings.CONSUMPTION_FAILED_DIR / instance.task_file_name).unlink()
logger.debug(f"Cleaned up failed file {instance.task_file_name}")
except FileNotFoundError:
logger.warning(f"Failed to clean up failed file {instance.task_file_name}")
@shared_task
def process_cf_select_update(custom_field: CustomField) -> None:
"""

View File

@@ -37,6 +37,7 @@ from documents.consumer import ConsumerPreflightPlugin
from documents.consumer import WorkflowTriggerPlugin
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.double_sided import CollatePlugin
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
@@ -63,6 +64,8 @@ from documents.signals import document_updated
from documents.signals.handlers import cleanup_document_deletion
from documents.signals.handlers import run_workflows
from documents.signals.handlers import send_websocket_document_updated
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from documents.workflows.utils import get_workflows_for_trigger
from paperless.config import AIConfig
from paperless_ai.indexing import llm_index_add_or_update_document
@@ -72,7 +75,6 @@ from paperless_ai.indexing import update_llm_index
_T = TypeVar("_T")
IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]
if settings.AUDIT_LOG_ENABLED:
from auditlog.models import LogEntry
logger = logging.getLogger("paperless.tasks")
@@ -239,6 +241,48 @@ def consume_file(
return msg
@shared_task
def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False):
task = PaperlessTask.objects.get(task_id=task_id, status=states.FAILURE)
if task:
failed_file = settings.CONSUMPTION_FAILED_DIR / task.task_file_name
if not failed_file.exists():
logger.error(f"File {failed_file} not found")
raise FileNotFoundError(f"File {failed_file} not found")
working_copy = settings.SCRATCH_DIR / failed_file.name
copy_file_with_basic_stats(failed_file, working_copy)
if clean:
try:
result = run_subprocess(
[
"qpdf",
"--replace-input",
"--warning-exit-0",
working_copy,
],
logger=logger,
)
if result.returncode != 0:
raise Exception(
f"qpdf failed with exit code {result.returncode}, error: {result.stderr}",
)
else:
logger.debug("PDF cleaned successfully")
except Exception as e:
logger.error(f"Error while cleaning PDF: {e}")
raise e
task = consume_file.delay(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=working_copy,
),
)
return task.id
@shared_task
def sanity_check(*, scheduled=True, raise_on_error=True):
messages = sanity_checker.check_sanity(scheduled=scheduled)

Binary file not shown.

View File

@@ -10,8 +10,8 @@ from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import get_supported_file_extensions
from documents.parsers import is_file_ext_supported
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_tika.parsers import TikaDocumentParser
class TestParserDiscovery(TestCase):

View File

@@ -1,4 +1,5 @@
import shutil
import uuid
from datetime import timedelta
from pathlib import Path
from unittest import mock
@@ -21,6 +22,7 @@ from documents.sanity_checker import SanityCheckMessages
from documents.tests.test_classifier import dummy_preprocess
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import SampleDirMixin
class TestIndexReindex(DirectoriesMixin, TestCase):
@@ -232,6 +234,44 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(Document.global_objects.count(), 0)
class TestRetryConsumeTask(
DirectoriesMixin,
SampleDirMixin,
FileSystemAssertsMixin,
TestCase,
):
def do_failed_task(self, test_file: Path) -> PaperlessTask:
failed_file = settings.CONSUMPTION_FAILED_DIR / test_file.name
shutil.copy(test_file, failed_file)
task = PaperlessTask.objects.create(
type=PaperlessTask.TaskType.AUTO,
task_id=str(uuid.uuid4()),
task_file_name=failed_file.name,
task_name=PaperlessTask.TaskName.CONSUME_FILE,
status=states.FAILURE,
date_created=timezone.now(),
date_done=timezone.now(),
)
self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
return task
@mock.patch("documents.tasks.consume_file.delay")
@mock.patch("documents.tasks.run_subprocess")
def test_retry_consume_clean(self, m_subprocess, m_consume_file) -> None:
task = self.do_failed_task(self.SAMPLE_DIR / "corrupted.pdf")
m_subprocess.return_value.returncode = 0
task_id = tasks.retry_failed_file(task_id=task.task_id, clean=True)
self.assertIsNotNone(task_id)
m_consume_file.assert_called_once()
def test_cleanup(self) -> None:
task = self.do_failed_task(self.SAMPLE_DIR / "corrupted.pdf")
task.acknowledged = True
task.save()
self.assertIsNotFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
class TestUpdateContent(DirectoriesMixin, TestCase):
def test_update_content_maybe_archive_file(self) -> None:
"""

View File

@@ -37,6 +37,7 @@ def setup_directories():
dirs.scratch_dir = Path(tempfile.mkdtemp()).resolve()
dirs.media_dir = Path(tempfile.mkdtemp()).resolve()
dirs.consumption_dir = Path(tempfile.mkdtemp()).resolve()
dirs.consumption_failed_dir = Path(tempfile.mkdtemp("failed")).resolve()
dirs.static_dir = Path(tempfile.mkdtemp()).resolve()
dirs.index_dir = dirs.data_dir / "index"
dirs.originals_dir = dirs.media_dir / "documents" / "originals"
@@ -58,6 +59,7 @@ def setup_directories():
THUMBNAIL_DIR=dirs.thumbnail_dir,
ARCHIVE_DIR=dirs.archive_dir,
CONSUMPTION_DIR=dirs.consumption_dir,
CONSUMPTION_FAILED_DIR=dirs.consumption_failed_dir,
LOGGING_DIR=dirs.logging_dir,
INDEX_DIR=dirs.index_dir,
STATIC_ROOT=dirs.static_dir,
@@ -74,6 +76,7 @@ def remove_dirs(dirs) -> None:
shutil.rmtree(dirs.data_dir, ignore_errors=True)
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
shutil.rmtree(dirs.consumption_failed_dir, ignore_errors=True)
shutil.rmtree(dirs.static_dir, ignore_errors=True)
dirs.settings_override.disable()

View File

@@ -7,7 +7,6 @@ import tempfile
import zipfile
from collections import defaultdict
from collections import deque
from contextlib import nullcontext
from datetime import datetime
from pathlib import Path
from time import mktime
@@ -190,6 +189,7 @@ from documents.serialisers import NotesSerializer
from documents.serialisers import PostDocumentSerializer
from documents.serialisers import RemovePasswordDocumentsSerializer
from documents.serialisers import ReprocessDocumentsSerializer
from documents.serialisers import RetryTaskSerializer
from documents.serialisers import RotateDocumentsSerializer
from documents.serialisers import RunTaskViewSerializer
from documents.serialisers import SavedViewSerializer
@@ -212,6 +212,7 @@ from documents.tasks import consume_file
from documents.tasks import empty_trash
from documents.tasks import index_optimize
from documents.tasks import llmindex_index
from documents.tasks import retry_failed_file
from documents.tasks import sanity_check
from documents.tasks import train_classifier
from documents.tasks import update_document_parent_tags
@@ -226,7 +227,6 @@ from paperless.celery import app as celery_app
from paperless.config import AIConfig
from paperless.config import GeneralConfig
from paperless.models import ApplicationConfiguration
from paperless.parsers import ParserProtocol
from paperless.serialisers import GroupSerializer
from paperless.serialisers import UserSerializer
from paperless.views import StandardPagination
@@ -1086,11 +1086,9 @@ class DocumentViewSet(
parser_class = get_parser_class_for_mime_type(mime_type)
if parser_class:
parser = parser_class(progress_callback=None, logging_group=None)
cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
try:
with cm:
return parser.extract_metadata(file, mime_type)
return parser.extract_metadata(file, mime_type)
except Exception: # pragma: no cover
logger.exception(f"Issue getting metadata for {file}")
# TODO: cover GPG errors, remove later.
@@ -3471,6 +3469,25 @@ class TasksViewSet(ReadOnlyModelViewSet):
queryset = PaperlessTask.objects.filter(task_id=task_id)
return queryset
@action(methods=["post"], detail=True)
def retry(self, request, pk=None):
task = self.get_object()
serializer = RetryTaskSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
clean = serializer.validated_data.get("clean")
try:
new_task_id = retry_failed_file(task.task_id, clean)
return Response({"task_id": new_task_id})
except FileNotFoundError:
return HttpResponseBadRequest("Original file not found")
except Exception as e:
logger.warning(f"An error occurred retrying task: {e!s}")
return HttpResponseBadRequest(
"Error retrying task, check logs for more detail.",
)
@action(
methods=["post"],
detail=False,

View File

@@ -68,6 +68,10 @@ def paths_check(app_configs, **kwargs) -> list[Error]:
+ path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
+ path_check(
"PAPERLESS_CONSUMPTION_FAILED_DIR",
settings.CONSUMPTION_FAILED_DIR,
)
)

View File

@@ -194,10 +194,8 @@ class ParserRegistry:
at runtime regardless of registration order.
"""
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
self.register_builtin(TextDocumentParser)
self.register_builtin(TikaDocumentParser)
# ------------------------------------------------------------------
# Discovery

View File

@@ -1,440 +0,0 @@
"""
Built-in Tika document parser.
Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
sending them to an Apache Tika server for text extraction and a Gotenberg
server for PDF conversion. Because the source formats cannot be rendered by
a browser natively, the parser always produces a PDF rendition for display.
"""
from __future__ import annotations
import logging
import shutil
import tempfile
from contextlib import ExitStack
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Self
import httpx
from django.conf import settings
from django.utils import timezone
from gotenberg_client import GotenbergClient
from gotenberg_client.options import PdfAFormat
from tika_client import TikaClient
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.config import OutputTypeConfig
from paperless.models import OutputTypeChoices
from paperless.version import __full_version_str__
if TYPE_CHECKING:
import datetime
from types import TracebackType
from paperless.parsers import MetadataEntry
logger = logging.getLogger("paperless.parsing.tika")
_SUPPORTED_MIME_TYPES: dict[str, str] = {
"application/msword": ".doc",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/vnd.ms-excel": ".xls",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
"application/vnd.ms-powerpoint": ".ppt",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
"application/vnd.oasis.opendocument.presentation": ".odp",
"application/vnd.oasis.opendocument.spreadsheet": ".ods",
"application/vnd.oasis.opendocument.text": ".odt",
"application/vnd.oasis.opendocument.graphics": ".odg",
"text/rtf": ".rtf",
}
class TikaDocumentParser:
"""Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
Text extraction is handled by the Tika server. PDF conversion for display
is handled by Gotenberg (LibreOffice route). Because the source formats
cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
True and the PDF is always produced regardless of the ``produce_archive``
flag passed to ``parse``.
Both ``TikaClient`` and ``GotenbergClient`` are opened once in
``__enter__`` via an ``ExitStack`` and shared across ``parse``,
``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
``ExitStack.close()`` in ``__exit__``. The parser must always be used
as a context manager.
Class attributes
----------------
name : str
Human-readable parser name.
version : str
Semantic version string, kept in sync with Paperless-ngx releases.
author : str
Maintainer name.
url : str
Issue tracker / source URL.
"""
name: str = "Paperless-ngx Tika Parser"
version: str = __full_version_str__
author: str = "Paperless-ngx Contributors"
url: str = "https://github.com/paperless-ngx/paperless-ngx"
# ------------------------------------------------------------------
# Class methods
# ------------------------------------------------------------------
@classmethod
def supported_mime_types(cls) -> dict[str, str]:
"""Return the MIME types this parser handles.
Returns
-------
dict[str, str]
Mapping of MIME type to preferred file extension.
"""
return _SUPPORTED_MIME_TYPES
@classmethod
def score(
cls,
mime_type: str,
filename: str,
path: Path | None = None,
) -> int | None:
"""Return the priority score for handling this file.
Returns ``None`` when Tika integration is disabled so the registry
skips this parser entirely.
Parameters
----------
mime_type:
Detected MIME type of the file.
filename:
Original filename including extension.
path:
Optional filesystem path. Not inspected by this parser.
Returns
-------
int | None
10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
"""
if not settings.TIKA_ENABLED:
return None
if mime_type in _SUPPORTED_MIME_TYPES:
return 10
return None
# ------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------
@property
def can_produce_archive(self) -> bool:
"""Whether this parser can produce a searchable PDF archive copy.
Returns
-------
bool
Always False — Tika produces a display PDF, not an OCR archive.
"""
return False
@property
def requires_pdf_rendition(self) -> bool:
"""Whether the parser must produce a PDF for the frontend to display.
Returns
-------
bool
Always True — Office formats cannot be rendered natively in a
browser, so a PDF conversion is always required for display.
"""
return True
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def __init__(self, logging_group: object = None) -> None:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
self._tempdir = Path(
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
)
self._text: str | None = None
self._date: datetime.datetime | None = None
self._archive_path: Path | None = None
self._exit_stack = ExitStack()
self._tika_client: TikaClient | None = None
self._gotenberg_client: GotenbergClient | None = None
def __enter__(self) -> Self:
self._tika_client = self._exit_stack.enter_context(
TikaClient(
tika_url=settings.TIKA_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
),
)
self._gotenberg_client = self._exit_stack.enter_context(
GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
),
)
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
self._exit_stack.close()
logger.debug("Cleaning up temporary directory %s", self._tempdir)
shutil.rmtree(self._tempdir, ignore_errors=True)
# ------------------------------------------------------------------
# Core parsing interface
# ------------------------------------------------------------------
def parse(
self,
document_path: Path,
mime_type: str,
*,
produce_archive: bool = True,
) -> None:
"""Send the document to Tika for text extraction and Gotenberg for PDF.
Because ``requires_pdf_rendition`` is True the PDF conversion is
always performed — the ``produce_archive`` flag is intentionally
ignored.
Parameters
----------
document_path:
Absolute path to the document file to parse.
mime_type:
Detected MIME type of the document.
produce_archive:
Accepted for protocol compatibility but ignored; the PDF rendition
is always produced since the source format cannot be displayed
natively in the browser.
Raises
------
documents.parsers.ParseError
If Tika or Gotenberg returns an error.
"""
if TYPE_CHECKING:
assert self._tika_client is not None
logger.info("Sending %s to Tika server", document_path)
try:
try:
parsed = self._tika_client.tika.as_text.from_file(
document_path,
mime_type,
)
except httpx.HTTPStatusError as err:
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
# Tika fails with some files as multi-part form data
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
parsed = self._tika_client.tika.as_text.from_buffer(
document_path.read_bytes(),
mime_type,
)
else: # pragma: no cover
raise
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
self._text = parsed.content
if self._text is not None:
self._text = self._text.strip()
self._date = parsed.created
if self._date is not None and timezone.is_naive(self._date):
self._date = timezone.make_aware(self._date)
# Always convert — requires_pdf_rendition=True means the browser
# cannot display the source format natively.
self._archive_path = self._convert_to_pdf(document_path)
# ------------------------------------------------------------------
# Result accessors
# ------------------------------------------------------------------
def get_text(self) -> str | None:
"""Return the plain-text content extracted during parse.
Returns
-------
str | None
Extracted text, or None if parse has not been called yet.
"""
return self._text
def get_date(self) -> datetime.datetime | None:
"""Return the document date detected during parse.
Returns
-------
datetime.datetime | None
Creation date from Tika metadata, or None if not detected.
"""
return self._date
def get_archive_path(self) -> Path | None:
"""Return the path to the generated PDF rendition, or None.
Returns
-------
Path | None
Path to the PDF produced by Gotenberg, or None if parse has not
been called yet.
"""
return self._archive_path
# ------------------------------------------------------------------
# Thumbnail and metadata
# ------------------------------------------------------------------
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
"""Generate a thumbnail from the PDF rendition of the document.
Converts the document to PDF first if not already done.
Parameters
----------
document_path:
Absolute path to the source document.
mime_type:
Detected MIME type of the document.
Returns
-------
Path
Path to the generated WebP thumbnail inside the temporary directory.
"""
if self._archive_path is None:
self._archive_path = self._convert_to_pdf(document_path)
return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
def get_page_count(
self,
document_path: Path,
mime_type: str,
) -> int | None:
"""Return the number of pages in the document.
Returns
-------
int | None
Always None — page count is not available from Tika.
"""
return None
def extract_metadata(
self,
document_path: Path,
mime_type: str,
) -> list[MetadataEntry]:
"""Extract format-specific metadata via the Tika metadata endpoint.
Returns
-------
list[MetadataEntry]
All key/value pairs returned by Tika, or ``[]`` on error.
"""
if TYPE_CHECKING:
assert self._tika_client is not None
try:
parsed = self._tika_client.metadata.from_file(document_path, mime_type)
return [
{
"namespace": "",
"prefix": "",
"key": key,
"value": parsed.data[key],
}
for key in parsed.data
]
except Exception as e:
logger.warning(
"Error while fetching document metadata for %s: %s",
document_path,
e,
)
return []
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
def _convert_to_pdf(self, document_path: Path) -> Path:
"""Convert the document to PDF using Gotenberg's LibreOffice route.
Parameters
----------
document_path:
Absolute path to the source document.
Returns
-------
Path
Path to the generated PDF inside the temporary directory.
Raises
------
documents.parsers.ParseError
If Gotenberg returns an error.
"""
if TYPE_CHECKING:
assert self._gotenberg_client is not None
pdf_path = self._tempdir / "convert.pdf"
logger.info("Converting %s to PDF as %s", document_path, pdf_path)
with self._gotenberg_client.libre_office.to_pdf() as route:
# Set the output format of the resulting PDF.
# OutputTypeConfig reads the database-stored ApplicationConfiguration
# first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
output_type = OutputTypeConfig().output_type
if output_type in {
OutputTypeChoices.PDF_A,
OutputTypeChoices.PDF_A2,
}:
route.pdf_format(PdfAFormat.A2b)
elif output_type == OutputTypeChoices.PDF_A1:
logger.warning(
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
)
route.pdf_format(PdfAFormat.A2b)
elif output_type == OutputTypeChoices.PDF_A3:
route.pdf_format(PdfAFormat.A3b)
route.convert(document_path)
try:
response = route.run()
pdf_path.write_bytes(response.content)
return pdf_path
except Exception as err:
raise ParseError(
f"Error while converting document to PDF: {err}",
) from err

View File

@@ -98,6 +98,11 @@ CONSUMPTION_DIR = get_path_from_env(
BASE_DIR.parent / "consume",
)
CONSUMPTION_FAILED_DIR = get_path_from_env(
"PAPERLESS_CONSUMPTION_FAILED_DIR",
CONSUMPTION_DIR / "failed",
)
# This will be created if it doesn't exist
SCRATCH_DIR = get_path_from_env(
"PAPERLESS_SCRATCH_DIR",
@@ -782,6 +787,8 @@ CONSUMER_IGNORE_PATTERNS = list(
),
),
)
if CONSUMPTION_DIR in CONSUMPTION_FAILED_DIR.parents:
CONSUMER_IGNORE_PATTERNS.append(CONSUMPTION_FAILED_DIR.name)
# Directories to always ignore. These are matched by directory name, not full path
CONSUMER_IGNORE_DIRS = list(

View File

@@ -11,7 +11,6 @@ from typing import TYPE_CHECKING
import pytest
from paperless.parsers.text import TextDocumentParser
from paperless.parsers.tika import TikaDocumentParser
if TYPE_CHECKING:
from collections.abc import Generator
@@ -75,86 +74,3 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
"""
with TextDocumentParser() as parser:
yield parser
# ------------------------------------------------------------------
# Tika parser sample files
# ------------------------------------------------------------------
@pytest.fixture(scope="session")
def tika_samples_dir(samples_dir: Path) -> Path:
"""Absolute path to the Tika parser sample files directory.
Returns
-------
Path
``<samples_dir>/tika/``
"""
return samples_dir / "tika"
@pytest.fixture(scope="session")
def sample_odt_file(tika_samples_dir: Path) -> Path:
"""Path to a sample ODT file.
Returns
-------
Path
Absolute path to ``tika/sample.odt``.
"""
return tika_samples_dir / "sample.odt"
@pytest.fixture(scope="session")
def sample_docx_file(tika_samples_dir: Path) -> Path:
"""Path to a sample DOCX file.
Returns
-------
Path
Absolute path to ``tika/sample.docx``.
"""
return tika_samples_dir / "sample.docx"
@pytest.fixture(scope="session")
def sample_doc_file(tika_samples_dir: Path) -> Path:
"""Path to a sample DOC file.
Returns
-------
Path
Absolute path to ``tika/sample.doc``.
"""
return tika_samples_dir / "sample.doc"
@pytest.fixture(scope="session")
def sample_broken_odt(tika_samples_dir: Path) -> Path:
"""Path to a broken ODT file that triggers the multi-part fallback.
Returns
-------
Path
Absolute path to ``tika/multi-part-broken.odt``.
"""
return tika_samples_dir / "multi-part-broken.odt"
# ------------------------------------------------------------------
# Tika parser instance
# ------------------------------------------------------------------
@pytest.fixture()
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
"""Yield a TikaDocumentParser and clean up its temporary directory afterwards.
Yields
------
TikaDocumentParser
A ready-to-use parser instance.
"""
with TikaDocumentParser() as parser:
yield parser

View File

@@ -24,6 +24,7 @@ class PaperlessTestDirs:
data_dir: Path
media_dir: Path
consumption_dir: Path
consumption_failed_dir: Path
# TODO: consolidate with documents/tests/conftest.py PaperlessDirs/paperless_dirs
@@ -33,18 +34,21 @@ def directories(tmp_path: Path, settings: SettingsWrapper) -> PaperlessTestDirs:
data_dir = tmp_path / "data"
media_dir = tmp_path / "media"
consumption_dir = tmp_path / "consumption"
consumption_failed_dir = tmp_path / "consumption_failed"
for d in (data_dir, media_dir, consumption_dir):
for d in (data_dir, media_dir, consumption_dir, consumption_failed_dir):
d.mkdir()
settings.DATA_DIR = data_dir
settings.MEDIA_ROOT = media_dir
settings.CONSUMPTION_DIR = consumption_dir
settings.CONSUMPTION_FAILED_DIR = consumption_failed_dir
return PaperlessTestDirs(
data_dir=data_dir,
media_dir=media_dir,
consumption_dir=consumption_dir,
consumption_failed_dir=consumption_failed_dir,
)
@@ -64,10 +68,11 @@ class TestChecks:
settings.MEDIA_ROOT = Path("uuh")
settings.DATA_DIR = Path("whatever")
settings.CONSUMPTION_DIR = Path("idontcare")
settings.CONSUMPTION_FAILED_DIR = Path("nope")
msgs = paths_check(None)
assert len(msgs) == 3, str(msgs)
assert len(msgs) == 4, str(msgs)
for msg in msgs:
assert msg.msg.endswith("is set but doesn't exist.")
@@ -75,6 +80,7 @@ class TestChecks:
directories.data_dir.chmod(0o000)
directories.media_dir.chmod(0o000)
directories.consumption_dir.chmod(0o000)
directories.consumption_failed_dir.chmod(0o000)
try:
msgs = paths_check(None)
@@ -82,8 +88,9 @@ class TestChecks:
directories.data_dir.chmod(0o777)
directories.media_dir.chmod(0o777)
directories.consumption_dir.chmod(0o777)
directories.consumption_failed_dir.chmod(0o777)
assert len(msgs) == 3
assert len(msgs) == 4
for msg in msgs:
assert msg.msg.endswith("is not writeable")

View File

@@ -1,12 +1,10 @@
def get_parser(*args, **kwargs):
from paperless.parsers.text import TextDocumentParser
# TextDocumentParser accepts logging_group for constructor compatibility but
# does not store or use it (no legacy DocumentParser base class).
# progress_callback is also not used. Both may arrive as a positional arg
# (consumer) or a keyword arg (views); *args absorbs the positional form,
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
# path with the new ParserRegistry so the shim can be removed at that point.
# The new TextDocumentParser does not accept the legacy logging_group /
# progress_callback kwargs injected by the old signal-based consumer.
# These are dropped here; Phase 4 will replace this signal path with the
# new ParserRegistry so the shim can be removed at that point.
kwargs.pop("logging_group", None)
kwargs.pop("progress_callback", None)
return TextDocumentParser()

View File

@@ -0,0 +1,136 @@
from pathlib import Path
import httpx
from django.conf import settings
from django.utils import timezone
from gotenberg_client import GotenbergClient
from gotenberg_client.options import PdfAFormat
from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.config import OutputTypeConfig
from paperless.models import OutputTypeChoices
class TikaDocumentParser(DocumentParser):
"""
This parser sends documents to a local tika server
"""
logging_name = "paperless.parsing.tika"
def get_thumbnail(self, document_path, mime_type, file_name=None):
if not self.archive_path:
self.archive_path = self.convert_to_pdf(document_path, file_name)
return make_thumbnail_from_pdf(
self.archive_path,
self.tempdir,
self.logging_group,
)
def extract_metadata(self, document_path, mime_type):
try:
with TikaClient(
tika_url=settings.TIKA_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client:
parsed = client.metadata.from_file(document_path, mime_type)
return [
{
"namespace": "",
"prefix": "",
"key": key,
"value": parsed.data[key],
}
for key in parsed.data
]
except Exception as e:
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
)
return []
def parse(self, document_path: Path, mime_type: str, file_name=None) -> None:
self.log.info(f"Sending {document_path} to Tika server")
try:
with TikaClient(
tika_url=settings.TIKA_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client:
try:
parsed = client.tika.as_text.from_file(document_path, mime_type)
except httpx.HTTPStatusError as err:
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
# Tika fails with some files as multi-part form data
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
parsed = client.tika.as_text.from_buffer(
document_path.read_bytes(),
mime_type,
)
else: # pragma: no cover
raise
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
self.text = parsed.content
if self.text is not None:
self.text = self.text.strip()
self.date = parsed.created
if self.date is not None and timezone.is_naive(self.date):
self.date = timezone.make_aware(self.date)
self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path: Path, file_name):
pdf_path = Path(self.tempdir) / "convert.pdf"
self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
with (
GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client,
client.libre_office.to_pdf() as route,
):
# Set the output format of the resulting PDF
if settings.OCR_OUTPUT_TYPE in {
OutputTypeChoices.PDF_A,
OutputTypeChoices.PDF_A2,
}:
route.pdf_format(PdfAFormat.A2b)
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
self.log.warning(
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
)
route.pdf_format(PdfAFormat.A2b)
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
route.pdf_format(PdfAFormat.A3b)
route.convert(document_path)
try:
response = route.run()
pdf_path.write_bytes(response.content)
return pdf_path
except Exception as err:
raise ParseError(
f"Error while converting document to PDF: {err}",
) from err
def get_settings(self) -> OutputTypeConfig:
"""
This parser only uses the PDF output type configuration currently
"""
return OutputTypeConfig()

View File

@@ -1,15 +1,7 @@
def get_parser(*args, **kwargs):
from paperless.parsers.tika import TikaDocumentParser
from paperless_tika.parsers import TikaDocumentParser
# TikaDocumentParser accepts logging_group for constructor compatibility but
# does not store or use it (no legacy DocumentParser base class).
# progress_callback is also not used. Both may arrive as a positional arg
# (consumer) or a keyword arg (views); *args absorbs the positional form,
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
# path with the new ParserRegistry so the shim can be removed at that point.
kwargs.pop("logging_group", None)
kwargs.pop("progress_callback", None)
return TikaDocumentParser()
return TikaDocumentParser(*args, **kwargs)
def tika_consumer_declaration(sender, **kwargs):

View File

View File

@@ -0,0 +1,41 @@
from collections.abc import Generator
from pathlib import Path
import pytest
from paperless_tika.parsers import TikaDocumentParser
@pytest.fixture()
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
try:
parser = TikaDocumentParser(logging_group=None)
yield parser
finally:
# TODO(stumpylog): Cleanup once all parsers are handled
parser.cleanup()
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture(scope="session")
def sample_odt_file(sample_dir: Path) -> Path:
return sample_dir / "sample.odt"
@pytest.fixture(scope="session")
def sample_docx_file(sample_dir: Path) -> Path:
return sample_dir / "sample.docx"
@pytest.fixture(scope="session")
def sample_doc_file(sample_dir: Path) -> Path:
return sample_dir / "sample.doc"
@pytest.fixture(scope="session")
def sample_broken_odt(sample_dir: Path) -> Path:
return sample_dir / "multi-part-broken.odt"

View File

@@ -4,7 +4,7 @@ from pathlib import Path
import pytest
from documents.tests.utils import util_call_with_backoff
from paperless.parsers.tika import TikaDocumentParser
from paperless_tika.parsers import TikaDocumentParser
@pytest.mark.skipif(
@@ -42,15 +42,14 @@ class TestTikaParserAgainstServer:
)
assert (
tika_parser.get_text()
tika_parser.text
== "This is an ODT test document, created September 14, 2022"
)
archive = tika_parser.get_archive_path()
assert archive is not None
assert b"PDF-" in archive.read_bytes()[:10]
assert tika_parser.archive_path is not None
assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
# self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_docx(
self,
@@ -75,15 +74,14 @@ class TestTikaParserAgainstServer:
)
assert (
tika_parser.get_text()
tika_parser.text
== "This is an DOCX test document, also made September 14, 2022"
)
archive = tika_parser.get_archive_path()
assert archive is not None
with archive.open("rb") as f:
assert tika_parser.archive_path is not None
with Path(tika_parser.archive_path).open("rb") as f:
assert b"PDF-" in f.read()[:10]
# self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_doc(
self,
@@ -104,12 +102,13 @@ class TestTikaParserAgainstServer:
[sample_doc_file, "application/msword"],
)
text = tika_parser.get_text()
assert text is not None
assert "This is a test document, saved in the older .doc format" in text
archive = tika_parser.get_archive_path()
assert archive is not None
with archive.open("rb") as f:
assert tika_parser.text is not None
assert (
"This is a test document, saved in the older .doc format"
in tika_parser.text
)
assert tika_parser.archive_path is not None
with Path(tika_parser.archive_path).open("rb") as f:
assert b"PDF-" in f.read()[:10]
def test_tika_fails_multi_part(
@@ -134,7 +133,6 @@ class TestTikaParserAgainstServer:
[sample_broken_odt, "application/vnd.oasis.opendocument.text"],
)
archive = tika_parser.get_archive_path()
assert archive is not None
with archive.open("rb") as f:
assert tika_parser.archive_path is not None
with Path(tika_parser.archive_path).open("rb") as f:
assert b"PDF-" in f.read()[:10]

View File

@@ -9,56 +9,7 @@ from pytest_django.fixtures import SettingsWrapper
from pytest_httpx import HTTPXMock
from documents.parsers import ParseError
from paperless.parsers import ParserProtocol
from paperless.parsers.tika import TikaDocumentParser
class TestTikaParserRegistryInterface:
"""Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
def test_satisfies_parser_protocol(self) -> None:
assert isinstance(TikaDocumentParser(), ParserProtocol)
def test_supported_mime_types_is_classmethod(self) -> None:
mime_types = TikaDocumentParser.supported_mime_types()
assert isinstance(mime_types, dict)
assert len(mime_types) > 0
def test_score_returns_none_when_tika_disabled(
self,
settings: SettingsWrapper,
) -> None:
settings.TIKA_ENABLED = False
result = TikaDocumentParser.score(
"application/vnd.oasis.opendocument.text",
"sample.odt",
)
assert result is None
def test_score_returns_int_when_tika_enabled(
self,
settings: SettingsWrapper,
) -> None:
settings.TIKA_ENABLED = True
result = TikaDocumentParser.score(
"application/vnd.oasis.opendocument.text",
"sample.odt",
)
assert isinstance(result, int)
def test_score_returns_none_for_unsupported_mime(
self,
settings: SettingsWrapper,
) -> None:
settings.TIKA_ENABLED = True
result = TikaDocumentParser.score("application/pdf", "doc.pdf")
assert result is None
def test_can_produce_archive_is_false(self) -> None:
assert TikaDocumentParser().can_produce_archive is False
def test_requires_pdf_rendition_is_true(self) -> None:
assert TikaDocumentParser().requires_pdf_rendition is True
from paperless_tika.parsers import TikaDocumentParser
@pytest.mark.django_db()
@@ -85,12 +36,12 @@ class TestTikaParser:
tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
assert tika_parser.get_text() == "the content"
assert tika_parser.get_archive_path() is not None
with Path(tika_parser.get_archive_path()).open("rb") as f:
assert tika_parser.text == "the content"
assert tika_parser.archive_path is not None
with Path(tika_parser.archive_path).open("rb") as f:
assert f.read() == b"PDF document"
assert tika_parser.get_date() == datetime.datetime(
assert tika_parser.date == datetime.datetime(
2020,
11,
21,
@@ -138,7 +89,7 @@ class TestTikaParser:
httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
with pytest.raises(ParseError):
tika_parser._convert_to_pdf(sample_odt_file)
tika_parser.convert_to_pdf(sample_odt_file, None)
@pytest.mark.parametrize(
("setting_value", "expected_form_value"),
@@ -155,6 +106,7 @@ class TestTikaParser:
expected_form_value: str,
httpx_mock: HTTPXMock,
settings: SettingsWrapper,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
) -> None:
"""
@@ -165,8 +117,6 @@ class TestTikaParser:
THEN:
- Request to Gotenberg contains the expected PDF/A format string
"""
# Parser must be created after the setting is changed so that
# OutputTypeConfig reads the correct value at __init__ time.
settings.OCR_OUTPUT_TYPE = setting_value
httpx_mock.add_response(
status_code=codes.OK,
@@ -174,8 +124,7 @@ class TestTikaParser:
method="POST",
)
with TikaDocumentParser() as parser:
parser._convert_to_pdf(sample_odt_file)
tika_parser.convert_to_pdf(sample_odt_file, None)
request = httpx_mock.get_request()