Lint

Update tests
Update __init__.py
2026-03-17 22:45:58 +00:00 · 2026-03-17 11:48:53 -07:00 · 2026-03-17 11:46:44 -07:00 · 2026-03-17 11:43:22 -07:00 · 2026-03-17 11:42:17 -07:00 · 2026-03-17 11:39:53 -07:00
36 changed files with 492 additions and 651 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -157,9 +157,6 @@ updates:
      postgres:
        patterns:
          - "docker.io/library/postgres*"
-      greenmail:
-        patterns:
-          - "docker.io/greenmail*"
  - package-ecosystem: "pre-commit" # See documentation for possible values
    directory: "/" # Location of package manifests
    schedule:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -50,7 +50,7 @@ repos:
          - 'prettier-plugin-organize-imports@4.3.0'
  # Python hooks
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.15.6
+    rev: v0.15.5
    hooks:
      - id: ruff-check
      - id: ruff-format
--- a/docker/compose/docker-compose.ci-test.yml
+++ b/docker/compose/docker-compose.ci-test.yml
@@ -18,13 +18,13 @@ services:
      - "--log-level=warn"
      - "--log-format=text"
  tika:
-    image: docker.io/apache/tika:3.2.3.0
+    image: docker.io/apache/tika:latest
    hostname: tika
    container_name: tika
    network_mode: host
    restart: unless-stopped
  greenmail:
-    image: docker.io/greenmail/standalone:2.1.8
+    image: greenmail/standalone:2.1.8
    hostname: greenmail
    container_name: greenmail
    environment:
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -14,6 +14,7 @@
 # Paths and folders

 #PAPERLESS_CONSUMPTION_DIR=../consume
+#PAPERLESS_CONSUMPTION_FAILED_DIR=../consume/failed
 #PAPERLESS_DATA_DIR=../data
 #PAPERLESS_EMPTY_TRASH_DIR=
 #PAPERLESS_MEDIA_ROOT=../media
--- a/src-ui/src/app/components/admin/tasks/tasks.component.html
+++ b/src-ui/src/app/components/admin/tasks/tasks.component.html
@@ -112,6 +112,9 @@
          </td>
          <td scope="row">
            <div class="btn-group" role="group">
+              @if (task.status === PaperlessTaskStatus.Failed) {
+                <ng-container *ngTemplateOutlet="retryDropdown; context: { task: task }"></ng-container>
+              }
              <button class="btn btn-sm btn-outline-secondary" (click)="dismissTask(task); $event.stopPropagation();" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.PaperlessTask }">
                <i-bs name="check" class="me-1"></i-bs><ng-container i18n>Dismiss</ng-container>
              </button>
@@ -184,3 +187,25 @@
  </li>
 </ul>
 <div [ngbNavOutlet]="nav"></div>
+
+<ng-template #retryDropdown let-task="task">
+  <div ngbDropdown>
+    <button class="btn btn-sm btn-outline-primary" (click)="$event.stopImmediatePropagation()" ngbDropdownToggle>
+      <i-bs name="arrow-repeat"></i-bs>&nbsp;<ng-container i18n>Retry</ng-container>
+    </button>
+    <div ngbDropdownMenu class="shadow retry-dropdown">
+      <div class="p-2">
+        <ul class="list-group list-group-flush">
+            <li class="list-group-item small" i18n>
+              <pngx-input-check [(ngModel)]="retryClean" i18n-title title="Attempt to clean pdf"></pngx-input-check>
+            </li>
+        </ul>
+        <div class="d-flex justify-content-end">
+          <button class="btn btn-sm btn-outline-primary" (click)="retryTask(task); $event.stopPropagation();">
+            <ng-container i18n>Proceed</ng-container>
+          </button>
+        </div>
+      </div>
+    </div>
+  </div>
+</ng-template>
--- a/src-ui/src/app/components/admin/tasks/tasks.component.scss
+++ b/src-ui/src/app/components/admin/tasks/tasks.component.scss
@@ -37,3 +37,7 @@ pre {
 .z-10 {
    z-index: 10;
 }
+
+.retry-dropdown {
+    width: 300px;
+}
--- a/src-ui/src/app/components/admin/tasks/tasks.component.spec.ts
+++ b/src-ui/src/app/components/admin/tasks/tasks.component.spec.ts
@@ -16,7 +16,7 @@ import {
  NgbNavItem,
 } from '@ng-bootstrap/ng-bootstrap'
 import { allIcons, NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
-import { throwError } from 'rxjs'
+import { of, throwError } from 'rxjs'
 import { routes } from 'src/app/app-routing.module'
 import {
  PaperlessTask,
@@ -32,6 +32,7 @@ import { TasksService } from 'src/app/services/tasks.service'
 import { ToastService } from 'src/app/services/toast.service'
 import { environment } from 'src/environments/environment'
 import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
+import { CheckComponent } from '../../common/input/check/check.component'
 import { PageHeaderComponent } from '../../common/page-header/page-header.component'
 import { TasksComponent, TaskTab } from './tasks.component'

@@ -138,6 +139,7 @@ describe('TasksComponent', () => {
        PageHeaderComponent,
        IfPermissionsDirective,
        CustomDatePipe,
+        CheckComponent,
        ConfirmDialogComponent,
      ],
      providers: [
@@ -184,8 +186,10 @@ describe('TasksComponent', () => {
      `Failed${currentTasksLength}`
    )
    expect(
-      fixture.debugElement.queryAll(By.css('table input[type="checkbox"]'))
-    ).toHaveLength(currentTasksLength + 1)
+      fixture.debugElement.queryAll(
+        By.css('table td > .form-check input[type="checkbox"]')
+      )
+    ).toHaveLength(currentTasksLength)

    currentTasksLength = tasks.filter(
      (t) => t.status === PaperlessTaskStatus.Complete
@@ -389,4 +393,20 @@ describe('TasksComponent', () => {
    expect(component.filterText).toEqual('')
    expect(component.filterTargetID).toEqual(0)
  })
+
+  it('should retry a task, show toast on error or success', () => {
+    const retrySpy = jest.spyOn(tasksService, 'retryTask')
+    const toastInfoSpy = jest.spyOn(toastService, 'showInfo')
+    const toastErrorSpy = jest.spyOn(toastService, 'showError')
+    retrySpy.mockReturnValueOnce(of({ task_id: '123' }))
+    component.retryTask(tasks[0])
+    expect(retrySpy).toHaveBeenCalledWith(tasks[0], false)
+    expect(toastInfoSpy).toHaveBeenCalledWith('Retrying task...')
+    retrySpy.mockReturnValueOnce(throwError(() => new Error('test')))
+    component.retryTask(tasks[0])
+    expect(toastErrorSpy).toHaveBeenCalledWith(
+      'Failed to retry task',
+      new Error('test')
+    )
+  })
 })
--- a/src-ui/src/app/components/admin/tasks/tasks.component.ts
+++ b/src-ui/src/app/components/admin/tasks/tasks.component.ts
@@ -20,12 +20,13 @@ import {
  takeUntil,
  timer,
 } from 'rxjs'
-import { PaperlessTask } from 'src/app/data/paperless-task'
+import { PaperlessTask, PaperlessTaskStatus } from 'src/app/data/paperless-task'
 import { IfPermissionsDirective } from 'src/app/directives/if-permissions.directive'
 import { CustomDatePipe } from 'src/app/pipes/custom-date.pipe'
 import { TasksService } from 'src/app/services/tasks.service'
 import { ToastService } from 'src/app/services/toast.service'
 import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
+import { CheckComponent } from '../../common/input/check/check.component'
 import { PageHeaderComponent } from '../../common/page-header/page-header.component'
 import { LoadingComponentWithPermissions } from '../../loading-component/loading.component'

@@ -54,6 +55,7 @@ const FILTER_TARGETS = [
    PageHeaderComponent,
    IfPermissionsDirective,
    CustomDatePipe,
+    CheckComponent,
    SlicePipe,
    FormsModule,
    ReactiveFormsModule,
@@ -75,6 +77,7 @@ export class TasksComponent
  private readonly router = inject(Router)
  private readonly toastService = inject(ToastService)

+  public PaperlessTaskStatus = PaperlessTaskStatus
  public activeTab: TaskTab
  public selectedTasks: Set<number> = new Set()
  public togggleAll: boolean = false
@@ -105,6 +108,8 @@ export class TasksComponent
      : FILTER_TARGETS.slice(0, 1)
  }

+  public retryClean: boolean = false
+
  get dismissButtonText(): string {
    return this.selectedTasks.size > 0
      ? $localize`Dismiss selected`
@@ -178,6 +183,17 @@ export class TasksComponent
    this.router.navigate(['documents', task.related_document])
  }

+  retryTask(task: PaperlessTask) {
+    this.tasksService.retryTask(task, this.retryClean).subscribe({
+      next: () => {
+        this.toastService.showInfo($localize`Retrying task...`)
+      },
+      error: (e) => {
+        this.toastService.showError($localize`Failed to retry task`, e)
+      },
+    })
+  }
+
  expandTask(task: PaperlessTask) {
    this.expandedTask = this.expandedTask == task.id ? undefined : task.id
  }
--- a/src-ui/src/app/services/tasks.service.spec.ts
+++ b/src-ui/src/app/services/tasks.service.spec.ts
@@ -147,4 +147,33 @@ describe('TasksService', () => {
      result: 'success',
    })
  })
+
+  it('should call retry task api endpoint', () => {
+    const task = {
+      id: 1,
+      type: PaperlessTaskType.File,
+      status: PaperlessTaskStatus.Failed,
+      acknowledged: false,
+      task_id: '1234',
+      task_file_name: 'file1.pdf',
+      date_created: new Date(),
+    }
+
+    tasksService.retryTask(task, true).subscribe()
+    const reloadSpy = jest.spyOn(tasksService, 'reload')
+    const req = httpTestingController.expectOne(
+      `${environment.apiBaseUrl}tasks/${task.id}/retry/`
+    )
+    expect(req.request.method).toEqual('POST')
+    expect(req.request.body).toEqual({
+      clean: true,
+    })
+    req.flush({ task_id: 12345 })
+    expect(reloadSpy).toHaveBeenCalled()
+    httpTestingController
+      .expectOne(
+        `${environment.apiBaseUrl}tasks/?task_name=consume_file&acknowledged=false`
+      )
+      .flush([])
+  })
 })
--- a/src-ui/src/app/services/tasks.service.ts
+++ b/src-ui/src/app/services/tasks.service.ts
@@ -81,6 +81,20 @@ export class TasksService {
      )
  }

+  public retryTask(task: PaperlessTask, clean: boolean): Observable<any> {
+    return this.http
+      .post(`${this.baseUrl}tasks/${task.id}/retry/`, {
+        clean,
+      })
+      .pipe(
+        takeUntil(this.unsubscribeNotifer),
+        first(),
+        tap(() => {
+          this.reload()
+        })
+      )
+  }
+
  public cancelPending(): void {
    this.unsubscribeNotifer.next(true)
  }
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -52,7 +52,6 @@ from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
 from paperless.parsers.text import TextDocumentParser
-from paperless.parsers.tika import TikaDocumentParser
 from paperless_mail.parsers import MailDocumentParser

 LOGGING_NAME: Final[str] = "paperless.consumer"
@@ -68,7 +67,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:

    TODO(stumpylog): Remove me in the future
    """
-    if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
+    if isinstance(parser, TextDocumentParser):
        parser.__exit__(None, None, None)
    else:
        parser.cleanup()
@@ -175,6 +174,17 @@ class ConsumerPluginMixin:
    ):
        self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
        self.log.error(log_message or message, exc_info=exc_info)
+        # Move the file to the failed directory
+        if (
+            self.input_doc.original_file.exists()
+            and not Path(
+                settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
+            ).exists()
+        ):
+            copy_file_with_basic_stats(
+                self.input_doc.original_file,
+                settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
+            )
        raise ConsumerError(f"{self.filename}: {log_message or message}") from exception


@@ -449,12 +459,6 @@ class ConsumerPlugin(
            progress_callback=progress_callback,
        )

-        # New-style parsers use __enter__/__exit__ for resource management.
-        # _parser_cleanup (below) handles __exit__; call __enter__ here.
-        # TODO(stumpylog): Remove me in the future
-        if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
-            document_parser.__enter__()
-
        self.log.debug(f"Parser: {type(document_parser).__name__}")

        # Parse the document. This may take some time.
@@ -483,7 +487,7 @@ class ConsumerPlugin(
                    self.filename,
                    self.input_doc.mailrule_id,
                )
-            elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+            elif isinstance(document_parser, TextDocumentParser):
                # TODO(stumpylog): Remove me in the future
                document_parser.parse(self.working_copy, mime_type)
            else:
@@ -496,7 +500,7 @@ class ConsumerPlugin(
                ProgressStatusOptions.WORKING,
                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
            )
-            if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
+            if isinstance(document_parser, TextDocumentParser):
                # TODO(stumpylog): Remove me in the future
                thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
            else:
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -2411,6 +2411,14 @@ class TasksViewSerializer(OwnedObjectSerializer):
        return list(duplicates.values("id", "title", "deleted_at"))


+class RetryTaskSerializer(serializers.Serializer):
+    clean = serializers.BooleanField(
+        default=False,
+        write_only=True,
+        required=False,
+    )
+
+
 class RunTaskViewSerializer(serializers.Serializer[dict[str, Any]]):
    task_name = serializers.ChoiceField(
        choices=PaperlessTask.TaskName.choices,
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -631,6 +631,19 @@ def update_filename_and_move_files(
            )


+@receiver(models.signals.post_save, sender=PaperlessTask)
+def cleanup_failed_documents(sender, instance: PaperlessTask, **kwargs):
+    if instance.status != states.FAILURE or not instance.acknowledged:
+        return
+
+    if instance.task_file_name:
+        try:
+            Path(settings.CONSUMPTION_FAILED_DIR / instance.task_file_name).unlink()
+            logger.debug(f"Cleaned up failed file {instance.task_file_name}")
+        except FileNotFoundError:
+            logger.warning(f"Failed to clean up failed file {instance.task_file_name}")
+
+
@shared_task
 def process_cf_select_update(custom_field: CustomField) -> None:
    """
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -37,6 +37,7 @@ from documents.consumer import ConsumerPreflightPlugin
 from documents.consumer import WorkflowTriggerPlugin
 from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
+from documents.data_models import DocumentSource
 from documents.double_sided import CollatePlugin
 from documents.file_handling import create_source_path_directory
 from documents.file_handling import generate_unique_filename
@@ -63,6 +64,8 @@ from documents.signals import document_updated
 from documents.signals.handlers import cleanup_document_deletion
 from documents.signals.handlers import run_workflows
 from documents.signals.handlers import send_websocket_document_updated
+from documents.utils import copy_file_with_basic_stats
+from documents.utils import run_subprocess
 from documents.workflows.utils import get_workflows_for_trigger
 from paperless.config import AIConfig
 from paperless_ai.indexing import llm_index_add_or_update_document
@@ -72,7 +75,6 @@ from paperless_ai.indexing import update_llm_index
 _T = TypeVar("_T")
 IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]

-
 if settings.AUDIT_LOG_ENABLED:
    from auditlog.models import LogEntry
 logger = logging.getLogger("paperless.tasks")
@@ -239,6 +241,48 @@ def consume_file(
    return msg


+@shared_task
+def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False):
+    task = PaperlessTask.objects.get(task_id=task_id, status=states.FAILURE)
+    if task:
+        failed_file = settings.CONSUMPTION_FAILED_DIR / task.task_file_name
+        if not failed_file.exists():
+            logger.error(f"File {failed_file} not found")
+            raise FileNotFoundError(f"File {failed_file} not found")
+        working_copy = settings.SCRATCH_DIR / failed_file.name
+        copy_file_with_basic_stats(failed_file, working_copy)
+
+        if clean:
+            try:
+                result = run_subprocess(
+                    [
+                        "qpdf",
+                        "--replace-input",
+                        "--warning-exit-0",
+                        working_copy,
+                    ],
+                    logger=logger,
+                )
+                if result.returncode != 0:
+                    raise Exception(
+                        f"qpdf failed with exit code {result.returncode}, error: {result.stderr}",
+                    )
+                else:
+                    logger.debug("PDF cleaned successfully")
+            except Exception as e:
+                logger.error(f"Error while cleaning PDF: {e}")
+                raise e
+
+        task = consume_file.delay(
+            ConsumableDocument(
+                source=DocumentSource.ConsumeFolder,
+                original_file=working_copy,
+            ),
+        )
+
+        return task.id
+
+
@shared_task
 def sanity_check(*, scheduled=True, raise_on_error=True):
    messages = sanity_checker.check_sanity(scheduled=scheduled)
--- a/src/documents/tests/samples/corrupted.pdf
+++ b/src/documents/tests/samples/corrupted.pdf
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -10,8 +10,8 @@ from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import get_supported_file_extensions
 from documents.parsers import is_file_ext_supported
 from paperless.parsers.text import TextDocumentParser
-from paperless.parsers.tika import TikaDocumentParser
 from paperless_tesseract.parsers import RasterisedDocumentParser
+from paperless_tika.parsers import TikaDocumentParser


 class TestParserDiscovery(TestCase):
--- a/src/documents/tests/test_tasks.py
+++ b/src/documents/tests/test_tasks.py
@@ -1,4 +1,5 @@
 import shutil
+import uuid
 from datetime import timedelta
 from pathlib import Path
 from unittest import mock
@@ -21,6 +22,7 @@ from documents.sanity_checker import SanityCheckMessages
 from documents.tests.test_classifier import dummy_preprocess
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import FileSystemAssertsMixin
+from documents.tests.utils import SampleDirMixin


 class TestIndexReindex(DirectoriesMixin, TestCase):
@@ -232,6 +234,44 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        self.assertEqual(Document.global_objects.count(), 0)


+class TestRetryConsumeTask(
+    DirectoriesMixin,
+    SampleDirMixin,
+    FileSystemAssertsMixin,
+    TestCase,
+):
+    def do_failed_task(self, test_file: Path) -> PaperlessTask:
+        failed_file = settings.CONSUMPTION_FAILED_DIR / test_file.name
+        shutil.copy(test_file, failed_file)
+
+        task = PaperlessTask.objects.create(
+            type=PaperlessTask.TaskType.AUTO,
+            task_id=str(uuid.uuid4()),
+            task_file_name=failed_file.name,
+            task_name=PaperlessTask.TaskName.CONSUME_FILE,
+            status=states.FAILURE,
+            date_created=timezone.now(),
+            date_done=timezone.now(),
+        )
+        self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
+        return task
+
+    @mock.patch("documents.tasks.consume_file.delay")
+    @mock.patch("documents.tasks.run_subprocess")
+    def test_retry_consume_clean(self, m_subprocess, m_consume_file) -> None:
+        task = self.do_failed_task(self.SAMPLE_DIR / "corrupted.pdf")
+        m_subprocess.return_value.returncode = 0
+        task_id = tasks.retry_failed_file(task_id=task.task_id, clean=True)
+        self.assertIsNotNone(task_id)
+        m_consume_file.assert_called_once()
+
+    def test_cleanup(self) -> None:
+        task = self.do_failed_task(self.SAMPLE_DIR / "corrupted.pdf")
+        task.acknowledged = True
+        task.save()
+        self.assertIsNotFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
+
+
 class TestUpdateContent(DirectoriesMixin, TestCase):
    def test_update_content_maybe_archive_file(self) -> None:
        """
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@@ -37,6 +37,7 @@ def setup_directories():
    dirs.scratch_dir = Path(tempfile.mkdtemp()).resolve()
    dirs.media_dir = Path(tempfile.mkdtemp()).resolve()
    dirs.consumption_dir = Path(tempfile.mkdtemp()).resolve()
+    dirs.consumption_failed_dir = Path(tempfile.mkdtemp("failed")).resolve()
    dirs.static_dir = Path(tempfile.mkdtemp()).resolve()
    dirs.index_dir = dirs.data_dir / "index"
    dirs.originals_dir = dirs.media_dir / "documents" / "originals"
@@ -58,6 +59,7 @@ def setup_directories():
        THUMBNAIL_DIR=dirs.thumbnail_dir,
        ARCHIVE_DIR=dirs.archive_dir,
        CONSUMPTION_DIR=dirs.consumption_dir,
+        CONSUMPTION_FAILED_DIR=dirs.consumption_failed_dir,
        LOGGING_DIR=dirs.logging_dir,
        INDEX_DIR=dirs.index_dir,
        STATIC_ROOT=dirs.static_dir,
@@ -74,6 +76,7 @@ def remove_dirs(dirs) -> None:
    shutil.rmtree(dirs.data_dir, ignore_errors=True)
    shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
    shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
+    shutil.rmtree(dirs.consumption_failed_dir, ignore_errors=True)
    shutil.rmtree(dirs.static_dir, ignore_errors=True)
    dirs.settings_override.disable()

--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -7,7 +7,6 @@ import tempfile
 import zipfile
 from collections import defaultdict
 from collections import deque
-from contextlib import nullcontext
 from datetime import datetime
 from pathlib import Path
 from time import mktime
@@ -190,6 +189,7 @@ from documents.serialisers import NotesSerializer
 from documents.serialisers import PostDocumentSerializer
 from documents.serialisers import RemovePasswordDocumentsSerializer
 from documents.serialisers import ReprocessDocumentsSerializer
+from documents.serialisers import RetryTaskSerializer
 from documents.serialisers import RotateDocumentsSerializer
 from documents.serialisers import RunTaskViewSerializer
 from documents.serialisers import SavedViewSerializer
@@ -212,6 +212,7 @@ from documents.tasks import consume_file
 from documents.tasks import empty_trash
 from documents.tasks import index_optimize
 from documents.tasks import llmindex_index
+from documents.tasks import retry_failed_file
 from documents.tasks import sanity_check
 from documents.tasks import train_classifier
 from documents.tasks import update_document_parent_tags
@@ -226,7 +227,6 @@ from paperless.celery import app as celery_app
 from paperless.config import AIConfig
 from paperless.config import GeneralConfig
 from paperless.models import ApplicationConfiguration
-from paperless.parsers import ParserProtocol
 from paperless.serialisers import GroupSerializer
 from paperless.serialisers import UserSerializer
 from paperless.views import StandardPagination
@@ -1086,11 +1086,9 @@ class DocumentViewSet(
        parser_class = get_parser_class_for_mime_type(mime_type)
        if parser_class:
            parser = parser_class(progress_callback=None, logging_group=None)
-            cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)

            try:
-                with cm:
-                    return parser.extract_metadata(file, mime_type)
+                return parser.extract_metadata(file, mime_type)
            except Exception:  # pragma: no cover
                logger.exception(f"Issue getting metadata for {file}")
                # TODO: cover GPG errors, remove later.
@@ -3471,6 +3469,25 @@ class TasksViewSet(ReadOnlyModelViewSet):
            queryset = PaperlessTask.objects.filter(task_id=task_id)
        return queryset

+    @action(methods=["post"], detail=True)
+    def retry(self, request, pk=None):
+        task = self.get_object()
+
+        serializer = RetryTaskSerializer(data=request.data)
+        serializer.is_valid(raise_exception=True)
+        clean = serializer.validated_data.get("clean")
+
+        try:
+            new_task_id = retry_failed_file(task.task_id, clean)
+            return Response({"task_id": new_task_id})
+        except FileNotFoundError:
+            return HttpResponseBadRequest("Original file not found")
+        except Exception as e:
+            logger.warning(f"An error occurred retrying task: {e!s}")
+            return HttpResponseBadRequest(
+                "Error retrying task, check logs for more detail.",
+            )
+
    @action(
        methods=["post"],
        detail=False,
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -68,6 +68,10 @@ def paths_check(app_configs, **kwargs) -> list[Error]:
        + path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
        + path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
        + path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
+        + path_check(
+            "PAPERLESS_CONSUMPTION_FAILED_DIR",
+            settings.CONSUMPTION_FAILED_DIR,
+        )
    )


--- a/src/paperless/parsers/registry.py
+++ b/src/paperless/parsers/registry.py
@@ -194,10 +194,8 @@ class ParserRegistry:
        at runtime regardless of registration order.
        """
        from paperless.parsers.text import TextDocumentParser
-        from paperless.parsers.tika import TikaDocumentParser

        self.register_builtin(TextDocumentParser)
-        self.register_builtin(TikaDocumentParser)

    # ------------------------------------------------------------------
    # Discovery
--- a/src/paperless/parsers/tika.py
+++ b/src/paperless/parsers/tika.py
@@ -1,440 +0,0 @@
-"""
-Built-in Tika document parser.
-
-Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
-sending them to an Apache Tika server for text extraction and a Gotenberg
-server for PDF conversion.  Because the source formats cannot be rendered by
-a browser natively, the parser always produces a PDF rendition for display.
-"""
-
-from __future__ import annotations
-
-import logging
-import shutil
-import tempfile
-from contextlib import ExitStack
-from pathlib import Path
-from typing import TYPE_CHECKING
-from typing import Self
-
-import httpx
-from django.conf import settings
-from django.utils import timezone
-from gotenberg_client import GotenbergClient
-from gotenberg_client.options import PdfAFormat
-from tika_client import TikaClient
-
-from documents.parsers import ParseError
-from documents.parsers import make_thumbnail_from_pdf
-from paperless.config import OutputTypeConfig
-from paperless.models import OutputTypeChoices
-from paperless.version import __full_version_str__
-
-if TYPE_CHECKING:
-    import datetime
-    from types import TracebackType
-
-    from paperless.parsers import MetadataEntry
-
-logger = logging.getLogger("paperless.parsing.tika")
-
-_SUPPORTED_MIME_TYPES: dict[str, str] = {
-    "application/msword": ".doc",
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
-    "application/vnd.ms-excel": ".xls",
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
-    "application/vnd.ms-powerpoint": ".ppt",
-    "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
-    "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
-    "application/vnd.oasis.opendocument.presentation": ".odp",
-    "application/vnd.oasis.opendocument.spreadsheet": ".ods",
-    "application/vnd.oasis.opendocument.text": ".odt",
-    "application/vnd.oasis.opendocument.graphics": ".odg",
-    "text/rtf": ".rtf",
-}
-
-
-class TikaDocumentParser:
-    """Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
-
-    Text extraction is handled by the Tika server.  PDF conversion for display
-    is handled by Gotenberg (LibreOffice route).  Because the source formats
-    cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
-    True and the PDF is always produced regardless of the ``produce_archive``
-    flag passed to ``parse``.
-
-    Both ``TikaClient`` and ``GotenbergClient`` are opened once in
-    ``__enter__`` via an ``ExitStack`` and shared across ``parse``,
-    ``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
-    ``ExitStack.close()`` in ``__exit__``.  The parser must always be used
-    as a context manager.
-
-    Class attributes
-    ----------------
-    name : str
-        Human-readable parser name.
-    version : str
-        Semantic version string, kept in sync with Paperless-ngx releases.
-    author : str
-        Maintainer name.
-    url : str
-        Issue tracker / source URL.
-    """
-
-    name: str = "Paperless-ngx Tika Parser"
-    version: str = __full_version_str__
-    author: str = "Paperless-ngx Contributors"
-    url: str = "https://github.com/paperless-ngx/paperless-ngx"
-
-    # ------------------------------------------------------------------
-    # Class methods
-    # ------------------------------------------------------------------
-
-    @classmethod
-    def supported_mime_types(cls) -> dict[str, str]:
-        """Return the MIME types this parser handles.
-
-        Returns
-        -------
-        dict[str, str]
-            Mapping of MIME type to preferred file extension.
-        """
-        return _SUPPORTED_MIME_TYPES
-
-    @classmethod
-    def score(
-        cls,
-        mime_type: str,
-        filename: str,
-        path: Path | None = None,
-    ) -> int | None:
-        """Return the priority score for handling this file.
-
-        Returns ``None`` when Tika integration is disabled so the registry
-        skips this parser entirely.
-
-        Parameters
-        ----------
-        mime_type:
-            Detected MIME type of the file.
-        filename:
-            Original filename including extension.
-        path:
-            Optional filesystem path. Not inspected by this parser.
-
-        Returns
-        -------
-        int | None
-            10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
-        """
-        if not settings.TIKA_ENABLED:
-            return None
-        if mime_type in _SUPPORTED_MIME_TYPES:
-            return 10
-        return None
-
-    # ------------------------------------------------------------------
-    # Properties
-    # ------------------------------------------------------------------
-
-    @property
-    def can_produce_archive(self) -> bool:
-        """Whether this parser can produce a searchable PDF archive copy.
-
-        Returns
-        -------
-        bool
-            Always False — Tika produces a display PDF, not an OCR archive.
-        """
-        return False
-
-    @property
-    def requires_pdf_rendition(self) -> bool:
-        """Whether the parser must produce a PDF for the frontend to display.
-
-        Returns
-        -------
-        bool
-            Always True — Office formats cannot be rendered natively in a
-            browser, so a PDF conversion is always required for display.
-        """
-        return True
-
-    # ------------------------------------------------------------------
-    # Lifecycle
-    # ------------------------------------------------------------------
-
-    def __init__(self, logging_group: object = None) -> None:
-        settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
-        self._tempdir = Path(
-            tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
-        )
-        self._text: str | None = None
-        self._date: datetime.datetime | None = None
-        self._archive_path: Path | None = None
-        self._exit_stack = ExitStack()
-        self._tika_client: TikaClient | None = None
-        self._gotenberg_client: GotenbergClient | None = None
-
-    def __enter__(self) -> Self:
-        self._tika_client = self._exit_stack.enter_context(
-            TikaClient(
-                tika_url=settings.TIKA_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ),
-        )
-        self._gotenberg_client = self._exit_stack.enter_context(
-            GotenbergClient(
-                host=settings.TIKA_GOTENBERG_ENDPOINT,
-                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            ),
-        )
-        return self
-
-    def __exit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc_val: BaseException | None,
-        exc_tb: TracebackType | None,
-    ) -> None:
-        self._exit_stack.close()
-        logger.debug("Cleaning up temporary directory %s", self._tempdir)
-        shutil.rmtree(self._tempdir, ignore_errors=True)
-
-    # ------------------------------------------------------------------
-    # Core parsing interface
-    # ------------------------------------------------------------------
-
-    def parse(
-        self,
-        document_path: Path,
-        mime_type: str,
-        *,
-        produce_archive: bool = True,
-    ) -> None:
-        """Send the document to Tika for text extraction and Gotenberg for PDF.
-
-        Because ``requires_pdf_rendition`` is True the PDF conversion is
-        always performed — the ``produce_archive`` flag is intentionally
-        ignored.
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the document file to parse.
-        mime_type:
-            Detected MIME type of the document.
-        produce_archive:
-            Accepted for protocol compatibility but ignored; the PDF rendition
-            is always produced since the source format cannot be displayed
-            natively in the browser.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If Tika or Gotenberg returns an error.
-        """
-        if TYPE_CHECKING:
-            assert self._tika_client is not None
-
-        logger.info("Sending %s to Tika server", document_path)
-
-        try:
-            try:
-                parsed = self._tika_client.tika.as_text.from_file(
-                    document_path,
-                    mime_type,
-                )
-            except httpx.HTTPStatusError as err:
-                # Workaround https://issues.apache.org/jira/browse/TIKA-4110
-                # Tika fails with some files as multi-part form data
-                if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
-                    parsed = self._tika_client.tika.as_text.from_buffer(
-                        document_path.read_bytes(),
-                        mime_type,
-                    )
-                else:  # pragma: no cover
-                    raise
-        except Exception as err:
-            raise ParseError(
-                f"Could not parse {document_path} with tika server at "
-                f"{settings.TIKA_ENDPOINT}: {err}",
-            ) from err
-
-        self._text = parsed.content
-        if self._text is not None:
-            self._text = self._text.strip()
-
-        self._date = parsed.created
-        if self._date is not None and timezone.is_naive(self._date):
-            self._date = timezone.make_aware(self._date)
-
-        # Always convert — requires_pdf_rendition=True means the browser
-        # cannot display the source format natively.
-        self._archive_path = self._convert_to_pdf(document_path)
-
-    # ------------------------------------------------------------------
-    # Result accessors
-    # ------------------------------------------------------------------
-
-    def get_text(self) -> str | None:
-        """Return the plain-text content extracted during parse.
-
-        Returns
-        -------
-        str | None
-            Extracted text, or None if parse has not been called yet.
-        """
-        return self._text
-
-    def get_date(self) -> datetime.datetime | None:
-        """Return the document date detected during parse.
-
-        Returns
-        -------
-        datetime.datetime | None
-            Creation date from Tika metadata, or None if not detected.
-        """
-        return self._date
-
-    def get_archive_path(self) -> Path | None:
-        """Return the path to the generated PDF rendition, or None.
-
-        Returns
-        -------
-        Path | None
-            Path to the PDF produced by Gotenberg, or None if parse has not
-            been called yet.
-        """
-        return self._archive_path
-
-    # ------------------------------------------------------------------
-    # Thumbnail and metadata
-    # ------------------------------------------------------------------
-
-    def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
-        """Generate a thumbnail from the PDF rendition of the document.
-
-        Converts the document to PDF first if not already done.
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the source document.
-        mime_type:
-            Detected MIME type of the document.
-
-        Returns
-        -------
-        Path
-            Path to the generated WebP thumbnail inside the temporary directory.
-        """
-        if self._archive_path is None:
-            self._archive_path = self._convert_to_pdf(document_path)
-        return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
-
-    def get_page_count(
-        self,
-        document_path: Path,
-        mime_type: str,
-    ) -> int | None:
-        """Return the number of pages in the document.
-
-        Returns
-        -------
-        int | None
-            Always None — page count is not available from Tika.
-        """
-        return None
-
-    def extract_metadata(
-        self,
-        document_path: Path,
-        mime_type: str,
-    ) -> list[MetadataEntry]:
-        """Extract format-specific metadata via the Tika metadata endpoint.
-
-        Returns
-        -------
-        list[MetadataEntry]
-            All key/value pairs returned by Tika, or ``[]`` on error.
-        """
-        if TYPE_CHECKING:
-            assert self._tika_client is not None
-
-        try:
-            parsed = self._tika_client.metadata.from_file(document_path, mime_type)
-            return [
-                {
-                    "namespace": "",
-                    "prefix": "",
-                    "key": key,
-                    "value": parsed.data[key],
-                }
-                for key in parsed.data
-            ]
-        except Exception as e:
-            logger.warning(
-                "Error while fetching document metadata for %s: %s",
-                document_path,
-                e,
-            )
-            return []
-
-    # ------------------------------------------------------------------
-    # Private helpers
-    # ------------------------------------------------------------------
-
-    def _convert_to_pdf(self, document_path: Path) -> Path:
-        """Convert the document to PDF using Gotenberg's LibreOffice route.
-
-        Parameters
-        ----------
-        document_path:
-            Absolute path to the source document.
-
-        Returns
-        -------
-        Path
-            Path to the generated PDF inside the temporary directory.
-
-        Raises
-        ------
-        documents.parsers.ParseError
-            If Gotenberg returns an error.
-        """
-        if TYPE_CHECKING:
-            assert self._gotenberg_client is not None
-
-        pdf_path = self._tempdir / "convert.pdf"
-
-        logger.info("Converting %s to PDF as %s", document_path, pdf_path)
-
-        with self._gotenberg_client.libre_office.to_pdf() as route:
-            # Set the output format of the resulting PDF.
-            # OutputTypeConfig reads the database-stored ApplicationConfiguration
-            # first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
-            output_type = OutputTypeConfig().output_type
-            if output_type in {
-                OutputTypeChoices.PDF_A,
-                OutputTypeChoices.PDF_A2,
-            }:
-                route.pdf_format(PdfAFormat.A2b)
-            elif output_type == OutputTypeChoices.PDF_A1:
-                logger.warning(
-                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
-                )
-                route.pdf_format(PdfAFormat.A2b)
-            elif output_type == OutputTypeChoices.PDF_A3:
-                route.pdf_format(PdfAFormat.A3b)
-
-            route.convert(document_path)
-
-            try:
-                response = route.run()
-                pdf_path.write_bytes(response.content)
-                return pdf_path
-            except Exception as err:
-                raise ParseError(
-                    f"Error while converting document to PDF: {err}",
-                ) from err
--- a/src/paperless/settings/init.py
+++ b/src/paperless/settings/init.py
@@ -98,6 +98,11 @@ CONSUMPTION_DIR = get_path_from_env(
    BASE_DIR.parent / "consume",
 )

+CONSUMPTION_FAILED_DIR = get_path_from_env(
+    "PAPERLESS_CONSUMPTION_FAILED_DIR",
+    CONSUMPTION_DIR / "failed",
+)
+
 # This will be created if it doesn't exist
 SCRATCH_DIR = get_path_from_env(
    "PAPERLESS_SCRATCH_DIR",
@@ -782,6 +787,8 @@ CONSUMER_IGNORE_PATTERNS = list(
        ),
    ),
 )
+if CONSUMPTION_DIR in CONSUMPTION_FAILED_DIR.parents:
+    CONSUMER_IGNORE_PATTERNS.append(CONSUMPTION_FAILED_DIR.name)

 # Directories to always ignore.  These are matched by directory name, not full path
 CONSUMER_IGNORE_DIRS = list(
--- a/src/paperless/tests/parsers/conftest.py
+++ b/src/paperless/tests/parsers/conftest.py
@@ -11,7 +11,6 @@ from typing import TYPE_CHECKING
 import pytest

 from paperless.parsers.text import TextDocumentParser
-from paperless.parsers.tika import TikaDocumentParser

 if TYPE_CHECKING:
    from collections.abc import Generator
@@ -75,86 +74,3 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
    """
    with TextDocumentParser() as parser:
        yield parser
-
-
-# ------------------------------------------------------------------
-# Tika parser sample files
-# ------------------------------------------------------------------
-
-
-@pytest.fixture(scope="session")
-def tika_samples_dir(samples_dir: Path) -> Path:
-    """Absolute path to the Tika parser sample files directory.
-
-    Returns
-    -------
-    Path
-        ``<samples_dir>/tika/``
-    """
-    return samples_dir / "tika"
-
-
-@pytest.fixture(scope="session")
-def sample_odt_file(tika_samples_dir: Path) -> Path:
-    """Path to a sample ODT file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tika/sample.odt``.
-    """
-    return tika_samples_dir / "sample.odt"
-
-
-@pytest.fixture(scope="session")
-def sample_docx_file(tika_samples_dir: Path) -> Path:
-    """Path to a sample DOCX file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tika/sample.docx``.
-    """
-    return tika_samples_dir / "sample.docx"
-
-
-@pytest.fixture(scope="session")
-def sample_doc_file(tika_samples_dir: Path) -> Path:
-    """Path to a sample DOC file.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tika/sample.doc``.
-    """
-    return tika_samples_dir / "sample.doc"
-
-
-@pytest.fixture(scope="session")
-def sample_broken_odt(tika_samples_dir: Path) -> Path:
-    """Path to a broken ODT file that triggers the multi-part fallback.
-
-    Returns
-    -------
-    Path
-        Absolute path to ``tika/multi-part-broken.odt``.
-    """
-    return tika_samples_dir / "multi-part-broken.odt"
-
-
-# ------------------------------------------------------------------
-# Tika parser instance
-# ------------------------------------------------------------------
-
-
-@pytest.fixture()
-def tika_parser() -> Generator[TikaDocumentParser, None, None]:
-    """Yield a TikaDocumentParser and clean up its temporary directory afterwards.
-
-    Yields
-    ------
-    TikaDocumentParser
-        A ready-to-use parser instance.
-    """
-    with TikaDocumentParser() as parser:
-        yield parser
--- a/src/paperless/tests/test_checks.py
+++ b/src/paperless/tests/test_checks.py
@@ -24,6 +24,7 @@ class PaperlessTestDirs:
    data_dir: Path
    media_dir: Path
    consumption_dir: Path
+    consumption_failed_dir: Path


 # TODO: consolidate with documents/tests/conftest.py PaperlessDirs/paperless_dirs
@@ -33,18 +34,21 @@ def directories(tmp_path: Path, settings: SettingsWrapper) -> PaperlessTestDirs:
    data_dir = tmp_path / "data"
    media_dir = tmp_path / "media"
    consumption_dir = tmp_path / "consumption"
+    consumption_failed_dir = tmp_path / "consumption_failed"

-    for d in (data_dir, media_dir, consumption_dir):
+    for d in (data_dir, media_dir, consumption_dir, consumption_failed_dir):
        d.mkdir()

    settings.DATA_DIR = data_dir
    settings.MEDIA_ROOT = media_dir
    settings.CONSUMPTION_DIR = consumption_dir
+    settings.CONSUMPTION_FAILED_DIR = consumption_failed_dir

    return PaperlessTestDirs(
        data_dir=data_dir,
        media_dir=media_dir,
        consumption_dir=consumption_dir,
+        consumption_failed_dir=consumption_failed_dir,
    )


@@ -64,10 +68,11 @@ class TestChecks:
        settings.MEDIA_ROOT = Path("uuh")
        settings.DATA_DIR = Path("whatever")
        settings.CONSUMPTION_DIR = Path("idontcare")
+        settings.CONSUMPTION_FAILED_DIR = Path("nope")

        msgs = paths_check(None)

-        assert len(msgs) == 3, str(msgs)
+        assert len(msgs) == 4, str(msgs)
        for msg in msgs:
            assert msg.msg.endswith("is set but doesn't exist.")

@@ -75,6 +80,7 @@ class TestChecks:
        directories.data_dir.chmod(0o000)
        directories.media_dir.chmod(0o000)
        directories.consumption_dir.chmod(0o000)
+        directories.consumption_failed_dir.chmod(0o000)

        try:
            msgs = paths_check(None)
@@ -82,8 +88,9 @@ class TestChecks:
            directories.data_dir.chmod(0o777)
            directories.media_dir.chmod(0o777)
            directories.consumption_dir.chmod(0o777)
+            directories.consumption_failed_dir.chmod(0o777)

-        assert len(msgs) == 3
+        assert len(msgs) == 4
        for msg in msgs:
            assert msg.msg.endswith("is not writeable")

--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -1,12 +1,10 @@
 def get_parser(*args, **kwargs):
    from paperless.parsers.text import TextDocumentParser

-    # TextDocumentParser accepts logging_group for constructor compatibility but
-    # does not store or use it (no legacy DocumentParser base class).
-    # progress_callback is also not used.  Both may arrive as a positional arg
-    # (consumer) or a keyword arg (views); *args absorbs the positional form,
-    # kwargs.pop handles the keyword form.  Phase 4 will replace this signal
-    # path with the new ParserRegistry so the shim can be removed at that point.
+    # The new TextDocumentParser does not accept the legacy logging_group /
+    # progress_callback kwargs injected by the old signal-based consumer.
+    # These are dropped here; Phase 4 will replace this signal path with the
+    # new ParserRegistry so the shim can be removed at that point.
    kwargs.pop("logging_group", None)
    kwargs.pop("progress_callback", None)
    return TextDocumentParser()
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -0,0 +1,136 @@
+from pathlib import Path
+
+import httpx
+from django.conf import settings
+from django.utils import timezone
+from gotenberg_client import GotenbergClient
+from gotenberg_client.options import PdfAFormat
+from tika_client import TikaClient
+
+from documents.parsers import DocumentParser
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from paperless.config import OutputTypeConfig
+from paperless.models import OutputTypeChoices
+
+
+class TikaDocumentParser(DocumentParser):
+    """
+    This parser sends documents to a local tika server
+    """
+
+    logging_name = "paperless.parsing.tika"
+
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
+        if not self.archive_path:
+            self.archive_path = self.convert_to_pdf(document_path, file_name)
+
+        return make_thumbnail_from_pdf(
+            self.archive_path,
+            self.tempdir,
+            self.logging_group,
+        )
+
+    def extract_metadata(self, document_path, mime_type):
+        try:
+            with TikaClient(
+                tika_url=settings.TIKA_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ) as client:
+                parsed = client.metadata.from_file(document_path, mime_type)
+                return [
+                    {
+                        "namespace": "",
+                        "prefix": "",
+                        "key": key,
+                        "value": parsed.data[key],
+                    }
+                    for key in parsed.data
+                ]
+        except Exception as e:
+            self.log.warning(
+                f"Error while fetching document metadata for {document_path}: {e}",
+            )
+            return []
+
+    def parse(self, document_path: Path, mime_type: str, file_name=None) -> None:
+        self.log.info(f"Sending {document_path} to Tika server")
+
+        try:
+            with TikaClient(
+                tika_url=settings.TIKA_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ) as client:
+                try:
+                    parsed = client.tika.as_text.from_file(document_path, mime_type)
+                except httpx.HTTPStatusError as err:
+                    # Workaround https://issues.apache.org/jira/browse/TIKA-4110
+                    # Tika fails with some files as multi-part form data
+                    if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
+                        parsed = client.tika.as_text.from_buffer(
+                            document_path.read_bytes(),
+                            mime_type,
+                        )
+                    else:  # pragma: no cover
+                        raise
+        except Exception as err:
+            raise ParseError(
+                f"Could not parse {document_path} with tika server at "
+                f"{settings.TIKA_ENDPOINT}: {err}",
+            ) from err
+
+        self.text = parsed.content
+        if self.text is not None:
+            self.text = self.text.strip()
+
+        self.date = parsed.created
+        if self.date is not None and timezone.is_naive(self.date):
+            self.date = timezone.make_aware(self.date)
+
+        self.archive_path = self.convert_to_pdf(document_path, file_name)
+
+    def convert_to_pdf(self, document_path: Path, file_name):
+        pdf_path = Path(self.tempdir) / "convert.pdf"
+
+        self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
+
+        with (
+            GotenbergClient(
+                host=settings.TIKA_GOTENBERG_ENDPOINT,
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
+            ) as client,
+            client.libre_office.to_pdf() as route,
+        ):
+            # Set the output format of the resulting PDF
+            if settings.OCR_OUTPUT_TYPE in {
+                OutputTypeChoices.PDF_A,
+                OutputTypeChoices.PDF_A2,
+            }:
+                route.pdf_format(PdfAFormat.A2b)
+            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
+                self.log.warning(
+                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
+                )
+                route.pdf_format(PdfAFormat.A2b)
+            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
+                route.pdf_format(PdfAFormat.A3b)
+
+            route.convert(document_path)
+
+            try:
+                response = route.run()
+
+                pdf_path.write_bytes(response.content)
+
+                return pdf_path
+
+            except Exception as err:
+                raise ParseError(
+                    f"Error while converting document to PDF: {err}",
+                ) from err
+
+    def get_settings(self) -> OutputTypeConfig:
+        """
+        This parser only uses the PDF output type configuration currently
+        """
+        return OutputTypeConfig()
--- a/src/paperless_tika/signals.py
+++ b/src/paperless_tika/signals.py
@@ -1,15 +1,7 @@
 def get_parser(*args, **kwargs):
-    from paperless.parsers.tika import TikaDocumentParser
+    from paperless_tika.parsers import TikaDocumentParser

-    # TikaDocumentParser accepts logging_group for constructor compatibility but
-    # does not store or use it (no legacy DocumentParser base class).
-    # progress_callback is also not used.  Both may arrive as a positional arg
-    # (consumer) or a keyword arg (views); *args absorbs the positional form,
-    # kwargs.pop handles the keyword form.  Phase 4 will replace this signal
-    # path with the new ParserRegistry so the shim can be removed at that point.
-    kwargs.pop("logging_group", None)
-    kwargs.pop("progress_callback", None)
-    return TikaDocumentParser()
+    return TikaDocumentParser(*args, **kwargs)


 def tika_consumer_declaration(sender, **kwargs):
--- a/src/paperless_tika/tests/init.py
+++ b/src/paperless_tika/tests/init.py
--- a/src/paperless_tika/tests/conftest.py
+++ b/src/paperless_tika/tests/conftest.py
@@ -0,0 +1,41 @@
+from collections.abc import Generator
+from pathlib import Path
+
+import pytest
+
+from paperless_tika.parsers import TikaDocumentParser
+
+
+@pytest.fixture()
+def tika_parser() -> Generator[TikaDocumentParser, None, None]:
+    try:
+        parser = TikaDocumentParser(logging_group=None)
+        yield parser
+    finally:
+        # TODO(stumpylog): Cleanup once all parsers are handled
+        parser.cleanup()
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+    return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture(scope="session")
+def sample_odt_file(sample_dir: Path) -> Path:
+    return sample_dir / "sample.odt"
+
+
+@pytest.fixture(scope="session")
+def sample_docx_file(sample_dir: Path) -> Path:
+    return sample_dir / "sample.docx"
+
+
+@pytest.fixture(scope="session")
+def sample_doc_file(sample_dir: Path) -> Path:
+    return sample_dir / "sample.doc"
+
+
+@pytest.fixture(scope="session")
+def sample_broken_odt(sample_dir: Path) -> Path:
+    return sample_dir / "multi-part-broken.odt"
--- a/src/paperless_tika/tests/samples/multi-part-broken.odt
+++ b/src/paperless_tika/tests/samples/multi-part-broken.odt
--- a/src/paperless_tika/tests/samples/sample.doc
+++ b/src/paperless_tika/tests/samples/sample.doc
--- a/src/paperless_tika/tests/samples/sample.docx
+++ b/src/paperless_tika/tests/samples/sample.docx
--- a/src/paperless_tika/tests/samples/sample.odt
+++ b/src/paperless_tika/tests/samples/sample.odt
--- a/src/paperless/tests/parsers/test_tika_liva.py
+++ b/src/paperless/tests/parsers/test_tika_liva.py
@@ -4,7 +4,7 @@ from pathlib import Path
 import pytest

 from documents.tests.utils import util_call_with_backoff
-from paperless.parsers.tika import TikaDocumentParser
+from paperless_tika.parsers import TikaDocumentParser


@pytest.mark.skipif(
@@ -42,15 +42,14 @@ class TestTikaParserAgainstServer:
        )

        assert (
-            tika_parser.get_text()
+            tika_parser.text
            == "This is an ODT test document, created September 14, 2022"
        )
-        archive = tika_parser.get_archive_path()
-        assert archive is not None
-        assert b"PDF-" in archive.read_bytes()[:10]
+        assert tika_parser.archive_path is not None
+        assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]

        # TODO: Unsure what can set the Creation-Date field in a document, enable when possible
-        # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))

    def test_basic_parse_docx(
        self,
@@ -75,15 +74,14 @@ class TestTikaParserAgainstServer:
        )

        assert (
-            tika_parser.get_text()
+            tika_parser.text
            == "This is an DOCX test document, also made September 14, 2022"
        )
-        archive = tika_parser.get_archive_path()
-        assert archive is not None
-        with archive.open("rb") as f:
+        assert tika_parser.archive_path is not None
+        with Path(tika_parser.archive_path).open("rb") as f:
            assert b"PDF-" in f.read()[:10]

-        # self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))

    def test_basic_parse_doc(
        self,
@@ -104,12 +102,13 @@ class TestTikaParserAgainstServer:
            [sample_doc_file, "application/msword"],
        )

-        text = tika_parser.get_text()
-        assert text is not None
-        assert "This is a test document, saved in the older .doc format" in text
-        archive = tika_parser.get_archive_path()
-        assert archive is not None
-        with archive.open("rb") as f:
+        assert tika_parser.text is not None
+        assert (
+            "This is a test document, saved in the older .doc format"
+            in tika_parser.text
+        )
+        assert tika_parser.archive_path is not None
+        with Path(tika_parser.archive_path).open("rb") as f:
            assert b"PDF-" in f.read()[:10]

    def test_tika_fails_multi_part(
@@ -134,7 +133,6 @@ class TestTikaParserAgainstServer:
            [sample_broken_odt, "application/vnd.oasis.opendocument.text"],
        )

-        archive = tika_parser.get_archive_path()
-        assert archive is not None
-        with archive.open("rb") as f:
+        assert tika_parser.archive_path is not None
+        with Path(tika_parser.archive_path).open("rb") as f:
            assert b"PDF-" in f.read()[:10]
--- a/src/paperless/tests/parsers/test_tika_parser.py
+++ b/src/paperless/tests/parsers/test_tika_parser.py
@@ -9,56 +9,7 @@ from pytest_django.fixtures import SettingsWrapper
 from pytest_httpx import HTTPXMock

 from documents.parsers import ParseError
-from paperless.parsers import ParserProtocol
-from paperless.parsers.tika import TikaDocumentParser
-
-
-class TestTikaParserRegistryInterface:
-    """Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
-
-    def test_satisfies_parser_protocol(self) -> None:
-        assert isinstance(TikaDocumentParser(), ParserProtocol)
-
-    def test_supported_mime_types_is_classmethod(self) -> None:
-        mime_types = TikaDocumentParser.supported_mime_types()
-        assert isinstance(mime_types, dict)
-        assert len(mime_types) > 0
-
-    def test_score_returns_none_when_tika_disabled(
-        self,
-        settings: SettingsWrapper,
-    ) -> None:
-        settings.TIKA_ENABLED = False
-        result = TikaDocumentParser.score(
-            "application/vnd.oasis.opendocument.text",
-            "sample.odt",
-        )
-        assert result is None
-
-    def test_score_returns_int_when_tika_enabled(
-        self,
-        settings: SettingsWrapper,
-    ) -> None:
-        settings.TIKA_ENABLED = True
-        result = TikaDocumentParser.score(
-            "application/vnd.oasis.opendocument.text",
-            "sample.odt",
-        )
-        assert isinstance(result, int)
-
-    def test_score_returns_none_for_unsupported_mime(
-        self,
-        settings: SettingsWrapper,
-    ) -> None:
-        settings.TIKA_ENABLED = True
-        result = TikaDocumentParser.score("application/pdf", "doc.pdf")
-        assert result is None
-
-    def test_can_produce_archive_is_false(self) -> None:
-        assert TikaDocumentParser().can_produce_archive is False
-
-    def test_requires_pdf_rendition_is_true(self) -> None:
-        assert TikaDocumentParser().requires_pdf_rendition is True
+from paperless_tika.parsers import TikaDocumentParser


@pytest.mark.django_db()
@@ -85,12 +36,12 @@ class TestTikaParser:

        tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")

-        assert tika_parser.get_text() == "the content"
-        assert tika_parser.get_archive_path() is not None
-        with Path(tika_parser.get_archive_path()).open("rb") as f:
+        assert tika_parser.text == "the content"
+        assert tika_parser.archive_path is not None
+        with Path(tika_parser.archive_path).open("rb") as f:
            assert f.read() == b"PDF document"

-        assert tika_parser.get_date() == datetime.datetime(
+        assert tika_parser.date == datetime.datetime(
            2020,
            11,
            21,
@@ -138,7 +89,7 @@ class TestTikaParser:
        httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)

        with pytest.raises(ParseError):
-            tika_parser._convert_to_pdf(sample_odt_file)
+            tika_parser.convert_to_pdf(sample_odt_file, None)

    @pytest.mark.parametrize(
        ("setting_value", "expected_form_value"),
@@ -155,6 +106,7 @@ class TestTikaParser:
        expected_form_value: str,
        httpx_mock: HTTPXMock,
        settings: SettingsWrapper,
+        tika_parser: TikaDocumentParser,
        sample_odt_file: Path,
    ) -> None:
        """
@@ -165,8 +117,6 @@ class TestTikaParser:
        THEN:
            - Request to Gotenberg contains the expected PDF/A format string
        """
-        # Parser must be created after the setting is changed so that
-        # OutputTypeConfig reads the correct value at __init__ time.
        settings.OCR_OUTPUT_TYPE = setting_value
        httpx_mock.add_response(
            status_code=codes.OK,
@@ -174,8 +124,7 @@ class TestTikaParser:
            method="POST",
        )

-        with TikaDocumentParser() as parser:
-            parser._convert_to_pdf(sample_odt_file)
+        tika_parser.convert_to_pdf(sample_odt_file, None)

        request = httpx_mock.get_request()
Author	SHA1	Message	Date
shamoon	e5afbccffc	Lint	2026-03-17 11:48:53 -07:00
shamoon	b8faae72ab	Update tests	2026-03-17 11:46:44 -07:00
shamoon	8cff99bef3	Update __init__.py	2026-03-17 11:43:22 -07:00
shamoon	b2bbc2c0ac	Basic option selection	2026-03-17 11:42:17 -07:00
shamoon	03c71c604f	Retry action, basic frontend, cleanup handler	2026-03-17 11:39:53 -07:00
shamoon	fe89ff760b	Move it out of consumer	2026-03-17 11:35:52 -07:00
shamoon	83eabbdf63	Try this	2026-03-17 11:35:11 -07:00
shamoon	24da26959d	Update consumer.py	2026-03-17 11:34:12 -07:00
shamoon	220267099a	Fix tests	2026-03-17 11:34:11 -07:00
shamoon	0f1a529b51	Messing around	2026-03-17 11:33:01 -07:00