mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-03-17 22:45:58 +00:00
Compare commits
2 Commits
feature-re
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
df5f8db0d3 | ||
|
|
aea2927a02 |
3
.github/dependabot.yml
vendored
3
.github/dependabot.yml
vendored
@@ -157,6 +157,9 @@ updates:
|
||||
postgres:
|
||||
patterns:
|
||||
- "docker.io/library/postgres*"
|
||||
greenmail:
|
||||
patterns:
|
||||
- "docker.io/greenmail*"
|
||||
- package-ecosystem: "pre-commit" # See documentation for possible values
|
||||
directory: "/" # Location of package manifests
|
||||
schedule:
|
||||
|
||||
@@ -50,7 +50,7 @@ repos:
|
||||
- 'prettier-plugin-organize-imports@4.3.0'
|
||||
# Python hooks
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.15.5
|
||||
rev: v0.15.6
|
||||
hooks:
|
||||
- id: ruff-check
|
||||
- id: ruff-format
|
||||
|
||||
@@ -18,13 +18,13 @@ services:
|
||||
- "--log-level=warn"
|
||||
- "--log-format=text"
|
||||
tika:
|
||||
image: docker.io/apache/tika:latest
|
||||
image: docker.io/apache/tika:3.2.3.0
|
||||
hostname: tika
|
||||
container_name: tika
|
||||
network_mode: host
|
||||
restart: unless-stopped
|
||||
greenmail:
|
||||
image: greenmail/standalone:2.1.8
|
||||
image: docker.io/greenmail/standalone:2.1.8
|
||||
hostname: greenmail
|
||||
container_name: greenmail
|
||||
environment:
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
# Paths and folders
|
||||
|
||||
#PAPERLESS_CONSUMPTION_DIR=../consume
|
||||
#PAPERLESS_CONSUMPTION_FAILED_DIR=../consume/failed
|
||||
#PAPERLESS_DATA_DIR=../data
|
||||
#PAPERLESS_EMPTY_TRASH_DIR=
|
||||
#PAPERLESS_MEDIA_ROOT=../media
|
||||
|
||||
@@ -112,9 +112,6 @@
|
||||
</td>
|
||||
<td scope="row">
|
||||
<div class="btn-group" role="group">
|
||||
@if (task.status === PaperlessTaskStatus.Failed) {
|
||||
<ng-container *ngTemplateOutlet="retryDropdown; context: { task: task }"></ng-container>
|
||||
}
|
||||
<button class="btn btn-sm btn-outline-secondary" (click)="dismissTask(task); $event.stopPropagation();" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.PaperlessTask }">
|
||||
<i-bs name="check" class="me-1"></i-bs><ng-container i18n>Dismiss</ng-container>
|
||||
</button>
|
||||
@@ -187,25 +184,3 @@
|
||||
</li>
|
||||
</ul>
|
||||
<div [ngbNavOutlet]="nav"></div>
|
||||
|
||||
<ng-template #retryDropdown let-task="task">
|
||||
<div ngbDropdown>
|
||||
<button class="btn btn-sm btn-outline-primary" (click)="$event.stopImmediatePropagation()" ngbDropdownToggle>
|
||||
<i-bs name="arrow-repeat"></i-bs> <ng-container i18n>Retry</ng-container>
|
||||
</button>
|
||||
<div ngbDropdownMenu class="shadow retry-dropdown">
|
||||
<div class="p-2">
|
||||
<ul class="list-group list-group-flush">
|
||||
<li class="list-group-item small" i18n>
|
||||
<pngx-input-check [(ngModel)]="retryClean" i18n-title title="Attempt to clean pdf"></pngx-input-check>
|
||||
</li>
|
||||
</ul>
|
||||
<div class="d-flex justify-content-end">
|
||||
<button class="btn btn-sm btn-outline-primary" (click)="retryTask(task); $event.stopPropagation();">
|
||||
<ng-container i18n>Proceed</ng-container>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</ng-template>
|
||||
|
||||
@@ -37,7 +37,3 @@ pre {
|
||||
.z-10 {
|
||||
z-index: 10;
|
||||
}
|
||||
|
||||
.retry-dropdown {
|
||||
width: 300px;
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ import {
|
||||
NgbNavItem,
|
||||
} from '@ng-bootstrap/ng-bootstrap'
|
||||
import { allIcons, NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
|
||||
import { of, throwError } from 'rxjs'
|
||||
import { throwError } from 'rxjs'
|
||||
import { routes } from 'src/app/app-routing.module'
|
||||
import {
|
||||
PaperlessTask,
|
||||
@@ -32,7 +32,6 @@ import { TasksService } from 'src/app/services/tasks.service'
|
||||
import { ToastService } from 'src/app/services/toast.service'
|
||||
import { environment } from 'src/environments/environment'
|
||||
import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
|
||||
import { CheckComponent } from '../../common/input/check/check.component'
|
||||
import { PageHeaderComponent } from '../../common/page-header/page-header.component'
|
||||
import { TasksComponent, TaskTab } from './tasks.component'
|
||||
|
||||
@@ -139,7 +138,6 @@ describe('TasksComponent', () => {
|
||||
PageHeaderComponent,
|
||||
IfPermissionsDirective,
|
||||
CustomDatePipe,
|
||||
CheckComponent,
|
||||
ConfirmDialogComponent,
|
||||
],
|
||||
providers: [
|
||||
@@ -186,10 +184,8 @@ describe('TasksComponent', () => {
|
||||
`Failed${currentTasksLength}`
|
||||
)
|
||||
expect(
|
||||
fixture.debugElement.queryAll(
|
||||
By.css('table td > .form-check input[type="checkbox"]')
|
||||
)
|
||||
).toHaveLength(currentTasksLength)
|
||||
fixture.debugElement.queryAll(By.css('table input[type="checkbox"]'))
|
||||
).toHaveLength(currentTasksLength + 1)
|
||||
|
||||
currentTasksLength = tasks.filter(
|
||||
(t) => t.status === PaperlessTaskStatus.Complete
|
||||
@@ -393,20 +389,4 @@ describe('TasksComponent', () => {
|
||||
expect(component.filterText).toEqual('')
|
||||
expect(component.filterTargetID).toEqual(0)
|
||||
})
|
||||
|
||||
it('should retry a task, show toast on error or success', () => {
|
||||
const retrySpy = jest.spyOn(tasksService, 'retryTask')
|
||||
const toastInfoSpy = jest.spyOn(toastService, 'showInfo')
|
||||
const toastErrorSpy = jest.spyOn(toastService, 'showError')
|
||||
retrySpy.mockReturnValueOnce(of({ task_id: '123' }))
|
||||
component.retryTask(tasks[0])
|
||||
expect(retrySpy).toHaveBeenCalledWith(tasks[0], false)
|
||||
expect(toastInfoSpy).toHaveBeenCalledWith('Retrying task...')
|
||||
retrySpy.mockReturnValueOnce(throwError(() => new Error('test')))
|
||||
component.retryTask(tasks[0])
|
||||
expect(toastErrorSpy).toHaveBeenCalledWith(
|
||||
'Failed to retry task',
|
||||
new Error('test')
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
@@ -20,13 +20,12 @@ import {
|
||||
takeUntil,
|
||||
timer,
|
||||
} from 'rxjs'
|
||||
import { PaperlessTask, PaperlessTaskStatus } from 'src/app/data/paperless-task'
|
||||
import { PaperlessTask } from 'src/app/data/paperless-task'
|
||||
import { IfPermissionsDirective } from 'src/app/directives/if-permissions.directive'
|
||||
import { CustomDatePipe } from 'src/app/pipes/custom-date.pipe'
|
||||
import { TasksService } from 'src/app/services/tasks.service'
|
||||
import { ToastService } from 'src/app/services/toast.service'
|
||||
import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
|
||||
import { CheckComponent } from '../../common/input/check/check.component'
|
||||
import { PageHeaderComponent } from '../../common/page-header/page-header.component'
|
||||
import { LoadingComponentWithPermissions } from '../../loading-component/loading.component'
|
||||
|
||||
@@ -55,7 +54,6 @@ const FILTER_TARGETS = [
|
||||
PageHeaderComponent,
|
||||
IfPermissionsDirective,
|
||||
CustomDatePipe,
|
||||
CheckComponent,
|
||||
SlicePipe,
|
||||
FormsModule,
|
||||
ReactiveFormsModule,
|
||||
@@ -77,7 +75,6 @@ export class TasksComponent
|
||||
private readonly router = inject(Router)
|
||||
private readonly toastService = inject(ToastService)
|
||||
|
||||
public PaperlessTaskStatus = PaperlessTaskStatus
|
||||
public activeTab: TaskTab
|
||||
public selectedTasks: Set<number> = new Set()
|
||||
public togggleAll: boolean = false
|
||||
@@ -108,8 +105,6 @@ export class TasksComponent
|
||||
: FILTER_TARGETS.slice(0, 1)
|
||||
}
|
||||
|
||||
public retryClean: boolean = false
|
||||
|
||||
get dismissButtonText(): string {
|
||||
return this.selectedTasks.size > 0
|
||||
? $localize`Dismiss selected`
|
||||
@@ -183,17 +178,6 @@ export class TasksComponent
|
||||
this.router.navigate(['documents', task.related_document])
|
||||
}
|
||||
|
||||
retryTask(task: PaperlessTask) {
|
||||
this.tasksService.retryTask(task, this.retryClean).subscribe({
|
||||
next: () => {
|
||||
this.toastService.showInfo($localize`Retrying task...`)
|
||||
},
|
||||
error: (e) => {
|
||||
this.toastService.showError($localize`Failed to retry task`, e)
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
expandTask(task: PaperlessTask) {
|
||||
this.expandedTask = this.expandedTask == task.id ? undefined : task.id
|
||||
}
|
||||
|
||||
@@ -147,33 +147,4 @@ describe('TasksService', () => {
|
||||
result: 'success',
|
||||
})
|
||||
})
|
||||
|
||||
it('should call retry task api endpoint', () => {
|
||||
const task = {
|
||||
id: 1,
|
||||
type: PaperlessTaskType.File,
|
||||
status: PaperlessTaskStatus.Failed,
|
||||
acknowledged: false,
|
||||
task_id: '1234',
|
||||
task_file_name: 'file1.pdf',
|
||||
date_created: new Date(),
|
||||
}
|
||||
|
||||
tasksService.retryTask(task, true).subscribe()
|
||||
const reloadSpy = jest.spyOn(tasksService, 'reload')
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}tasks/${task.id}/retry/`
|
||||
)
|
||||
expect(req.request.method).toEqual('POST')
|
||||
expect(req.request.body).toEqual({
|
||||
clean: true,
|
||||
})
|
||||
req.flush({ task_id: 12345 })
|
||||
expect(reloadSpy).toHaveBeenCalled()
|
||||
httpTestingController
|
||||
.expectOne(
|
||||
`${environment.apiBaseUrl}tasks/?task_name=consume_file&acknowledged=false`
|
||||
)
|
||||
.flush([])
|
||||
})
|
||||
})
|
||||
|
||||
@@ -81,20 +81,6 @@ export class TasksService {
|
||||
)
|
||||
}
|
||||
|
||||
public retryTask(task: PaperlessTask, clean: boolean): Observable<any> {
|
||||
return this.http
|
||||
.post(`${this.baseUrl}tasks/${task.id}/retry/`, {
|
||||
clean,
|
||||
})
|
||||
.pipe(
|
||||
takeUntil(this.unsubscribeNotifer),
|
||||
first(),
|
||||
tap(() => {
|
||||
this.reload()
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
public cancelPending(): void {
|
||||
this.unsubscribeNotifer.next(true)
|
||||
}
|
||||
|
||||
@@ -52,6 +52,7 @@ from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
@@ -67,7 +68,7 @@ def _parser_cleanup(parser: DocumentParser) -> None:
|
||||
|
||||
TODO(stumpylog): Remove me in the future
|
||||
"""
|
||||
if isinstance(parser, TextDocumentParser):
|
||||
if isinstance(parser, (TextDocumentParser, TikaDocumentParser)):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
@@ -174,17 +175,6 @@ class ConsumerPluginMixin:
|
||||
):
|
||||
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
|
||||
self.log.error(log_message or message, exc_info=exc_info)
|
||||
# Move the file to the failed directory
|
||||
if (
|
||||
self.input_doc.original_file.exists()
|
||||
and not Path(
|
||||
settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
|
||||
).exists()
|
||||
):
|
||||
copy_file_with_basic_stats(
|
||||
self.input_doc.original_file,
|
||||
settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
|
||||
)
|
||||
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
|
||||
|
||||
|
||||
@@ -459,6 +449,12 @@ class ConsumerPlugin(
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
# New-style parsers use __enter__/__exit__ for resource management.
|
||||
# _parser_cleanup (below) handles __exit__; call __enter__ here.
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
|
||||
document_parser.__enter__()
|
||||
|
||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
@@ -487,7 +483,7 @@ class ConsumerPlugin(
|
||||
self.filename,
|
||||
self.input_doc.mailrule_id,
|
||||
)
|
||||
elif isinstance(document_parser, TextDocumentParser):
|
||||
elif isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
else:
|
||||
@@ -500,7 +496,7 @@ class ConsumerPlugin(
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
if isinstance(document_parser, TextDocumentParser):
|
||||
if isinstance(document_parser, (TextDocumentParser, TikaDocumentParser)):
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
else:
|
||||
|
||||
@@ -2411,14 +2411,6 @@ class TasksViewSerializer(OwnedObjectSerializer):
|
||||
return list(duplicates.values("id", "title", "deleted_at"))
|
||||
|
||||
|
||||
class RetryTaskSerializer(serializers.Serializer):
|
||||
clean = serializers.BooleanField(
|
||||
default=False,
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
|
||||
class RunTaskViewSerializer(serializers.Serializer[dict[str, Any]]):
|
||||
task_name = serializers.ChoiceField(
|
||||
choices=PaperlessTask.TaskName.choices,
|
||||
|
||||
@@ -631,19 +631,6 @@ def update_filename_and_move_files(
|
||||
)
|
||||
|
||||
|
||||
@receiver(models.signals.post_save, sender=PaperlessTask)
|
||||
def cleanup_failed_documents(sender, instance: PaperlessTask, **kwargs):
|
||||
if instance.status != states.FAILURE or not instance.acknowledged:
|
||||
return
|
||||
|
||||
if instance.task_file_name:
|
||||
try:
|
||||
Path(settings.CONSUMPTION_FAILED_DIR / instance.task_file_name).unlink()
|
||||
logger.debug(f"Cleaned up failed file {instance.task_file_name}")
|
||||
except FileNotFoundError:
|
||||
logger.warning(f"Failed to clean up failed file {instance.task_file_name}")
|
||||
|
||||
|
||||
@shared_task
|
||||
def process_cf_select_update(custom_field: CustomField) -> None:
|
||||
"""
|
||||
|
||||
@@ -37,7 +37,6 @@ from documents.consumer import ConsumerPreflightPlugin
|
||||
from documents.consumer import WorkflowTriggerPlugin
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.double_sided import CollatePlugin
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.file_handling import generate_unique_filename
|
||||
@@ -64,8 +63,6 @@ from documents.signals import document_updated
|
||||
from documents.signals.handlers import cleanup_document_deletion
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
@@ -75,6 +72,7 @@ from paperless_ai.indexing import update_llm_index
|
||||
_T = TypeVar("_T")
|
||||
IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]
|
||||
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import LogEntry
|
||||
logger = logging.getLogger("paperless.tasks")
|
||||
@@ -241,48 +239,6 @@ def consume_file(
|
||||
return msg
|
||||
|
||||
|
||||
@shared_task
|
||||
def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False):
|
||||
task = PaperlessTask.objects.get(task_id=task_id, status=states.FAILURE)
|
||||
if task:
|
||||
failed_file = settings.CONSUMPTION_FAILED_DIR / task.task_file_name
|
||||
if not failed_file.exists():
|
||||
logger.error(f"File {failed_file} not found")
|
||||
raise FileNotFoundError(f"File {failed_file} not found")
|
||||
working_copy = settings.SCRATCH_DIR / failed_file.name
|
||||
copy_file_with_basic_stats(failed_file, working_copy)
|
||||
|
||||
if clean:
|
||||
try:
|
||||
result = run_subprocess(
|
||||
[
|
||||
"qpdf",
|
||||
"--replace-input",
|
||||
"--warning-exit-0",
|
||||
working_copy,
|
||||
],
|
||||
logger=logger,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise Exception(
|
||||
f"qpdf failed with exit code {result.returncode}, error: {result.stderr}",
|
||||
)
|
||||
else:
|
||||
logger.debug("PDF cleaned successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Error while cleaning PDF: {e}")
|
||||
raise e
|
||||
|
||||
task = consume_file.delay(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=working_copy,
|
||||
),
|
||||
)
|
||||
|
||||
return task.id
|
||||
|
||||
|
||||
@shared_task
|
||||
def sanity_check(*, scheduled=True, raise_on_error=True):
|
||||
messages = sanity_checker.check_sanity(scheduled=scheduled)
|
||||
|
||||
Binary file not shown.
@@ -10,8 +10,8 @@ from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import get_supported_file_extensions
|
||||
from documents.parsers import is_file_ext_supported
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
|
||||
|
||||
class TestParserDiscovery(TestCase):
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import shutil
|
||||
import uuid
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
@@ -22,7 +21,6 @@ from documents.sanity_checker import SanityCheckMessages
|
||||
from documents.tests.test_classifier import dummy_preprocess
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from documents.tests.utils import SampleDirMixin
|
||||
|
||||
|
||||
class TestIndexReindex(DirectoriesMixin, TestCase):
|
||||
@@ -234,44 +232,6 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertEqual(Document.global_objects.count(), 0)
|
||||
|
||||
|
||||
class TestRetryConsumeTask(
|
||||
DirectoriesMixin,
|
||||
SampleDirMixin,
|
||||
FileSystemAssertsMixin,
|
||||
TestCase,
|
||||
):
|
||||
def do_failed_task(self, test_file: Path) -> PaperlessTask:
|
||||
failed_file = settings.CONSUMPTION_FAILED_DIR / test_file.name
|
||||
shutil.copy(test_file, failed_file)
|
||||
|
||||
task = PaperlessTask.objects.create(
|
||||
type=PaperlessTask.TaskType.AUTO,
|
||||
task_id=str(uuid.uuid4()),
|
||||
task_file_name=failed_file.name,
|
||||
task_name=PaperlessTask.TaskName.CONSUME_FILE,
|
||||
status=states.FAILURE,
|
||||
date_created=timezone.now(),
|
||||
date_done=timezone.now(),
|
||||
)
|
||||
self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
|
||||
return task
|
||||
|
||||
@mock.patch("documents.tasks.consume_file.delay")
|
||||
@mock.patch("documents.tasks.run_subprocess")
|
||||
def test_retry_consume_clean(self, m_subprocess, m_consume_file) -> None:
|
||||
task = self.do_failed_task(self.SAMPLE_DIR / "corrupted.pdf")
|
||||
m_subprocess.return_value.returncode = 0
|
||||
task_id = tasks.retry_failed_file(task_id=task.task_id, clean=True)
|
||||
self.assertIsNotNone(task_id)
|
||||
m_consume_file.assert_called_once()
|
||||
|
||||
def test_cleanup(self) -> None:
|
||||
task = self.do_failed_task(self.SAMPLE_DIR / "corrupted.pdf")
|
||||
task.acknowledged = True
|
||||
task.save()
|
||||
self.assertIsNotFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
|
||||
|
||||
|
||||
class TestUpdateContent(DirectoriesMixin, TestCase):
|
||||
def test_update_content_maybe_archive_file(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -37,7 +37,6 @@ def setup_directories():
|
||||
dirs.scratch_dir = Path(tempfile.mkdtemp()).resolve()
|
||||
dirs.media_dir = Path(tempfile.mkdtemp()).resolve()
|
||||
dirs.consumption_dir = Path(tempfile.mkdtemp()).resolve()
|
||||
dirs.consumption_failed_dir = Path(tempfile.mkdtemp("failed")).resolve()
|
||||
dirs.static_dir = Path(tempfile.mkdtemp()).resolve()
|
||||
dirs.index_dir = dirs.data_dir / "index"
|
||||
dirs.originals_dir = dirs.media_dir / "documents" / "originals"
|
||||
@@ -59,7 +58,6 @@ def setup_directories():
|
||||
THUMBNAIL_DIR=dirs.thumbnail_dir,
|
||||
ARCHIVE_DIR=dirs.archive_dir,
|
||||
CONSUMPTION_DIR=dirs.consumption_dir,
|
||||
CONSUMPTION_FAILED_DIR=dirs.consumption_failed_dir,
|
||||
LOGGING_DIR=dirs.logging_dir,
|
||||
INDEX_DIR=dirs.index_dir,
|
||||
STATIC_ROOT=dirs.static_dir,
|
||||
@@ -76,7 +74,6 @@ def remove_dirs(dirs) -> None:
|
||||
shutil.rmtree(dirs.data_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.consumption_failed_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.static_dir, ignore_errors=True)
|
||||
dirs.settings_override.disable()
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ import tempfile
|
||||
import zipfile
|
||||
from collections import defaultdict
|
||||
from collections import deque
|
||||
from contextlib import nullcontext
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from time import mktime
|
||||
@@ -189,7 +190,6 @@ from documents.serialisers import NotesSerializer
|
||||
from documents.serialisers import PostDocumentSerializer
|
||||
from documents.serialisers import RemovePasswordDocumentsSerializer
|
||||
from documents.serialisers import ReprocessDocumentsSerializer
|
||||
from documents.serialisers import RetryTaskSerializer
|
||||
from documents.serialisers import RotateDocumentsSerializer
|
||||
from documents.serialisers import RunTaskViewSerializer
|
||||
from documents.serialisers import SavedViewSerializer
|
||||
@@ -212,7 +212,6 @@ from documents.tasks import consume_file
|
||||
from documents.tasks import empty_trash
|
||||
from documents.tasks import index_optimize
|
||||
from documents.tasks import llmindex_index
|
||||
from documents.tasks import retry_failed_file
|
||||
from documents.tasks import sanity_check
|
||||
from documents.tasks import train_classifier
|
||||
from documents.tasks import update_document_parent_tags
|
||||
@@ -227,6 +226,7 @@ from paperless.celery import app as celery_app
|
||||
from paperless.config import AIConfig
|
||||
from paperless.config import GeneralConfig
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.serialisers import GroupSerializer
|
||||
from paperless.serialisers import UserSerializer
|
||||
from paperless.views import StandardPagination
|
||||
@@ -1086,9 +1086,11 @@ class DocumentViewSet(
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if parser_class:
|
||||
parser = parser_class(progress_callback=None, logging_group=None)
|
||||
cm = parser if isinstance(parser, ParserProtocol) else nullcontext(parser)
|
||||
|
||||
try:
|
||||
return parser.extract_metadata(file, mime_type)
|
||||
with cm:
|
||||
return parser.extract_metadata(file, mime_type)
|
||||
except Exception: # pragma: no cover
|
||||
logger.exception(f"Issue getting metadata for {file}")
|
||||
# TODO: cover GPG errors, remove later.
|
||||
@@ -3469,25 +3471,6 @@ class TasksViewSet(ReadOnlyModelViewSet):
|
||||
queryset = PaperlessTask.objects.filter(task_id=task_id)
|
||||
return queryset
|
||||
|
||||
@action(methods=["post"], detail=True)
|
||||
def retry(self, request, pk=None):
|
||||
task = self.get_object()
|
||||
|
||||
serializer = RetryTaskSerializer(data=request.data)
|
||||
serializer.is_valid(raise_exception=True)
|
||||
clean = serializer.validated_data.get("clean")
|
||||
|
||||
try:
|
||||
new_task_id = retry_failed_file(task.task_id, clean)
|
||||
return Response({"task_id": new_task_id})
|
||||
except FileNotFoundError:
|
||||
return HttpResponseBadRequest("Original file not found")
|
||||
except Exception as e:
|
||||
logger.warning(f"An error occurred retrying task: {e!s}")
|
||||
return HttpResponseBadRequest(
|
||||
"Error retrying task, check logs for more detail.",
|
||||
)
|
||||
|
||||
@action(
|
||||
methods=["post"],
|
||||
detail=False,
|
||||
|
||||
@@ -68,10 +68,6 @@ def paths_check(app_configs, **kwargs) -> list[Error]:
|
||||
+ path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
|
||||
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
|
||||
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
|
||||
+ path_check(
|
||||
"PAPERLESS_CONSUMPTION_FAILED_DIR",
|
||||
settings.CONSUMPTION_FAILED_DIR,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -194,8 +194,10 @@ class ParserRegistry:
|
||||
at runtime regardless of registration order.
|
||||
"""
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
self.register_builtin(TextDocumentParser)
|
||||
self.register_builtin(TikaDocumentParser)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Discovery
|
||||
|
||||
440
src/paperless/parsers/tika.py
Normal file
440
src/paperless/parsers/tika.py
Normal file
@@ -0,0 +1,440 @@
|
||||
"""
|
||||
Built-in Tika document parser.
|
||||
|
||||
Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
|
||||
sending them to an Apache Tika server for text extraction and a Gotenberg
|
||||
server for PDF conversion. Because the source formats cannot be rendered by
|
||||
a browser natively, the parser always produces a PDF rendition for display.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
from contextlib import ExitStack
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
|
||||
import httpx
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from gotenberg_client import GotenbergClient
|
||||
from gotenberg_client.options import PdfAFormat
|
||||
from tika_client import TikaClient
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless.config import OutputTypeConfig
|
||||
from paperless.models import OutputTypeChoices
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.tika")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"application/msword": ".doc",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||
"application/vnd.ms-excel": ".xls",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
||||
"application/vnd.ms-powerpoint": ".ppt",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
|
||||
"application/vnd.oasis.opendocument.presentation": ".odp",
|
||||
"application/vnd.oasis.opendocument.spreadsheet": ".ods",
|
||||
"application/vnd.oasis.opendocument.text": ".odt",
|
||||
"application/vnd.oasis.opendocument.graphics": ".odg",
|
||||
"text/rtf": ".rtf",
|
||||
}
|
||||
|
||||
|
||||
class TikaDocumentParser:
|
||||
"""Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
|
||||
|
||||
Text extraction is handled by the Tika server. PDF conversion for display
|
||||
is handled by Gotenberg (LibreOffice route). Because the source formats
|
||||
cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
|
||||
True and the PDF is always produced regardless of the ``produce_archive``
|
||||
flag passed to ``parse``.
|
||||
|
||||
Both ``TikaClient`` and ``GotenbergClient`` are opened once in
|
||||
``__enter__`` via an ``ExitStack`` and shared across ``parse``,
|
||||
``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
|
||||
``ExitStack.close()`` in ``__exit__``. The parser must always be used
|
||||
as a context manager.
|
||||
|
||||
Class attributes
|
||||
----------------
|
||||
name : str
|
||||
Human-readable parser name.
|
||||
version : str
|
||||
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||
author : str
|
||||
Maintainer name.
|
||||
url : str
|
||||
Issue tracker / source URL.
|
||||
"""
|
||||
|
||||
name: str = "Paperless-ngx Tika Parser"
|
||||
version: str = __full_version_str__
|
||||
author: str = "Paperless-ngx Contributors"
|
||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict[str, str]:
|
||||
"""Return the MIME types this parser handles.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, str]
|
||||
Mapping of MIME type to preferred file extension.
|
||||
"""
|
||||
return _SUPPORTED_MIME_TYPES
|
||||
|
||||
@classmethod
|
||||
def score(
|
||||
cls,
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> int | None:
|
||||
"""Return the priority score for handling this file.
|
||||
|
||||
Returns ``None`` when Tika integration is disabled so the registry
|
||||
skips this parser entirely.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mime_type:
|
||||
Detected MIME type of the file.
|
||||
filename:
|
||||
Original filename including extension.
|
||||
path:
|
||||
Optional filesystem path. Not inspected by this parser.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
|
||||
"""
|
||||
if not settings.TIKA_ENABLED:
|
||||
return None
|
||||
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||
return 10
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
"""Whether this parser can produce a searchable PDF archive copy.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always False — Tika produces a display PDF, not an OCR archive.
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
"""Whether the parser must produce a PDF for the frontend to display.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always True — Office formats cannot be rendered natively in a
|
||||
browser, so a PDF conversion is always required for display.
|
||||
"""
|
||||
return True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self._tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
self._text: str | None = None
|
||||
self._date: datetime.datetime | None = None
|
||||
self._archive_path: Path | None = None
|
||||
self._exit_stack = ExitStack()
|
||||
self._tika_client: TikaClient | None = None
|
||||
self._gotenberg_client: GotenbergClient | None = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
self._tika_client = self._exit_stack.enter_context(
|
||||
TikaClient(
|
||||
tika_url=settings.TIKA_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
),
|
||||
)
|
||||
self._gotenberg_client = self._exit_stack.enter_context(
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
),
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
self._exit_stack.close()
|
||||
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
*,
|
||||
produce_archive: bool = True,
|
||||
) -> None:
|
||||
"""Send the document to Tika for text extraction and Gotenberg for PDF.
|
||||
|
||||
Because ``requires_pdf_rendition`` is True the PDF conversion is
|
||||
always performed — the ``produce_archive`` flag is intentionally
|
||||
ignored.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the document file to parse.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
produce_archive:
|
||||
Accepted for protocol compatibility but ignored; the PDF rendition
|
||||
is always produced since the source format cannot be displayed
|
||||
natively in the browser.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If Tika or Gotenberg returns an error.
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
assert self._tika_client is not None
|
||||
|
||||
logger.info("Sending %s to Tika server", document_path)
|
||||
|
||||
try:
|
||||
try:
|
||||
parsed = self._tika_client.tika.as_text.from_file(
|
||||
document_path,
|
||||
mime_type,
|
||||
)
|
||||
except httpx.HTTPStatusError as err:
|
||||
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
|
||||
# Tika fails with some files as multi-part form data
|
||||
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
|
||||
parsed = self._tika_client.tika.as_text.from_buffer(
|
||||
document_path.read_bytes(),
|
||||
mime_type,
|
||||
)
|
||||
else: # pragma: no cover
|
||||
raise
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse {document_path} with tika server at "
|
||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||
) from err
|
||||
|
||||
self._text = parsed.content
|
||||
if self._text is not None:
|
||||
self._text = self._text.strip()
|
||||
|
||||
self._date = parsed.created
|
||||
if self._date is not None and timezone.is_naive(self._date):
|
||||
self._date = timezone.make_aware(self._date)
|
||||
|
||||
# Always convert — requires_pdf_rendition=True means the browser
|
||||
# cannot display the source format natively.
|
||||
self._archive_path = self._convert_to_pdf(document_path)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
"""Return the plain-text content extracted during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if parse has not been called yet.
|
||||
"""
|
||||
return self._text
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
datetime.datetime | None
|
||||
Creation date from Tika metadata, or None if not detected.
|
||||
"""
|
||||
return self._date
|
||||
|
||||
def get_archive_path(self) -> Path | None:
|
||||
"""Return the path to the generated PDF rendition, or None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path | None
|
||||
Path to the PDF produced by Gotenberg, or None if parse has not
|
||||
been called yet.
|
||||
"""
|
||||
return self._archive_path
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thumbnail and metadata
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||
"""Generate a thumbnail from the PDF rendition of the document.
|
||||
|
||||
Converts the document to PDF first if not already done.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated WebP thumbnail inside the temporary directory.
|
||||
"""
|
||||
if self._archive_path is None:
|
||||
self._archive_path = self._convert_to_pdf(document_path)
|
||||
return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
|
||||
|
||||
def get_page_count(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Always None — page count is not available from Tika.
|
||||
"""
|
||||
return None
|
||||
|
||||
def extract_metadata(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> list[MetadataEntry]:
|
||||
"""Extract format-specific metadata via the Tika metadata endpoint.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[MetadataEntry]
|
||||
All key/value pairs returned by Tika, or ``[]`` on error.
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
assert self._tika_client is not None
|
||||
|
||||
try:
|
||||
parsed = self._tika_client.metadata.from_file(document_path, mime_type)
|
||||
return [
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": key,
|
||||
"value": parsed.data[key],
|
||||
}
|
||||
for key in parsed.data
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Error while fetching document metadata for %s: %s",
|
||||
document_path,
|
||||
e,
|
||||
)
|
||||
return []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _convert_to_pdf(self, document_path: Path) -> Path:
|
||||
"""Convert the document to PDF using Gotenberg's LibreOffice route.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated PDF inside the temporary directory.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If Gotenberg returns an error.
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
assert self._gotenberg_client is not None
|
||||
|
||||
pdf_path = self._tempdir / "convert.pdf"
|
||||
|
||||
logger.info("Converting %s to PDF as %s", document_path, pdf_path)
|
||||
|
||||
with self._gotenberg_client.libre_office.to_pdf() as route:
|
||||
# Set the output format of the resulting PDF.
|
||||
# OutputTypeConfig reads the database-stored ApplicationConfiguration
|
||||
# first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
|
||||
output_type = OutputTypeConfig().output_type
|
||||
if output_type in {
|
||||
OutputTypeChoices.PDF_A,
|
||||
OutputTypeChoices.PDF_A2,
|
||||
}:
|
||||
route.pdf_format(PdfAFormat.A2b)
|
||||
elif output_type == OutputTypeChoices.PDF_A1:
|
||||
logger.warning(
|
||||
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||
)
|
||||
route.pdf_format(PdfAFormat.A2b)
|
||||
elif output_type == OutputTypeChoices.PDF_A3:
|
||||
route.pdf_format(PdfAFormat.A3b)
|
||||
|
||||
route.convert(document_path)
|
||||
|
||||
try:
|
||||
response = route.run()
|
||||
pdf_path.write_bytes(response.content)
|
||||
return pdf_path
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting document to PDF: {err}",
|
||||
) from err
|
||||
@@ -98,11 +98,6 @@ CONSUMPTION_DIR = get_path_from_env(
|
||||
BASE_DIR.parent / "consume",
|
||||
)
|
||||
|
||||
CONSUMPTION_FAILED_DIR = get_path_from_env(
|
||||
"PAPERLESS_CONSUMPTION_FAILED_DIR",
|
||||
CONSUMPTION_DIR / "failed",
|
||||
)
|
||||
|
||||
# This will be created if it doesn't exist
|
||||
SCRATCH_DIR = get_path_from_env(
|
||||
"PAPERLESS_SCRATCH_DIR",
|
||||
@@ -787,8 +782,6 @@ CONSUMER_IGNORE_PATTERNS = list(
|
||||
),
|
||||
),
|
||||
)
|
||||
if CONSUMPTION_DIR in CONSUMPTION_FAILED_DIR.parents:
|
||||
CONSUMER_IGNORE_PATTERNS.append(CONSUMPTION_FAILED_DIR.name)
|
||||
|
||||
# Directories to always ignore. These are matched by directory name, not full path
|
||||
CONSUMER_IGNORE_DIRS = list(
|
||||
|
||||
@@ -11,6 +11,7 @@ from typing import TYPE_CHECKING
|
||||
import pytest
|
||||
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
@@ -74,3 +75,86 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
||||
"""
|
||||
with TextDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tika parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tika_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the Tika parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/tika/``
|
||||
"""
|
||||
return samples_dir / "tika"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_odt_file(tika_samples_dir: Path) -> Path:
|
||||
"""Path to a sample ODT file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tika/sample.odt``.
|
||||
"""
|
||||
return tika_samples_dir / "sample.odt"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_docx_file(tika_samples_dir: Path) -> Path:
|
||||
"""Path to a sample DOCX file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tika/sample.docx``.
|
||||
"""
|
||||
return tika_samples_dir / "sample.docx"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_doc_file(tika_samples_dir: Path) -> Path:
|
||||
"""Path to a sample DOC file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tika/sample.doc``.
|
||||
"""
|
||||
return tika_samples_dir / "sample.doc"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_broken_odt(tika_samples_dir: Path) -> Path:
|
||||
"""Path to a broken ODT file that triggers the multi-part fallback.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tika/multi-part-broken.odt``.
|
||||
"""
|
||||
return tika_samples_dir / "multi-part-broken.odt"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tika parser instance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
|
||||
"""Yield a TikaDocumentParser and clean up its temporary directory afterwards.
|
||||
|
||||
Yields
|
||||
------
|
||||
TikaDocumentParser
|
||||
A ready-to-use parser instance.
|
||||
"""
|
||||
with TikaDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
@@ -4,7 +4,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from documents.tests.utils import util_call_with_backoff
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -42,14 +42,15 @@ class TestTikaParserAgainstServer:
|
||||
)
|
||||
|
||||
assert (
|
||||
tika_parser.text
|
||||
tika_parser.get_text()
|
||||
== "This is an ODT test document, created September 14, 2022"
|
||||
)
|
||||
assert tika_parser.archive_path is not None
|
||||
assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
|
||||
archive = tika_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
assert b"PDF-" in archive.read_bytes()[:10]
|
||||
|
||||
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
|
||||
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
|
||||
# self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
|
||||
|
||||
def test_basic_parse_docx(
|
||||
self,
|
||||
@@ -74,14 +75,15 @@ class TestTikaParserAgainstServer:
|
||||
)
|
||||
|
||||
assert (
|
||||
tika_parser.text
|
||||
tika_parser.get_text()
|
||||
== "This is an DOCX test document, also made September 14, 2022"
|
||||
)
|
||||
assert tika_parser.archive_path is not None
|
||||
with Path(tika_parser.archive_path).open("rb") as f:
|
||||
archive = tika_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
with archive.open("rb") as f:
|
||||
assert b"PDF-" in f.read()[:10]
|
||||
|
||||
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
|
||||
# self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
|
||||
|
||||
def test_basic_parse_doc(
|
||||
self,
|
||||
@@ -102,13 +104,12 @@ class TestTikaParserAgainstServer:
|
||||
[sample_doc_file, "application/msword"],
|
||||
)
|
||||
|
||||
assert tika_parser.text is not None
|
||||
assert (
|
||||
"This is a test document, saved in the older .doc format"
|
||||
in tika_parser.text
|
||||
)
|
||||
assert tika_parser.archive_path is not None
|
||||
with Path(tika_parser.archive_path).open("rb") as f:
|
||||
text = tika_parser.get_text()
|
||||
assert text is not None
|
||||
assert "This is a test document, saved in the older .doc format" in text
|
||||
archive = tika_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
with archive.open("rb") as f:
|
||||
assert b"PDF-" in f.read()[:10]
|
||||
|
||||
def test_tika_fails_multi_part(
|
||||
@@ -133,6 +134,7 @@ class TestTikaParserAgainstServer:
|
||||
[sample_broken_odt, "application/vnd.oasis.opendocument.text"],
|
||||
)
|
||||
|
||||
assert tika_parser.archive_path is not None
|
||||
with Path(tika_parser.archive_path).open("rb") as f:
|
||||
archive = tika_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
with archive.open("rb") as f:
|
||||
assert b"PDF-" in f.read()[:10]
|
||||
@@ -9,7 +9,56 @@ from pytest_django.fixtures import SettingsWrapper
|
||||
from pytest_httpx import HTTPXMock
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
|
||||
class TestTikaParserRegistryInterface:
|
||||
"""Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
|
||||
|
||||
def test_satisfies_parser_protocol(self) -> None:
|
||||
assert isinstance(TikaDocumentParser(), ParserProtocol)
|
||||
|
||||
def test_supported_mime_types_is_classmethod(self) -> None:
|
||||
mime_types = TikaDocumentParser.supported_mime_types()
|
||||
assert isinstance(mime_types, dict)
|
||||
assert len(mime_types) > 0
|
||||
|
||||
def test_score_returns_none_when_tika_disabled(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.TIKA_ENABLED = False
|
||||
result = TikaDocumentParser.score(
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
"sample.odt",
|
||||
)
|
||||
assert result is None
|
||||
|
||||
def test_score_returns_int_when_tika_enabled(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.TIKA_ENABLED = True
|
||||
result = TikaDocumentParser.score(
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
"sample.odt",
|
||||
)
|
||||
assert isinstance(result, int)
|
||||
|
||||
def test_score_returns_none_for_unsupported_mime(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.TIKA_ENABLED = True
|
||||
result = TikaDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
def test_can_produce_archive_is_false(self) -> None:
|
||||
assert TikaDocumentParser().can_produce_archive is False
|
||||
|
||||
def test_requires_pdf_rendition_is_true(self) -> None:
|
||||
assert TikaDocumentParser().requires_pdf_rendition is True
|
||||
|
||||
|
||||
@pytest.mark.django_db()
|
||||
@@ -36,12 +85,12 @@ class TestTikaParser:
|
||||
|
||||
tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
|
||||
|
||||
assert tika_parser.text == "the content"
|
||||
assert tika_parser.archive_path is not None
|
||||
with Path(tika_parser.archive_path).open("rb") as f:
|
||||
assert tika_parser.get_text() == "the content"
|
||||
assert tika_parser.get_archive_path() is not None
|
||||
with Path(tika_parser.get_archive_path()).open("rb") as f:
|
||||
assert f.read() == b"PDF document"
|
||||
|
||||
assert tika_parser.date == datetime.datetime(
|
||||
assert tika_parser.get_date() == datetime.datetime(
|
||||
2020,
|
||||
11,
|
||||
21,
|
||||
@@ -89,7 +138,7 @@ class TestTikaParser:
|
||||
httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
|
||||
|
||||
with pytest.raises(ParseError):
|
||||
tika_parser.convert_to_pdf(sample_odt_file, None)
|
||||
tika_parser._convert_to_pdf(sample_odt_file)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("setting_value", "expected_form_value"),
|
||||
@@ -106,7 +155,6 @@ class TestTikaParser:
|
||||
expected_form_value: str,
|
||||
httpx_mock: HTTPXMock,
|
||||
settings: SettingsWrapper,
|
||||
tika_parser: TikaDocumentParser,
|
||||
sample_odt_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
@@ -117,6 +165,8 @@ class TestTikaParser:
|
||||
THEN:
|
||||
- Request to Gotenberg contains the expected PDF/A format string
|
||||
"""
|
||||
# Parser must be created after the setting is changed so that
|
||||
# OutputTypeConfig reads the correct value at __init__ time.
|
||||
settings.OCR_OUTPUT_TYPE = setting_value
|
||||
httpx_mock.add_response(
|
||||
status_code=codes.OK,
|
||||
@@ -124,7 +174,8 @@ class TestTikaParser:
|
||||
method="POST",
|
||||
)
|
||||
|
||||
tika_parser.convert_to_pdf(sample_odt_file, None)
|
||||
with TikaDocumentParser() as parser:
|
||||
parser._convert_to_pdf(sample_odt_file)
|
||||
|
||||
request = httpx_mock.get_request()
|
||||
|
||||
@@ -24,7 +24,6 @@ class PaperlessTestDirs:
|
||||
data_dir: Path
|
||||
media_dir: Path
|
||||
consumption_dir: Path
|
||||
consumption_failed_dir: Path
|
||||
|
||||
|
||||
# TODO: consolidate with documents/tests/conftest.py PaperlessDirs/paperless_dirs
|
||||
@@ -34,21 +33,18 @@ def directories(tmp_path: Path, settings: SettingsWrapper) -> PaperlessTestDirs:
|
||||
data_dir = tmp_path / "data"
|
||||
media_dir = tmp_path / "media"
|
||||
consumption_dir = tmp_path / "consumption"
|
||||
consumption_failed_dir = tmp_path / "consumption_failed"
|
||||
|
||||
for d in (data_dir, media_dir, consumption_dir, consumption_failed_dir):
|
||||
for d in (data_dir, media_dir, consumption_dir):
|
||||
d.mkdir()
|
||||
|
||||
settings.DATA_DIR = data_dir
|
||||
settings.MEDIA_ROOT = media_dir
|
||||
settings.CONSUMPTION_DIR = consumption_dir
|
||||
settings.CONSUMPTION_FAILED_DIR = consumption_failed_dir
|
||||
|
||||
return PaperlessTestDirs(
|
||||
data_dir=data_dir,
|
||||
media_dir=media_dir,
|
||||
consumption_dir=consumption_dir,
|
||||
consumption_failed_dir=consumption_failed_dir,
|
||||
)
|
||||
|
||||
|
||||
@@ -68,11 +64,10 @@ class TestChecks:
|
||||
settings.MEDIA_ROOT = Path("uuh")
|
||||
settings.DATA_DIR = Path("whatever")
|
||||
settings.CONSUMPTION_DIR = Path("idontcare")
|
||||
settings.CONSUMPTION_FAILED_DIR = Path("nope")
|
||||
|
||||
msgs = paths_check(None)
|
||||
|
||||
assert len(msgs) == 4, str(msgs)
|
||||
assert len(msgs) == 3, str(msgs)
|
||||
for msg in msgs:
|
||||
assert msg.msg.endswith("is set but doesn't exist.")
|
||||
|
||||
@@ -80,7 +75,6 @@ class TestChecks:
|
||||
directories.data_dir.chmod(0o000)
|
||||
directories.media_dir.chmod(0o000)
|
||||
directories.consumption_dir.chmod(0o000)
|
||||
directories.consumption_failed_dir.chmod(0o000)
|
||||
|
||||
try:
|
||||
msgs = paths_check(None)
|
||||
@@ -88,9 +82,8 @@ class TestChecks:
|
||||
directories.data_dir.chmod(0o777)
|
||||
directories.media_dir.chmod(0o777)
|
||||
directories.consumption_dir.chmod(0o777)
|
||||
directories.consumption_failed_dir.chmod(0o777)
|
||||
|
||||
assert len(msgs) == 4
|
||||
assert len(msgs) == 3
|
||||
for msg in msgs:
|
||||
assert msg.msg.endswith("is not writeable")
|
||||
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
# The new TextDocumentParser does not accept the legacy logging_group /
|
||||
# progress_callback kwargs injected by the old signal-based consumer.
|
||||
# These are dropped here; Phase 4 will replace this signal path with the
|
||||
# new ParserRegistry so the shim can be removed at that point.
|
||||
# TextDocumentParser accepts logging_group for constructor compatibility but
|
||||
# does not store or use it (no legacy DocumentParser base class).
|
||||
# progress_callback is also not used. Both may arrive as a positional arg
|
||||
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
||||
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||
kwargs.pop("logging_group", None)
|
||||
kwargs.pop("progress_callback", None)
|
||||
return TextDocumentParser()
|
||||
|
||||
@@ -1,136 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from gotenberg_client import GotenbergClient
|
||||
from gotenberg_client.options import PdfAFormat
|
||||
from tika_client import TikaClient
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless.config import OutputTypeConfig
|
||||
from paperless.models import OutputTypeChoices
|
||||
|
||||
|
||||
class TikaDocumentParser(DocumentParser):
|
||||
"""
|
||||
This parser sends documents to a local tika server
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.tika"
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
if not self.archive_path:
|
||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||
|
||||
return make_thumbnail_from_pdf(
|
||||
self.archive_path,
|
||||
self.tempdir,
|
||||
self.logging_group,
|
||||
)
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
try:
|
||||
with TikaClient(
|
||||
tika_url=settings.TIKA_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client:
|
||||
parsed = client.metadata.from_file(document_path, mime_type)
|
||||
return [
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": key,
|
||||
"value": parsed.data[key],
|
||||
}
|
||||
for key in parsed.data
|
||||
]
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"Error while fetching document metadata for {document_path}: {e}",
|
||||
)
|
||||
return []
|
||||
|
||||
def parse(self, document_path: Path, mime_type: str, file_name=None) -> None:
|
||||
self.log.info(f"Sending {document_path} to Tika server")
|
||||
|
||||
try:
|
||||
with TikaClient(
|
||||
tika_url=settings.TIKA_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client:
|
||||
try:
|
||||
parsed = client.tika.as_text.from_file(document_path, mime_type)
|
||||
except httpx.HTTPStatusError as err:
|
||||
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
|
||||
# Tika fails with some files as multi-part form data
|
||||
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
|
||||
parsed = client.tika.as_text.from_buffer(
|
||||
document_path.read_bytes(),
|
||||
mime_type,
|
||||
)
|
||||
else: # pragma: no cover
|
||||
raise
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse {document_path} with tika server at "
|
||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||
) from err
|
||||
|
||||
self.text = parsed.content
|
||||
if self.text is not None:
|
||||
self.text = self.text.strip()
|
||||
|
||||
self.date = parsed.created
|
||||
if self.date is not None and timezone.is_naive(self.date):
|
||||
self.date = timezone.make_aware(self.date)
|
||||
|
||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||
|
||||
def convert_to_pdf(self, document_path: Path, file_name):
|
||||
pdf_path = Path(self.tempdir) / "convert.pdf"
|
||||
|
||||
self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
|
||||
|
||||
with (
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client,
|
||||
client.libre_office.to_pdf() as route,
|
||||
):
|
||||
# Set the output format of the resulting PDF
|
||||
if settings.OCR_OUTPUT_TYPE in {
|
||||
OutputTypeChoices.PDF_A,
|
||||
OutputTypeChoices.PDF_A2,
|
||||
}:
|
||||
route.pdf_format(PdfAFormat.A2b)
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
|
||||
self.log.warning(
|
||||
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||
)
|
||||
route.pdf_format(PdfAFormat.A2b)
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
|
||||
route.pdf_format(PdfAFormat.A3b)
|
||||
|
||||
route.convert(document_path)
|
||||
|
||||
try:
|
||||
response = route.run()
|
||||
|
||||
pdf_path.write_bytes(response.content)
|
||||
|
||||
return pdf_path
|
||||
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting document to PDF: {err}",
|
||||
) from err
|
||||
|
||||
def get_settings(self) -> OutputTypeConfig:
|
||||
"""
|
||||
This parser only uses the PDF output type configuration currently
|
||||
"""
|
||||
return OutputTypeConfig()
|
||||
@@ -1,7 +1,15 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
return TikaDocumentParser(*args, **kwargs)
|
||||
# TikaDocumentParser accepts logging_group for constructor compatibility but
|
||||
# does not store or use it (no legacy DocumentParser base class).
|
||||
# progress_callback is also not used. Both may arrive as a positional arg
|
||||
# (consumer) or a keyword arg (views); *args absorbs the positional form,
|
||||
# kwargs.pop handles the keyword form. Phase 4 will replace this signal
|
||||
# path with the new ParserRegistry so the shim can be removed at that point.
|
||||
kwargs.pop("logging_group", None)
|
||||
kwargs.pop("progress_callback", None)
|
||||
return TikaDocumentParser()
|
||||
|
||||
|
||||
def tika_consumer_declaration(sender, **kwargs):
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
|
||||
try:
|
||||
parser = TikaDocumentParser(logging_group=None)
|
||||
yield parser
|
||||
finally:
|
||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_dir() -> Path:
|
||||
return (Path(__file__).parent / Path("samples")).resolve()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_odt_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "sample.odt"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_docx_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "sample.docx"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_doc_file(sample_dir: Path) -> Path:
|
||||
return sample_dir / "sample.doc"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_broken_odt(sample_dir: Path) -> Path:
|
||||
return sample_dir / "multi-part-broken.odt"
|
||||
Reference in New Issue
Block a user