Compare commits
51 Commits
feature-re
...
feature-dr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1e282e345a | ||
|
|
cda71b7a84 | ||
|
|
c540f59cc0 | ||
|
|
fb8e6c94a0 | ||
|
|
52b6cd2b8d | ||
|
|
e24a2d8214 | ||
|
|
8e3dfcb4ee | ||
|
|
1b45e4d029 | ||
|
|
6b279e9368 | ||
|
|
97bc53ccdc | ||
|
|
80fa4f6f12 | ||
|
|
c5b006e666 | ||
|
|
ad1654d89b | ||
|
|
466a402715 | ||
|
|
b2e3048083 | ||
|
|
fe1e35b9ac | ||
|
|
d01513a869 | ||
|
|
9e3c93f72d | ||
|
|
f7c12d550a | ||
|
|
68fc898042 | ||
|
|
16e73f611d | ||
|
|
b66cfb1867 | ||
|
|
49e1ebb620 | ||
|
|
8148f2ced2 | ||
|
|
a36b6ecbef | ||
|
|
2cbe6ae892 | ||
|
|
b0bb31654f | ||
|
|
07237bde6a | ||
|
|
b80702acb8 | ||
|
|
7428bbb8dc | ||
|
|
9a709abb7d | ||
|
|
3236bbd0c5 | ||
|
|
d107c8c531 | ||
|
|
8c671514ab | ||
|
|
f2c16a7d98 | ||
|
|
7c76e65950 | ||
|
|
d162c83eb7 | ||
|
|
d3ac75741f | ||
|
|
3abff21d1f | ||
|
|
0a08499fc7 | ||
|
|
330ee696a8 | ||
|
|
b98697ab8b | ||
|
|
7e94dd8208 | ||
|
|
79da72f69c | ||
|
|
261ae9d8ce | ||
|
|
0e2c191524 | ||
|
|
ab4656692d | ||
|
|
03e2c352c2 | ||
|
|
2d46ed9692 | ||
|
|
8d23d17ae8 | ||
|
|
aea2927a02 |
3
.github/dependabot.yml
vendored
@@ -157,6 +157,9 @@ updates:
|
||||
postgres:
|
||||
patterns:
|
||||
- "docker.io/library/postgres*"
|
||||
greenmail:
|
||||
patterns:
|
||||
- "docker.io/greenmail*"
|
||||
- package-ecosystem: "pre-commit" # See documentation for possible values
|
||||
directory: "/" # Location of package manifests
|
||||
schedule:
|
||||
|
||||
6
.github/workflows/ci-docker.yml
vendored
@@ -119,7 +119,7 @@ jobs:
|
||||
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
|
||||
- name: Docker metadata
|
||||
id: docker-meta
|
||||
uses: docker/metadata-action@v5.10.0
|
||||
uses: docker/metadata-action@v6.0.0
|
||||
with:
|
||||
images: |
|
||||
${{ env.REGISTRY }}/${{ steps.repo.outputs.name }}
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
- name: Build and push by digest
|
||||
id: build
|
||||
uses: docker/build-push-action@v6.19.2
|
||||
uses: docker/build-push-action@v7.0.0
|
||||
with:
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
@@ -201,7 +201,7 @@ jobs:
|
||||
password: ${{ secrets.QUAY_ROBOT_TOKEN }}
|
||||
- name: Docker metadata
|
||||
id: docker-meta
|
||||
uses: docker/metadata-action@v5.10.0
|
||||
uses: docker/metadata-action@v6.0.0
|
||||
with:
|
||||
images: |
|
||||
${{ env.REGISTRY }}/${{ needs.build-arch.outputs.repository }}
|
||||
|
||||
@@ -2437,17 +2437,3 @@ src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "Non
|
||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "unpaper_clean" [union-attr]
|
||||
src/paperless_tesseract/tests/test_parser_custom_settings.py:0: error: Item "None" of "ApplicationConfiguration | None" has no attribute "user_args" [union-attr]
|
||||
src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_text/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_text/parsers.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "None") [assignment]
|
||||
src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_text/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Argument 1 to "make_thumbnail_from_pdf" has incompatible type "None"; expected "Path" [arg-type]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a return type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/paperless_tika/parsers.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "None") [assignment]
|
||||
src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/paperless_tika/signals.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
|
||||
@@ -50,7 +50,7 @@ repos:
|
||||
- 'prettier-plugin-organize-imports@4.3.0'
|
||||
# Python hooks
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.15.5
|
||||
rev: v0.15.6
|
||||
hooks:
|
||||
- id: ruff-check
|
||||
- id: ruff-format
|
||||
|
||||
@@ -18,13 +18,13 @@ services:
|
||||
- "--log-level=warn"
|
||||
- "--log-format=text"
|
||||
tika:
|
||||
image: docker.io/apache/tika:latest
|
||||
image: docker.io/apache/tika:3.2.3.0
|
||||
hostname: tika
|
||||
container_name: tika
|
||||
network_mode: host
|
||||
restart: unless-stopped
|
||||
greenmail:
|
||||
image: greenmail/standalone:2.1.8
|
||||
image: docker.io/greenmail/standalone:2.1.8
|
||||
hostname: greenmail
|
||||
container_name: greenmail
|
||||
environment:
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
# Paths and folders
|
||||
|
||||
#PAPERLESS_CONSUMPTION_DIR=../consume
|
||||
#PAPERLESS_CONSUMPTION_FAILED_DIR=../consume/failed
|
||||
#PAPERLESS_DATA_DIR=../data
|
||||
#PAPERLESS_EMPTY_TRASH_DIR=
|
||||
#PAPERLESS_MEDIA_ROOT=../media
|
||||
|
||||
@@ -26,7 +26,7 @@ dependencies = [
|
||||
# WARNING: django does not use semver.
|
||||
# Only patch versions are guaranteed to not introduce breaking changes.
|
||||
"django~=5.2.10",
|
||||
"django-allauth[mfa,socialaccount]~=65.14.0",
|
||||
"django-allauth[mfa,socialaccount]~=65.15.0",
|
||||
"django-auditlog~=3.4.1",
|
||||
"django-cachalot~=2.9.0",
|
||||
"django-celery-results~=2.6.0",
|
||||
@@ -248,15 +248,13 @@ lint.per-file-ignores."docker/wait-for-redis.py" = [
|
||||
lint.per-file-ignores."src/documents/models.py" = [
|
||||
"SIM115",
|
||||
]
|
||||
lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
|
||||
"RUF001",
|
||||
]
|
||||
|
||||
lint.isort.force-single-line = true
|
||||
|
||||
[tool.codespell]
|
||||
write-changes = true
|
||||
ignore-words-list = "criterias,afterall,valeu,ureue,equest,ure,assertIn,Oktober,commitish"
|
||||
skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/documents/tests/samples/*,*.po,*.json"
|
||||
skip = "src-ui/src/locale/*,src-ui/pnpm-lock.yaml,src-ui/e2e/*,src/paperless_mail/tests/samples/*,src/paperless/tests/samples/mail/*,src/documents/tests/samples/*,*.po,*.json"
|
||||
|
||||
[tool.pytest]
|
||||
minversion = "9.0"
|
||||
@@ -271,10 +269,6 @@ testpaths = [
|
||||
"src/documents/tests/",
|
||||
"src/paperless/tests/",
|
||||
"src/paperless_mail/tests/",
|
||||
"src/paperless_tesseract/tests/",
|
||||
"src/paperless_tika/tests",
|
||||
"src/paperless_text/tests/",
|
||||
"src/paperless_remote/tests/",
|
||||
"src/paperless_ai/tests",
|
||||
]
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
"@angular/platform-browser-dynamic": "~21.2.4",
|
||||
"@angular/router": "~21.2.4",
|
||||
"@ng-bootstrap/ng-bootstrap": "^20.0.0",
|
||||
"@ng-select/ng-select": "^21.4.1",
|
||||
"@ng-select/ng-select": "^21.5.2",
|
||||
"@ngneat/dirty-check-forms": "^3.0.3",
|
||||
"@popperjs/core": "^2.11.8",
|
||||
"bootstrap": "^5.3.8",
|
||||
@@ -55,13 +55,13 @@
|
||||
"@codecov/webpack-plugin": "^1.9.1",
|
||||
"@playwright/test": "^1.58.2",
|
||||
"@types/jest": "^30.0.0",
|
||||
"@types/node": "^25.3.3",
|
||||
"@typescript-eslint/eslint-plugin": "^8.54.0",
|
||||
"@typescript-eslint/parser": "^8.54.0",
|
||||
"@typescript-eslint/utils": "^8.54.0",
|
||||
"eslint": "^10.0.2",
|
||||
"jest": "30.2.0",
|
||||
"jest-environment-jsdom": "^30.2.0",
|
||||
"@types/node": "^25.4.0",
|
||||
"@typescript-eslint/eslint-plugin": "^8.57.0",
|
||||
"@typescript-eslint/parser": "^8.57.0",
|
||||
"@typescript-eslint/utils": "^8.57.0",
|
||||
"eslint": "^10.0.3",
|
||||
"jest": "30.3.0",
|
||||
"jest-environment-jsdom": "^30.3.0",
|
||||
"jest-junit": "^16.0.0",
|
||||
"jest-preset-angular": "^16.1.1",
|
||||
"jest-websocket-mock": "^2.5.0",
|
||||
|
||||
1422
src-ui/pnpm-lock.yaml
generated
@@ -112,9 +112,6 @@
|
||||
</td>
|
||||
<td scope="row">
|
||||
<div class="btn-group" role="group">
|
||||
@if (task.status === PaperlessTaskStatus.Failed) {
|
||||
<ng-container *ngTemplateOutlet="retryDropdown; context: { task: task }"></ng-container>
|
||||
}
|
||||
<button class="btn btn-sm btn-outline-secondary" (click)="dismissTask(task); $event.stopPropagation();" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.PaperlessTask }">
|
||||
<i-bs name="check" class="me-1"></i-bs><ng-container i18n>Dismiss</ng-container>
|
||||
</button>
|
||||
@@ -187,25 +184,3 @@
|
||||
</li>
|
||||
</ul>
|
||||
<div [ngbNavOutlet]="nav"></div>
|
||||
|
||||
<ng-template #retryDropdown let-task="task">
|
||||
<div ngbDropdown>
|
||||
<button class="btn btn-sm btn-outline-primary" (click)="$event.stopImmediatePropagation()" ngbDropdownToggle>
|
||||
<i-bs name="arrow-repeat"></i-bs> <ng-container i18n>Retry</ng-container>
|
||||
</button>
|
||||
<div ngbDropdownMenu class="shadow retry-dropdown">
|
||||
<div class="p-2">
|
||||
<ul class="list-group list-group-flush">
|
||||
<li class="list-group-item small" i18n>
|
||||
<pngx-input-check [(ngModel)]="retryClean" i18n-title title="Attempt to clean pdf"></pngx-input-check>
|
||||
</li>
|
||||
</ul>
|
||||
<div class="d-flex justify-content-end">
|
||||
<button class="btn btn-sm btn-outline-primary" (click)="retryTask(task); $event.stopPropagation();">
|
||||
<ng-container i18n>Proceed</ng-container>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</ng-template>
|
||||
|
||||
@@ -37,7 +37,3 @@ pre {
|
||||
.z-10 {
|
||||
z-index: 10;
|
||||
}
|
||||
|
||||
.retry-dropdown {
|
||||
width: 300px;
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ import {
|
||||
NgbNavItem,
|
||||
} from '@ng-bootstrap/ng-bootstrap'
|
||||
import { allIcons, NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
|
||||
import { of, throwError } from 'rxjs'
|
||||
import { throwError } from 'rxjs'
|
||||
import { routes } from 'src/app/app-routing.module'
|
||||
import {
|
||||
PaperlessTask,
|
||||
@@ -32,7 +32,6 @@ import { TasksService } from 'src/app/services/tasks.service'
|
||||
import { ToastService } from 'src/app/services/toast.service'
|
||||
import { environment } from 'src/environments/environment'
|
||||
import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
|
||||
import { CheckComponent } from '../../common/input/check/check.component'
|
||||
import { PageHeaderComponent } from '../../common/page-header/page-header.component'
|
||||
import { TasksComponent, TaskTab } from './tasks.component'
|
||||
|
||||
@@ -139,7 +138,6 @@ describe('TasksComponent', () => {
|
||||
PageHeaderComponent,
|
||||
IfPermissionsDirective,
|
||||
CustomDatePipe,
|
||||
CheckComponent,
|
||||
ConfirmDialogComponent,
|
||||
],
|
||||
providers: [
|
||||
@@ -186,10 +184,8 @@ describe('TasksComponent', () => {
|
||||
`Failed${currentTasksLength}`
|
||||
)
|
||||
expect(
|
||||
fixture.debugElement.queryAll(
|
||||
By.css('table td > .form-check input[type="checkbox"]')
|
||||
)
|
||||
).toHaveLength(currentTasksLength)
|
||||
fixture.debugElement.queryAll(By.css('table input[type="checkbox"]'))
|
||||
).toHaveLength(currentTasksLength + 1)
|
||||
|
||||
currentTasksLength = tasks.filter(
|
||||
(t) => t.status === PaperlessTaskStatus.Complete
|
||||
@@ -393,20 +389,4 @@ describe('TasksComponent', () => {
|
||||
expect(component.filterText).toEqual('')
|
||||
expect(component.filterTargetID).toEqual(0)
|
||||
})
|
||||
|
||||
it('should retry a task, show toast on error or success', () => {
|
||||
const retrySpy = jest.spyOn(tasksService, 'retryTask')
|
||||
const toastInfoSpy = jest.spyOn(toastService, 'showInfo')
|
||||
const toastErrorSpy = jest.spyOn(toastService, 'showError')
|
||||
retrySpy.mockReturnValueOnce(of({ task_id: '123' }))
|
||||
component.retryTask(tasks[0])
|
||||
expect(retrySpy).toHaveBeenCalledWith(tasks[0], false)
|
||||
expect(toastInfoSpy).toHaveBeenCalledWith('Retrying task...')
|
||||
retrySpy.mockReturnValueOnce(throwError(() => new Error('test')))
|
||||
component.retryTask(tasks[0])
|
||||
expect(toastErrorSpy).toHaveBeenCalledWith(
|
||||
'Failed to retry task',
|
||||
new Error('test')
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
@@ -20,13 +20,12 @@ import {
|
||||
takeUntil,
|
||||
timer,
|
||||
} from 'rxjs'
|
||||
import { PaperlessTask, PaperlessTaskStatus } from 'src/app/data/paperless-task'
|
||||
import { PaperlessTask } from 'src/app/data/paperless-task'
|
||||
import { IfPermissionsDirective } from 'src/app/directives/if-permissions.directive'
|
||||
import { CustomDatePipe } from 'src/app/pipes/custom-date.pipe'
|
||||
import { TasksService } from 'src/app/services/tasks.service'
|
||||
import { ToastService } from 'src/app/services/toast.service'
|
||||
import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
|
||||
import { CheckComponent } from '../../common/input/check/check.component'
|
||||
import { PageHeaderComponent } from '../../common/page-header/page-header.component'
|
||||
import { LoadingComponentWithPermissions } from '../../loading-component/loading.component'
|
||||
|
||||
@@ -55,7 +54,6 @@ const FILTER_TARGETS = [
|
||||
PageHeaderComponent,
|
||||
IfPermissionsDirective,
|
||||
CustomDatePipe,
|
||||
CheckComponent,
|
||||
SlicePipe,
|
||||
FormsModule,
|
||||
ReactiveFormsModule,
|
||||
@@ -77,7 +75,6 @@ export class TasksComponent
|
||||
private readonly router = inject(Router)
|
||||
private readonly toastService = inject(ToastService)
|
||||
|
||||
public PaperlessTaskStatus = PaperlessTaskStatus
|
||||
public activeTab: TaskTab
|
||||
public selectedTasks: Set<number> = new Set()
|
||||
public togggleAll: boolean = false
|
||||
@@ -108,8 +105,6 @@ export class TasksComponent
|
||||
: FILTER_TARGETS.slice(0, 1)
|
||||
}
|
||||
|
||||
public retryClean: boolean = false
|
||||
|
||||
get dismissButtonText(): string {
|
||||
return this.selectedTasks.size > 0
|
||||
? $localize`Dismiss selected`
|
||||
@@ -183,17 +178,6 @@ export class TasksComponent
|
||||
this.router.navigate(['documents', task.related_document])
|
||||
}
|
||||
|
||||
retryTask(task: PaperlessTask) {
|
||||
this.tasksService.retryTask(task, this.retryClean).subscribe({
|
||||
next: () => {
|
||||
this.toastService.showInfo($localize`Retrying task...`)
|
||||
},
|
||||
error: (e) => {
|
||||
this.toastService.showError($localize`Failed to retry task`, e)
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
expandTask(task: PaperlessTask) {
|
||||
this.expandedTask = this.expandedTask == task.id ? undefined : task.id
|
||||
}
|
||||
|
||||
@@ -147,33 +147,4 @@ describe('TasksService', () => {
|
||||
result: 'success',
|
||||
})
|
||||
})
|
||||
|
||||
it('should call retry task api endpoint', () => {
|
||||
const task = {
|
||||
id: 1,
|
||||
type: PaperlessTaskType.File,
|
||||
status: PaperlessTaskStatus.Failed,
|
||||
acknowledged: false,
|
||||
task_id: '1234',
|
||||
task_file_name: 'file1.pdf',
|
||||
date_created: new Date(),
|
||||
}
|
||||
|
||||
tasksService.retryTask(task, true).subscribe()
|
||||
const reloadSpy = jest.spyOn(tasksService, 'reload')
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}tasks/${task.id}/retry/`
|
||||
)
|
||||
expect(req.request.method).toEqual('POST')
|
||||
expect(req.request.body).toEqual({
|
||||
clean: true,
|
||||
})
|
||||
req.flush({ task_id: 12345 })
|
||||
expect(reloadSpy).toHaveBeenCalled()
|
||||
httpTestingController
|
||||
.expectOne(
|
||||
`${environment.apiBaseUrl}tasks/?task_name=consume_file&acknowledged=false`
|
||||
)
|
||||
.flush([])
|
||||
})
|
||||
})
|
||||
|
||||
@@ -81,20 +81,6 @@ export class TasksService {
|
||||
)
|
||||
}
|
||||
|
||||
public retryTask(task: PaperlessTask, clean: boolean): Observable<any> {
|
||||
return this.http
|
||||
.post(`${this.baseUrl}tasks/${task.id}/retry/`, {
|
||||
clean,
|
||||
})
|
||||
.pipe(
|
||||
takeUntil(this.unsubscribeNotifer),
|
||||
first(),
|
||||
tap(() => {
|
||||
this.reload()
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
public cancelPending(): void {
|
||||
this.unsubscribeNotifer.next(true)
|
||||
}
|
||||
|
||||
@@ -3,25 +3,20 @@ from django.core.checks import Error
|
||||
from django.core.checks import Warning
|
||||
from django.core.checks import register
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.templating.utils import convert_format_str_to_template_format
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
|
||||
@register()
|
||||
def parser_check(app_configs, **kwargs):
|
||||
parsers = []
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parsers.append(response[1])
|
||||
|
||||
if len(parsers) == 0:
|
||||
if not get_parser_registry().all_parsers():
|
||||
return [
|
||||
Error(
|
||||
"No parsers found. This is a bug. The consumer won't be "
|
||||
"able to consume any documents without parsers.",
|
||||
),
|
||||
]
|
||||
else:
|
||||
return []
|
||||
return []
|
||||
|
||||
|
||||
@register()
|
||||
|
||||
@@ -32,9 +32,7 @@ from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.permissions import set_permissions_for_object
|
||||
from documents.plugins.base import AlwaysRunPluginMixin
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
@@ -51,28 +49,13 @@ from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
LOGGING_NAME: Final[str] = "paperless.consumer"
|
||||
|
||||
|
||||
def _parser_cleanup(parser: DocumentParser) -> None:
|
||||
"""
|
||||
Call cleanup on a parser, handling the new-style context-manager parsers.
|
||||
|
||||
New-style parsers (e.g. TextDocumentParser) use __exit__ for teardown
|
||||
instead of a cleanup() method. This shim will be removed once all existing parsers
|
||||
have switched to the new style and this consumer is updated to use it
|
||||
|
||||
TODO(stumpylog): Remove me in the future
|
||||
"""
|
||||
if isinstance(parser, TextDocumentParser):
|
||||
parser.__exit__(None, None, None)
|
||||
else:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class WorkflowTriggerPlugin(
|
||||
NoCleanupPluginMixin,
|
||||
NoSetupPluginMixin,
|
||||
@@ -174,17 +157,6 @@ class ConsumerPluginMixin:
|
||||
):
|
||||
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
|
||||
self.log.error(log_message or message, exc_info=exc_info)
|
||||
# Move the file to the failed directory
|
||||
if (
|
||||
self.input_doc.original_file.exists()
|
||||
and not Path(
|
||||
settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
|
||||
).exists()
|
||||
):
|
||||
copy_file_with_basic_stats(
|
||||
self.input_doc.original_file,
|
||||
settings.CONSUMPTION_FAILED_DIR / self.input_doc.original_file.name,
|
||||
)
|
||||
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
|
||||
|
||||
|
||||
@@ -420,8 +392,12 @@ class ConsumerPlugin(
|
||||
self.log.error(f"Error attempting to clean PDF: {e}")
|
||||
|
||||
# Based on the mime type, get the parser for that type
|
||||
parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
|
||||
mime_type,
|
||||
parser_class: type[ParserProtocol] | None = (
|
||||
get_parser_registry().get_parser_for_file(
|
||||
mime_type,
|
||||
self.filename,
|
||||
self.working_copy,
|
||||
)
|
||||
)
|
||||
if not parser_class:
|
||||
tempdir.cleanup()
|
||||
@@ -444,301 +420,275 @@ class ConsumerPlugin(
|
||||
tempdir.cleanup()
|
||||
raise
|
||||
|
||||
def progress_callback(
|
||||
current_progress,
|
||||
max_progress,
|
||||
) -> None: # pragma: no cover
|
||||
# recalculate progress to be within 20 and 80
|
||||
p = int((current_progress / max_progress) * 50 + 20)
|
||||
self._send_progress(p, 100, ProgressStatusOptions.WORKING)
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser: DocumentParser = parser_class(
|
||||
self.logging_group,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
with parser_class() as document_parser:
|
||||
document_parser.configure(
|
||||
ParserContext(mailrule_id=self.input_doc.mailrule_id),
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
if (
|
||||
isinstance(document_parser, MailDocumentParser)
|
||||
and self.input_doc.mailrule_id
|
||||
):
|
||||
document_parser.parse(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
self.filename,
|
||||
self.input_doc.mailrule_id,
|
||||
)
|
||||
elif isinstance(document_parser, TextDocumentParser):
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
else:
|
||||
document_parser.parse(self.working_copy, mime_type, self.filename)
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
if isinstance(document_parser, TextDocumentParser):
|
||||
# TODO(stumpylog): Remove me in the future
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
else:
|
||||
thumbnail = document_parser.get_thumbnail(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
self.filename,
|
||||
)
|
||||
self.log.debug(f"Parser: {document_parser.name} v{document_parser.version}")
|
||||
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
text = None
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
page_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
90,
|
||||
20,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
)
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
|
||||
except ParseError as e:
|
||||
_parser_cleanup(document_parser)
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
_parser_cleanup(document_parser)
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
document_parser.parse(self.working_copy, mime_type)
|
||||
|
||||
# Prepare the document classifier.
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
thumbnail = document_parser.get_thumbnail(self.working_copy, mime_type)
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
if self.input_doc.root_document_id:
|
||||
# If this is a new version of an existing document, we need
|
||||
# to make sure we're not creating a new document, but updating
|
||||
# the existing one.
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
self._send_progress(
|
||||
90,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
actor = None
|
||||
with get_date_parser() as date_parser:
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(
|
||||
self.working_copy,
|
||||
mime_type,
|
||||
)
|
||||
|
||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||
if (
|
||||
settings.AUDIT_LOG_ENABLED
|
||||
and self.metadata.actor_id is not None
|
||||
):
|
||||
actor = User.objects.filter(pk=self.metadata.actor_id).first()
|
||||
if actor is not None:
|
||||
from auditlog.context import ( # type: ignore[import-untyped]
|
||||
set_actor,
|
||||
)
|
||||
except ParseError as e:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error occurred while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
except Exception as e:
|
||||
if tempdir:
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Unexpected error while consuming document {self.filename}: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
with set_actor(actor):
|
||||
# Prepare the document classifier.
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
if self.input_doc.root_document_id:
|
||||
# If this is a new version of an existing document, we need
|
||||
# to make sure we're not creating a new document, but updating
|
||||
# the existing one.
|
||||
root_doc = Document.objects.get(
|
||||
pk=self.input_doc.root_document_id,
|
||||
)
|
||||
original_document = self._create_version_from_root(
|
||||
root_doc,
|
||||
text=text,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
actor = None
|
||||
|
||||
# Save the new version, potentially creating an audit log entry for the version addition if enabled.
|
||||
if (
|
||||
settings.AUDIT_LOG_ENABLED
|
||||
and self.metadata.actor_id is not None
|
||||
):
|
||||
actor = User.objects.filter(
|
||||
pk=self.metadata.actor_id,
|
||||
).first()
|
||||
if actor is not None:
|
||||
from auditlog.context import ( # type: ignore[import-untyped]
|
||||
set_actor,
|
||||
)
|
||||
|
||||
with set_actor(actor):
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
else:
|
||||
original_document.save()
|
||||
|
||||
# Create a log entry for the version addition, if enabled
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import ( # type: ignore[import-untyped]
|
||||
LogEntry,
|
||||
)
|
||||
|
||||
LogEntry.objects.log_create(
|
||||
instance=root_doc,
|
||||
changes={
|
||||
"Version Added": ["None", original_document.id],
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
actor=actor,
|
||||
additional_data={
|
||||
"reason": "Version added",
|
||||
"version_id": original_document.id,
|
||||
},
|
||||
)
|
||||
document = original_document
|
||||
else:
|
||||
original_document.save()
|
||||
|
||||
# Create a log entry for the version addition, if enabled
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import ( # type: ignore[import-untyped]
|
||||
LogEntry,
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
|
||||
LogEntry.objects.log_create(
|
||||
instance=root_doc,
|
||||
changes={
|
||||
"Version Added": ["None", original_document.id],
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
actor=actor,
|
||||
additional_data={
|
||||
"reason": "Version added",
|
||||
"version_id": original_document.id,
|
||||
},
|
||||
)
|
||||
document = original_document
|
||||
else:
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
page_count=page_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
)
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
generated_filename = generate_unique_filename(document)
|
||||
if (
|
||||
len(str(generated_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_filename = generate_filename(
|
||||
document,
|
||||
use_format=False,
|
||||
)
|
||||
document.filename = generated_filename
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier,
|
||||
original_file=self.unmodified_original
|
||||
if self.unmodified_original
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
self._write(
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
generated_archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
generated_filename = generate_unique_filename(document)
|
||||
if (
|
||||
len(str(generated_archive_filename))
|
||||
len(str(generated_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||
"Generated source filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_archive_filename = generate_filename(
|
||||
generated_filename = generate_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
use_format=False,
|
||||
)
|
||||
document.archive_filename = generated_archive_filename
|
||||
create_source_path_directory(document.archive_path)
|
||||
document.filename = generated_filename
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
else self.working_copy,
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
self._write(
|
||||
thumbnail,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
if archive_path and Path(archive_path).is_file():
|
||||
generated_archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
if (
|
||||
len(str(generated_archive_filename))
|
||||
> Document.MAX_STORED_FILENAME_LENGTH
|
||||
):
|
||||
self.log.warning(
|
||||
"Generated archive filename exceeds db path limit, falling back to default naming",
|
||||
)
|
||||
generated_archive_filename = generate_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
use_format=False,
|
||||
)
|
||||
document.archive_filename = generated_archive_filename
|
||||
create_source_path_directory(document.archive_path)
|
||||
self._write(
|
||||
archive_path,
|
||||
document.archive_path,
|
||||
)
|
||||
|
||||
if document.root_document_id:
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
document=document.root_document,
|
||||
)
|
||||
with Path(archive_path).open("rb") as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read(),
|
||||
).hexdigest()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(f"Deleting original file {self.input_doc.original_file}")
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
|
||||
if document.root_document_id:
|
||||
document_updated.send(
|
||||
sender=self.__class__,
|
||||
document=document.root_document,
|
||||
)
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
f"Deleting original file {self.input_doc.original_file}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
self.input_doc.original_file.unlink()
|
||||
self.log.debug(f"Deleting working copy {self.working_copy}")
|
||||
self.working_copy.unlink()
|
||||
if self.unmodified_original is not None: # pragma: no cover
|
||||
self.log.debug(
|
||||
f"Deleting unmodified original file {self.unmodified_original}",
|
||||
)
|
||||
self.unmodified_original.unlink()
|
||||
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = (
|
||||
Path(self.input_doc.original_file).parent
|
||||
/ f"._{Path(self.input_doc.original_file).name}"
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
if Path(shadow_file).is_file():
|
||||
self.log.debug(f"Deleting shadow file {shadow_file}")
|
||||
Path(shadow_file).unlink()
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
str(e),
|
||||
f"The following error occurred while storing document "
|
||||
f"{self.filename} after parsing: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
finally:
|
||||
_parser_cleanup(document_parser)
|
||||
tempdir.cleanup()
|
||||
finally:
|
||||
tempdir.cleanup()
|
||||
|
||||
self.run_post_consume_script(document)
|
||||
|
||||
|
||||
@@ -3,14 +3,18 @@ import shutil
|
||||
|
||||
from documents.management.commands.base import PaperlessCommand
|
||||
from documents.models import Document
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
logger = logging.getLogger("paperless.management.thumbnails")
|
||||
|
||||
|
||||
def _process_document(doc_id: int) -> None:
|
||||
document: Document = Document.objects.get(id=doc_id)
|
||||
parser_class = get_parser_class_for_mime_type(document.mime_type)
|
||||
parser_class = get_parser_registry().get_parser_for_file(
|
||||
document.mime_type,
|
||||
document.original_filename or "",
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
if parser_class is None:
|
||||
logger.warning(
|
||||
@@ -20,18 +24,9 @@ def _process_document(doc_id: int) -> None:
|
||||
)
|
||||
return
|
||||
|
||||
parser = parser_class(logging_group=None)
|
||||
|
||||
try:
|
||||
thumb = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
document.mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
with parser_class() as parser:
|
||||
thumb = parser.get_thumbnail(document.source_path, document.mime_type)
|
||||
shutil.move(thumb, document.thumbnail_path)
|
||||
finally:
|
||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(PaperlessCommand):
|
||||
|
||||
@@ -3,84 +3,47 @@ from __future__ import annotations
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
|
||||
# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits
|
||||
|
||||
# TODO: isn't there a date parsing library for this?
|
||||
|
||||
DATE_REGEX = re.compile(
|
||||
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger("paperless.parsing")
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def is_mime_type_supported(mime_type: str) -> bool:
|
||||
"""
|
||||
Returns True if the mime type is supported, False otherwise
|
||||
"""
|
||||
return get_parser_class_for_mime_type(mime_type) is not None
|
||||
return get_parser_registry().get_parser_for_file(mime_type, "") is not None
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def get_default_file_extension(mime_type: str) -> str:
|
||||
"""
|
||||
Returns the default file extension for a mimetype, or
|
||||
an empty string if it could not be determined
|
||||
"""
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
return supported_mime_types[mime_type]
|
||||
parser_class = get_parser_registry().get_parser_for_file(mime_type, "")
|
||||
if parser_class is not None:
|
||||
supported = parser_class.supported_mime_types()
|
||||
if mime_type in supported:
|
||||
return supported[mime_type]
|
||||
|
||||
ext = mimetypes.guess_extension(mime_type)
|
||||
if ext:
|
||||
return ext
|
||||
else:
|
||||
return ""
|
||||
return ext if ext else ""
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def is_file_ext_supported(ext: str) -> bool:
|
||||
"""
|
||||
Returns True if the file extension is supported, False otherwise
|
||||
@@ -94,44 +57,17 @@ def is_file_ext_supported(ext: str) -> bool:
|
||||
|
||||
def get_supported_file_extensions() -> set[str]:
|
||||
extensions = set()
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
for mime_type in supported_mime_types:
|
||||
for parser_class in get_parser_registry().all_parsers():
|
||||
for mime_type, ext in parser_class.supported_mime_types().items():
|
||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||
# Python's stdlib might be behind, so also add what the parser
|
||||
# says is the default extension
|
||||
# This makes image/webp supported on Python < 3.11
|
||||
extensions.add(supported_mime_types[mime_type])
|
||||
extensions.add(ext)
|
||||
|
||||
return extensions
|
||||
|
||||
|
||||
def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
|
||||
"""
|
||||
Returns the best parser (by weight) for the given mimetype or
|
||||
None if no parser exists
|
||||
"""
|
||||
|
||||
options = []
|
||||
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
options.append(parser_declaration)
|
||||
|
||||
if not options:
|
||||
return None
|
||||
|
||||
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
|
||||
|
||||
# Return the parser with the highest weight.
|
||||
return best_parser["parser"]
|
||||
|
||||
|
||||
def run_convert(
|
||||
input_file,
|
||||
output_file,
|
||||
|
||||
@@ -2411,14 +2411,6 @@ class TasksViewSerializer(OwnedObjectSerializer):
|
||||
return list(duplicates.values("id", "title", "deleted_at"))
|
||||
|
||||
|
||||
class RetryTaskSerializer(serializers.Serializer):
|
||||
clean = serializers.BooleanField(
|
||||
default=False,
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
|
||||
class RunTaskViewSerializer(serializers.Serializer[dict[str, Any]]):
|
||||
task_name = serializers.ChoiceField(
|
||||
choices=PaperlessTask.TaskName.choices,
|
||||
|
||||
@@ -2,5 +2,4 @@ from django.dispatch import Signal
|
||||
|
||||
document_consumption_started = Signal()
|
||||
document_consumption_finished = Signal()
|
||||
document_consumer_declaration = Signal()
|
||||
document_updated = Signal()
|
||||
|
||||
@@ -631,19 +631,6 @@ def update_filename_and_move_files(
|
||||
)
|
||||
|
||||
|
||||
@receiver(models.signals.post_save, sender=PaperlessTask)
|
||||
def cleanup_failed_documents(sender, instance: PaperlessTask, **kwargs):
|
||||
if instance.status != states.FAILURE or not instance.acknowledged:
|
||||
return
|
||||
|
||||
if instance.task_file_name:
|
||||
try:
|
||||
Path(settings.CONSUMPTION_FAILED_DIR / instance.task_file_name).unlink()
|
||||
logger.debug(f"Cleaned up failed file {instance.task_file_name}")
|
||||
except FileNotFoundError:
|
||||
logger.warning(f"Failed to clean up failed file {instance.task_file_name}")
|
||||
|
||||
|
||||
@shared_task
|
||||
def process_cf_select_update(custom_field: CustomField) -> None:
|
||||
"""
|
||||
|
||||
@@ -37,7 +37,6 @@ from documents.consumer import ConsumerPreflightPlugin
|
||||
from documents.consumer import WorkflowTriggerPlugin
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.double_sided import CollatePlugin
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.file_handling import generate_unique_filename
|
||||
@@ -53,8 +52,6 @@ from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.models import WorkflowRun
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import ProgressManager
|
||||
from documents.plugins.base import StopConsumeTaskError
|
||||
@@ -64,10 +61,10 @@ from documents.signals import document_updated
|
||||
from documents.signals.handlers import cleanup_document_deletion
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
from paperless_ai.indexing import llm_index_remove_document
|
||||
from paperless_ai.indexing import update_llm_index
|
||||
@@ -75,6 +72,7 @@ from paperless_ai.indexing import update_llm_index
|
||||
_T = TypeVar("_T")
|
||||
IterWrapper = Callable[[Iterable[_T]], Iterable[_T]]
|
||||
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import LogEntry
|
||||
logger = logging.getLogger("paperless.tasks")
|
||||
@@ -241,48 +239,6 @@ def consume_file(
|
||||
return msg
|
||||
|
||||
|
||||
@shared_task
|
||||
def retry_failed_file(task_id: str, clean: bool = False, skip_ocr: bool = False):
|
||||
task = PaperlessTask.objects.get(task_id=task_id, status=states.FAILURE)
|
||||
if task:
|
||||
failed_file = settings.CONSUMPTION_FAILED_DIR / task.task_file_name
|
||||
if not failed_file.exists():
|
||||
logger.error(f"File {failed_file} not found")
|
||||
raise FileNotFoundError(f"File {failed_file} not found")
|
||||
working_copy = settings.SCRATCH_DIR / failed_file.name
|
||||
copy_file_with_basic_stats(failed_file, working_copy)
|
||||
|
||||
if clean:
|
||||
try:
|
||||
result = run_subprocess(
|
||||
[
|
||||
"qpdf",
|
||||
"--replace-input",
|
||||
"--warning-exit-0",
|
||||
working_copy,
|
||||
],
|
||||
logger=logger,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise Exception(
|
||||
f"qpdf failed with exit code {result.returncode}, error: {result.stderr}",
|
||||
)
|
||||
else:
|
||||
logger.debug("PDF cleaned successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Error while cleaning PDF: {e}")
|
||||
raise e
|
||||
|
||||
task = consume_file.delay(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=working_copy,
|
||||
),
|
||||
)
|
||||
|
||||
return task.id
|
||||
|
||||
|
||||
@shared_task
|
||||
def sanity_check(*, scheduled=True, raise_on_error=True):
|
||||
messages = sanity_checker.check_sanity(scheduled=scheduled)
|
||||
@@ -348,7 +304,11 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
|
||||
mime_type = document.mime_type
|
||||
|
||||
parser_class: type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
|
||||
parser_class = get_parser_registry().get_parser_for_file(
|
||||
mime_type,
|
||||
document.original_filename or "",
|
||||
document.source_path,
|
||||
)
|
||||
|
||||
if not parser_class:
|
||||
logger.error(
|
||||
@@ -357,98 +317,92 @@ def update_document_content_maybe_archive_file(document_id) -> None:
|
||||
)
|
||||
return
|
||||
|
||||
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
|
||||
with parser_class() as parser:
|
||||
parser.configure(ParserContext())
|
||||
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type, document.get_public_filename())
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type)
|
||||
|
||||
thumbnail = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
thumbnail = parser.get_thumbnail(document.source_path, mime_type)
|
||||
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text(),
|
||||
archive_filename=document.archive_filename,
|
||||
)
|
||||
newDocument = Document.objects.get(pk=document.pk)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, newDocument.content],
|
||||
"archive_checksum": [
|
||||
oldDocument.archive_checksum,
|
||||
newDocument.archive_checksum,
|
||||
],
|
||||
"archive_filename": [
|
||||
oldDocument.archive_filename,
|
||||
newDocument.archive_filename,
|
||||
],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
else:
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
content=parser.get_text(),
|
||||
)
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, parser.get_text()],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
with transaction.atomic():
|
||||
oldDocument = Document.objects.get(pk=document.pk)
|
||||
if parser.get_archive_path():
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
shutil.move(thumbnail, document.thumbnail_path)
|
||||
with Path(parser.get_archive_path()).open("rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text(),
|
||||
archive_filename=document.archive_filename,
|
||||
)
|
||||
newDocument = Document.objects.get(pk=document.pk)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, newDocument.content],
|
||||
"archive_checksum": [
|
||||
oldDocument.archive_checksum,
|
||||
newDocument.archive_checksum,
|
||||
],
|
||||
"archive_filename": [
|
||||
oldDocument.archive_filename,
|
||||
newDocument.archive_filename,
|
||||
],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
else:
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
content=parser.get_text(),
|
||||
)
|
||||
|
||||
document.refresh_from_db()
|
||||
logger.info(
|
||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
LogEntry.objects.log_create(
|
||||
instance=oldDocument,
|
||||
changes={
|
||||
"content": [oldDocument.content, parser.get_text()],
|
||||
},
|
||||
additional_data={
|
||||
"reason": "Update document content",
|
||||
},
|
||||
action=LogEntry.Action.UPDATE,
|
||||
)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
llm_index_add_or_update_document(document)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
if parser.get_archive_path():
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
shutil.move(thumbnail, document.thumbnail_path)
|
||||
|
||||
clear_document_caches(document.pk)
|
||||
document.refresh_from_db()
|
||||
logger.info(
|
||||
f"Updating index for document {document_id} ({document.archive_checksum})",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Error while parsing document {document} (ID: {document_id})",
|
||||
)
|
||||
finally:
|
||||
# TODO(stumpylog): Cleanup once all parsers are handled
|
||||
parser.cleanup()
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
llm_index_add_or_update_document(document)
|
||||
|
||||
clear_document_caches(document.pk)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Error while parsing document {document} (ID: {document_id})",
|
||||
)
|
||||
|
||||
|
||||
@shared_task
|
||||
|
||||
@@ -101,13 +101,17 @@ class TestSystemStatus(APITestCase):
|
||||
- The response contains the correct install type
|
||||
"""
|
||||
self.client.force_login(self.user)
|
||||
os.environ["PNGX_CONTAINERIZED"] = "1"
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["install_type"], "docker")
|
||||
os.environ["KUBERNETES_SERVICE_HOST"] = "http://localhost"
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.data["install_type"], "kubernetes")
|
||||
with mock.patch.dict(os.environ, {"PNGX_CONTAINERIZED": "1"}, clear=False):
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["install_type"], "docker")
|
||||
with mock.patch.dict(
|
||||
os.environ,
|
||||
{"PNGX_CONTAINERIZED": "1", "KUBERNETES_SERVICE_HOST": "http://localhost"},
|
||||
clear=False,
|
||||
):
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.data["install_type"], "kubernetes")
|
||||
|
||||
@mock.patch("redis.Redis.execute_command")
|
||||
def test_system_status_redis_ping(self, mock_ping) -> None:
|
||||
|
||||
@@ -13,8 +13,10 @@ class TestDocumentChecks(TestCase):
|
||||
def test_parser_check(self) -> None:
|
||||
self.assertEqual(parser_check(None), [])
|
||||
|
||||
with mock.patch("documents.checks.document_consumer_declaration.send") as m:
|
||||
m.return_value = []
|
||||
with mock.patch("documents.checks.get_parser_registry") as mock_registry_fn:
|
||||
mock_registry = mock.MagicMock()
|
||||
mock_registry.all_parsers.return_value = []
|
||||
mock_registry_fn.return_value = mock_registry
|
||||
|
||||
self.assertEqual(
|
||||
parser_check(None),
|
||||
|
||||
@@ -27,7 +27,6 @@ from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.tasks import sanity_check
|
||||
@@ -36,65 +35,106 @@ from documents.tests.utils import DummyProgressManager
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from documents.tests.utils import GetConsumerMixin
|
||||
from paperless_mail.models import MailRule
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
|
||||
|
||||
class _BaseTestParser(DocumentParser):
|
||||
def get_settings(self) -> None:
|
||||
"""
|
||||
This parser does not implement additional settings yet
|
||||
"""
|
||||
class _BaseNewStyleParser:
|
||||
"""Minimal ParserProtocol implementation for use in consumer tests."""
|
||||
|
||||
name: str = "test-parser"
|
||||
version: str = "0.1"
|
||||
author: str = "test"
|
||||
url: str = "test"
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict:
|
||||
return {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"message/rfc822": ".eml",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def score(cls, mime_type: str, filename: str, path=None):
|
||||
return 0 if mime_type in cls.supported_mime_types() else None
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
return False
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._tmpdir: Path | None = None
|
||||
self._text: str | None = None
|
||||
self._archive: Path | None = None
|
||||
self._thumb: Path | None = None
|
||||
|
||||
def __enter__(self):
|
||||
self._tmpdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-test-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
_, thumb = tempfile.mkstemp(suffix=".webp", dir=self._tmpdir)
|
||||
self._thumb = Path(thumb)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||
if self._tmpdir and self._tmpdir.exists():
|
||||
shutil.rmtree(self._tmpdir, ignore_errors=True)
|
||||
|
||||
def configure(self, context) -> None:
|
||||
pass
|
||||
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
return self._text
|
||||
|
||||
def get_date(self):
|
||||
return None
|
||||
|
||||
def get_archive_path(self):
|
||||
return self._archive
|
||||
|
||||
class DummyParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir, archive_path) -> None:
|
||||
super().__init__(logging_group, None)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
self.archive_path = archive_path
|
||||
def get_thumbnail(self, document_path, mime_type) -> Path:
|
||||
return self._thumb
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
def get_page_count(self, document_path, mime_type):
|
||||
return None
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
||||
self.text = "The Text"
|
||||
def extract_metadata(self, document_path, mime_type) -> list:
|
||||
return []
|
||||
|
||||
|
||||
class CopyParser(_BaseTestParser):
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
class DummyParser(_BaseNewStyleParser):
|
||||
_ARCHIVE_SRC = (
|
||||
Path(__file__).parent / "samples" / "documents" / "archive" / "0000001.pdf"
|
||||
)
|
||||
|
||||
def __init__(self, logging_group, progress_callback=None) -> None:
|
||||
super().__init__(logging_group, progress_callback)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir)
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None) -> None:
|
||||
self.text = "The text"
|
||||
self.archive_path = Path(self.tempdir / "archive.pdf")
|
||||
shutil.copy(document_path, self.archive_path)
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
self._text = "The Text"
|
||||
if produce_archive and self._tmpdir:
|
||||
self._archive = self._tmpdir / "archive.pdf"
|
||||
shutil.copy(self._ARCHIVE_SRC, self._archive)
|
||||
|
||||
|
||||
class FaultyParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir) -> None:
|
||||
super().__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
class CopyParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
self._text = "The text"
|
||||
if produce_archive and self._tmpdir:
|
||||
self._archive = self._tmpdir / "archive.pdf"
|
||||
shutil.copy(document_path, self._archive)
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
class FaultyParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
class FaultyGenericExceptionParser(_BaseTestParser):
|
||||
def __init__(self, logging_group, scratch_dir) -> None:
|
||||
super().__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
return self.fake_thumb
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
class FaultyGenericExceptionParser(_BaseNewStyleParser):
|
||||
def parse(self, document_path, mime_type, *, produce_archive: bool = True) -> None:
|
||||
raise Exception("Generic exception.")
|
||||
|
||||
|
||||
@@ -148,38 +188,12 @@ class TestConsumer(
|
||||
self.assertEqual(payload["data"]["max_progress"], last_progress_max)
|
||||
self.assertEqual(payload["data"]["status"], last_status)
|
||||
|
||||
def make_dummy_parser(self, logging_group, progress_callback=None):
|
||||
return DummyParser(
|
||||
logging_group,
|
||||
self.dirs.scratch_dir,
|
||||
self.get_test_archive_file(),
|
||||
)
|
||||
|
||||
def make_faulty_parser(self, logging_group, progress_callback=None):
|
||||
return FaultyParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def make_faulty_generic_exception_parser(
|
||||
self,
|
||||
logging_group,
|
||||
progress_callback=None,
|
||||
):
|
||||
return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def setUp(self) -> None:
|
||||
super().setUp()
|
||||
|
||||
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
m = patcher.start()
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_dummy_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
patcher = mock.patch("documents.consumer.get_parser_registry")
|
||||
mock_registry = patcher.start()
|
||||
mock_registry.return_value.get_parser_for_file.return_value = DummyParser
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
def get_test_file(self):
|
||||
@@ -548,9 +562,9 @@ class TestConsumer(
|
||||
) as consumer:
|
||||
consumer.run()
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def testNoParsers(self, m) -> None:
|
||||
m.return_value = []
|
||||
m.return_value.get_parser_for_file.return_value = None
|
||||
|
||||
with self.assertRaisesMessage(
|
||||
ConsumerError,
|
||||
@@ -561,18 +575,9 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def testFaultyParser(self, m) -> None:
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_faulty_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value.get_parser_for_file.return_value = FaultyParser
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
with self.assertRaisesMessage(
|
||||
@@ -583,18 +588,9 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def testGenericParserException(self, m) -> None:
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": self.make_faulty_generic_exception_parser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value.get_parser_for_file.return_value = FaultyGenericExceptionParser
|
||||
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
with self.assertRaisesMessage(
|
||||
@@ -1018,7 +1014,7 @@ class TestConsumer(
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{title}")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_similar_filenames(self, m) -> None:
|
||||
shutil.copy(
|
||||
Path(__file__).parent / "samples" / "simple.pdf",
|
||||
@@ -1032,16 +1028,7 @@ class TestConsumer(
|
||||
Path(__file__).parent / "samples" / "simple-noalpha.png",
|
||||
settings.CONSUMPTION_DIR / "simple.png.pdf",
|
||||
)
|
||||
m.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": CopyParser,
|
||||
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
m.return_value.get_parser_for_file.return_value = CopyParser
|
||||
|
||||
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer:
|
||||
consumer.run()
|
||||
@@ -1069,8 +1056,10 @@ class TestConsumer(
|
||||
|
||||
sanity_check()
|
||||
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
@mock.patch("documents.consumer.run_subprocess")
|
||||
def test_try_to_clean_invalid_pdf(self, m) -> None:
|
||||
def test_try_to_clean_invalid_pdf(self, m, mock_registry) -> None:
|
||||
mock_registry.return_value.get_parser_for_file.return_value = None
|
||||
shutil.copy(
|
||||
Path(__file__).parent / "samples" / "invalid_pdf.pdf",
|
||||
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
|
||||
@@ -1091,11 +1080,11 @@ class TestConsumer(
|
||||
self.assertEqual(command[1], "--replace-input")
|
||||
|
||||
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
||||
@mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@mock.patch("paperless.parsers.mail.MailDocumentParser.parse")
|
||||
@mock.patch("documents.consumer.get_parser_registry")
|
||||
def test_mail_parser_receives_mailrule(
|
||||
self,
|
||||
mock_consumer_declaration_send: mock.Mock,
|
||||
mock_get_parser_registry: mock.Mock,
|
||||
mock_mail_parser_parse: mock.Mock,
|
||||
mock_mailrule_get: mock.Mock,
|
||||
) -> None:
|
||||
@@ -1107,25 +1096,21 @@ class TestConsumer(
|
||||
THEN:
|
||||
- The mail parser should receive the mail rule
|
||||
"""
|
||||
mock_consumer_declaration_send.return_value = [
|
||||
(
|
||||
None,
|
||||
{
|
||||
"parser": MailDocumentParser,
|
||||
"mime_types": {"message/rfc822": ".eml"},
|
||||
"weight": 0,
|
||||
},
|
||||
),
|
||||
]
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
mock_get_parser_registry.return_value.get_parser_for_file.return_value = (
|
||||
MailDocumentParser
|
||||
)
|
||||
mock_mailrule_get.return_value = mock.Mock(
|
||||
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
|
||||
)
|
||||
with self.get_consumer(
|
||||
filepath=(
|
||||
Path(__file__).parent.parent.parent
|
||||
/ Path("paperless_mail")
|
||||
/ Path("paperless")
|
||||
/ Path("tests")
|
||||
/ Path("samples")
|
||||
/ Path("mail")
|
||||
).resolve()
|
||||
/ "html.eml",
|
||||
source=DocumentSource.MailFetch,
|
||||
@@ -1136,12 +1121,10 @@ class TestConsumer(
|
||||
ConsumerError,
|
||||
):
|
||||
consumer.run()
|
||||
mock_mail_parser_parse.assert_called_once_with(
|
||||
consumer.working_copy,
|
||||
"message/rfc822",
|
||||
file_name="sample.pdf",
|
||||
mailrule=mock_mailrule_get.return_value,
|
||||
)
|
||||
mock_mail_parser_parse.assert_called_once_with(
|
||||
consumer.working_copy,
|
||||
"message/rfc822",
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
|
||||
@@ -1,130 +1,14 @@
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
from django.apps import apps
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.parsers import get_default_file_extension
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import get_supported_file_extensions
|
||||
from documents.parsers import is_file_ext_supported
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.parsers.registry import reset_parser_registry
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
|
||||
|
||||
class TestParserDiscovery(TestCase):
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_1_parser(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Parser declared for a given mimetype
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- Declared parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_n_parsers(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Two parsers declared for a given mimetype
|
||||
- Second parser has a higher weight
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- Second parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser1:
|
||||
pass
|
||||
|
||||
class DummyParser2:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser1,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 1,
|
||||
"parser": DummyParser2,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
get_parser_class_for_mime_type("application/pdf"),
|
||||
DummyParser2,
|
||||
)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_0_parsers(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- No parsers are declared
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- No parser class is returned
|
||||
"""
|
||||
m.return_value = []
|
||||
with TemporaryDirectory():
|
||||
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_no_valid_parser(self, m, *args) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- No parser declared for a given mimetype
|
||||
- Parser declared for a different mimetype
|
||||
WHEN:
|
||||
- Attempt to get parser for the given mimetype
|
||||
THEN:
|
||||
- No parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
|
||||
class TestParserAvailability(TestCase):
|
||||
@@ -151,7 +35,7 @@ class TestParserAvailability(TestCase):
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||
RasterisedDocumentParser,
|
||||
)
|
||||
|
||||
@@ -175,7 +59,7 @@ class TestParserAvailability(TestCase):
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||
TextDocumentParser,
|
||||
)
|
||||
|
||||
@@ -198,22 +82,23 @@ class TestParserAvailability(TestCase):
|
||||
),
|
||||
]
|
||||
|
||||
# Force the app ready to notice the settings override
|
||||
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
|
||||
app = apps.get_app_config("paperless_tika")
|
||||
app.ready()
|
||||
self.addCleanup(reset_parser_registry)
|
||||
|
||||
# Reset and rebuild the registry with Tika enabled.
|
||||
with override_settings(TIKA_ENABLED=True):
|
||||
reset_parser_registry()
|
||||
supported_exts = get_supported_file_extensions()
|
||||
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
TikaDocumentParser,
|
||||
)
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_registry().get_parser_for_file(mime_type, "")(),
|
||||
TikaDocumentParser,
|
||||
)
|
||||
|
||||
def test_no_parser_for_mime(self) -> None:
|
||||
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
||||
self.assertIsNone(get_parser_registry().get_parser_for_file("text/sdgsdf", ""))
|
||||
|
||||
def test_default_extension(self) -> None:
|
||||
# Test no parser declared still returns a an extension
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import shutil
|
||||
import uuid
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
@@ -22,7 +21,6 @@ from documents.sanity_checker import SanityCheckMessages
|
||||
from documents.tests.test_classifier import dummy_preprocess
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from documents.tests.utils import SampleDirMixin
|
||||
|
||||
|
||||
class TestIndexReindex(DirectoriesMixin, TestCase):
|
||||
@@ -234,44 +232,6 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertEqual(Document.global_objects.count(), 0)
|
||||
|
||||
|
||||
class TestRetryConsumeTask(
|
||||
DirectoriesMixin,
|
||||
SampleDirMixin,
|
||||
FileSystemAssertsMixin,
|
||||
TestCase,
|
||||
):
|
||||
def do_failed_task(self, test_file: Path) -> PaperlessTask:
|
||||
failed_file = settings.CONSUMPTION_FAILED_DIR / test_file.name
|
||||
shutil.copy(test_file, failed_file)
|
||||
|
||||
task = PaperlessTask.objects.create(
|
||||
type=PaperlessTask.TaskType.AUTO,
|
||||
task_id=str(uuid.uuid4()),
|
||||
task_file_name=failed_file.name,
|
||||
task_name=PaperlessTask.TaskName.CONSUME_FILE,
|
||||
status=states.FAILURE,
|
||||
date_created=timezone.now(),
|
||||
date_done=timezone.now(),
|
||||
)
|
||||
self.assertIsFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
|
||||
return task
|
||||
|
||||
@mock.patch("documents.tasks.consume_file.delay")
|
||||
@mock.patch("documents.tasks.run_subprocess")
|
||||
def test_retry_consume_clean(self, m_subprocess, m_consume_file) -> None:
|
||||
task = self.do_failed_task(self.SAMPLE_DIR / "corrupted.pdf")
|
||||
m_subprocess.return_value.returncode = 0
|
||||
task_id = tasks.retry_failed_file(task_id=task.task_id, clean=True)
|
||||
self.assertIsNotNone(task_id)
|
||||
m_consume_file.assert_called_once()
|
||||
|
||||
def test_cleanup(self) -> None:
|
||||
task = self.do_failed_task(self.SAMPLE_DIR / "corrupted.pdf")
|
||||
task.acknowledged = True
|
||||
task.save()
|
||||
self.assertIsNotFile(settings.CONSUMPTION_FAILED_DIR / task.task_file_name)
|
||||
|
||||
|
||||
class TestUpdateContent(DirectoriesMixin, TestCase):
|
||||
def test_update_content_maybe_archive_file(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -37,7 +37,6 @@ def setup_directories():
|
||||
dirs.scratch_dir = Path(tempfile.mkdtemp()).resolve()
|
||||
dirs.media_dir = Path(tempfile.mkdtemp()).resolve()
|
||||
dirs.consumption_dir = Path(tempfile.mkdtemp()).resolve()
|
||||
dirs.consumption_failed_dir = Path(tempfile.mkdtemp("failed")).resolve()
|
||||
dirs.static_dir = Path(tempfile.mkdtemp()).resolve()
|
||||
dirs.index_dir = dirs.data_dir / "index"
|
||||
dirs.originals_dir = dirs.media_dir / "documents" / "originals"
|
||||
@@ -59,7 +58,6 @@ def setup_directories():
|
||||
THUMBNAIL_DIR=dirs.thumbnail_dir,
|
||||
ARCHIVE_DIR=dirs.archive_dir,
|
||||
CONSUMPTION_DIR=dirs.consumption_dir,
|
||||
CONSUMPTION_FAILED_DIR=dirs.consumption_failed_dir,
|
||||
LOGGING_DIR=dirs.logging_dir,
|
||||
INDEX_DIR=dirs.index_dir,
|
||||
STATIC_ROOT=dirs.static_dir,
|
||||
@@ -76,7 +74,6 @@ def remove_dirs(dirs) -> None:
|
||||
shutil.rmtree(dirs.data_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.consumption_failed_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.static_dir, ignore_errors=True)
|
||||
dirs.settings_override.disable()
|
||||
|
||||
|
||||
@@ -157,7 +157,6 @@ from documents.models import UiSettings
|
||||
from documents.models import Workflow
|
||||
from documents.models import WorkflowAction
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.permissions import AcknowledgeTasksPermissions
|
||||
from documents.permissions import PaperlessAdminPermissions
|
||||
from documents.permissions import PaperlessNotePermissions
|
||||
@@ -189,7 +188,6 @@ from documents.serialisers import NotesSerializer
|
||||
from documents.serialisers import PostDocumentSerializer
|
||||
from documents.serialisers import RemovePasswordDocumentsSerializer
|
||||
from documents.serialisers import ReprocessDocumentsSerializer
|
||||
from documents.serialisers import RetryTaskSerializer
|
||||
from documents.serialisers import RotateDocumentsSerializer
|
||||
from documents.serialisers import RunTaskViewSerializer
|
||||
from documents.serialisers import SavedViewSerializer
|
||||
@@ -212,7 +210,6 @@ from documents.tasks import consume_file
|
||||
from documents.tasks import empty_trash
|
||||
from documents.tasks import index_optimize
|
||||
from documents.tasks import llmindex_index
|
||||
from documents.tasks import retry_failed_file
|
||||
from documents.tasks import sanity_check
|
||||
from documents.tasks import train_classifier
|
||||
from documents.tasks import update_document_parent_tags
|
||||
@@ -227,6 +224,7 @@ from paperless.celery import app as celery_app
|
||||
from paperless.config import AIConfig
|
||||
from paperless.config import GeneralConfig
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
from paperless.serialisers import GroupSerializer
|
||||
from paperless.serialisers import UserSerializer
|
||||
from paperless.views import StandardPagination
|
||||
@@ -1083,15 +1081,17 @@ class DocumentViewSet(
|
||||
if not Path(file).is_file():
|
||||
return None
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
parser_class = get_parser_registry().get_parser_for_file(
|
||||
mime_type,
|
||||
Path(file).name,
|
||||
Path(file),
|
||||
)
|
||||
if parser_class:
|
||||
parser = parser_class(progress_callback=None, logging_group=None)
|
||||
|
||||
try:
|
||||
return parser.extract_metadata(file, mime_type)
|
||||
with parser_class() as parser:
|
||||
return parser.extract_metadata(file, mime_type)
|
||||
except Exception: # pragma: no cover
|
||||
logger.exception(f"Issue getting metadata for {file}")
|
||||
# TODO: cover GPG errors, remove later.
|
||||
return []
|
||||
else: # pragma: no cover
|
||||
logger.warning(f"No parser for {mime_type}")
|
||||
@@ -3469,25 +3469,6 @@ class TasksViewSet(ReadOnlyModelViewSet):
|
||||
queryset = PaperlessTask.objects.filter(task_id=task_id)
|
||||
return queryset
|
||||
|
||||
@action(methods=["post"], detail=True)
|
||||
def retry(self, request, pk=None):
|
||||
task = self.get_object()
|
||||
|
||||
serializer = RetryTaskSerializer(data=request.data)
|
||||
serializer.is_valid(raise_exception=True)
|
||||
clean = serializer.validated_data.get("clean")
|
||||
|
||||
try:
|
||||
new_task_id = retry_failed_file(task.task_id, clean)
|
||||
return Response({"task_id": new_task_id})
|
||||
except FileNotFoundError:
|
||||
return HttpResponseBadRequest("Original file not found")
|
||||
except Exception as e:
|
||||
logger.warning(f"An error occurred retrying task: {e!s}")
|
||||
return HttpResponseBadRequest(
|
||||
"Error retrying task, check logs for more detail.",
|
||||
)
|
||||
|
||||
@action(
|
||||
methods=["post"],
|
||||
detail=False,
|
||||
|
||||
@@ -2,7 +2,7 @@ msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: paperless-ngx\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2026-03-12 15:43+0000\n"
|
||||
"POT-Creation-Date: 2026-03-17 22:44+0000\n"
|
||||
"PO-Revision-Date: 2022-02-17 04:17\n"
|
||||
"Last-Translator: \n"
|
||||
"Language-Team: English\n"
|
||||
@@ -1339,7 +1339,7 @@ msgstr ""
|
||||
msgid "Duplicate document identifiers are not allowed."
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:2556 documents/views.py:3565
|
||||
#: documents/serialisers.py:2556 documents/views.py:3569
|
||||
#, python-format
|
||||
msgid "Documents not found: %(ids)s"
|
||||
msgstr ""
|
||||
@@ -1603,20 +1603,20 @@ msgstr ""
|
||||
msgid "Unable to parse URI {value}"
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3577
|
||||
#: documents/views.py:3581
|
||||
#, python-format
|
||||
msgid "Insufficient permissions to share document %(id)s."
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3620
|
||||
#: documents/views.py:3624
|
||||
msgid "Bundle is already being processed."
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3677
|
||||
#: documents/views.py:3681
|
||||
msgid "The share link bundle is still being prepared. Please try again later."
|
||||
msgstr ""
|
||||
|
||||
#: documents/views.py:3687
|
||||
#: documents/views.py:3691
|
||||
msgid "The share link bundle is unavailable."
|
||||
msgstr ""
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import os
|
||||
import pwd
|
||||
import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
@@ -68,10 +69,6 @@ def paths_check(app_configs, **kwargs) -> list[Error]:
|
||||
+ path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
|
||||
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
|
||||
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
|
||||
+ path_check(
|
||||
"PAPERLESS_CONSUMPTION_FAILED_DIR",
|
||||
settings.CONSUMPTION_FAILED_DIR,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -303,3 +300,62 @@ def check_deprecated_db_settings(
|
||||
)
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs, **kwargs) -> list[Error]:
|
||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||
):
|
||||
return [
|
||||
Error(
|
||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
||||
),
|
||||
]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def get_tesseract_langs():
|
||||
proc = subprocess.run(
|
||||
[shutil.which("tesseract"), "--list-langs"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Decode bytes to string, split on newlines, trim out the header
|
||||
proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
|
||||
|
||||
return [x.strip() for x in proc_lines]
|
||||
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
errs = []
|
||||
|
||||
if not settings.OCR_LANGUAGE:
|
||||
errs.append(
|
||||
Warning(
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
||||
"This means that tesseract will fallback to english.",
|
||||
),
|
||||
)
|
||||
return errs
|
||||
|
||||
# binaries_check in paperless will check and report if this doesn't exist
|
||||
# So skip trying to do anything here and let that handle missing binaries
|
||||
if shutil.which("tesseract") is not None:
|
||||
installed_langs = get_tesseract_langs()
|
||||
|
||||
specified_langs = [x.strip() for x in settings.OCR_LANGUAGE.split("+")]
|
||||
|
||||
for lang in specified_langs:
|
||||
if lang not in installed_langs:
|
||||
errs.append(
|
||||
Error(
|
||||
f"The selected ocr language {lang} is "
|
||||
f"not installed. Paperless cannot OCR your documents "
|
||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
|
||||
),
|
||||
)
|
||||
|
||||
return errs
|
||||
|
||||
@@ -35,6 +35,7 @@ Usage example (third-party parser)::
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Protocol
|
||||
from typing import Self
|
||||
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
|
||||
|
||||
__all__ = [
|
||||
"MetadataEntry",
|
||||
"ParserContext",
|
||||
"ParserProtocol",
|
||||
]
|
||||
|
||||
@@ -73,6 +75,44 @@ class MetadataEntry(TypedDict):
|
||||
"""String representation of the field value."""
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class ParserContext:
|
||||
"""Immutable context passed to a parser before parse().
|
||||
|
||||
The consumer assembles this from the ingestion event and Django
|
||||
settings, then calls ``parser.configure(context)`` before
|
||||
``parser.parse()``. Parsers read only the fields relevant to them;
|
||||
unneeded fields are ignored.
|
||||
|
||||
``frozen=True`` prevents accidental mutation after the consumer
|
||||
hands the context off. ``slots=True`` keeps instances lightweight.
|
||||
|
||||
Fields
|
||||
------
|
||||
mailrule_id : int | None
|
||||
Primary key of the ``MailRule`` that triggered this ingestion,
|
||||
or ``None`` when the document did not arrive via a mail rule.
|
||||
Used by ``MailDocumentParser`` to select the PDF layout.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Future fields (not yet implemented):
|
||||
|
||||
* ``output_type`` — PDF/A variant for archive generation
|
||||
(replaces ``settings.OCR_OUTPUT_TYPE`` reads inside parsers).
|
||||
* ``ocr_mode`` — skip-text, redo, force, etc.
|
||||
(replaces ``settings.OCR_MODE`` reads inside parsers).
|
||||
* ``ocr_language`` — Tesseract language string.
|
||||
(replaces ``settings.OCR_LANGUAGE`` reads inside parsers).
|
||||
|
||||
When those fields are added the consumer will read from Django
|
||||
settings once and populate them here, decoupling parsers from
|
||||
``settings.*`` entirely.
|
||||
"""
|
||||
|
||||
mailrule_id: int | None = None
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class ParserProtocol(Protocol):
|
||||
"""Structural contract for all Paperless-ngx document parsers.
|
||||
@@ -191,6 +231,21 @@ class ParserProtocol(Protocol):
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
"""Apply source context before parse().
|
||||
|
||||
Called by the consumer after instantiation and before parse().
|
||||
The default implementation is a no-op; parsers override only the
|
||||
fields they need.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
context:
|
||||
Immutable context assembled by the consumer for this
|
||||
specific ingestion event.
|
||||
"""
|
||||
...
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
|
||||
834
src/paperless/parsers/mail.py
Normal file
@@ -0,0 +1,834 @@
|
||||
"""
|
||||
Built-in mail document parser.
|
||||
|
||||
Handles message/rfc822 (EML) MIME type by:
|
||||
- Parsing the email using imap_tools
|
||||
- Generating a PDF via Gotenberg (for display and archive)
|
||||
- Extracting text via Tika for HTML content
|
||||
- Extracting metadata from email headers
|
||||
|
||||
The parser always produces a PDF because EML files cannot be rendered
|
||||
natively in a browser (requires_pdf_rendition=True).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from html import escape
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
|
||||
from bleach import clean
|
||||
from bleach import linkify
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.utils.timezone import is_naive
|
||||
from django.utils.timezone import make_aware
|
||||
from gotenberg_client import GotenbergClient
|
||||
from gotenberg_client.constants import A4
|
||||
from gotenberg_client.options import Measurement
|
||||
from gotenberg_client.options import MeasurementUnitType
|
||||
from gotenberg_client.options import PageMarginsType
|
||||
from gotenberg_client.options import PdfAFormat
|
||||
from humanize import naturalsize
|
||||
from imap_tools import MailAttachment
|
||||
from imap_tools import MailMessage
|
||||
from tika_client import TikaClient
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless.models import OutputTypeChoices
|
||||
from paperless.version import __full_version_str__
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
from paperless.parsers import ParserContext
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.mail")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"message/rfc822": ".eml",
|
||||
}
|
||||
|
||||
|
||||
class MailDocumentParser:
|
||||
"""Parse .eml email files for Paperless-ngx.
|
||||
|
||||
Uses imap_tools to parse .eml files, generates a PDF using Gotenberg,
|
||||
and sends the HTML part to a Tika server for text extraction. Because
|
||||
EML files cannot be rendered natively in a browser, the parser always
|
||||
produces a PDF rendition (requires_pdf_rendition=True).
|
||||
|
||||
Pass a ``ParserContext`` to ``configure()`` before ``parse()`` to
|
||||
apply mail-rule-specific PDF layout options:
|
||||
|
||||
parser.configure(ParserContext(mailrule_id=rule.pk))
|
||||
parser.parse(path, mime_type)
|
||||
|
||||
Class attributes
|
||||
----------------
|
||||
name : str
|
||||
Human-readable parser name.
|
||||
version : str
|
||||
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||
author : str
|
||||
Maintainer name.
|
||||
url : str
|
||||
Issue tracker / source URL.
|
||||
"""
|
||||
|
||||
name: str = "Paperless-ngx Mail Parser"
|
||||
version: str = __full_version_str__
|
||||
author: str = "Paperless-ngx Contributors"
|
||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict[str, str]:
|
||||
"""Return the MIME types this parser handles.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, str]
|
||||
Mapping of MIME type to preferred file extension.
|
||||
"""
|
||||
return _SUPPORTED_MIME_TYPES
|
||||
|
||||
@classmethod
|
||||
def score(
|
||||
cls,
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> int | None:
|
||||
"""Return the priority score for handling this file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mime_type:
|
||||
Detected MIME type of the file.
|
||||
filename:
|
||||
Original filename including extension.
|
||||
path:
|
||||
Optional filesystem path. Not inspected by this parser.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
10 if the MIME type is supported, otherwise None.
|
||||
"""
|
||||
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||
return 10
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
"""Whether this parser can produce a searchable PDF archive copy.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always False — the mail parser produces a display PDF
|
||||
(requires_pdf_rendition=True), not an optional OCR archive.
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
"""Whether the parser must produce a PDF for the frontend to display.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always True — EML files cannot be rendered natively in a browser,
|
||||
so a PDF conversion is always required for display.
|
||||
"""
|
||||
return True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self._tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
self._text: str | None = None
|
||||
self._date: datetime.datetime | None = None
|
||||
self._archive_path: Path | None = None
|
||||
self._mailrule_id: int | None = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
self._mailrule_id = context.mailrule_id
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
*,
|
||||
produce_archive: bool = True,
|
||||
) -> None:
|
||||
"""Parse the given .eml into formatted text and a PDF archive.
|
||||
|
||||
Call ``configure(ParserContext(mailrule_id=...))`` before this method
|
||||
to apply mail-rule-specific PDF layout options. The ``produce_archive``
|
||||
flag is accepted for protocol compatibility but is always honoured —
|
||||
the mail parser always produces a PDF since EML files cannot be
|
||||
displayed natively.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the .eml file.
|
||||
mime_type:
|
||||
Detected MIME type of the document (should be "message/rfc822").
|
||||
produce_archive:
|
||||
Accepted for protocol compatibility. The PDF rendition is always
|
||||
produced since EML files cannot be displayed natively in a browser.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If the file cannot be parsed or PDF generation fails.
|
||||
"""
|
||||
|
||||
def strip_text(text: str) -> str:
|
||||
"""Reduces the spacing of the given text string."""
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
text = re.sub(r"(\n *)+", "\n", text)
|
||||
return text.strip()
|
||||
|
||||
def build_formatted_text(mail_message: MailMessage) -> str:
|
||||
"""Constructs a formatted string based on the given email."""
|
||||
fmt_text = f"Subject: {mail_message.subject}\n\n"
|
||||
fmt_text += f"From: {mail_message.from_values.full if mail_message.from_values else ''}\n\n"
|
||||
to_list = [address.full for address in mail_message.to_values]
|
||||
fmt_text += f"To: {', '.join(to_list)}\n\n"
|
||||
if mail_message.cc_values:
|
||||
fmt_text += (
|
||||
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
|
||||
)
|
||||
if mail_message.bcc_values:
|
||||
fmt_text += (
|
||||
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
|
||||
)
|
||||
if mail_message.attachments:
|
||||
att = []
|
||||
for a in mail.attachments:
|
||||
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
|
||||
att.append(
|
||||
f"{a.filename} ({attachment_size})",
|
||||
)
|
||||
fmt_text += f"Attachments: {', '.join(att)}\n\n"
|
||||
|
||||
if mail.html:
|
||||
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
|
||||
|
||||
fmt_text += f"\n\n{strip_text(mail.text)}"
|
||||
|
||||
return fmt_text
|
||||
|
||||
logger.debug("Parsing file %s into an email", document_path.name)
|
||||
mail = self.parse_file_to_message(document_path)
|
||||
|
||||
logger.debug("Building formatted text from email")
|
||||
self._text = build_formatted_text(mail)
|
||||
|
||||
if is_naive(mail.date):
|
||||
self._date = make_aware(mail.date)
|
||||
else:
|
||||
self._date = mail.date
|
||||
|
||||
logger.debug("Creating a PDF from the email")
|
||||
if self._mailrule_id:
|
||||
rule = MailRule.objects.get(pk=self._mailrule_id)
|
||||
self._archive_path = self.generate_pdf(
|
||||
mail,
|
||||
MailRule.PdfLayout(rule.pdf_layout),
|
||||
)
|
||||
else:
|
||||
self._archive_path = self.generate_pdf(mail)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
"""Return the plain-text content extracted during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if parse has not been called yet.
|
||||
"""
|
||||
return self._text
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
datetime.datetime | None
|
||||
Date from the email headers, or None if not detected.
|
||||
"""
|
||||
return self._date
|
||||
|
||||
def get_archive_path(self) -> Path | None:
|
||||
"""Return the path to the generated archive PDF, or None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path | None
|
||||
Path to the PDF produced by Gotenberg, or None if parse has not
|
||||
been called yet.
|
||||
"""
|
||||
return self._archive_path
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thumbnail and metadata
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_thumbnail(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
file_name: str | None = None,
|
||||
) -> Path:
|
||||
"""Generate a thumbnail from the PDF rendition of the email.
|
||||
|
||||
Converts the document to PDF first if not already done.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
file_name:
|
||||
Kept for backward compatibility; not used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated WebP thumbnail inside the temporary directory.
|
||||
"""
|
||||
if not self._archive_path:
|
||||
self._archive_path = self.generate_pdf(
|
||||
self.parse_file_to_message(document_path),
|
||||
)
|
||||
|
||||
return make_thumbnail_from_pdf(
|
||||
self._archive_path,
|
||||
self._tempdir,
|
||||
)
|
||||
|
||||
def get_page_count(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in the document.
|
||||
|
||||
Counts pages in the archive PDF produced by a preceding parse()
|
||||
call. Returns ``None`` if parse() has not been called yet or if
|
||||
no archive was produced.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Page count of the archive PDF, or ``None``.
|
||||
"""
|
||||
if self._archive_path is not None:
|
||||
from paperless.parsers.utils import get_page_count_for_pdf
|
||||
|
||||
return get_page_count_for_pdf(self._archive_path, log=logger)
|
||||
return None
|
||||
|
||||
def extract_metadata(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> list[MetadataEntry]:
|
||||
"""Extract metadata from the email headers.
|
||||
|
||||
Returns email headers as metadata entries with prefix "header",
|
||||
plus summary entries for attachments and date.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[MetadataEntry]
|
||||
Sorted list of metadata entries, or ``[]`` on parse failure.
|
||||
"""
|
||||
result: list[MetadataEntry] = []
|
||||
|
||||
try:
|
||||
mail = self.parse_file_to_message(document_path)
|
||||
except ParseError as e:
|
||||
logger.warning(
|
||||
"Error while fetching document metadata for %s: %s",
|
||||
document_path,
|
||||
e,
|
||||
)
|
||||
return result
|
||||
|
||||
for key, header_values in mail.headers.items():
|
||||
value = ", ".join(header_values)
|
||||
try:
|
||||
value.encode("utf-8")
|
||||
except UnicodeEncodeError as e: # pragma: no cover
|
||||
logger.debug("Skipping header %s: %s", key, e)
|
||||
continue
|
||||
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "header",
|
||||
"key": key,
|
||||
"value": value,
|
||||
},
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": "attachments",
|
||||
"value": ", ".join(
|
||||
f"{attachment.filename}"
|
||||
f"({naturalsize(attachment.size, binary=True, format='%.2f')})"
|
||||
for attachment in mail.attachments
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": "date",
|
||||
"value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
|
||||
},
|
||||
)
|
||||
|
||||
result.sort(key=lambda item: (item["prefix"], item["key"]))
|
||||
return result
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Email-specific methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
|
||||
"""Convert the OCR output type setting to a Gotenberg PdfAFormat."""
|
||||
if settings.OCR_OUTPUT_TYPE in {
|
||||
OutputTypeChoices.PDF_A,
|
||||
OutputTypeChoices.PDF_A2,
|
||||
}:
|
||||
return PdfAFormat.A2b
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
|
||||
logger.warning(
|
||||
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||
)
|
||||
return PdfAFormat.A2b
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
|
||||
return PdfAFormat.A3b
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def parse_file_to_message(filepath: Path) -> MailMessage:
|
||||
"""Parse the given .eml file into a MailMessage object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath:
|
||||
Path to the .eml file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MailMessage
|
||||
Parsed mail message.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If the file cannot be parsed or is missing required fields.
|
||||
"""
|
||||
try:
|
||||
with filepath.open("rb") as eml:
|
||||
parsed = MailMessage.from_bytes(eml.read())
|
||||
if parsed.from_values is None:
|
||||
raise ParseError(
|
||||
f"Could not parse {filepath}: Missing 'from'",
|
||||
)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse {filepath}: {err}",
|
||||
) from err
|
||||
|
||||
return parsed
|
||||
|
||||
def tika_parse(self, html: str) -> str:
|
||||
"""Send HTML content to the Tika server for text extraction.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
html:
|
||||
HTML string to parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Extracted plain text.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If the Tika server cannot be reached or returns an error.
|
||||
"""
|
||||
logger.info("Sending content to Tika server")
|
||||
|
||||
try:
|
||||
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
||||
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
||||
|
||||
if parsed.content is not None:
|
||||
return parsed.content.strip()
|
||||
return ""
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse content with tika server at "
|
||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||
) from err
|
||||
|
||||
def generate_pdf(
|
||||
self,
|
||||
mail_message: MailMessage,
|
||||
pdf_layout: MailRule.PdfLayout | None = None,
|
||||
) -> Path:
|
||||
"""Generate a PDF from the email message.
|
||||
|
||||
Creates separate PDFs for the email body and HTML content, then
|
||||
merges them according to the requested layout.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mail_message:
|
||||
Parsed email message.
|
||||
pdf_layout:
|
||||
Layout option for the PDF. Falls back to the
|
||||
EMAIL_PARSE_DEFAULT_LAYOUT setting if not provided.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated PDF inside the temporary directory.
|
||||
"""
|
||||
archive_path = Path(self._tempdir) / "merged.pdf"
|
||||
|
||||
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
|
||||
|
||||
if pdf_layout is None:
|
||||
pdf_layout = MailRule.PdfLayout(settings.EMAIL_PARSE_DEFAULT_LAYOUT)
|
||||
|
||||
# If no HTML content, create the PDF from the message.
|
||||
# Otherwise, create 2 PDFs and merge them with Gotenberg.
|
||||
if not mail_message.html:
|
||||
archive_path.write_bytes(mail_pdf_file.read_bytes())
|
||||
else:
|
||||
pdf_of_html_content = self.generate_pdf_from_html(
|
||||
mail_message.html,
|
||||
mail_message.attachments,
|
||||
)
|
||||
|
||||
logger.debug("Merging email text and HTML content into single PDF")
|
||||
|
||||
with (
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client,
|
||||
client.merge.merge() as route,
|
||||
):
|
||||
# Configure requested PDF/A formatting, if any
|
||||
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||
if pdf_a_format is not None:
|
||||
route.pdf_format(pdf_a_format)
|
||||
|
||||
match pdf_layout:
|
||||
case MailRule.PdfLayout.HTML_TEXT:
|
||||
route.merge([pdf_of_html_content, mail_pdf_file])
|
||||
case MailRule.PdfLayout.HTML_ONLY:
|
||||
route.merge([pdf_of_html_content])
|
||||
case MailRule.PdfLayout.TEXT_ONLY:
|
||||
route.merge([mail_pdf_file])
|
||||
case MailRule.PdfLayout.TEXT_HTML | _:
|
||||
route.merge([mail_pdf_file, pdf_of_html_content])
|
||||
|
||||
try:
|
||||
response = route.run()
|
||||
archive_path.write_bytes(response.content)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while merging email HTML into PDF: {err}",
|
||||
) from err
|
||||
|
||||
return archive_path
|
||||
|
||||
def mail_to_html(self, mail: MailMessage) -> Path:
|
||||
"""Convert the given email into an HTML file using a template.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mail:
|
||||
Parsed mail message.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the rendered HTML file inside the temporary directory.
|
||||
"""
|
||||
|
||||
def clean_html(text: str) -> str:
|
||||
"""Attempt to clean, escape, and linkify the given HTML string."""
|
||||
if isinstance(text, list):
|
||||
text = "\n".join([str(e) for e in text])
|
||||
if not isinstance(text, str):
|
||||
text = str(text)
|
||||
text = escape(text)
|
||||
text = clean(text)
|
||||
text = linkify(text, parse_email=True)
|
||||
text = text.replace("\n", "<br>")
|
||||
return text
|
||||
|
||||
data = {}
|
||||
|
||||
data["subject"] = clean_html(mail.subject)
|
||||
if data["subject"]:
|
||||
data["subject_label"] = "Subject"
|
||||
data["from"] = clean_html(mail.from_values.full if mail.from_values else "")
|
||||
if data["from"]:
|
||||
data["from_label"] = "From"
|
||||
data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
|
||||
if data["to"]:
|
||||
data["to_label"] = "To"
|
||||
data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
|
||||
if data["cc"]:
|
||||
data["cc_label"] = "CC"
|
||||
data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
|
||||
if data["bcc"]:
|
||||
data["bcc_label"] = "BCC"
|
||||
|
||||
att = []
|
||||
for a in mail.attachments:
|
||||
att.append(
|
||||
f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
|
||||
)
|
||||
data["attachments"] = clean_html(", ".join(att))
|
||||
if data["attachments"]:
|
||||
data["attachments_label"] = "Attachments"
|
||||
|
||||
data["date"] = clean_html(
|
||||
timezone.localtime(mail.date).strftime("%Y-%m-%d %H:%M"),
|
||||
)
|
||||
data["content"] = clean_html(mail.text.strip())
|
||||
|
||||
from django.template.loader import render_to_string
|
||||
|
||||
html_file = Path(self._tempdir) / "email_as_html.html"
|
||||
html_file.write_text(render_to_string("email_msg_template.html", context=data))
|
||||
|
||||
return html_file
|
||||
|
||||
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
|
||||
"""Create a PDF from the email body using an HTML template and Gotenberg.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mail:
|
||||
Parsed mail message.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated PDF inside the temporary directory.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If Gotenberg returns an error.
|
||||
"""
|
||||
logger.info("Converting mail to PDF")
|
||||
|
||||
css_file = (
|
||||
Path(__file__).parent.parent.parent
|
||||
/ "paperless_mail"
|
||||
/ "templates"
|
||||
/ "output.css"
|
||||
)
|
||||
email_html_file = self.mail_to_html(mail)
|
||||
|
||||
with (
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client,
|
||||
client.chromium.html_to_pdf() as route,
|
||||
):
|
||||
# Configure requested PDF/A formatting, if any
|
||||
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||
if pdf_a_format is not None:
|
||||
route.pdf_format(pdf_a_format)
|
||||
|
||||
try:
|
||||
response = (
|
||||
route.index(email_html_file)
|
||||
.resource(css_file)
|
||||
.margins(
|
||||
PageMarginsType(
|
||||
top=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
bottom=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
left=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
right=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
),
|
||||
)
|
||||
.size(A4)
|
||||
.scale(1.0)
|
||||
.run()
|
||||
)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting email to PDF: {err}",
|
||||
) from err
|
||||
|
||||
email_as_pdf_file = Path(self._tempdir) / "email_as_pdf.pdf"
|
||||
email_as_pdf_file.write_bytes(response.content)
|
||||
|
||||
return email_as_pdf_file
|
||||
|
||||
def generate_pdf_from_html(
|
||||
self,
|
||||
orig_html: str,
|
||||
attachments: list[MailAttachment],
|
||||
) -> Path:
|
||||
"""Generate a PDF from the HTML content of the email.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
orig_html:
|
||||
Raw HTML string from the email body.
|
||||
attachments:
|
||||
List of email attachments (used as inline resources).
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated PDF inside the temporary directory.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If Gotenberg returns an error.
|
||||
"""
|
||||
|
||||
def clean_html_script(text: str) -> str:
|
||||
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
|
||||
text = compiled_open.sub("<div hidden ", text)
|
||||
|
||||
compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
|
||||
text = compiled_close.sub("</div", text)
|
||||
return text
|
||||
|
||||
logger.info("Converting message html to PDF")
|
||||
|
||||
tempdir = Path(self._tempdir)
|
||||
|
||||
html_clean = clean_html_script(orig_html)
|
||||
html_clean_file = tempdir / "index.html"
|
||||
html_clean_file.write_text(html_clean)
|
||||
|
||||
with (
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client,
|
||||
client.chromium.html_to_pdf() as route,
|
||||
):
|
||||
# Configure requested PDF/A formatting, if any
|
||||
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||
if pdf_a_format is not None:
|
||||
route.pdf_format(pdf_a_format)
|
||||
|
||||
# Add attachments as resources, cleaning the filename and replacing
|
||||
# it in the index file for inclusion
|
||||
for attachment in attachments:
|
||||
# Clean the attachment name to be valid
|
||||
name_cid = f"cid:{attachment.content_id}"
|
||||
name_clean = "".join(e for e in name_cid if e.isalnum())
|
||||
|
||||
# Write attachment payload to a temp file
|
||||
temp_file = tempdir / name_clean
|
||||
temp_file.write_bytes(attachment.payload)
|
||||
|
||||
route.resource(temp_file)
|
||||
|
||||
# Replace as needed the name with the clean name
|
||||
html_clean = html_clean.replace(name_cid, name_clean)
|
||||
|
||||
# Now store the cleaned up HTML version
|
||||
html_clean_file = tempdir / "index.html"
|
||||
html_clean_file.write_text(html_clean)
|
||||
# This is our index file, the main page basically
|
||||
route.index(html_clean_file)
|
||||
|
||||
# Set page size, margins
|
||||
route.margins(
|
||||
PageMarginsType(
|
||||
top=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
bottom=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
left=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
right=Measurement(0.1, MeasurementUnitType.Inches),
|
||||
),
|
||||
).size(A4).scale(1.0)
|
||||
|
||||
try:
|
||||
response = route.run()
|
||||
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting document to PDF: {err}",
|
||||
) from err
|
||||
|
||||
html_pdf = tempdir / "html.pdf"
|
||||
html_pdf.write_bytes(response.content)
|
||||
return html_pdf
|
||||
@@ -193,9 +193,17 @@ class ParserRegistry:
|
||||
that log output is predictable; scoring determines which parser wins
|
||||
at runtime regardless of registration order.
|
||||
"""
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
self.register_builtin(TextDocumentParser)
|
||||
self.register_builtin(RemoteDocumentParser)
|
||||
self.register_builtin(TikaDocumentParser)
|
||||
self.register_builtin(MailDocumentParser)
|
||||
self.register_builtin(RasterisedDocumentParser)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Discovery
|
||||
@@ -296,6 +304,23 @@ class ParserRegistry:
|
||||
getattr(cls, "url", "unknown"),
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Inspection helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def all_parsers(self) -> list[type[ParserProtocol]]:
|
||||
"""Return all registered parser classes (external first, then builtins).
|
||||
|
||||
Used by compatibility wrappers that need to iterate every parser to
|
||||
compute the full set of supported MIME types and file extensions.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[type[ParserProtocol]]
|
||||
External parsers followed by built-in parsers.
|
||||
"""
|
||||
return [*self._external, *self._builtins]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Parser resolution
|
||||
# ------------------------------------------------------------------
|
||||
@@ -326,7 +351,7 @@ class ParserRegistry:
|
||||
mime_type:
|
||||
The detected MIME type of the file.
|
||||
filename:
|
||||
The original filename, including extension.
|
||||
The original filename, including extension. May be empty in some cases
|
||||
path:
|
||||
Optional filesystem path to the file. Forwarded to each
|
||||
parser's score method.
|
||||
|
||||
433
src/paperless/parsers/remote.py
Normal file
@@ -0,0 +1,433 @@
|
||||
"""
|
||||
Built-in remote-OCR document parser.
|
||||
|
||||
Handles documents by sending them to a configured remote OCR engine
|
||||
(currently Azure AI Vision / Document Intelligence) and retrieving both
|
||||
the extracted text and a searchable PDF with an embedded text layer.
|
||||
|
||||
When no engine is configured, ``score()`` returns ``None`` so the parser
|
||||
is effectively invisible to the registry — the tesseract parser handles
|
||||
these MIME types instead.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
from paperless.parsers import ParserContext
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.remote")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/tiff": ".tiff",
|
||||
"image/bmp": ".bmp",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
|
||||
|
||||
class RemoteEngineConfig:
|
||||
"""Holds and validates the remote OCR engine configuration."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine: str | None,
|
||||
api_key: str | None = None,
|
||||
endpoint: str | None = None,
|
||||
) -> None:
|
||||
self.engine = engine
|
||||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
|
||||
def engine_is_valid(self) -> bool:
|
||||
"""Return True when the engine is known and fully configured."""
|
||||
return (
|
||||
self.engine in ("azureai",)
|
||||
and self.api_key is not None
|
||||
and not (self.engine == "azureai" and self.endpoint is None)
|
||||
)
|
||||
|
||||
|
||||
class RemoteDocumentParser:
|
||||
"""Parse documents via a remote OCR API (currently Azure AI Vision).
|
||||
|
||||
This parser sends documents to a remote engine that returns both
|
||||
extracted text and a searchable PDF with an embedded text layer.
|
||||
It does not depend on Tesseract or ocrmypdf.
|
||||
|
||||
Class attributes
|
||||
----------------
|
||||
name : str
|
||||
Human-readable parser name.
|
||||
version : str
|
||||
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||
author : str
|
||||
Maintainer name.
|
||||
url : str
|
||||
Issue tracker / source URL.
|
||||
"""
|
||||
|
||||
name: str = "Paperless-ngx Remote OCR Parser"
|
||||
version: str = __full_version_str__
|
||||
author: str = "Paperless-ngx Contributors"
|
||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict[str, str]:
|
||||
"""Return the MIME types this parser can handle.
|
||||
|
||||
The full set is always returned regardless of whether a remote
|
||||
engine is configured. The ``score()`` method handles the
|
||||
"am I active?" logic by returning ``None`` when not configured.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, str]
|
||||
Mapping of MIME type to preferred file extension.
|
||||
"""
|
||||
return _SUPPORTED_MIME_TYPES
|
||||
|
||||
@classmethod
|
||||
def score(
|
||||
cls,
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> int | None:
|
||||
"""Return the priority score for handling this file, or None.
|
||||
|
||||
Returns ``None`` when no valid remote engine is configured,
|
||||
making the parser invisible to the registry for this file.
|
||||
When configured, returns 20 — higher than the Tesseract parser's
|
||||
default of 10 — so the remote engine takes priority.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mime_type:
|
||||
Detected MIME type of the file.
|
||||
filename:
|
||||
Original filename including extension.
|
||||
path:
|
||||
Optional filesystem path. Not inspected by this parser.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
20 when the remote engine is configured and the MIME type is
|
||||
supported, otherwise None.
|
||||
"""
|
||||
config = RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
if not config.engine_is_valid():
|
||||
return None
|
||||
if mime_type not in _SUPPORTED_MIME_TYPES:
|
||||
return None
|
||||
return 20
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
"""Whether this parser can produce a searchable PDF archive copy.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always True — the remote engine always returns a PDF with an
|
||||
embedded text layer that serves as the archive copy.
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
"""Whether the parser must produce a PDF for the frontend to display.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always False — all supported originals are displayable by
|
||||
the browser (PDF) or handled via the archive copy (images).
|
||||
"""
|
||||
return False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self._tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
self._logging_group = logging_group
|
||||
self._text: str | None = None
|
||||
self._archive_path: Path | None = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
pass
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
*,
|
||||
produce_archive: bool = True,
|
||||
) -> None:
|
||||
"""Send the document to the remote engine and store results.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the document file to parse.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
produce_archive:
|
||||
Ignored — the remote engine always returns a searchable PDF,
|
||||
which is stored as the archive copy regardless of this flag.
|
||||
"""
|
||||
config = RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
|
||||
if not config.engine_is_valid():
|
||||
logger.warning(
|
||||
"No valid remote parser engine is configured, content will be empty.",
|
||||
)
|
||||
self._text = ""
|
||||
return
|
||||
|
||||
if config.engine == "azureai":
|
||||
self._text = self._azure_ai_vision_parse(document_path, config)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
"""Return the plain-text content extracted during parse."""
|
||||
return self._text
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
datetime.datetime | None
|
||||
Always None — the remote parser does not detect dates.
|
||||
"""
|
||||
return None
|
||||
|
||||
def get_archive_path(self) -> Path | None:
|
||||
"""Return the path to the generated archive PDF, or None."""
|
||||
return self._archive_path
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thumbnail and metadata
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||
"""Generate a thumbnail image for the document.
|
||||
|
||||
Uses the archive PDF produced by the remote engine when available,
|
||||
otherwise falls back to the original document path (PDF inputs).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated WebP thumbnail inside the temp directory.
|
||||
"""
|
||||
# make_thumbnail_from_pdf lives in documents.parsers for now;
|
||||
# it will move to paperless.parsers.utils when the tesseract
|
||||
# parser is migrated in a later phase.
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
|
||||
return make_thumbnail_from_pdf(
|
||||
self._archive_path or document_path,
|
||||
self._tempdir,
|
||||
self._logging_group,
|
||||
)
|
||||
|
||||
def get_page_count(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in a PDF document.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Page count for PDF inputs, or ``None`` for other MIME types.
|
||||
"""
|
||||
if mime_type != "application/pdf":
|
||||
return None
|
||||
|
||||
from paperless.parsers.utils import get_page_count_for_pdf
|
||||
|
||||
return get_page_count_for_pdf(document_path, log=logger)
|
||||
|
||||
def extract_metadata(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> list[MetadataEntry]:
|
||||
"""Extract format-specific metadata from the document.
|
||||
|
||||
Delegates to the shared pikepdf-based extractor for PDF files.
|
||||
Returns ``[]`` for all other MIME types.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the file to extract metadata from.
|
||||
mime_type:
|
||||
MIME type of the file. May be ``"application/pdf"`` when
|
||||
called for the archive version of an image original.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[MetadataEntry]
|
||||
Zero or more metadata entries.
|
||||
"""
|
||||
if mime_type != "application/pdf":
|
||||
return []
|
||||
|
||||
from paperless.parsers.utils import extract_pdf_metadata
|
||||
|
||||
return extract_pdf_metadata(document_path, log=logger)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _azure_ai_vision_parse(
|
||||
self,
|
||||
file: Path,
|
||||
config: RemoteEngineConfig,
|
||||
) -> str | None:
|
||||
"""Send ``file`` to Azure AI Document Intelligence and return text.
|
||||
|
||||
Downloads the searchable PDF output from Azure and stores it at
|
||||
``self._archive_path``. Returns the extracted text content, or
|
||||
``None`` on failure (the error is logged).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file:
|
||||
Absolute path to the document to analyse.
|
||||
config:
|
||||
Validated remote engine configuration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if the Azure call failed.
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
# Callers must have already validated config via engine_is_valid():
|
||||
# engine_is_valid() asserts api_key is not None and (for azureai)
|
||||
# endpoint is not None, so these casts are provably safe.
|
||||
assert config.endpoint is not None
|
||||
assert config.api_key is not None
|
||||
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
||||
from azure.ai.documentintelligence.models import DocumentContentFormat
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
client = DocumentIntelligenceClient(
|
||||
endpoint=config.endpoint,
|
||||
credential=AzureKeyCredential(config.api_key),
|
||||
)
|
||||
|
||||
try:
|
||||
with file.open("rb") as f:
|
||||
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
||||
poller = client.begin_analyze_document(
|
||||
model_id="prebuilt-read",
|
||||
body=analyze_request,
|
||||
output_content_format=DocumentContentFormat.TEXT,
|
||||
output=[AnalyzeOutputOption.PDF],
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
poller.wait()
|
||||
result_id = poller.details["operation_id"]
|
||||
result = poller.result()
|
||||
|
||||
self._archive_path = self._tempdir / "archive.pdf"
|
||||
with self._archive_path.open("wb") as f:
|
||||
for chunk in client.get_analyze_result_pdf(
|
||||
model_id="prebuilt-read",
|
||||
result_id=result_id,
|
||||
):
|
||||
f.write(chunk)
|
||||
|
||||
return result.content
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Azure AI Vision parsing failed: %s", e)
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
return None
|
||||
@@ -1,13 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Any
|
||||
from typing import Self
|
||||
|
||||
from django.conf import settings
|
||||
from PIL import Image
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
@@ -16,6 +21,28 @@ from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.parsers.utils import read_file_handle_unicode_errors
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
from paperless.parsers import ParserContext
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.tesseract")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/tiff": ".tif",
|
||||
"image/gif": ".gif",
|
||||
"image/bmp": ".bmp",
|
||||
"image/webp": ".webp",
|
||||
"image/heic": ".heic",
|
||||
}
|
||||
|
||||
|
||||
class NoTextFoundException(Exception):
|
||||
@@ -26,81 +53,125 @@ class RtlLanguageException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
class RasterisedDocumentParser:
|
||||
"""
|
||||
This parser uses Tesseract to try and get some text out of a rasterised
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.tesseract"
|
||||
name: str = "Paperless-ngx Tesseract OCR Parser"
|
||||
version: str = __full_version_str__
|
||||
author: str = "Paperless-ngx Contributors"
|
||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||
|
||||
def get_settings(self) -> OcrConfig:
|
||||
"""
|
||||
This parser uses the OCR configuration settings to parse documents
|
||||
"""
|
||||
return OcrConfig()
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_page_count(self, document_path, mime_type):
|
||||
page_count = None
|
||||
if mime_type == "application/pdf":
|
||||
try:
|
||||
import pikepdf
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict[str, str]:
|
||||
return _SUPPORTED_MIME_TYPES
|
||||
|
||||
with pikepdf.Pdf.open(document_path) as pdf:
|
||||
page_count = len(pdf.pages)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"Unable to determine PDF page count {document_path}: {e}",
|
||||
)
|
||||
return page_count
|
||||
@classmethod
|
||||
def score(
|
||||
cls,
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> int | None:
|
||||
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||
return 10
|
||||
return None
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
result = []
|
||||
if mime_type == "application/pdf":
|
||||
import pikepdf
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
return True
|
||||
|
||||
pdf = pikepdf.open(document_path)
|
||||
meta = pdf.open_metadata()
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join([str(e) for e in value])
|
||||
value = str(value)
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
if m is None: # pragma: no cover
|
||||
continue
|
||||
namespace = m.group(1)
|
||||
key_value = m.group(2)
|
||||
try:
|
||||
namespace.encode("utf-8")
|
||||
key_value.encode("utf-8")
|
||||
except UnicodeEncodeError as e: # pragma: no cover
|
||||
self.log.debug(f"Skipping metadata key {key}: {e}")
|
||||
continue
|
||||
result.append(
|
||||
{
|
||||
"namespace": namespace,
|
||||
"prefix": meta.REVERSE_NS[namespace],
|
||||
"key": key_value,
|
||||
"value": value,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"Error while reading metadata {key}: {value}. Error: {e}",
|
||||
)
|
||||
return result
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
return False
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self.tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
self.settings = OcrConfig()
|
||||
self.archive_path: Path | None = None
|
||||
self.text: str | None = None
|
||||
self.date: datetime.datetime | None = None
|
||||
self.log = logger
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
logger.debug("Cleaning up temporary directory %s", self.tempdir)
|
||||
shutil.rmtree(self.tempdir, ignore_errors=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
return self.text
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
return self.date
|
||||
|
||||
def get_archive_path(self) -> Path | None:
|
||||
return self.archive_path
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thumbnail, page count, and metadata
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||
return make_thumbnail_from_pdf(
|
||||
self.archive_path or document_path,
|
||||
self.archive_path or Path(document_path),
|
||||
self.tempdir,
|
||||
self.logging_group,
|
||||
)
|
||||
|
||||
def is_image(self, mime_type) -> bool:
|
||||
def get_page_count(self, document_path: Path, mime_type: str) -> int | None:
|
||||
if mime_type == "application/pdf":
|
||||
from paperless.parsers.utils import get_page_count_for_pdf
|
||||
|
||||
return get_page_count_for_pdf(Path(document_path), log=self.log)
|
||||
return None
|
||||
|
||||
def extract_metadata(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> list[MetadataEntry]:
|
||||
if mime_type != "application/pdf":
|
||||
return []
|
||||
|
||||
from paperless.parsers.utils import extract_pdf_metadata
|
||||
|
||||
return extract_pdf_metadata(Path(document_path), log=self.log)
|
||||
|
||||
def is_image(self, mime_type: str) -> bool:
|
||||
return mime_type in [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
@@ -111,25 +182,25 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
"image/heic",
|
||||
]
|
||||
|
||||
def has_alpha(self, image) -> bool:
|
||||
def has_alpha(self, image: Path) -> bool:
|
||||
with Image.open(image) as im:
|
||||
return im.mode in ("RGBA", "LA")
|
||||
|
||||
def remove_alpha(self, image_path: str) -> Path:
|
||||
def remove_alpha(self, image_path: Path) -> Path:
|
||||
no_alpha_image = Path(self.tempdir) / "image-no-alpha"
|
||||
run_subprocess(
|
||||
[
|
||||
settings.CONVERT_BINARY,
|
||||
"-alpha",
|
||||
"off",
|
||||
image_path,
|
||||
no_alpha_image,
|
||||
str(image_path),
|
||||
str(no_alpha_image),
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
return no_alpha_image
|
||||
|
||||
def get_dpi(self, image) -> int | None:
|
||||
def get_dpi(self, image: Path) -> int | None:
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
x, _ = im.info["dpi"]
|
||||
@@ -138,7 +209,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def calculate_a4_dpi(self, image) -> int | None:
|
||||
def calculate_a4_dpi(self, image: Path) -> int | None:
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
width, _ = im.size
|
||||
@@ -156,6 +227,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
sidecar_file: Path | None,
|
||||
pdf_file: Path,
|
||||
) -> str | None:
|
||||
text: str | None = None
|
||||
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||
# the whole text, so do not utilize it in that case
|
||||
if (
|
||||
@@ -163,7 +235,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
and sidecar_file.is_file()
|
||||
and self.settings.mode != "redo"
|
||||
):
|
||||
text = self.read_file_handle_unicode_errors(sidecar_file)
|
||||
text = read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
if "[OCR skipped on page" not in text:
|
||||
# This happens when there's already text in the input file.
|
||||
@@ -191,12 +263,12 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
pdf_file,
|
||||
str(pdf_file),
|
||||
tmp.name,
|
||||
],
|
||||
logger=self.log,
|
||||
)
|
||||
text = self.read_file_handle_unicode_errors(Path(tmp.name))
|
||||
text = read_file_handle_unicode_errors(Path(tmp.name))
|
||||
|
||||
return post_process_text(text)
|
||||
|
||||
@@ -211,16 +283,14 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
def construct_ocrmypdf_parameters(
|
||||
self,
|
||||
input_file,
|
||||
mime_type,
|
||||
output_file,
|
||||
sidecar_file,
|
||||
input_file: Path,
|
||||
mime_type: str,
|
||||
output_file: Path,
|
||||
sidecar_file: Path,
|
||||
*,
|
||||
safe_fallback=False,
|
||||
):
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(self.settings, OcrConfig)
|
||||
ocrmypdf_args = {
|
||||
safe_fallback: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
ocrmypdf_args: dict[str, Any] = {
|
||||
"input_file_or_options": input_file,
|
||||
"output_file": output_file,
|
||||
# need to use threads, since this will be run in daemonized
|
||||
@@ -330,7 +400,13 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
return ocrmypdf_args
|
||||
|
||||
def parse(self, document_path: Path, mime_type, file_name=None) -> None:
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
*,
|
||||
produce_archive: bool = True,
|
||||
) -> None:
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||
VALID_TEXT_LENGTH = 50
|
||||
@@ -458,7 +534,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.text = ""
|
||||
|
||||
|
||||
def post_process_text(text):
|
||||
def post_process_text(text: str | None) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
|
||||
@@ -27,6 +27,7 @@ if TYPE_CHECKING:
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
from paperless.parsers import ParserContext
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.text")
|
||||
|
||||
@@ -156,6 +157,9 @@ class TextDocumentParser:
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
pass
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
|
||||
452
src/paperless/parsers/tika.py
Normal file
@@ -0,0 +1,452 @@
|
||||
"""
|
||||
Built-in Tika document parser.
|
||||
|
||||
Handles Office documents (DOCX, ODT, XLS, XLSX, PPT, PPTX, RTF, etc.) by
|
||||
sending them to an Apache Tika server for text extraction and a Gotenberg
|
||||
server for PDF conversion. Because the source formats cannot be rendered by
|
||||
a browser natively, the parser always produces a PDF rendition for display.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
from contextlib import ExitStack
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Self
|
||||
|
||||
import httpx
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from gotenberg_client import GotenbergClient
|
||||
from gotenberg_client.options import PdfAFormat
|
||||
from tika_client import TikaClient
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless.config import OutputTypeConfig
|
||||
from paperless.models import OutputTypeChoices
|
||||
from paperless.version import __full_version_str__
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from types import TracebackType
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
from paperless.parsers import ParserContext
|
||||
|
||||
logger = logging.getLogger("paperless.parsing.tika")
|
||||
|
||||
_SUPPORTED_MIME_TYPES: dict[str, str] = {
|
||||
"application/msword": ".doc",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||
"application/vnd.ms-excel": ".xls",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
||||
"application/vnd.ms-powerpoint": ".ppt",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
|
||||
"application/vnd.oasis.opendocument.presentation": ".odp",
|
||||
"application/vnd.oasis.opendocument.spreadsheet": ".ods",
|
||||
"application/vnd.oasis.opendocument.text": ".odt",
|
||||
"application/vnd.oasis.opendocument.graphics": ".odg",
|
||||
"text/rtf": ".rtf",
|
||||
}
|
||||
|
||||
|
||||
class TikaDocumentParser:
|
||||
"""Parse Office documents via Apache Tika and Gotenberg for Paperless-ngx.
|
||||
|
||||
Text extraction is handled by the Tika server. PDF conversion for display
|
||||
is handled by Gotenberg (LibreOffice route). Because the source formats
|
||||
cannot be rendered by a browser natively, ``requires_pdf_rendition`` is
|
||||
True and the PDF is always produced regardless of the ``produce_archive``
|
||||
flag passed to ``parse``.
|
||||
|
||||
Both ``TikaClient`` and ``GotenbergClient`` are opened once in
|
||||
``__enter__`` via an ``ExitStack`` and shared across ``parse``,
|
||||
``extract_metadata``, and ``_convert_to_pdf`` calls, then closed via
|
||||
``ExitStack.close()`` in ``__exit__``. The parser must always be used
|
||||
as a context manager.
|
||||
|
||||
Class attributes
|
||||
----------------
|
||||
name : str
|
||||
Human-readable parser name.
|
||||
version : str
|
||||
Semantic version string, kept in sync with Paperless-ngx releases.
|
||||
author : str
|
||||
Maintainer name.
|
||||
url : str
|
||||
Issue tracker / source URL.
|
||||
"""
|
||||
|
||||
name: str = "Paperless-ngx Tika Parser"
|
||||
version: str = __full_version_str__
|
||||
author: str = "Paperless-ngx Contributors"
|
||||
url: str = "https://github.com/paperless-ngx/paperless-ngx"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Class methods
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def supported_mime_types(cls) -> dict[str, str]:
|
||||
"""Return the MIME types this parser handles.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict[str, str]
|
||||
Mapping of MIME type to preferred file extension.
|
||||
"""
|
||||
return _SUPPORTED_MIME_TYPES
|
||||
|
||||
@classmethod
|
||||
def score(
|
||||
cls,
|
||||
mime_type: str,
|
||||
filename: str,
|
||||
path: Path | None = None,
|
||||
) -> int | None:
|
||||
"""Return the priority score for handling this file.
|
||||
|
||||
Returns ``None`` when Tika integration is disabled so the registry
|
||||
skips this parser entirely.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mime_type:
|
||||
Detected MIME type of the file.
|
||||
filename:
|
||||
Original filename including extension.
|
||||
path:
|
||||
Optional filesystem path. Not inspected by this parser.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
10 if TIKA_ENABLED and the MIME type is supported, otherwise None.
|
||||
"""
|
||||
if not settings.TIKA_ENABLED:
|
||||
return None
|
||||
if mime_type in _SUPPORTED_MIME_TYPES:
|
||||
return 10
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Properties
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@property
|
||||
def can_produce_archive(self) -> bool:
|
||||
"""Whether this parser can produce a searchable PDF archive copy.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always False — Tika produces a display PDF, not an OCR archive.
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_pdf_rendition(self) -> bool:
|
||||
"""Whether the parser must produce a PDF for the frontend to display.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
Always True — Office formats cannot be rendered natively in a
|
||||
browser, so a PDF conversion is always required for display.
|
||||
"""
|
||||
return True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def __init__(self, logging_group: object = None) -> None:
|
||||
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self._tempdir = Path(
|
||||
tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR),
|
||||
)
|
||||
self._text: str | None = None
|
||||
self._date: datetime.datetime | None = None
|
||||
self._archive_path: Path | None = None
|
||||
self._exit_stack = ExitStack()
|
||||
self._tika_client: TikaClient | None = None
|
||||
self._gotenberg_client: GotenbergClient | None = None
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
self._tika_client = self._exit_stack.enter_context(
|
||||
TikaClient(
|
||||
tika_url=settings.TIKA_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
),
|
||||
)
|
||||
self._gotenberg_client = self._exit_stack.enter_context(
|
||||
GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
),
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
self._exit_stack.close()
|
||||
logger.debug("Cleaning up temporary directory %s", self._tempdir)
|
||||
shutil.rmtree(self._tempdir, ignore_errors=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Core parsing interface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
pass
|
||||
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
*,
|
||||
produce_archive: bool = True,
|
||||
) -> None:
|
||||
"""Send the document to Tika for text extraction and Gotenberg for PDF.
|
||||
|
||||
Because ``requires_pdf_rendition`` is True the PDF conversion is
|
||||
always performed — the ``produce_archive`` flag is intentionally
|
||||
ignored.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the document file to parse.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
produce_archive:
|
||||
Accepted for protocol compatibility but ignored; the PDF rendition
|
||||
is always produced since the source format cannot be displayed
|
||||
natively in the browser.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If Tika or Gotenberg returns an error.
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
assert self._tika_client is not None
|
||||
|
||||
logger.info("Sending %s to Tika server", document_path)
|
||||
|
||||
try:
|
||||
try:
|
||||
parsed = self._tika_client.tika.as_text.from_file(
|
||||
document_path,
|
||||
mime_type,
|
||||
)
|
||||
except httpx.HTTPStatusError as err:
|
||||
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
|
||||
# Tika fails with some files as multi-part form data
|
||||
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
|
||||
parsed = self._tika_client.tika.as_text.from_buffer(
|
||||
document_path.read_bytes(),
|
||||
mime_type,
|
||||
)
|
||||
else: # pragma: no cover
|
||||
raise
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse {document_path} with tika server at "
|
||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||
) from err
|
||||
|
||||
self._text = parsed.content
|
||||
if self._text is not None:
|
||||
self._text = self._text.strip()
|
||||
|
||||
self._date = parsed.created
|
||||
if self._date is not None and timezone.is_naive(self._date):
|
||||
self._date = timezone.make_aware(self._date)
|
||||
|
||||
# Always convert — requires_pdf_rendition=True means the browser
|
||||
# cannot display the source format natively.
|
||||
self._archive_path = self._convert_to_pdf(document_path)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Result accessors
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_text(self) -> str | None:
|
||||
"""Return the plain-text content extracted during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
Extracted text, or None if parse has not been called yet.
|
||||
"""
|
||||
return self._text
|
||||
|
||||
def get_date(self) -> datetime.datetime | None:
|
||||
"""Return the document date detected during parse.
|
||||
|
||||
Returns
|
||||
-------
|
||||
datetime.datetime | None
|
||||
Creation date from Tika metadata, or None if not detected.
|
||||
"""
|
||||
return self._date
|
||||
|
||||
def get_archive_path(self) -> Path | None:
|
||||
"""Return the path to the generated PDF rendition, or None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path | None
|
||||
Path to the PDF produced by Gotenberg, or None if parse has not
|
||||
been called yet.
|
||||
"""
|
||||
return self._archive_path
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Thumbnail and metadata
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_thumbnail(self, document_path: Path, mime_type: str) -> Path:
|
||||
"""Generate a thumbnail from the PDF rendition of the document.
|
||||
|
||||
Converts the document to PDF first if not already done.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
mime_type:
|
||||
Detected MIME type of the document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated WebP thumbnail inside the temporary directory.
|
||||
"""
|
||||
if self._archive_path is None:
|
||||
self._archive_path = self._convert_to_pdf(document_path)
|
||||
return make_thumbnail_from_pdf(self._archive_path, self._tempdir)
|
||||
|
||||
def get_page_count(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in the document.
|
||||
|
||||
Counts pages in the archive PDF produced by a preceding parse()
|
||||
call. Returns ``None`` if parse() has not been called yet or if
|
||||
no archive was produced.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Page count of the archive PDF, or ``None``.
|
||||
"""
|
||||
if self._archive_path is not None:
|
||||
from paperless.parsers.utils import get_page_count_for_pdf
|
||||
|
||||
return get_page_count_for_pdf(self._archive_path, log=logger)
|
||||
return None
|
||||
|
||||
def extract_metadata(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
) -> list[MetadataEntry]:
|
||||
"""Extract format-specific metadata via the Tika metadata endpoint.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[MetadataEntry]
|
||||
All key/value pairs returned by Tika, or ``[]`` on error.
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
assert self._tika_client is not None
|
||||
|
||||
try:
|
||||
parsed = self._tika_client.metadata.from_file(document_path, mime_type)
|
||||
return [
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": key,
|
||||
"value": parsed.data[key],
|
||||
}
|
||||
for key in parsed.data
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Error while fetching document metadata for %s: %s",
|
||||
document_path,
|
||||
e,
|
||||
)
|
||||
return []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _convert_to_pdf(self, document_path: Path) -> Path:
|
||||
"""Convert the document to PDF using Gotenberg's LibreOffice route.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the source document.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Path to the generated PDF inside the temporary directory.
|
||||
|
||||
Raises
|
||||
------
|
||||
documents.parsers.ParseError
|
||||
If Gotenberg returns an error.
|
||||
"""
|
||||
if TYPE_CHECKING:
|
||||
assert self._gotenberg_client is not None
|
||||
|
||||
pdf_path = self._tempdir / "convert.pdf"
|
||||
|
||||
logger.info("Converting %s to PDF as %s", document_path, pdf_path)
|
||||
|
||||
with self._gotenberg_client.libre_office.to_pdf() as route:
|
||||
# Set the output format of the resulting PDF.
|
||||
# OutputTypeConfig reads the database-stored ApplicationConfiguration
|
||||
# first, then falls back to the PAPERLESS_OCR_OUTPUT_TYPE env var.
|
||||
output_type = OutputTypeConfig().output_type
|
||||
if output_type in {
|
||||
OutputTypeChoices.PDF_A,
|
||||
OutputTypeChoices.PDF_A2,
|
||||
}:
|
||||
route.pdf_format(PdfAFormat.A2b)
|
||||
elif output_type == OutputTypeChoices.PDF_A1:
|
||||
logger.warning(
|
||||
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||
)
|
||||
route.pdf_format(PdfAFormat.A2b)
|
||||
elif output_type == OutputTypeChoices.PDF_A3:
|
||||
route.pdf_format(PdfAFormat.A3b)
|
||||
|
||||
route.convert(document_path)
|
||||
|
||||
try:
|
||||
response = route.run()
|
||||
pdf_path.write_bytes(response.content)
|
||||
return pdf_path
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting document to PDF: {err}",
|
||||
) from err
|
||||
158
src/paperless/parsers/utils.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""
|
||||
Shared utilities for Paperless-ngx document parsers.
|
||||
|
||||
Functions here are format-neutral helpers that multiple parsers need.
|
||||
Keeping them here avoids parsers inheriting from each other just to
|
||||
share implementation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
logger = logging.getLogger("paperless.parsers.utils")
|
||||
|
||||
|
||||
def read_file_handle_unicode_errors(
|
||||
filepath: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> str:
|
||||
"""Read a file as UTF-8 text, replacing invalid bytes rather than raising.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath:
|
||||
Absolute path to the file to read.
|
||||
log:
|
||||
Logger to use for warnings. Falls back to the module-level logger
|
||||
when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
File content as a string, with any invalid UTF-8 sequences replaced
|
||||
by the Unicode replacement character.
|
||||
"""
|
||||
_log = log or logger
|
||||
try:
|
||||
return filepath.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError as e:
|
||||
_log.warning("Unicode error during text reading, continuing: %s", e)
|
||||
return filepath.read_bytes().decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def get_page_count_for_pdf(
|
||||
document_path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> int | None:
|
||||
"""Return the number of pages in a PDF file using pikepdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger to use for warnings. Falls back to the module-level logger
|
||||
when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int | None
|
||||
Page count, or ``None`` if the file cannot be opened or is not a
|
||||
valid PDF.
|
||||
"""
|
||||
import pikepdf
|
||||
|
||||
_log = log or logger
|
||||
|
||||
try:
|
||||
with pikepdf.Pdf.open(document_path) as pdf:
|
||||
return len(pdf.pages)
|
||||
except Exception as e:
|
||||
_log.warning("Unable to determine PDF page count for %s: %s", document_path, e)
|
||||
return None
|
||||
|
||||
|
||||
def extract_pdf_metadata(
|
||||
document_path: Path,
|
||||
log: logging.Logger | None = None,
|
||||
) -> list[MetadataEntry]:
|
||||
"""Extract XMP/PDF metadata from a PDF file using pikepdf.
|
||||
|
||||
Reads all XMP metadata entries from the document and returns them as a
|
||||
list of ``MetadataEntry`` dicts. The method never raises — any failure
|
||||
to open the file or read a specific key is logged and skipped.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
document_path:
|
||||
Absolute path to the PDF file.
|
||||
log:
|
||||
Logger to use for warnings and debug messages. Falls back to the
|
||||
module-level logger when omitted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[MetadataEntry]
|
||||
Zero or more metadata entries. Returns ``[]`` if the file cannot
|
||||
be opened or contains no readable XMP metadata.
|
||||
"""
|
||||
import pikepdf
|
||||
|
||||
from paperless.parsers import MetadataEntry
|
||||
|
||||
_log = log or logger
|
||||
result: list[MetadataEntry] = []
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
|
||||
try:
|
||||
pdf = pikepdf.open(document_path)
|
||||
meta = pdf.open_metadata()
|
||||
except Exception as e:
|
||||
_log.warning("Could not open PDF metadata for %s: %s", document_path, e)
|
||||
return []
|
||||
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join(str(e) for e in value)
|
||||
value = str(value)
|
||||
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
if m is None:
|
||||
continue
|
||||
|
||||
namespace = m.group(1)
|
||||
key_value = m.group(2)
|
||||
|
||||
try:
|
||||
namespace.encode("utf-8")
|
||||
key_value.encode("utf-8")
|
||||
except UnicodeEncodeError as enc_err:
|
||||
_log.debug("Skipping metadata key %s: %s", key, enc_err)
|
||||
continue
|
||||
|
||||
result.append(
|
||||
MetadataEntry(
|
||||
namespace=namespace,
|
||||
prefix=meta.REVERSE_NS[namespace],
|
||||
key=key_value,
|
||||
value=value,
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
_log.warning(
|
||||
"Error reading metadata key %s value %s: %s",
|
||||
key,
|
||||
value,
|
||||
e,
|
||||
)
|
||||
|
||||
return result
|
||||
@@ -248,7 +248,9 @@ class ApplicationConfigurationSerializer(serializers.ModelSerializer):
|
||||
allow_internal=settings.LLM_ALLOW_INTERNAL_ENDPOINTS,
|
||||
)
|
||||
except ValueError as e:
|
||||
raise serializers.ValidationError(str(e)) from e
|
||||
raise serializers.ValidationError(
|
||||
f"Invalid LLM endpoint: {e.args[0]}, see logs for details",
|
||||
) from e
|
||||
|
||||
return value
|
||||
|
||||
|
||||
@@ -98,11 +98,6 @@ CONSUMPTION_DIR = get_path_from_env(
|
||||
BASE_DIR.parent / "consume",
|
||||
)
|
||||
|
||||
CONSUMPTION_FAILED_DIR = get_path_from_env(
|
||||
"PAPERLESS_CONSUMPTION_FAILED_DIR",
|
||||
CONSUMPTION_DIR / "failed",
|
||||
)
|
||||
|
||||
# This will be created if it doesn't exist
|
||||
SCRATCH_DIR = get_path_from_env(
|
||||
"PAPERLESS_SCRATCH_DIR",
|
||||
@@ -126,10 +121,7 @@ INSTALLED_APPS = [
|
||||
"django_extensions",
|
||||
"paperless",
|
||||
"documents.apps.DocumentsConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
"paperless_remote.apps.PaperlessRemoteParserConfig",
|
||||
"django.contrib.admin",
|
||||
"rest_framework",
|
||||
"rest_framework.authtoken",
|
||||
@@ -787,8 +779,6 @@ CONSUMER_IGNORE_PATTERNS = list(
|
||||
),
|
||||
),
|
||||
)
|
||||
if CONSUMPTION_DIR in CONSUMPTION_FAILED_DIR.parents:
|
||||
CONSUMER_IGNORE_PATTERNS.append(CONSUMPTION_FAILED_DIR.name)
|
||||
|
||||
# Directories to always ignore. These are matched by directory name, not full path
|
||||
CONSUMER_IGNORE_DIRS = list(
|
||||
@@ -981,8 +971,8 @@ TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
||||
"http://localhost:3000",
|
||||
)
|
||||
|
||||
if TIKA_ENABLED:
|
||||
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
||||
# Tika parser is now integrated into the main parser registry
|
||||
# No separate Django app needed
|
||||
|
||||
AUDIT_LOG_ENABLED = get_bool_from_env("PAPERLESS_AUDIT_LOG_ENABLED", "true")
|
||||
if AUDIT_LOG_ENABLED:
|
||||
|
||||
@@ -6,15 +6,29 @@ so it is easy to see which files belong to which test module.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import contextmanager
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from django.test import override_settings
|
||||
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
#: Type for the ``make_tesseract_parser`` fixture factory.
|
||||
MakeTesseractParser = Callable[..., Generator[RasterisedDocumentParser, None, None]]
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
@@ -74,3 +88,713 @@ def text_parser() -> Generator[TextDocumentParser, None, None]:
|
||||
"""
|
||||
with TextDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def remote_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the remote parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/remote/``
|
||||
"""
|
||||
return samples_dir / "remote"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_pdf_file(remote_samples_dir: Path) -> Path:
|
||||
"""Path to a simple digital PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``remote/simple-digital.pdf``.
|
||||
"""
|
||||
return remote_samples_dir / "simple-digital.pdf"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser instance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def remote_parser() -> Generator[RemoteDocumentParser, None, None]:
|
||||
"""Yield a RemoteDocumentParser and clean up its temporary directory afterwards.
|
||||
|
||||
Yields
|
||||
------
|
||||
RemoteDocumentParser
|
||||
A ready-to-use parser instance.
|
||||
"""
|
||||
with RemoteDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Remote parser settings helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def azure_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||
"""Configure Django settings for a valid Azure AI OCR engine.
|
||||
|
||||
Sets ``REMOTE_OCR_ENGINE``, ``REMOTE_OCR_API_KEY``, and
|
||||
``REMOTE_OCR_ENDPOINT`` to test values. Settings are restored
|
||||
automatically after the test by pytest-django.
|
||||
|
||||
Returns
|
||||
-------
|
||||
SettingsWrapper
|
||||
The modified settings object (for chaining further overrides).
|
||||
"""
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = "test-api-key"
|
||||
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||
return settings
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def no_engine_settings(settings: SettingsWrapper) -> SettingsWrapper:
|
||||
"""Configure Django settings with no remote engine configured.
|
||||
|
||||
Returns
|
||||
-------
|
||||
SettingsWrapper
|
||||
The modified settings object.
|
||||
"""
|
||||
settings.REMOTE_OCR_ENGINE = None
|
||||
settings.REMOTE_OCR_API_KEY = None
|
||||
settings.REMOTE_OCR_ENDPOINT = None
|
||||
return settings
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tika parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tika_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the Tika parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/tika/``
|
||||
"""
|
||||
return samples_dir / "tika"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_odt_file(tika_samples_dir: Path) -> Path:
|
||||
"""Path to a sample ODT file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tika/sample.odt``.
|
||||
"""
|
||||
return tika_samples_dir / "sample.odt"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_docx_file(tika_samples_dir: Path) -> Path:
|
||||
"""Path to a sample DOCX file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tika/sample.docx``.
|
||||
"""
|
||||
return tika_samples_dir / "sample.docx"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_doc_file(tika_samples_dir: Path) -> Path:
|
||||
"""Path to a sample DOC file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tika/sample.doc``.
|
||||
"""
|
||||
return tika_samples_dir / "sample.doc"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sample_broken_odt(tika_samples_dir: Path) -> Path:
|
||||
"""Path to a broken ODT file that triggers the multi-part fallback.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tika/multi-part-broken.odt``.
|
||||
"""
|
||||
return tika_samples_dir / "multi-part-broken.odt"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tika parser instance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
|
||||
"""Yield a TikaDocumentParser and clean up its temporary directory afterwards.
|
||||
|
||||
Yields
|
||||
------
|
||||
TikaDocumentParser
|
||||
A ready-to-use parser instance.
|
||||
"""
|
||||
with TikaDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Mail parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def mail_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the mail parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/mail/``
|
||||
"""
|
||||
return samples_dir / "mail"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def broken_email_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to a broken/malformed EML sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/broken.eml``.
|
||||
"""
|
||||
return mail_samples_dir / "broken.eml"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_txt_email_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to a plain-text email sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/simple_text.eml``.
|
||||
"""
|
||||
return mail_samples_dir / "simple_text.eml"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_txt_email_pdf_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the expected PDF rendition of the plain-text email.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/simple_text.eml.pdf``.
|
||||
"""
|
||||
return mail_samples_dir / "simple_text.eml.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_txt_email_thumbnail_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the expected thumbnail for the plain-text email.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/simple_text.eml.pdf.webp``.
|
||||
"""
|
||||
return mail_samples_dir / "simple_text.eml.pdf.webp"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to an HTML email sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/html.eml``.
|
||||
"""
|
||||
return mail_samples_dir / "html.eml"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_pdf_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the expected PDF rendition of the HTML email.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/html.eml.pdf``.
|
||||
"""
|
||||
return mail_samples_dir / "html.eml.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_thumbnail_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the expected thumbnail for the HTML email.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/html.eml.pdf.webp``.
|
||||
"""
|
||||
return mail_samples_dir / "html.eml.pdf.webp"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def html_email_html_file(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the HTML body of the HTML email sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/html.eml.html``.
|
||||
"""
|
||||
return mail_samples_dir / "html.eml.html"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def merged_pdf_first(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the first PDF used in PDF-merge tests.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/first.pdf``.
|
||||
"""
|
||||
return mail_samples_dir / "first.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def merged_pdf_second(mail_samples_dir: Path) -> Path:
|
||||
"""Path to the second PDF used in PDF-merge tests.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``mail/second.pdf``.
|
||||
"""
|
||||
return mail_samples_dir / "second.pdf"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Mail parser instance
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mail_parser() -> Generator[MailDocumentParser, None, None]:
|
||||
"""Yield a MailDocumentParser and clean up its temporary directory afterwards.
|
||||
|
||||
Yields
|
||||
------
|
||||
MailDocumentParser
|
||||
A ready-to-use parser instance.
|
||||
"""
|
||||
with MailDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def nginx_base_url() -> Generator[str, None, None]:
|
||||
"""
|
||||
The base URL for the nginx HTTP server we expect to be alive
|
||||
"""
|
||||
yield "http://localhost:8080"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tesseract parser sample files
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tesseract_samples_dir(samples_dir: Path) -> Path:
|
||||
"""Absolute path to the tesseract parser sample files directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
``<samples_dir>/tesseract/``
|
||||
"""
|
||||
return samples_dir / "tesseract"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def document_webp_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a WebP document sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/document.webp``.
|
||||
"""
|
||||
return tesseract_samples_dir / "document.webp"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def encrypted_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to an encrypted PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/encrypted.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "encrypted.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page digital PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-digital.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-digital.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_images_alpha_rgb_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page TIFF with alpha channel in RGB.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-images-alpha-rgb.tiff``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-images-alpha-rgb.tiff"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_images_alpha_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page TIFF with alpha channel.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-images-alpha.tiff``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-images-alpha.tiff"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_images_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page PDF with images.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-images.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-images.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_images_tiff_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page TIFF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-images.tiff``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-images.tiff"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def multi_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a multi-page mixed PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/multi-page-mixed.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "multi-page-mixed.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def no_text_alpha_png_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a PNG with alpha channel and no text.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/no-text-alpha.png``.
|
||||
"""
|
||||
return tesseract_samples_dir / "no-text-alpha.png"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def rotated_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a rotated PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/rotated.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "rotated.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def rtl_test_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to an RTL test PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/rtl-test.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "rtl-test.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def signed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a signed PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/signed.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "signed.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_alpha_png_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple PNG with alpha channel.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple-alpha.png``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple-alpha.png"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_digital_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple digital PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple-digital.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple-digital.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_no_dpi_png_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple PNG without DPI information.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple-no-dpi.png``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple-no-dpi.png"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_bmp_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple BMP sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.bmp``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.bmp"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_gif_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple GIF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.gif``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.gif"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_heic_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple HEIC sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.heic``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.heic"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_jpg_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple JPG sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.jpg``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.jpg"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_png_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple PNG sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.png``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.png"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def simple_tif_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a simple TIF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/simple.tif``.
|
||||
"""
|
||||
return tesseract_samples_dir / "simple.tif"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def single_page_mixed_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a single-page mixed PDF sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/single-page-mixed.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "single-page-mixed.pdf"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def with_form_pdf_file(tesseract_samples_dir: Path) -> Path:
|
||||
"""Path to a PDF with form sample file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
Absolute path to ``tesseract/with-form.pdf``.
|
||||
"""
|
||||
return tesseract_samples_dir / "with-form.pdf"
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tesseract parser instance and settings helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def null_app_config(mocker: MockerFixture) -> MagicMock:
|
||||
"""Return a MagicMock with all OcrConfig fields set to None.
|
||||
|
||||
This allows the parser to fall back to Django settings instead of
|
||||
hitting the database.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MagicMock
|
||||
Mock config with all fields as None
|
||||
"""
|
||||
return mocker.MagicMock(
|
||||
output_type=None,
|
||||
pages=None,
|
||||
language=None,
|
||||
mode=None,
|
||||
skip_archive_file=None,
|
||||
image_dpi=None,
|
||||
unpaper_clean=None,
|
||||
deskew=None,
|
||||
rotate_pages=None,
|
||||
rotate_pages_threshold=None,
|
||||
max_image_pixels=None,
|
||||
color_conversion_strategy=None,
|
||||
user_args=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def tesseract_parser(
|
||||
mocker: MockerFixture,
|
||||
null_app_config: MagicMock,
|
||||
) -> Generator[RasterisedDocumentParser, None, None]:
|
||||
"""Yield a RasterisedDocumentParser and clean up its temporary directory afterwards.
|
||||
|
||||
Patches the config system to avoid database access.
|
||||
|
||||
Yields
|
||||
------
|
||||
RasterisedDocumentParser
|
||||
A ready-to-use parser instance.
|
||||
"""
|
||||
mocker.patch(
|
||||
"paperless.config.BaseConfig._get_config_instance",
|
||||
return_value=null_app_config,
|
||||
)
|
||||
with RasterisedDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def make_tesseract_parser(
|
||||
mocker: MockerFixture,
|
||||
null_app_config: MagicMock,
|
||||
) -> MakeTesseractParser:
|
||||
"""Return a factory for creating RasterisedDocumentParser with Django settings overrides.
|
||||
|
||||
This fixture is useful for tests that need to create parsers with different
|
||||
settings configurations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Callable[..., contextmanager[RasterisedDocumentParser]]
|
||||
A context manager factory that accepts Django settings overrides
|
||||
"""
|
||||
mocker.patch(
|
||||
"paperless.config.BaseConfig._get_config_instance",
|
||||
return_value=null_app_config,
|
||||
)
|
||||
|
||||
@contextmanager
|
||||
def _make_parser(**django_settings_overrides):
|
||||
with override_settings(**django_settings_overrides):
|
||||
with RasterisedDocumentParser() as parser:
|
||||
yield parser
|
||||
|
||||
return _make_parser
|
||||
|
||||
@@ -12,7 +12,64 @@ from pytest_httpx import HTTPXMock
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
|
||||
class TestMailParserProtocol:
|
||||
"""Verify that MailDocumentParser satisfies the ParserProtocol contract."""
|
||||
|
||||
def test_isinstance_satisfies_protocol(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
) -> None:
|
||||
assert isinstance(mail_parser, ParserProtocol)
|
||||
|
||||
def test_supported_mime_types(self) -> None:
|
||||
mime_types = MailDocumentParser.supported_mime_types()
|
||||
assert isinstance(mime_types, dict)
|
||||
assert "message/rfc822" in mime_types
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("mime_type", "expected"),
|
||||
[
|
||||
("message/rfc822", 10),
|
||||
("application/pdf", None),
|
||||
("text/plain", None),
|
||||
],
|
||||
)
|
||||
def test_score(self, mime_type: str, expected: int | None) -> None:
|
||||
assert MailDocumentParser.score(mime_type, "email.eml") == expected
|
||||
|
||||
def test_can_produce_archive_is_false(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
) -> None:
|
||||
assert mail_parser.can_produce_archive is False
|
||||
|
||||
def test_requires_pdf_rendition_is_true(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
) -> None:
|
||||
assert mail_parser.requires_pdf_rendition is True
|
||||
|
||||
def test_get_page_count_returns_none_without_archive(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
html_email_file: Path,
|
||||
) -> None:
|
||||
assert mail_parser.get_page_count(html_email_file, "message/rfc822") is None
|
||||
|
||||
def test_get_page_count_returns_int_with_pdf_archive(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
simple_txt_email_pdf_file: Path,
|
||||
) -> None:
|
||||
mail_parser._archive_path = simple_txt_email_pdf_file
|
||||
count = mail_parser.get_page_count(simple_txt_email_pdf_file, "message/rfc822")
|
||||
assert isinstance(count, int)
|
||||
assert count > 0
|
||||
|
||||
|
||||
class TestEmailFileParsing:
|
||||
@@ -24,7 +81,7 @@ class TestEmailFileParsing:
|
||||
def test_parse_error_missing_file(
|
||||
self,
|
||||
mail_parser: MailDocumentParser,
|
||||
sample_dir: Path,
|
||||
mail_samples_dir: Path,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -35,7 +92,7 @@ class TestEmailFileParsing:
|
||||
- An Exception is thrown
|
||||
"""
|
||||
# Check if exception is raised when parsing fails.
|
||||
test_file = sample_dir / "doesntexist.eml"
|
||||
test_file = mail_samples_dir / "doesntexist.eml"
|
||||
|
||||
assert not test_file.exists()
|
||||
|
||||
@@ -246,12 +303,12 @@ class TestEmailThumbnailGenerate:
|
||||
"""
|
||||
mocked_return = "Passing the return value through.."
|
||||
mock_make_thumbnail_from_pdf = mocker.patch(
|
||||
"paperless_mail.parsers.make_thumbnail_from_pdf",
|
||||
"paperless.parsers.mail.make_thumbnail_from_pdf",
|
||||
)
|
||||
mock_make_thumbnail_from_pdf.return_value = mocked_return
|
||||
|
||||
mock_generate_pdf = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||
)
|
||||
mock_generate_pdf.return_value = "Mocked return value.."
|
||||
|
||||
@@ -260,8 +317,7 @@ class TestEmailThumbnailGenerate:
|
||||
mock_generate_pdf.assert_called_once()
|
||||
mock_make_thumbnail_from_pdf.assert_called_once_with(
|
||||
"Mocked return value..",
|
||||
mail_parser.tempdir,
|
||||
None,
|
||||
mail_parser._tempdir,
|
||||
)
|
||||
|
||||
assert mocked_return == thumb
|
||||
@@ -373,7 +429,7 @@ class TestParser:
|
||||
"""
|
||||
# Validate parsing returns the expected results
|
||||
mock_generate_pdf = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||
)
|
||||
|
||||
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
||||
@@ -385,7 +441,7 @@ class TestParser:
|
||||
"BCC: fdf@fvf.de\n\n"
|
||||
"\n\nThis is just a simple Text Mail."
|
||||
)
|
||||
assert text_expected == mail_parser.text
|
||||
assert text_expected == mail_parser.get_text()
|
||||
assert (
|
||||
datetime.datetime(
|
||||
2022,
|
||||
@@ -396,7 +452,7 @@ class TestParser:
|
||||
43,
|
||||
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
||||
)
|
||||
== mail_parser.date
|
||||
== mail_parser.get_date()
|
||||
)
|
||||
|
||||
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
|
||||
@@ -419,7 +475,7 @@ class TestParser:
|
||||
"""
|
||||
|
||||
mock_generate_pdf = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||
)
|
||||
|
||||
# Validate parsing returns the expected results
|
||||
@@ -443,7 +499,7 @@ class TestParser:
|
||||
mail_parser.parse(html_email_file, "message/rfc822")
|
||||
|
||||
mock_generate_pdf.assert_called_once()
|
||||
assert text_expected == mail_parser.text
|
||||
assert text_expected == mail_parser.get_text()
|
||||
assert (
|
||||
datetime.datetime(
|
||||
2022,
|
||||
@@ -454,7 +510,7 @@ class TestParser:
|
||||
19,
|
||||
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
|
||||
)
|
||||
== mail_parser.date
|
||||
== mail_parser.get_date()
|
||||
)
|
||||
|
||||
def test_generate_pdf_parse_error(
|
||||
@@ -501,7 +557,7 @@ class TestParser:
|
||||
|
||||
mail_parser.parse(simple_txt_email_file, "message/rfc822")
|
||||
|
||||
assert mail_parser.archive_path is not None
|
||||
assert mail_parser.get_archive_path() is not None
|
||||
|
||||
@pytest.mark.httpx_mock(can_send_already_matched_responses=True)
|
||||
def test_generate_pdf_html_email(
|
||||
@@ -542,7 +598,7 @@ class TestParser:
|
||||
)
|
||||
mail_parser.parse(html_email_file, "message/rfc822")
|
||||
|
||||
assert mail_parser.archive_path is not None
|
||||
assert mail_parser.get_archive_path() is not None
|
||||
|
||||
def test_generate_pdf_html_email_html_to_pdf_failure(
|
||||
self,
|
||||
@@ -712,10 +768,10 @@ class TestParser:
|
||||
|
||||
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
|
||||
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
|
||||
mail_parser.configure(ParserContext(mailrule_id=1))
|
||||
mail_parser.parse(
|
||||
document_path=html_email_file,
|
||||
mime_type="message/rfc822",
|
||||
mailrule_id=1,
|
||||
)
|
||||
args, _ = mock_merge_route.call_args
|
||||
assert len(args[0]) == expected_calls
|
||||
@@ -11,7 +11,7 @@ from PIL import Image
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from documents.tests.utils import util_call_with_backoff
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
from paperless.parsers.mail import MailDocumentParser
|
||||
|
||||
|
||||
def extract_text(pdf_path: Path) -> str:
|
||||
@@ -159,7 +159,7 @@ class TestParserLive:
|
||||
- The returned thumbnail image file shall match the expected hash
|
||||
"""
|
||||
mock_generate_pdf = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf",
|
||||
)
|
||||
mock_generate_pdf.return_value = simple_txt_email_pdf_file
|
||||
|
||||
@@ -216,10 +216,10 @@ class TestParserLive:
|
||||
- The merged PDF shall contain text from both source PDFs
|
||||
"""
|
||||
mock_generate_pdf_from_html = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_html",
|
||||
)
|
||||
mock_generate_pdf_from_mail = mocker.patch(
|
||||
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
|
||||
"paperless.parsers.mail.MailDocumentParser.generate_pdf_from_mail",
|
||||
)
|
||||
mock_generate_pdf_from_mail.return_value = merged_pdf_first
|
||||
mock_generate_pdf_from_html.return_value = merged_pdf_second
|
||||
497
src/paperless/tests/parsers/test_remote_parser.py
Normal file
@@ -0,0 +1,497 @@
|
||||
"""
|
||||
Tests for paperless.parsers.remote.RemoteDocumentParser.
|
||||
|
||||
All tests use the context-manager protocol for parser lifecycle.
|
||||
|
||||
Fixture layout
|
||||
--------------
|
||||
make_azure_mock — factory (defined here; specific to this module)
|
||||
azure_client — composes azure_settings + make_azure_mock + patch;
|
||||
use when a test needs the client to succeed
|
||||
failing_azure_client
|
||||
— composes azure_settings + patch with RuntimeError;
|
||||
use when a test needs the client to fail
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.remote import RemoteDocumentParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-local fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_AZURE_CLIENT_TARGET = "azure.ai.documentintelligence.DocumentIntelligenceClient"
|
||||
_DEFAULT_TEXT = "Extracted text."
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def make_azure_mock() -> Callable[[str], Mock]:
|
||||
"""Return a factory that builds a mock Azure DocumentIntelligenceClient.
|
||||
|
||||
Usage::
|
||||
|
||||
mock_client = make_azure_mock() # default extracted text
|
||||
mock_client = make_azure_mock("My text.") # custom extracted text
|
||||
"""
|
||||
|
||||
def _factory(text: str = _DEFAULT_TEXT) -> Mock:
|
||||
mock_client = Mock()
|
||||
mock_poller = Mock()
|
||||
mock_poller.wait.return_value = None
|
||||
mock_poller.details = {"operation_id": "fake-op-id"}
|
||||
mock_poller.result.return_value.content = text
|
||||
mock_client.begin_analyze_document.return_value = mock_poller
|
||||
mock_client.get_analyze_result_pdf.return_value = [b"%PDF-1.4 FAKE"]
|
||||
return mock_client
|
||||
|
||||
return _factory
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def azure_client(
|
||||
azure_settings: SettingsWrapper,
|
||||
make_azure_mock: Callable[[str], Mock],
|
||||
mocker: MockerFixture,
|
||||
) -> Mock:
|
||||
"""Patch the Azure DI client with a succeeding mock and return the instance.
|
||||
|
||||
Implicitly applies ``azure_settings`` so tests using this fixture do not
|
||||
also need ``@pytest.mark.usefixtures("azure_settings")``.
|
||||
"""
|
||||
mock_client = make_azure_mock()
|
||||
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||
return mock_client
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def failing_azure_client(
|
||||
azure_settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> Mock:
|
||||
"""Patch the Azure DI client to raise RuntimeError on every call.
|
||||
|
||||
Implicitly applies ``azure_settings``. Returns the mock instance so
|
||||
tests can assert on calls such as ``close()``.
|
||||
"""
|
||||
mock_client = Mock()
|
||||
mock_client.begin_analyze_document.side_effect = RuntimeError("network failure")
|
||||
mocker.patch(_AZURE_CLIENT_TARGET, return_value=mock_client)
|
||||
return mock_client
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Protocol contract
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserProtocol:
|
||||
"""Verify that RemoteDocumentParser satisfies the ParserProtocol contract."""
|
||||
|
||||
def test_isinstance_satisfies_protocol(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert isinstance(remote_parser, ParserProtocol)
|
||||
|
||||
def test_class_attributes_present(self) -> None:
|
||||
assert isinstance(RemoteDocumentParser.name, str) and RemoteDocumentParser.name
|
||||
assert (
|
||||
isinstance(RemoteDocumentParser.version, str)
|
||||
and RemoteDocumentParser.version
|
||||
)
|
||||
assert (
|
||||
isinstance(RemoteDocumentParser.author, str) and RemoteDocumentParser.author
|
||||
)
|
||||
assert isinstance(RemoteDocumentParser.url, str) and RemoteDocumentParser.url
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# supported_mime_types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserSupportedMimeTypes:
|
||||
"""supported_mime_types() always returns the full set regardless of config."""
|
||||
|
||||
def test_returns_dict(self) -> None:
|
||||
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||
assert isinstance(mime_types, dict)
|
||||
|
||||
def test_includes_all_expected_types(self) -> None:
|
||||
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||
expected = {
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
}
|
||||
assert expected == set(mime_types.keys())
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
def test_returns_full_set_when_not_configured(self) -> None:
|
||||
"""
|
||||
GIVEN: No remote engine is configured
|
||||
WHEN: supported_mime_types() is called
|
||||
THEN: The full MIME type dict is still returned (score() handles activation)
|
||||
"""
|
||||
mime_types = RemoteDocumentParser.supported_mime_types()
|
||||
assert len(mime_types) == 7
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# score()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserScore:
|
||||
"""score() encodes the activation logic: None when unconfigured, 20 when active."""
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
@pytest.mark.parametrize(
|
||||
"mime_type",
|
||||
[
|
||||
pytest.param("application/pdf", id="pdf"),
|
||||
pytest.param("image/png", id="png"),
|
||||
pytest.param("image/jpeg", id="jpeg"),
|
||||
pytest.param("image/tiff", id="tiff"),
|
||||
pytest.param("image/bmp", id="bmp"),
|
||||
pytest.param("image/gif", id="gif"),
|
||||
pytest.param("image/webp", id="webp"),
|
||||
],
|
||||
)
|
||||
def test_score_returns_20_when_configured(self, mime_type: str) -> None:
|
||||
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||
assert result == 20
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
@pytest.mark.parametrize(
|
||||
"mime_type",
|
||||
[
|
||||
pytest.param("application/pdf", id="pdf"),
|
||||
pytest.param("image/png", id="png"),
|
||||
pytest.param("image/jpeg", id="jpeg"),
|
||||
],
|
||||
)
|
||||
def test_score_returns_none_when_no_engine(self, mime_type: str) -> None:
|
||||
result = RemoteDocumentParser.score(mime_type, "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
def test_score_returns_none_when_api_key_missing(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = None
|
||||
settings.REMOTE_OCR_ENDPOINT = "https://test.cognitiveservices.azure.com"
|
||||
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
def test_score_returns_none_when_endpoint_missing(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = "key"
|
||||
settings.REMOTE_OCR_ENDPOINT = None
|
||||
result = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
def test_score_returns_none_for_unsupported_mime_type(self) -> None:
|
||||
result = RemoteDocumentParser.score("text/plain", "doc.txt")
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
def test_score_higher_than_tesseract_default(self) -> None:
|
||||
"""Remote parser (20) outranks the tesseract default (10) when configured."""
|
||||
score = RemoteDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert score is not None and score > 10
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Properties
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserProperties:
|
||||
def test_can_produce_archive_is_true(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert remote_parser.can_produce_archive is True
|
||||
|
||||
def test_requires_pdf_rendition_is_false(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert remote_parser.requires_pdf_rendition is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserLifecycle:
|
||||
def test_context_manager_cleans_up_tempdir(self) -> None:
|
||||
with RemoteDocumentParser() as parser:
|
||||
tempdir = parser._tempdir
|
||||
assert tempdir.exists()
|
||||
assert not tempdir.exists()
|
||||
|
||||
def test_context_manager_cleans_up_after_exception(self) -> None:
|
||||
tempdir: Path | None = None
|
||||
with pytest.raises(RuntimeError):
|
||||
with RemoteDocumentParser() as parser:
|
||||
tempdir = parser._tempdir
|
||||
raise RuntimeError("boom")
|
||||
assert tempdir is not None
|
||||
assert not tempdir.exists()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse() — happy path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserParse:
|
||||
def test_parse_returns_text_from_azure(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() == _DEFAULT_TEXT
|
||||
|
||||
def test_parse_sets_archive_path(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
archive = remote_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
assert archive.exists()
|
||||
assert archive.suffix == ".pdf"
|
||||
|
||||
def test_parse_closes_client_on_success(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.configure(ParserContext())
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
azure_client.close.assert_called_once()
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
def test_parse_sets_empty_text_when_not_configured(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() == ""
|
||||
assert remote_parser.get_archive_path() is None
|
||||
|
||||
def test_get_text_none_before_parse(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
) -> None:
|
||||
assert remote_parser.get_text() is None
|
||||
|
||||
def test_get_date_always_none(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_date() is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# parse() — Azure failure path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserParseError:
|
||||
def test_parse_returns_none_on_azure_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
assert remote_parser.get_text() is None
|
||||
|
||||
def test_parse_closes_client_on_error(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
) -> None:
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
failing_azure_client.close.assert_called_once()
|
||||
|
||||
def test_parse_logs_error_on_azure_failure(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
failing_azure_client: Mock,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
mock_log = mocker.patch("paperless.parsers.remote.logger")
|
||||
|
||||
remote_parser.parse(sample_pdf_file, "application/pdf")
|
||||
|
||||
mock_log.error.assert_called_once()
|
||||
assert "Azure AI Vision parsing failed" in mock_log.error.call_args[0][0]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_page_count()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserPageCount:
|
||||
def test_page_count_for_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
count = remote_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||
assert isinstance(count, int)
|
||||
assert count >= 1
|
||||
|
||||
def test_page_count_returns_none_for_image_mime(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
count = remote_parser.get_page_count(sample_pdf_file, "image/png")
|
||||
assert count is None
|
||||
|
||||
def test_page_count_returns_none_for_invalid_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
bad_pdf = tmp_path / "bad.pdf"
|
||||
bad_pdf.write_bytes(b"not a pdf at all")
|
||||
count = remote_parser.get_page_count(bad_pdf, "application/pdf")
|
||||
assert count is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# extract_metadata()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserMetadata:
|
||||
def test_extract_metadata_non_pdf_returns_empty(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "image/png")
|
||||
assert result == []
|
||||
|
||||
def test_extract_metadata_pdf_returns_list(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||
assert isinstance(result, list)
|
||||
|
||||
def test_extract_metadata_pdf_entries_have_required_keys(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
result = remote_parser.extract_metadata(sample_pdf_file, "application/pdf")
|
||||
for entry in result:
|
||||
assert "namespace" in entry
|
||||
assert "prefix" in entry
|
||||
assert "key" in entry
|
||||
assert "value" in entry
|
||||
assert isinstance(entry["value"], str)
|
||||
|
||||
def test_extract_metadata_does_not_raise_on_invalid_pdf(
|
||||
self,
|
||||
remote_parser: RemoteDocumentParser,
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
bad_pdf = tmp_path / "bad.pdf"
|
||||
bad_pdf.write_bytes(b"not a pdf at all")
|
||||
result = remote_parser.extract_metadata(bad_pdf, "application/pdf")
|
||||
assert result == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry integration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRemoteParserRegistry:
|
||||
def test_registered_in_defaults(self) -> None:
|
||||
from paperless.parsers.registry import ParserRegistry
|
||||
|
||||
registry = ParserRegistry()
|
||||
registry.register_defaults()
|
||||
|
||||
assert RemoteDocumentParser in registry._builtins
|
||||
|
||||
@pytest.mark.usefixtures("azure_settings")
|
||||
def test_get_parser_returns_remote_when_configured(self) -> None:
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
registry = get_parser_registry()
|
||||
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||
|
||||
assert parser_cls is RemoteDocumentParser
|
||||
|
||||
@pytest.mark.usefixtures("no_engine_settings")
|
||||
def test_get_parser_returns_none_for_unsupported_type_when_not_configured(
|
||||
self,
|
||||
) -> None:
|
||||
"""With remote off and a truly unsupported MIME type, registry returns None."""
|
||||
from paperless.parsers.registry import ParserRegistry
|
||||
|
||||
registry = ParserRegistry()
|
||||
registry.register_defaults()
|
||||
parser_cls = registry.get_parser_for_file(
|
||||
"application/x-unknown-format",
|
||||
"doc.xyz",
|
||||
)
|
||||
|
||||
assert parser_cls is None
|
||||
@@ -10,7 +10,7 @@ from paperless.models import CleanChoices
|
||||
from paperless.models import ColorConvertChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.models import OutputTypeChoices
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless.parsers.tesseract import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
1174
src/paperless/tests/parsers/test_tesseract_parser.py
Normal file
@@ -12,6 +12,7 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.text import TextDocumentParser
|
||||
|
||||
@@ -93,6 +94,7 @@ class TestTextParserParse:
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
text_parser.configure(ParserContext())
|
||||
text_parser.parse(sample_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_text() == "This is a test file.\n"
|
||||
@@ -102,6 +104,7 @@ class TestTextParserParse:
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
text_parser.configure(ParserContext())
|
||||
text_parser.parse(sample_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_archive_path() is None
|
||||
@@ -111,6 +114,7 @@ class TestTextParserParse:
|
||||
text_parser: TextDocumentParser,
|
||||
sample_txt_file: Path,
|
||||
) -> None:
|
||||
text_parser.configure(ParserContext())
|
||||
text_parser.parse(sample_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_date() is None
|
||||
@@ -129,6 +133,7 @@ class TestTextParserParse:
|
||||
- Parsing succeeds
|
||||
- Invalid bytes are replaced with the Unicode replacement character
|
||||
"""
|
||||
text_parser.configure(ParserContext())
|
||||
text_parser.parse(malformed_txt_file, "text/plain")
|
||||
|
||||
assert text_parser.get_text() == "Pantothens\ufffdure\n"
|
||||
@@ -251,6 +256,9 @@ class TestTextParserRegistry:
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
|
||||
registry = get_parser_registry()
|
||||
parser_cls = registry.get_parser_for_file("application/pdf", "doc.pdf")
|
||||
parser_cls = registry.get_parser_for_file(
|
||||
"application/x-unknown-format",
|
||||
"doc.xyz",
|
||||
)
|
||||
|
||||
assert parser_cls is None
|
||||
|
||||
@@ -4,7 +4,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from documents.tests.utils import util_call_with_backoff
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -42,14 +42,15 @@ class TestTikaParserAgainstServer:
|
||||
)
|
||||
|
||||
assert (
|
||||
tika_parser.text
|
||||
tika_parser.get_text()
|
||||
== "This is an ODT test document, created September 14, 2022"
|
||||
)
|
||||
assert tika_parser.archive_path is not None
|
||||
assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
|
||||
archive = tika_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
assert b"PDF-" in archive.read_bytes()[:10]
|
||||
|
||||
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
|
||||
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
|
||||
# self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
|
||||
|
||||
def test_basic_parse_docx(
|
||||
self,
|
||||
@@ -74,14 +75,15 @@ class TestTikaParserAgainstServer:
|
||||
)
|
||||
|
||||
assert (
|
||||
tika_parser.text
|
||||
tika_parser.get_text()
|
||||
== "This is an DOCX test document, also made September 14, 2022"
|
||||
)
|
||||
assert tika_parser.archive_path is not None
|
||||
with Path(tika_parser.archive_path).open("rb") as f:
|
||||
archive = tika_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
with archive.open("rb") as f:
|
||||
assert b"PDF-" in f.read()[:10]
|
||||
|
||||
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
|
||||
# self.assertEqual(tika_parser.get_date(), datetime.datetime(2022, 9, 14))
|
||||
|
||||
def test_basic_parse_doc(
|
||||
self,
|
||||
@@ -102,13 +104,12 @@ class TestTikaParserAgainstServer:
|
||||
[sample_doc_file, "application/msword"],
|
||||
)
|
||||
|
||||
assert tika_parser.text is not None
|
||||
assert (
|
||||
"This is a test document, saved in the older .doc format"
|
||||
in tika_parser.text
|
||||
)
|
||||
assert tika_parser.archive_path is not None
|
||||
with Path(tika_parser.archive_path).open("rb") as f:
|
||||
text = tika_parser.get_text()
|
||||
assert text is not None
|
||||
assert "This is a test document, saved in the older .doc format" in text
|
||||
archive = tika_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
with archive.open("rb") as f:
|
||||
assert b"PDF-" in f.read()[:10]
|
||||
|
||||
def test_tika_fails_multi_part(
|
||||
@@ -133,6 +134,7 @@ class TestTikaParserAgainstServer:
|
||||
[sample_broken_odt, "application/vnd.oasis.opendocument.text"],
|
||||
)
|
||||
|
||||
assert tika_parser.archive_path is not None
|
||||
with Path(tika_parser.archive_path).open("rb") as f:
|
||||
archive = tika_parser.get_archive_path()
|
||||
assert archive is not None
|
||||
with archive.open("rb") as f:
|
||||
assert b"PDF-" in f.read()[:10]
|
||||
@@ -9,7 +9,80 @@ from pytest_django.fixtures import SettingsWrapper
|
||||
from pytest_httpx import HTTPXMock
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.tika import TikaDocumentParser
|
||||
|
||||
|
||||
class TestTikaParserRegistryInterface:
|
||||
"""Verify that TikaDocumentParser satisfies the ParserProtocol contract."""
|
||||
|
||||
def test_satisfies_parser_protocol(self) -> None:
|
||||
assert isinstance(TikaDocumentParser(), ParserProtocol)
|
||||
|
||||
def test_supported_mime_types_is_classmethod(self) -> None:
|
||||
mime_types = TikaDocumentParser.supported_mime_types()
|
||||
assert isinstance(mime_types, dict)
|
||||
assert len(mime_types) > 0
|
||||
|
||||
def test_score_returns_none_when_tika_disabled(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.TIKA_ENABLED = False
|
||||
result = TikaDocumentParser.score(
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
"sample.odt",
|
||||
)
|
||||
assert result is None
|
||||
|
||||
def test_score_returns_int_when_tika_enabled(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.TIKA_ENABLED = True
|
||||
result = TikaDocumentParser.score(
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
"sample.odt",
|
||||
)
|
||||
assert isinstance(result, int)
|
||||
|
||||
def test_score_returns_none_for_unsupported_mime(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
settings.TIKA_ENABLED = True
|
||||
result = TikaDocumentParser.score("application/pdf", "doc.pdf")
|
||||
assert result is None
|
||||
|
||||
def test_can_produce_archive_is_false(self) -> None:
|
||||
assert TikaDocumentParser().can_produce_archive is False
|
||||
|
||||
def test_requires_pdf_rendition_is_true(self) -> None:
|
||||
assert TikaDocumentParser().requires_pdf_rendition is True
|
||||
|
||||
def test_get_page_count_returns_none_without_archive(
|
||||
self,
|
||||
tika_parser: TikaDocumentParser,
|
||||
sample_odt_file: Path,
|
||||
) -> None:
|
||||
assert (
|
||||
tika_parser.get_page_count(
|
||||
sample_odt_file,
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
)
|
||||
is None
|
||||
)
|
||||
|
||||
def test_get_page_count_returns_int_with_pdf_archive(
|
||||
self,
|
||||
tika_parser: TikaDocumentParser,
|
||||
sample_pdf_file: Path,
|
||||
) -> None:
|
||||
tika_parser._archive_path = sample_pdf_file
|
||||
count = tika_parser.get_page_count(sample_pdf_file, "application/pdf")
|
||||
assert isinstance(count, int)
|
||||
assert count > 0
|
||||
|
||||
|
||||
@pytest.mark.django_db()
|
||||
@@ -34,14 +107,15 @@ class TestTikaParser:
|
||||
# Pretend convert to PDF response
|
||||
httpx_mock.add_response(content=b"PDF document")
|
||||
|
||||
tika_parser.configure(ParserContext())
|
||||
tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
|
||||
|
||||
assert tika_parser.text == "the content"
|
||||
assert tika_parser.archive_path is not None
|
||||
with Path(tika_parser.archive_path).open("rb") as f:
|
||||
assert tika_parser.get_text() == "the content"
|
||||
assert tika_parser.get_archive_path() is not None
|
||||
with Path(tika_parser.get_archive_path()).open("rb") as f:
|
||||
assert f.read() == b"PDF document"
|
||||
|
||||
assert tika_parser.date == datetime.datetime(
|
||||
assert tika_parser.get_date() == datetime.datetime(
|
||||
2020,
|
||||
11,
|
||||
21,
|
||||
@@ -89,7 +163,7 @@ class TestTikaParser:
|
||||
httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
|
||||
|
||||
with pytest.raises(ParseError):
|
||||
tika_parser.convert_to_pdf(sample_odt_file, None)
|
||||
tika_parser._convert_to_pdf(sample_odt_file)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("setting_value", "expected_form_value"),
|
||||
@@ -106,7 +180,6 @@ class TestTikaParser:
|
||||
expected_form_value: str,
|
||||
httpx_mock: HTTPXMock,
|
||||
settings: SettingsWrapper,
|
||||
tika_parser: TikaDocumentParser,
|
||||
sample_odt_file: Path,
|
||||
) -> None:
|
||||
"""
|
||||
@@ -117,6 +190,8 @@ class TestTikaParser:
|
||||
THEN:
|
||||
- Request to Gotenberg contains the expected PDF/A format string
|
||||
"""
|
||||
# Parser must be created after the setting is changed so that
|
||||
# OutputTypeConfig reads the correct value at __init__ time.
|
||||
settings.OCR_OUTPUT_TYPE = setting_value
|
||||
httpx_mock.add_response(
|
||||
status_code=codes.OK,
|
||||
@@ -124,7 +199,8 @@ class TestTikaParser:
|
||||
method="POST",
|
||||
)
|
||||
|
||||
tika_parser.convert_to_pdf(sample_odt_file, None)
|
||||
with TikaDocumentParser() as parser:
|
||||
parser._convert_to_pdf(sample_odt_file)
|
||||
|
||||
request = httpx_mock.get_request()
|
||||
|
||||
|
Before Width: | Height: | Size: 6.0 KiB After Width: | Height: | Size: 6.0 KiB |
|
Before Width: | Height: | Size: 2.8 KiB After Width: | Height: | Size: 2.8 KiB |
|
Before Width: | Height: | Size: 6.9 KiB After Width: | Height: | Size: 6.9 KiB |
|
Before Width: | Height: | Size: 5.2 KiB After Width: | Height: | Size: 5.2 KiB |
|
Before Width: | Height: | Size: 5.7 KiB After Width: | Height: | Size: 5.7 KiB |
|
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
|
Before Width: | Height: | Size: 8.2 KiB After Width: | Height: | Size: 8.2 KiB |
|
Before Width: | Height: | Size: 6.8 KiB After Width: | Height: | Size: 6.8 KiB |
|
Before Width: | Height: | Size: 1.7 MiB After Width: | Height: | Size: 1.7 MiB |
|
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 18 KiB |
|
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 19 KiB |
|
Before Width: | Height: | Size: 7.2 KiB After Width: | Height: | Size: 7.2 KiB |
@@ -5,6 +5,7 @@ from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from django.core.checks import ERROR
|
||||
from django.core.checks import Error
|
||||
from django.core.checks import Warning
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
@@ -12,7 +13,9 @@ from pytest_mock import MockerFixture
|
||||
|
||||
from paperless.checks import audit_log_check
|
||||
from paperless.checks import binaries_check
|
||||
from paperless.checks import check_default_language_available
|
||||
from paperless.checks import check_deprecated_db_settings
|
||||
from paperless.checks import check_remote_parser_configured
|
||||
from paperless.checks import check_v3_minimum_upgrade_version
|
||||
from paperless.checks import debug_mode_check
|
||||
from paperless.checks import paths_check
|
||||
@@ -24,7 +27,6 @@ class PaperlessTestDirs:
|
||||
data_dir: Path
|
||||
media_dir: Path
|
||||
consumption_dir: Path
|
||||
consumption_failed_dir: Path
|
||||
|
||||
|
||||
# TODO: consolidate with documents/tests/conftest.py PaperlessDirs/paperless_dirs
|
||||
@@ -34,21 +36,18 @@ def directories(tmp_path: Path, settings: SettingsWrapper) -> PaperlessTestDirs:
|
||||
data_dir = tmp_path / "data"
|
||||
media_dir = tmp_path / "media"
|
||||
consumption_dir = tmp_path / "consumption"
|
||||
consumption_failed_dir = tmp_path / "consumption_failed"
|
||||
|
||||
for d in (data_dir, media_dir, consumption_dir, consumption_failed_dir):
|
||||
for d in (data_dir, media_dir, consumption_dir):
|
||||
d.mkdir()
|
||||
|
||||
settings.DATA_DIR = data_dir
|
||||
settings.MEDIA_ROOT = media_dir
|
||||
settings.CONSUMPTION_DIR = consumption_dir
|
||||
settings.CONSUMPTION_FAILED_DIR = consumption_failed_dir
|
||||
|
||||
return PaperlessTestDirs(
|
||||
data_dir=data_dir,
|
||||
media_dir=media_dir,
|
||||
consumption_dir=consumption_dir,
|
||||
consumption_failed_dir=consumption_failed_dir,
|
||||
)
|
||||
|
||||
|
||||
@@ -68,11 +67,10 @@ class TestChecks:
|
||||
settings.MEDIA_ROOT = Path("uuh")
|
||||
settings.DATA_DIR = Path("whatever")
|
||||
settings.CONSUMPTION_DIR = Path("idontcare")
|
||||
settings.CONSUMPTION_FAILED_DIR = Path("nope")
|
||||
|
||||
msgs = paths_check(None)
|
||||
|
||||
assert len(msgs) == 4, str(msgs)
|
||||
assert len(msgs) == 3, str(msgs)
|
||||
for msg in msgs:
|
||||
assert msg.msg.endswith("is set but doesn't exist.")
|
||||
|
||||
@@ -80,7 +78,6 @@ class TestChecks:
|
||||
directories.data_dir.chmod(0o000)
|
||||
directories.media_dir.chmod(0o000)
|
||||
directories.consumption_dir.chmod(0o000)
|
||||
directories.consumption_failed_dir.chmod(0o000)
|
||||
|
||||
try:
|
||||
msgs = paths_check(None)
|
||||
@@ -88,9 +85,8 @@ class TestChecks:
|
||||
directories.data_dir.chmod(0o777)
|
||||
directories.media_dir.chmod(0o777)
|
||||
directories.consumption_dir.chmod(0o777)
|
||||
directories.consumption_failed_dir.chmod(0o777)
|
||||
|
||||
assert len(msgs) == 4
|
||||
assert len(msgs) == 3
|
||||
for msg in msgs:
|
||||
assert msg.msg.endswith("is not writeable")
|
||||
|
||||
@@ -633,3 +629,116 @@ class TestV3MinimumUpgradeVersionCheck:
|
||||
conn.introspection.table_names.side_effect = OperationalError("DB unavailable")
|
||||
mocker.patch.dict("paperless.checks.connections", {"default": conn})
|
||||
assert check_v3_minimum_upgrade_version(None) == []
|
||||
|
||||
|
||||
class TestRemoteParserChecks:
|
||||
def test_no_engine(self, settings: SettingsWrapper) -> None:
|
||||
settings.REMOTE_OCR_ENGINE = None
|
||||
msgs = check_remote_parser_configured(None)
|
||||
|
||||
assert len(msgs) == 0
|
||||
|
||||
def test_azure_no_endpoint(self, settings: SettingsWrapper) -> None:
|
||||
|
||||
settings.REMOTE_OCR_ENGINE = "azureai"
|
||||
settings.REMOTE_OCR_API_KEY = "somekey"
|
||||
settings.REMOTE_OCR_ENDPOINT = None
|
||||
|
||||
msgs = check_remote_parser_configured(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
|
||||
msg = msgs[0]
|
||||
|
||||
assert (
|
||||
"Azure AI remote parser requires endpoint and API key to be configured."
|
||||
in msg.msg
|
||||
)
|
||||
|
||||
|
||||
class TestTesseractChecks:
|
||||
def test_default_language(self) -> None:
|
||||
check_default_language_available(None)
|
||||
|
||||
def test_no_language(self, settings: SettingsWrapper) -> None:
|
||||
|
||||
settings.OCR_LANGUAGE = ""
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
msg = msgs[0]
|
||||
|
||||
assert (
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE" in msg.msg
|
||||
)
|
||||
|
||||
def test_invalid_language(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
|
||||
settings.OCR_LANGUAGE = "ita"
|
||||
|
||||
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||
tesser_lang_mock.return_value = ["deu", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
msg = msgs[0]
|
||||
|
||||
assert msg.level == ERROR
|
||||
assert "The selected ocr language ita is not installed" in msg.msg
|
||||
|
||||
def test_multi_part_language(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- An OCR language which is multi part (ie chi-sim)
|
||||
- The language is correctly formatted
|
||||
WHEN:
|
||||
- Installed packages are checked
|
||||
THEN:
|
||||
- No errors are reported
|
||||
"""
|
||||
|
||||
settings.OCR_LANGUAGE = "chi_sim"
|
||||
|
||||
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||
tesser_lang_mock.return_value = ["chi_sim", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 0
|
||||
|
||||
def test_multi_part_language_bad_format(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- An OCR language which is multi part (ie chi-sim)
|
||||
- The language is correctly NOT formatted
|
||||
WHEN:
|
||||
- Installed packages are checked
|
||||
THEN:
|
||||
- No errors are reported
|
||||
"""
|
||||
settings.OCR_LANGUAGE = "chi-sim"
|
||||
|
||||
tesser_lang_mock = mocker.patch("paperless.checks.get_tesseract_langs")
|
||||
tesser_lang_mock.return_value = ["chi_sim", "eng"]
|
||||
|
||||
msgs = check_default_language_available(None)
|
||||
|
||||
assert len(msgs) == 1
|
||||
msg = msgs[0]
|
||||
|
||||
assert msg.level == ERROR
|
||||
assert "The selected ocr language chi-sim is not installed" in msg.msg
|
||||
|
||||
@@ -18,6 +18,7 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless.parsers import ParserContext
|
||||
from paperless.parsers import ParserProtocol
|
||||
from paperless.parsers.registry import ParserRegistry
|
||||
from paperless.parsers.registry import get_parser_registry
|
||||
@@ -103,6 +104,11 @@ def dummy_parser_cls() -> type:
|
||||
) -> list:
|
||||
return []
|
||||
|
||||
def configure(self, context: ParserContext) -> None:
|
||||
"""
|
||||
Required to exist, but doesn't need to do anything
|
||||
"""
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
@@ -144,6 +150,7 @@ class TestParserProtocol:
|
||||
@pytest.mark.parametrize(
|
||||
"missing_method",
|
||||
[
|
||||
pytest.param("configure", id="missing-configure"),
|
||||
pytest.param("parse", id="missing-parse"),
|
||||
pytest.param("get_text", id="missing-get_text"),
|
||||
pytest.param("get_thumbnail", id="missing-get_thumbnail"),
|
||||
|
||||
10
src/paperless_ai/tests/conftest.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_llm_index_dir(tmp_path: Path, settings: SettingsWrapper):
|
||||
settings.LLM_INDEX_DIR = tmp_path
|
||||
return tmp_path
|
||||
@@ -13,14 +13,6 @@ from documents.models import PaperlessTask
|
||||
from paperless_ai import indexing
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_llm_index_dir(tmp_path):
|
||||
original_dir = indexing.settings.LLM_INDEX_DIR
|
||||
indexing.settings.LLM_INDEX_DIR = tmp_path
|
||||
yield tmp_path
|
||||
indexing.settings.LLM_INDEX_DIR = original_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def real_document(db):
|
||||
return Document.objects.create(
|
||||
|
||||
@@ -3,7 +3,6 @@ from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from django.conf import settings
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.models import LLMEmbeddingBackend
|
||||
@@ -19,14 +18,6 @@ def mock_ai_config():
|
||||
yield MockAIConfig
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_llm_index_dir(tmp_path):
|
||||
original_dir = settings.LLM_INDEX_DIR
|
||||
settings.LLM_INDEX_DIR = tmp_path
|
||||
yield tmp_path
|
||||
settings.LLM_INDEX_DIR = original_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_document():
|
||||
doc = MagicMock(spec=Document)
|
||||
|
||||
@@ -1,18 +1,8 @@
|
||||
from django.apps import AppConfig
|
||||
from django.conf import settings
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from paperless_mail.signals import mail_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessMailConfig(AppConfig):
|
||||
name = "paperless_mail"
|
||||
|
||||
verbose_name = _("Paperless mail")
|
||||
|
||||
def ready(self) -> None:
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
if settings.TIKA_ENABLED:
|
||||
document_consumer_declaration.connect(mail_consumer_declaration)
|
||||
AppConfig.ready(self)
|
||||
|
||||