mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-28 16:24:19 +00:00
Feature: OCR Templates (#13043)
[skip ci] Signed-off-by: dependabot[bot] <support@github.com> Co-Authored-By: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-Authored-By: stumpylog <797416+stumpylog@users.noreply.github.com> Co-Authored-By: GitHub Actions <41898282+github-actions[bot]@users.noreply.github.com> Co-Authored-By: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
committed by
shamoon
parent
bf70e597ee
commit
bf73b5b1d1
@@ -13,6 +13,8 @@ import { DocumentDetailComponent } from './components/document-detail/document-d
|
||||
import { DocumentListComponent } from './components/document-list/document-list.component'
|
||||
import { DocumentAttributesComponent } from './components/manage/document-attributes/document-attributes.component'
|
||||
import { MailComponent } from './components/manage/mail/mail.component'
|
||||
import { OcrTemplateEditorComponent } from './components/manage/ocr-templates/ocr-template-editor/ocr-template-editor.component'
|
||||
import { OcrTemplatesComponent } from './components/manage/ocr-templates/ocr-templates.component'
|
||||
import { SavedViewsComponent } from './components/manage/saved-views/saved-views.component'
|
||||
import { WorkflowsComponent } from './components/manage/workflows/workflows.component'
|
||||
import { NotFoundComponent } from './components/not-found/not-found.component'
|
||||
@@ -274,6 +276,30 @@ export const routes: Routes = [
|
||||
componentName: 'WorkflowsComponent',
|
||||
},
|
||||
},
|
||||
{
|
||||
path: 'ocr-templates',
|
||||
component: OcrTemplatesComponent,
|
||||
canActivate: [PermissionsGuard],
|
||||
data: {
|
||||
requiredPermission: {
|
||||
action: PermissionAction.View,
|
||||
type: PermissionType.OcrTemplate,
|
||||
},
|
||||
componentName: 'OcrTemplatesComponent',
|
||||
},
|
||||
},
|
||||
{
|
||||
path: 'ocr-templates/:id',
|
||||
component: OcrTemplateEditorComponent,
|
||||
canActivate: [PermissionsGuard],
|
||||
data: {
|
||||
requiredPermission: {
|
||||
action: PermissionAction.Change,
|
||||
type: PermissionType.OcrTemplate,
|
||||
},
|
||||
componentName: 'OcrTemplateEditorComponent',
|
||||
},
|
||||
},
|
||||
{
|
||||
path: 'mail',
|
||||
component: MailComponent,
|
||||
|
||||
@@ -243,6 +243,14 @@
|
||||
<i-bs class="me-2" name="boxes"></i-bs><span><ng-container i18n>Workflows</ng-container></span>
|
||||
</a>
|
||||
</li>
|
||||
<li class="nav-item app-link"
|
||||
*pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.OcrTemplate }">
|
||||
<a class="nav-link" routerLink="ocr-templates" routerLinkActive="active" (click)="closeMenu()"
|
||||
ngbPopover="OCR Templates" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
|
||||
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
|
||||
<i-bs class="me-2" name="file-earmark-break"></i-bs><span><ng-container i18n>OCR Templates</ng-container></span>
|
||||
</a>
|
||||
</li>
|
||||
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.MailAccount }"
|
||||
tourAnchor="tour.mail">
|
||||
<a class="nav-link" routerLink="mail" routerLinkActive="active" (click)="closeMenu()" ngbPopover="Mail"
|
||||
|
||||
@@ -82,6 +82,14 @@
|
||||
<i-bs name="pencil" class="me-1"></i-bs><ng-container i18n>PDF Editor</ng-container>
|
||||
</button>
|
||||
|
||||
<button ngbDropdownItem (click)="runZoneOcr()" [disabled]="!userCanEdit || !document?.document_type">
|
||||
<i-bs width="1em" height="1em" name="file-earmark-ruled" class="me-1"></i-bs><span i18n>Run Zone OCR</span>
|
||||
</button>
|
||||
|
||||
<button ngbDropdownItem (click)="createOcrTemplate()">
|
||||
<i-bs width="1em" height="1em" name="file-earmark-medical" class="me-1"></i-bs><span i18n>Create OCR Template</span>
|
||||
</button>
|
||||
|
||||
@if (userIsOwner && (requiresPassword || password)) {
|
||||
<button ngbDropdownItem (click)="removePassword()" [disabled]="!password">
|
||||
<i-bs name="unlock" class="me-1"></i-bs><ng-container i18n>Remove Password</ng-container>
|
||||
|
||||
@@ -1405,6 +1405,48 @@ export class DocumentDetailComponent
|
||||
})
|
||||
}
|
||||
|
||||
runZoneOcr() {
|
||||
this.documentsService.runZoneOcr(this.document.id).subscribe({
|
||||
next: (res) => {
|
||||
const results = res.results ?? []
|
||||
if (results.length) {
|
||||
const failed = results.filter(
|
||||
(r) =>
|
||||
r.value === null ||
|
||||
r.value === undefined ||
|
||||
`${r.value}`.trim() === ''
|
||||
)
|
||||
const filled = results.length - failed.length
|
||||
let msg = $localize`Filled ${filled} of ${results.length} fields`
|
||||
if (failed.length) {
|
||||
const names = failed.map((r) => r.zone).join(', ')
|
||||
msg = `${msg}. ${$localize`Failed to match zones: ${names}`}`
|
||||
}
|
||||
this.toastService.showInfo(msg)
|
||||
} else {
|
||||
this.toastService.showInfo(
|
||||
$localize`Zone OCR ran but no results extracted.`
|
||||
)
|
||||
}
|
||||
this.documentsService
|
||||
.get(this.documentId)
|
||||
.subscribe((doc) => this.updateComponent(doc))
|
||||
},
|
||||
error: (error) => {
|
||||
this.toastService.showError($localize`Zone OCR failed`, error)
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
createOcrTemplate() {
|
||||
this.router.navigate(['/ocr-templates', 'new'], {
|
||||
queryParams: {
|
||||
document_type: this.document.document_type,
|
||||
sample_document: this.document.id,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
private getSelectedNonLatestVersionId(): number | null {
|
||||
const versions = this.document?.versions ?? []
|
||||
if (!versions.length || !this.selectedVersionId) {
|
||||
|
||||
@@ -95,6 +95,9 @@
|
||||
<button ngbDropdownItem (click)="mergeSelected()" [disabled]="!userCanAdd || list.allSelected || list.selectedCount < 2">
|
||||
<i-bs name="journals" class="me-1"></i-bs><ng-container i18n>Merge</ng-container>
|
||||
</button>
|
||||
<button ngbDropdownItem (click)="runZoneOcrSelected()" [disabled]="!userCanEditAll || list.allSelected">
|
||||
<i-bs name="file-earmark-ruled" class="me-1"></i-bs><ng-container i18n>Run Zone OCR</ng-container>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -12,7 +12,15 @@ import {
|
||||
} from '@ng-bootstrap/ng-bootstrap'
|
||||
import { saveAs } from 'file-saver'
|
||||
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
|
||||
import { first, map, Observable, Subject, switchMap, takeUntil } from 'rxjs'
|
||||
import {
|
||||
first,
|
||||
forkJoin,
|
||||
map,
|
||||
Observable,
|
||||
Subject,
|
||||
switchMap,
|
||||
takeUntil,
|
||||
} from 'rxjs'
|
||||
import { ConfirmDialogComponent } from 'src/app/components/common/confirm-dialog/confirm-dialog.component'
|
||||
import { CustomField } from 'src/app/data/custom-field'
|
||||
import { MatchingModel } from 'src/app/data/matching-model'
|
||||
@@ -908,6 +916,27 @@ export class BulkEditorComponent
|
||||
})
|
||||
}
|
||||
|
||||
runZoneOcrSelected() {
|
||||
const ids = Array.from(this.list.selected)
|
||||
if (!ids.length) return
|
||||
const modal = this.modalService.open(ConfirmDialogComponent, {
|
||||
backdrop: 'static',
|
||||
})
|
||||
modal.componentInstance.title = $localize`Run Zone OCR`
|
||||
modal.componentInstance.messageBold = $localize`Run zone OCR on ${this.getSelectionSize()} selected document(s)?`
|
||||
modal.componentInstance.message = $localize`Each document's type template (if it has one) is applied, overwriting the mapped fields.`
|
||||
modal.componentInstance.btnCaption = $localize`Proceed`
|
||||
modal.componentInstance.confirmClicked
|
||||
.pipe(takeUntil(this.unsubscribeNotifier))
|
||||
.subscribe(() => {
|
||||
modal.componentInstance.buttonsEnabled = false
|
||||
this.executeDocumentAction(
|
||||
modal,
|
||||
forkJoin(ids.map((id) => this.documentService.runZoneOcr(id)))
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
setPermissions() {
|
||||
let modal = this.modalService.open(PermissionsDialogComponent, {
|
||||
backdrop: 'static',
|
||||
|
||||
+414
@@ -0,0 +1,414 @@
|
||||
<pngx-page-header [title]="pageTitle" [id]="template.id">
|
||||
<div class="input-group input-group-sm me-5 align-items-center">
|
||||
<div class="input-group-text">
|
||||
<i-bs name="file-text"></i-bs>
|
||||
</div>
|
||||
<input
|
||||
type="text"
|
||||
class="form-control"
|
||||
[(ngModel)]="previewDocModel"
|
||||
[ngbTypeahead]="searchDocuments"
|
||||
[inputFormatter]="documentFormatter"
|
||||
[resultFormatter]="documentFormatter"
|
||||
(selectItem)="onPreviewDocSelected($event)"
|
||||
[editable]="false"
|
||||
placeholder="Search documents by title..."
|
||||
i18n-placeholder
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div class="d-flex align-items-center flex-wrap gap-2">
|
||||
<div class="input-group input-group-sm ms-2 d-none d-md-flex">
|
||||
<div class="input-group-text" i18n>Page</div>
|
||||
<input class="form-control flex-grow-0 w-auto" type="number" min="1" [max]="previewPageCount" [(ngModel)]="previewPageDisplay" />
|
||||
<div class="input-group-text" i18n>of {{previewPageCount}}</div>
|
||||
</div>
|
||||
<button type="button" class="btn btn-sm btn-outline-secondary" i18n-title title="Previous" (click)="prevPage()" [disabled]="!pageImageUrl || previewPage <= 0">
|
||||
<i-bs width="1.2em" height="1.2em" name="arrow-left"></i-bs>
|
||||
</button>
|
||||
<button type="button" class="btn btn-sm btn-outline-secondary" i18n-title title="Next" (click)="nextPage()" [disabled]="!pageImageUrl || previewPage >= (previewPageCount ?? 1) - 1">
|
||||
<i-bs width="1.2em" height="1.2em" name="arrow-right"></i-bs>
|
||||
</button>
|
||||
|
||||
<div class="input-group input-group-sm">
|
||||
<button class="btn btn-outline-secondary" (click)="zoomOut()" i18n>-</button>
|
||||
<span class="input-group-text">{{ zoom * 100 | number: '1.0-0' }}%</span>
|
||||
<button class="btn btn-outline-secondary" (click)="zoomIn()" i18n>+</button>
|
||||
</div>
|
||||
</div>
|
||||
</pngx-page-header>
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-4">
|
||||
<div class="btn-toolbar mb-1 border-bottom">
|
||||
<div class="btn-group pb-3">
|
||||
<a routerLink="/ocr-templates" class="btn btn-sm btn-outline-secondary">
|
||||
<i-bs width="1.2em" height="1.2em" name="x"></i-bs>
|
||||
<span class="ms-1" i18n>Close</span>
|
||||
</a>
|
||||
</div>
|
||||
<div class="btn-group ms-auto pb-3">
|
||||
<button class="btn btn-sm btn-primary" (click)="save()" [disabled]="saving">
|
||||
@if (saving) {
|
||||
<span class="spinner-border spinner-border-sm me-1"></span>
|
||||
}
|
||||
<span i18n>Save</span>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<ul ngbNav #nav="ngbNav" [(activeId)]="activeTab" class="nav-underline flex-nowrap flex-md-wrap overflow-auto">
|
||||
<li ngbNavItem="settings">
|
||||
<a ngbNavLink i18n>Settings</a>
|
||||
<ng-template ngbNavContent>
|
||||
<div class="row mb-3">
|
||||
<div class="col-9">
|
||||
<pngx-input-text [(ngModel)]="template.name" title="Template name" i18n-title></pngx-input-text>
|
||||
</div>
|
||||
<div class="col-3">
|
||||
<pngx-input-switch [(ngModel)]="template.enabled" title="Enabled" i18n-title></pngx-input-switch>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<pngx-input-select [(ngModel)]="template.document_type" [items]="documentTypes" bindLabel="name" bindValue="id" title="Document type" i18n-title></pngx-input-select>
|
||||
|
||||
<small class="text-muted" i18n>
|
||||
Draw rectangles on the preview to define extraction zones. Use the
|
||||
page controls above the preview to add zones on different pages.
|
||||
</small>
|
||||
</ng-template>
|
||||
</li>
|
||||
|
||||
<li ngbNavItem="zones">
|
||||
<a ngbNavLink><ng-container i18n>Zones</ng-container> <span class="badge bg-primary ms-2">{{ template.zones.length }}</span></a>
|
||||
<ng-template ngbNavContent>
|
||||
@if (template.zones.length === 0) {
|
||||
<p class="text-muted" i18n>
|
||||
No zones defined. Load a document preview and draw rectangles to add zones.
|
||||
</p>
|
||||
}
|
||||
<div class="list-group">
|
||||
@for (zone of template.zones; track $index; let i = $index) {
|
||||
<div
|
||||
class="list-group-item list-group-item-action d-flex justify-content-between align-items-center"
|
||||
[style.box-shadow]="selectedZoneIndex === i ? 'inset 3px 0 0 0 var(--bs-primary)' : null"
|
||||
>
|
||||
<div class="flex-grow-1" role="button" style="cursor: pointer;" (click)="selectZone(i)">
|
||||
<div><strong [class.text-primary]="selectedZoneIndex === i">{{ zone.name }}</strong></div>
|
||||
<div class="small text-muted">
|
||||
{{ getZoneTargetName(zone) }} - {{ zone.width }}x{{ zone.height }}px <ng-container i18n>p.</ng-container>{{ zonePage(zone) }}
|
||||
</div>
|
||||
</div>
|
||||
<div class="btn-group">
|
||||
<button class="btn btn-sm btn-outline-secondary" type="button" (click)="selectZone(i)" title="Edit" i18n-title>
|
||||
<i-bs name="pencil"></i-bs>
|
||||
</button>
|
||||
<button class="btn btn-sm btn-outline-danger" type="button" (click)="removeZone(i)" title="Delete" i18n-title>
|
||||
<i-bs name="trash"></i-bs>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
}
|
||||
</div>
|
||||
</ng-template>
|
||||
</li>
|
||||
|
||||
<li ngbNavItem="zone">
|
||||
<a ngbNavLink i18n>Zone</a>
|
||||
<ng-template ngbNavContent>
|
||||
@if (selectedZone; as zone) {
|
||||
<div class="d-flex justify-content-between align-items-center mb-3">
|
||||
<strong>{{ zone.name }}</strong>
|
||||
<div class="d-flex gap-2">
|
||||
<button class="btn btn-sm btn-primary" (click)="save()" [disabled]="saving">
|
||||
@if (saving) {
|
||||
<span class="spinner-border spinner-border-sm me-1"></span>
|
||||
}
|
||||
<span i18n>Save</span>
|
||||
</button>
|
||||
<button class="btn btn-sm btn-outline-danger" (click)="deleteSelectedZone()">
|
||||
<i-bs name="trash" class="me-1"></i-bs><ng-container i18n>Delete zone</ng-container>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
<label class="form-label" i18n>Zone Name</label>
|
||||
<input
|
||||
type="text"
|
||||
class="form-control"
|
||||
[(ngModel)]="zone.name"
|
||||
(ngModelChange)="redrawCanvas()"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
<label class="form-label" i18n>Page</label>
|
||||
<input
|
||||
type="number"
|
||||
class="form-control"
|
||||
[(ngModel)]="zone.page"
|
||||
min="-1"
|
||||
(ngModelChange)="redrawCanvas()"
|
||||
/>
|
||||
<small class="text-muted" i18n>Page this zone is on. Use -1 for the last page. Set automatically when you draw it.</small>
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
<label class="form-label" i18n>Field</label>
|
||||
<div class="input-group">
|
||||
<select class="form-select" [ngModel]="zoneFieldValue(zone)" (ngModelChange)="setZoneField(zone, $event)">
|
||||
<optgroup label="Built-in fields" i18n-label>
|
||||
@for (t of builtinTargets; track t.id) {
|
||||
<option [ngValue]="t.id">{{ t.name }}</option>
|
||||
}
|
||||
</optgroup>
|
||||
<optgroup label="Custom fields" i18n-label>
|
||||
@for (cf of customFields; track cf.id) {
|
||||
<option [ngValue]="cf.id">{{ cf.name }} ({{ cf.data_type }})</option>
|
||||
}
|
||||
</optgroup>
|
||||
</select>
|
||||
<button
|
||||
class="btn btn-outline-secondary"
|
||||
type="button"
|
||||
(click)="openQuickCreate(selectedZoneIndex)"
|
||||
title="Create new custom field"
|
||||
i18n-title
|
||||
>
|
||||
<i-bs name="plus"></i-bs>
|
||||
</button>
|
||||
</div>
|
||||
<small class="text-muted" i18n>Write the extracted value to a custom field, or to a built-in field (Title, ASN, Date created).</small>
|
||||
</div>
|
||||
|
||||
@if (isFieldShared(zone)) {
|
||||
<div class="card mb-3 border-info">
|
||||
<div class="card-body">
|
||||
<h6 class="card-title d-flex align-items-center gap-2">
|
||||
<i-bs name="braces"></i-bs>
|
||||
<span i18n>Combine zones into this field</span>
|
||||
</h6>
|
||||
<p class="small text-muted mb-2" i18n>
|
||||
More than one zone writes to this field. Build the combined
|
||||
value below: click a zone to insert its token, and type any
|
||||
separators or literal text between tokens.
|
||||
</p>
|
||||
<div class="d-flex flex-wrap gap-1 mb-2">
|
||||
@for (z of zonesForField(zone); track $index) {
|
||||
<button
|
||||
type="button"
|
||||
class="btn btn-sm btn-outline-info"
|
||||
(click)="insertCombineToken(zone, z)"
|
||||
title="Insert token"
|
||||
i18n-title
|
||||
>
|
||||
+ {{ z.name || 'Zone' }}
|
||||
</button>
|
||||
}
|
||||
</div>
|
||||
<input
|
||||
type="text"
|
||||
class="form-control font-monospace"
|
||||
[ngModel]="getCombineFormat(zone)"
|
||||
(ngModelChange)="setCombineFormat(zone, $event)"
|
||||
placeholder="{Zone 1} - {Zone 2}"
|
||||
/>
|
||||
<small class="text-muted" i18n>
|
||||
Tokens are matched by zone name. An empty zone leaves its
|
||||
token blank and the stray separator is trimmed. Leave empty
|
||||
to just join the zones in order with a space.
|
||||
</small>
|
||||
</div>
|
||||
</div>
|
||||
}
|
||||
|
||||
@if (showQuickCreate) {
|
||||
<div class="card mb-3 border-primary">
|
||||
<div class="card-body">
|
||||
<h6 class="card-title" i18n>Create Custom Field</h6>
|
||||
<div class="mb-2">
|
||||
<label class="form-label small" i18n>Field Name</label>
|
||||
<input type="text" class="form-control form-control-sm"
|
||||
[(ngModel)]="quickCreateName" placeholder="e.g. Invoice Number" />
|
||||
</div>
|
||||
<div class="mb-2">
|
||||
<label class="form-label small" i18n>Field Type</label>
|
||||
<select class="form-select form-select-sm" [(ngModel)]="quickCreateType">
|
||||
@for (t of quickCreateTypes; track t.id) {
|
||||
<option [ngValue]="t.id">{{ t.name }}</option>
|
||||
}
|
||||
</select>
|
||||
</div>
|
||||
<div class="d-flex gap-2">
|
||||
<button class="btn btn-primary btn-sm" (click)="submitQuickCreate()"
|
||||
[disabled]="!quickCreateName.trim()" i18n>
|
||||
Create & Assign
|
||||
</button>
|
||||
<button class="btn btn-outline-secondary btn-sm" (click)="cancelQuickCreate()" i18n>
|
||||
Cancel
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
}
|
||||
|
||||
<div class="mb-3">
|
||||
<label class="form-label" i18n>OCR Language</label>
|
||||
<ng-select
|
||||
[items]="ocrLanguageOptions"
|
||||
bindLabel="name"
|
||||
bindValue="id"
|
||||
[multiple]="true"
|
||||
[closeOnSelect]="false"
|
||||
[ngModel]="ocrLanguageArray(zone)"
|
||||
(ngModelChange)="setOcrLanguages(zone, $event)"
|
||||
placeholder="Select languages"
|
||||
i18n-placeholder
|
||||
></ng-select>
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
<label class="form-label" i18n>Transform</label>
|
||||
<select class="form-select" [(ngModel)]="zone.transform">
|
||||
@for (opt of transformOptions; track opt.id) {
|
||||
<option [ngValue]="opt.id">{{ opt.name }}</option>
|
||||
}
|
||||
</select>
|
||||
</div>
|
||||
|
||||
@if (zone.transform === 'date') {
|
||||
<div class="mb-3">
|
||||
<label class="form-label" i18n>Date format</label>
|
||||
<select class="form-select" [ngModel]="dateFormatChoice(zone)" (ngModelChange)="setDateFormatChoice(zone, $event)">
|
||||
@for (opt of dateFormatOptions; track opt.id) {
|
||||
<option [ngValue]="opt.id">{{ opt.name }}</option>
|
||||
}
|
||||
<option [ngValue]="'custom'" i18n>Custom...</option>
|
||||
</select>
|
||||
@if (dateFormatCustom) {
|
||||
<div class="input-group mt-2">
|
||||
<input type="text" class="form-control font-monospace" [(ngModel)]="zone.date_format" placeholder="%d.%m.%Y" />
|
||||
<button class="btn btn-outline-secondary" type="button" [ngbPopover]="dateFmtHelp" [autoClose]="true" title="Date format help" i18n-title>
|
||||
<i-bs name="question-circle"></i-bs>
|
||||
</button>
|
||||
</div>
|
||||
<ng-template #dateFmtHelp>
|
||||
<p class="mb-1" i18n>Python date codes:</p>
|
||||
<ul class="mb-1 ps-3">
|
||||
<li><code>%d</code> <ng-container i18n>day (01-31)</ng-container></li>
|
||||
<li><code>%m</code> <ng-container i18n>month (01-12)</ng-container></li>
|
||||
<li><code>%Y</code> <ng-container i18n>year, 4-digit</ng-container></li>
|
||||
<li><code>%y</code> <ng-container i18n>year, 2-digit</ng-container></li>
|
||||
<li><code>%b</code> <ng-container i18n>month name (Jan)</ng-container></li>
|
||||
</ul>
|
||||
<span i18n>Example:</span> <code>%d.%m.%Y</code> -> 03.03.2026
|
||||
</ng-template>
|
||||
}
|
||||
</div>
|
||||
}
|
||||
|
||||
<div class="mb-3">
|
||||
<label class="form-label" i18n>Validation Regex</label>
|
||||
<input
|
||||
type="text"
|
||||
class="form-control font-monospace"
|
||||
[(ngModel)]="zone.validation_regex"
|
||||
placeholder="e.g. \d{2}\.\d{2}\.\d{4}"
|
||||
>
|
||||
</div>
|
||||
|
||||
<div class="text-muted small">
|
||||
{{ zone.x }}, {{ zone.y }} - {{ zone.width }}x{{ zone.height }}px
|
||||
</div>
|
||||
|
||||
<hr class="my-3" />
|
||||
<h6 i18n>Test</h6>
|
||||
@if (!previewDocId) {
|
||||
<p class="text-muted small mb-0" i18n>
|
||||
Load a document in the Settings tab to test this zone.
|
||||
</p>
|
||||
} @else {
|
||||
<button class="btn btn-sm btn-outline-secondary" (click)="testZone()" [disabled]="zoneTesting">
|
||||
@if (zoneTesting) {
|
||||
<span class="spinner-border spinner-border-sm me-1"></span>
|
||||
}
|
||||
<span i18n>Test this zone</span>
|
||||
</button>
|
||||
@if (zoneTestResult) {
|
||||
@if (zoneTestResult.error) {
|
||||
<div class="alert alert-warning py-2 mt-2 mb-0 small">{{ zoneTestResult.error }}</div>
|
||||
} @else {
|
||||
<dl class="row small mt-2 mb-0">
|
||||
<dt class="col-sm-4" i18n>OCR text</dt>
|
||||
<dd class="col-sm-8"><code>{{ zoneTestResult.raw_text || '(nothing detected)' }}</code></dd>
|
||||
<dt class="col-sm-4" i18n>Value</dt>
|
||||
<dd class="col-sm-8"><code>{{ zoneTestResult.value || '(empty)' }}</code></dd>
|
||||
@if (zoneTestResult.regex) {
|
||||
<dt class="col-sm-4" i18n>Validation</dt>
|
||||
<dd class="col-sm-8">
|
||||
@if (zoneTestResult.regex_match) {
|
||||
<span class="badge bg-success" i18n>Regex matches</span>
|
||||
} @else {
|
||||
<span class="badge bg-danger" i18n>Regex does not match</span>
|
||||
}
|
||||
</dd>
|
||||
}
|
||||
</dl>
|
||||
}
|
||||
}
|
||||
}
|
||||
} @else {
|
||||
<p class="text-muted" i18n>
|
||||
Select a zone from the Zones tab, or draw a rectangle on the document to create one.
|
||||
</p>
|
||||
}
|
||||
</ng-template>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<div [ngbNavOutlet]="nav" class="mt-3"></div>
|
||||
</div>
|
||||
|
||||
<!-- Right column: Document preview with zone overlay -->
|
||||
<div class="col-md-8">
|
||||
@if (pageImageUrl) {
|
||||
<div class="border" style="overflow: auto; max-height: 78vh;">
|
||||
<div class="position-relative d-inline-block" [style.width.%]="zoom * 100">
|
||||
<img
|
||||
#pageImage
|
||||
[src]="pageImageUrl"
|
||||
(load)="onImageLoad()"
|
||||
style="width: 100%; display: block;"
|
||||
[style.visibility]="imageLoaded ? 'visible' : 'hidden'"
|
||||
crossorigin="use-credentials"
|
||||
/>
|
||||
@if (imageLoaded) {
|
||||
<canvas
|
||||
#zoneCanvas
|
||||
class="position-absolute top-0 start-0"
|
||||
style="width: 100%; height: 100%; cursor: crosshair;"
|
||||
(mousedown)="onCanvasMouseDown($event)"
|
||||
(mousemove)="onCanvasMouseMove($event)"
|
||||
(mouseup)="onCanvasMouseUp($event)"
|
||||
></canvas>
|
||||
}
|
||||
@if (!imageLoaded) {
|
||||
<div class="d-flex justify-content-center p-5">
|
||||
<div class="spinner-border" role="status">
|
||||
<span class="visually-hidden" i18n>Loading page...</span>
|
||||
</div>
|
||||
</div>
|
||||
}
|
||||
</div>
|
||||
</div>
|
||||
} @else {
|
||||
<div class="border rounded p-5 text-center text-muted">
|
||||
<i-bs name="file-earmark-image" width="48" height="48"></i-bs>
|
||||
<p class="mt-3" i18n>
|
||||
Enter a document ID and click "Load" to preview a page and draw extraction zones.
|
||||
</p>
|
||||
</div>
|
||||
}
|
||||
</div>
|
||||
</div>
|
||||
+3
@@ -0,0 +1,3 @@
|
||||
:host {
|
||||
display: block;
|
||||
}
|
||||
+997
@@ -0,0 +1,997 @@
|
||||
import { CommonModule } from '@angular/common'
|
||||
import {
|
||||
AfterViewInit,
|
||||
Component,
|
||||
ElementRef,
|
||||
inject,
|
||||
OnDestroy,
|
||||
OnInit,
|
||||
ViewChild,
|
||||
} from '@angular/core'
|
||||
import { FormsModule } from '@angular/forms'
|
||||
import { ActivatedRoute, Router, RouterModule } from '@angular/router'
|
||||
import {
|
||||
NgbNavModule,
|
||||
NgbPopoverModule,
|
||||
NgbTypeaheadModule,
|
||||
NgbTypeaheadSelectItemEvent,
|
||||
} from '@ng-bootstrap/ng-bootstrap'
|
||||
import { NgSelectModule } from '@ng-select/ng-select'
|
||||
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
|
||||
import {
|
||||
catchError,
|
||||
debounceTime,
|
||||
distinctUntilChanged,
|
||||
map,
|
||||
Observable,
|
||||
of,
|
||||
Subject,
|
||||
switchMap,
|
||||
takeUntil,
|
||||
} from 'rxjs'
|
||||
import { SelectComponent } from 'src/app/components/common/input/select/select.component'
|
||||
import { SwitchComponent } from 'src/app/components/common/input/switch/switch.component'
|
||||
import { TextComponent } from 'src/app/components/common/input/text/text.component'
|
||||
import { PageHeaderComponent } from 'src/app/components/common/page-header/page-header.component'
|
||||
import { CustomField } from 'src/app/data/custom-field'
|
||||
import { Document } from 'src/app/data/document'
|
||||
import { DocumentType } from 'src/app/data/document-type'
|
||||
import {
|
||||
DATE_FORMAT_OPTIONS,
|
||||
OCR_BUILTIN_TARGETS,
|
||||
OCR_LANGUAGE_OPTIONS,
|
||||
OcrTemplate,
|
||||
OcrTemplateZone,
|
||||
OcrZoneTestResult,
|
||||
TRANSFORM_OPTIONS,
|
||||
ZoneTestRequest,
|
||||
} from 'src/app/data/ocr-template'
|
||||
import { CorrespondentService } from 'src/app/services/rest/correspondent.service'
|
||||
import { CustomFieldsService } from 'src/app/services/rest/custom-fields.service'
|
||||
import { DocumentTypeService } from 'src/app/services/rest/document-type.service'
|
||||
import { DocumentService } from 'src/app/services/rest/document.service'
|
||||
import { OcrTemplateService } from 'src/app/services/rest/ocr-template.service'
|
||||
import { ToastService } from 'src/app/services/toast.service'
|
||||
|
||||
interface DrawingRect {
|
||||
startX: number
|
||||
startY: number
|
||||
endX: number
|
||||
endY: number
|
||||
}
|
||||
|
||||
type ResizeHandle = 'n' | 's' | 'e' | 'w' | 'ne' | 'nw' | 'se' | 'sw'
|
||||
|
||||
type ActiveTab = 'settings' | 'zones' | 'zone'
|
||||
|
||||
@Component({
|
||||
selector: 'pngx-ocr-template-editor',
|
||||
standalone: true,
|
||||
imports: [
|
||||
PageHeaderComponent,
|
||||
TextComponent,
|
||||
SelectComponent,
|
||||
SwitchComponent,
|
||||
CommonModule,
|
||||
FormsModule,
|
||||
RouterModule,
|
||||
NgbNavModule,
|
||||
NgbPopoverModule,
|
||||
NgbTypeaheadModule,
|
||||
NgSelectModule,
|
||||
NgxBootstrapIconsModule,
|
||||
],
|
||||
templateUrl: './ocr-template-editor.component.html',
|
||||
styleUrls: ['./ocr-template-editor.component.scss'],
|
||||
})
|
||||
export class OcrTemplateEditorComponent
|
||||
implements OnInit, OnDestroy, AfterViewInit
|
||||
{
|
||||
private readonly route = inject(ActivatedRoute)
|
||||
private readonly router = inject(Router)
|
||||
private readonly templateService = inject(OcrTemplateService)
|
||||
private readonly customFieldsService = inject(CustomFieldsService)
|
||||
private readonly documentTypeService = inject(DocumentTypeService)
|
||||
private readonly correspondentService = inject(CorrespondentService)
|
||||
private readonly documentService = inject(DocumentService)
|
||||
private readonly toastService = inject(ToastService)
|
||||
private readonly destroy$ = new Subject<void>()
|
||||
|
||||
@ViewChild('zoneCanvas') canvasRef: ElementRef<HTMLCanvasElement>
|
||||
@ViewChild('pageImage') imageRef: ElementRef<HTMLImageElement>
|
||||
|
||||
template: OcrTemplate = {
|
||||
id: null,
|
||||
name: '',
|
||||
document_type: null,
|
||||
sample_document: null,
|
||||
source_width: 0,
|
||||
source_height: 0,
|
||||
enabled: true,
|
||||
combine_formats: {},
|
||||
zones: [],
|
||||
}
|
||||
|
||||
customFields: CustomField[] = []
|
||||
documentTypes: DocumentType[] = []
|
||||
transformOptions = TRANSFORM_OPTIONS
|
||||
builtinTargets = OCR_BUILTIN_TARGETS
|
||||
dateFormatOptions = DATE_FORMAT_OPTIONS
|
||||
ocrLanguageOptions = OCR_LANGUAGE_OPTIONS
|
||||
dateFormatCustom = false
|
||||
isNew = true
|
||||
saving = false
|
||||
|
||||
previewDocId: number | null = null
|
||||
previewPage = 0
|
||||
previewPageCount: number | null = null
|
||||
private pageCountForDoc: number | null = null
|
||||
pageImageUrl: string | null = null
|
||||
imageLoaded = false
|
||||
zoom = 1
|
||||
previewDocModel: Document | string = ''
|
||||
private correspondentNames = new Map<number, string>()
|
||||
|
||||
public get previewPageDisplay(): number {
|
||||
return this.previewPage + 1
|
||||
}
|
||||
|
||||
public set previewPageDisplay(value: number) {
|
||||
this.previewPage = Math.max(0, value) - 1
|
||||
}
|
||||
|
||||
activeTab: ActiveTab = 'settings'
|
||||
|
||||
isDrawing = false
|
||||
currentRect: DrawingRect | null = null
|
||||
selectedZoneIndex: number | null = null
|
||||
|
||||
isResizing = false
|
||||
resizeHandle: ResizeHandle | null = null
|
||||
resizeZoneIndex: number | null = null
|
||||
private readonly HANDLE_SIZE = 8
|
||||
|
||||
isMoving = false
|
||||
moveZoneIndex: number | null = null
|
||||
private moveStart = { mouseX: 0, mouseY: 0, zoneX: 0, zoneY: 0 }
|
||||
|
||||
zoneTestResult: OcrZoneTestResult | null = null
|
||||
zoneTesting = false
|
||||
|
||||
showQuickCreate = false
|
||||
quickCreateName = ''
|
||||
quickCreateType = 'string'
|
||||
quickCreateForZoneIndex: number | null = null
|
||||
quickCreateTypes = [
|
||||
{ id: 'string', name: $localize`String` },
|
||||
{ id: 'integer', name: $localize`Integer` },
|
||||
{ id: 'float', name: $localize`Float` },
|
||||
{ id: 'date', name: $localize`Date` },
|
||||
{ id: 'monetary', name: $localize`Monetary` },
|
||||
{ id: 'boolean', name: $localize`Boolean` },
|
||||
{ id: 'url', name: $localize`URL` },
|
||||
{ id: 'longtext', name: $localize`Long Text` },
|
||||
]
|
||||
|
||||
get selectedZone(): OcrTemplateZone | null {
|
||||
return this.selectedZoneIndex !== null
|
||||
? (this.template.zones[this.selectedZoneIndex] ?? null)
|
||||
: null
|
||||
}
|
||||
|
||||
get pageTitle(): string {
|
||||
return this.isNew
|
||||
? $localize`New OCR Template`
|
||||
: $localize`Edit OCR Template`
|
||||
}
|
||||
|
||||
ngOnInit() {
|
||||
this.customFieldsService
|
||||
.listAll()
|
||||
.pipe(takeUntil(this.destroy$))
|
||||
.subscribe((r) => (this.customFields = r.results))
|
||||
|
||||
this.documentTypeService
|
||||
.listAll()
|
||||
.pipe(takeUntil(this.destroy$))
|
||||
.subscribe((r) => (this.documentTypes = r.results))
|
||||
|
||||
this.correspondentService
|
||||
.listAll()
|
||||
.pipe(takeUntil(this.destroy$))
|
||||
.subscribe((r) => {
|
||||
this.correspondentNames = new Map(r.results.map((c) => [c.id, c.name]))
|
||||
})
|
||||
|
||||
const id = this.route.snapshot.paramMap.get('id')
|
||||
if (id && id !== 'new') {
|
||||
this.isNew = false
|
||||
this.templateService
|
||||
.get(parseInt(id))
|
||||
.pipe(takeUntil(this.destroy$))
|
||||
.subscribe((t) => {
|
||||
this.template = t
|
||||
this.template.combine_formats ??= {}
|
||||
if (t.sample_document) {
|
||||
this.previewDocId = t.sample_document
|
||||
this.loadPreview()
|
||||
}
|
||||
})
|
||||
} else {
|
||||
const qp = this.route.snapshot.queryParams
|
||||
if (qp['document_type']) {
|
||||
this.template.document_type = parseInt(qp['document_type'])
|
||||
}
|
||||
if (qp['sample_document']) {
|
||||
const docId = parseInt(qp['sample_document'])
|
||||
this.template.sample_document = docId
|
||||
this.previewDocId = docId
|
||||
this.loadPreview()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ngAfterViewInit() {}
|
||||
|
||||
searchDocuments = (text$: Observable<string>): Observable<Document[]> =>
|
||||
text$.pipe(
|
||||
debounceTime(250),
|
||||
distinctUntilChanged(),
|
||||
switchMap((term) => {
|
||||
if (!term || term.trim().length < 2) return of([])
|
||||
const params: { title__icontains: string; document_type__id?: number } =
|
||||
{ title__icontains: term.trim() }
|
||||
if (this.template.document_type) {
|
||||
params['document_type__id'] = this.template.document_type
|
||||
}
|
||||
return this.documentService.list(1, 10, 'created', true, params).pipe(
|
||||
map((r) => r.results),
|
||||
catchError(() => of([]))
|
||||
)
|
||||
})
|
||||
)
|
||||
|
||||
documentFormatter = (doc: Document | string): string => {
|
||||
if (typeof doc === 'string') return doc
|
||||
const corr = doc.correspondent
|
||||
? this.correspondentNames.get(doc.correspondent)
|
||||
: null
|
||||
return corr
|
||||
? `#${doc.id} ${doc.title} (${corr})`
|
||||
: `#${doc.id} ${doc.title}`
|
||||
}
|
||||
|
||||
onPreviewDocSelected(event: NgbTypeaheadSelectItemEvent<Document>) {
|
||||
event.preventDefault()
|
||||
const doc: Document = event.item
|
||||
this.previewDocModel = doc
|
||||
this.previewDocId = doc.id
|
||||
if (!this.template.document_type && doc.document_type) {
|
||||
this.template.document_type = doc.document_type
|
||||
}
|
||||
this.previewPage = 0
|
||||
this.loadPreview()
|
||||
}
|
||||
|
||||
clearPreviewDoc() {
|
||||
this.previewDocModel = ''
|
||||
this.previewDocId = null
|
||||
this.previewPageCount = null
|
||||
this.pageCountForDoc = null
|
||||
this.previewPage = 0
|
||||
this.pageImageUrl = null
|
||||
this.imageLoaded = false
|
||||
}
|
||||
|
||||
loadPreview() {
|
||||
if (!this.previewDocId) return
|
||||
if (this.pageCountForDoc !== this.previewDocId) {
|
||||
this.pageCountForDoc = this.previewDocId
|
||||
this.previewPageCount = null
|
||||
this.documentService
|
||||
.get(this.previewDocId)
|
||||
.pipe(takeUntil(this.destroy$))
|
||||
.subscribe({
|
||||
next: (doc) => {
|
||||
this.previewPageCount = doc?.page_count ?? null
|
||||
if (doc && !this.previewDocModel) this.previewDocModel = doc
|
||||
},
|
||||
error: () => (this.previewPageCount = null),
|
||||
})
|
||||
}
|
||||
this.pageImageUrl = this.templateService.getPageImageUrl(
|
||||
this.previewDocId,
|
||||
this.previewPage
|
||||
)
|
||||
this.imageLoaded = false
|
||||
}
|
||||
|
||||
goToPage(page: number) {
|
||||
const max = this.previewPageCount ? this.previewPageCount - 1 : page
|
||||
const clamped = Math.max(0, Math.min(page, max))
|
||||
if (clamped === this.previewPage) return
|
||||
this.previewPage = clamped
|
||||
this.loadPreview()
|
||||
}
|
||||
|
||||
prevPage() {
|
||||
this.goToPage(this.previewPage - 1)
|
||||
}
|
||||
|
||||
nextPage() {
|
||||
this.goToPage(this.previewPage + 1)
|
||||
}
|
||||
|
||||
zoomIn() {
|
||||
this.zoom = Math.min(4, Math.round((this.zoom + 0.25) * 100) / 100)
|
||||
this.afterZoom()
|
||||
}
|
||||
|
||||
zoomOut() {
|
||||
this.zoom = Math.max(0.5, Math.round((this.zoom - 0.25) * 100) / 100)
|
||||
this.afterZoom()
|
||||
}
|
||||
|
||||
resetZoom() {
|
||||
this.zoom = 1
|
||||
this.afterZoom()
|
||||
}
|
||||
|
||||
private afterZoom() {
|
||||
// Defer so the wrapper reflows to the new width before the canvas resizes.
|
||||
setTimeout(() => this.redrawCanvas())
|
||||
}
|
||||
|
||||
zonePage(zone: OcrTemplateZone): number {
|
||||
const v = zone.page ?? 1
|
||||
if (v === -1) return this.previewPageCount ?? this.previewPage + 1
|
||||
return v >= 1 ? v : 1
|
||||
}
|
||||
|
||||
private isOnCurrentPage(zone: OcrTemplateZone): boolean {
|
||||
return this.zonePage(zone) === this.previewPage + 1
|
||||
}
|
||||
|
||||
onImageLoad() {
|
||||
this.imageLoaded = true
|
||||
const img = this.imageRef.nativeElement
|
||||
this.template.source_width = img.naturalWidth
|
||||
this.template.source_height = img.naturalHeight
|
||||
// The canvas only exists after @if(imageLoaded) renders, so defer the draw.
|
||||
setTimeout(() => this.redrawCanvas())
|
||||
}
|
||||
|
||||
onCanvasMouseDown(event: MouseEvent) {
|
||||
const rect = this.canvasRef.nativeElement.getBoundingClientRect()
|
||||
const x = event.clientX - rect.left
|
||||
const y = event.clientY - rect.top
|
||||
|
||||
if (this.selectedZoneIndex !== null) {
|
||||
const handle = this.findHandleAt(x, y, this.selectedZoneIndex)
|
||||
if (handle) {
|
||||
this.isResizing = true
|
||||
this.resizeHandle = handle
|
||||
this.resizeZoneIndex = this.selectedZoneIndex
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
const clickedIdx = this.findZoneAt(x, y)
|
||||
if (clickedIdx !== null && !event.shiftKey) {
|
||||
this.selectZone(clickedIdx)
|
||||
const zone = this.template.zones[clickedIdx]
|
||||
this.isMoving = true
|
||||
this.moveZoneIndex = clickedIdx
|
||||
this.moveStart = { mouseX: x, mouseY: y, zoneX: zone.x, zoneY: zone.y }
|
||||
return
|
||||
}
|
||||
|
||||
// Shift+click or click on empty area starts a new zone.
|
||||
this.isDrawing = true
|
||||
this.currentRect = { startX: x, startY: y, endX: x, endY: y }
|
||||
this.selectedZoneIndex = null
|
||||
}
|
||||
|
||||
onCanvasMouseMove(event: MouseEvent) {
|
||||
const rect = this.canvasRef.nativeElement.getBoundingClientRect()
|
||||
const mx = event.clientX - rect.left
|
||||
const my = event.clientY - rect.top
|
||||
|
||||
if (this.isResizing && this.resizeZoneIndex !== null && this.resizeHandle) {
|
||||
this.applyResize(mx, my)
|
||||
this.redrawCanvas()
|
||||
return
|
||||
}
|
||||
|
||||
if (this.isMoving && this.moveZoneIndex !== null) {
|
||||
const zone = this.template.zones[this.moveZoneIndex]
|
||||
const canvas = this.canvasRef.nativeElement
|
||||
const img = this.imageRef.nativeElement
|
||||
const srcW = zone.zone_source_width || img.naturalWidth
|
||||
const srcH = zone.zone_source_height || img.naturalHeight
|
||||
const scaleX = srcW / canvas.width
|
||||
const scaleY = srcH / canvas.height
|
||||
const dx = Math.round((mx - this.moveStart.mouseX) * scaleX)
|
||||
const dy = Math.round((my - this.moveStart.mouseY) * scaleY)
|
||||
zone.x = Math.max(
|
||||
0,
|
||||
Math.min(this.moveStart.zoneX + dx, srcW - zone.width)
|
||||
)
|
||||
zone.y = Math.max(
|
||||
0,
|
||||
Math.min(this.moveStart.zoneY + dy, srcH - zone.height)
|
||||
)
|
||||
this.redrawCanvas()
|
||||
return
|
||||
}
|
||||
|
||||
if (this.isDrawing && this.currentRect) {
|
||||
this.currentRect.endX = mx
|
||||
this.currentRect.endY = my
|
||||
this.redrawCanvas()
|
||||
return
|
||||
}
|
||||
|
||||
// Cursor feedback: resize handle > move (over a zone) > crosshair.
|
||||
const canvas = this.canvasRef.nativeElement
|
||||
if (this.selectedZoneIndex !== null) {
|
||||
const handle = this.findHandleAt(mx, my, this.selectedZoneIndex)
|
||||
if (handle) {
|
||||
const cursorMap: Record<ResizeHandle, string> = {
|
||||
nw: 'nw-resize',
|
||||
ne: 'ne-resize',
|
||||
sw: 'sw-resize',
|
||||
se: 'se-resize',
|
||||
n: 'n-resize',
|
||||
s: 's-resize',
|
||||
w: 'w-resize',
|
||||
e: 'e-resize',
|
||||
}
|
||||
canvas.style.cursor = cursorMap[handle] || 'crosshair'
|
||||
return
|
||||
}
|
||||
}
|
||||
canvas.style.cursor =
|
||||
this.findZoneAt(mx, my) !== null ? 'move' : 'crosshair'
|
||||
}
|
||||
|
||||
onCanvasMouseUp(event: MouseEvent) {
|
||||
if (this.isMoving) {
|
||||
this.isMoving = false
|
||||
this.moveZoneIndex = null
|
||||
return
|
||||
}
|
||||
|
||||
if (this.isResizing) {
|
||||
this.isResizing = false
|
||||
this.resizeHandle = null
|
||||
this.resizeZoneIndex = null
|
||||
return
|
||||
}
|
||||
|
||||
if (!this.isDrawing || !this.currentRect) return
|
||||
this.isDrawing = false
|
||||
|
||||
const canvas = this.canvasRef.nativeElement
|
||||
const img = this.imageRef.nativeElement
|
||||
|
||||
const scaleX = img.naturalWidth / canvas.width
|
||||
const scaleY = img.naturalHeight / canvas.height
|
||||
|
||||
const x = Math.round(
|
||||
Math.min(this.currentRect.startX, this.currentRect.endX) * scaleX
|
||||
)
|
||||
const y = Math.round(
|
||||
Math.min(this.currentRect.startY, this.currentRect.endY) * scaleY
|
||||
)
|
||||
const w = Math.round(
|
||||
Math.abs(this.currentRect.endX - this.currentRect.startX) * scaleX
|
||||
)
|
||||
const h = Math.round(
|
||||
Math.abs(this.currentRect.endY - this.currentRect.startY) * scaleY
|
||||
)
|
||||
|
||||
// Ignore tiny accidental clicks.
|
||||
if (w < 10 || h < 10) {
|
||||
this.currentRect = null
|
||||
this.redrawCanvas()
|
||||
return
|
||||
}
|
||||
|
||||
const zone: OcrTemplateZone = {
|
||||
name: `Zone ${this.template.zones.length + 1}`,
|
||||
target: 'custom_field',
|
||||
custom_field:
|
||||
this.customFields.length > 0 ? this.customFields[0].id : null,
|
||||
x,
|
||||
y,
|
||||
width: w,
|
||||
height: h,
|
||||
page: this.previewPage + 1,
|
||||
ocr_language: 'deu+eng',
|
||||
transform: 'strip',
|
||||
date_format: '',
|
||||
validation_regex: '',
|
||||
order: this.template.zones.length,
|
||||
zone_source_width: img.naturalWidth,
|
||||
zone_source_height: img.naturalHeight,
|
||||
}
|
||||
|
||||
this.template.zones.push(zone)
|
||||
this.currentRect = null
|
||||
this.selectZone(this.template.zones.length - 1)
|
||||
}
|
||||
|
||||
private getZoneDisplayRect(
|
||||
zoneIdx: number
|
||||
): { x: number; y: number; w: number; h: number } | null {
|
||||
const canvas = this.canvasRef?.nativeElement
|
||||
const img = this.imageRef?.nativeElement
|
||||
if (!canvas || !img || !img.naturalWidth) return null
|
||||
const zone = this.template.zones[zoneIdx]
|
||||
if (!zone) return null
|
||||
if (!this.isOnCurrentPage(zone)) return null
|
||||
const srcW = zone.zone_source_width || img.naturalWidth
|
||||
const srcH = zone.zone_source_height || img.naturalHeight
|
||||
const scaleX = canvas.width / srcW
|
||||
const scaleY = canvas.height / srcH
|
||||
return {
|
||||
x: zone.x * scaleX,
|
||||
y: zone.y * scaleY,
|
||||
w: zone.width * scaleX,
|
||||
h: zone.height * scaleY,
|
||||
}
|
||||
}
|
||||
|
||||
private findHandleAt(
|
||||
mx: number,
|
||||
my: number,
|
||||
zoneIdx: number
|
||||
): ResizeHandle | null {
|
||||
const r = this.getZoneDisplayRect(zoneIdx)
|
||||
if (!r) return null
|
||||
const hs = this.HANDLE_SIZE
|
||||
const handles: [ResizeHandle, number, number][] = [
|
||||
['nw', r.x, r.y],
|
||||
['n', r.x + r.w / 2, r.y],
|
||||
['ne', r.x + r.w, r.y],
|
||||
['w', r.x, r.y + r.h / 2],
|
||||
['e', r.x + r.w, r.y + r.h / 2],
|
||||
['sw', r.x, r.y + r.h],
|
||||
['s', r.x + r.w / 2, r.y + r.h],
|
||||
['se', r.x + r.w, r.y + r.h],
|
||||
]
|
||||
for (const [name, hx, hy] of handles) {
|
||||
if (Math.abs(mx - hx) <= hs && Math.abs(my - hy) <= hs) return name
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
private applyResize(mx: number, my: number) {
|
||||
const canvas = this.canvasRef.nativeElement
|
||||
const img = this.imageRef.nativeElement
|
||||
const zone = this.template.zones[this.resizeZoneIndex]
|
||||
if (!zone) return
|
||||
const srcW = zone.zone_source_width || img.naturalWidth
|
||||
const srcH = zone.zone_source_height || img.naturalHeight
|
||||
const scaleX = srcW / canvas.width
|
||||
const scaleY = srcH / canvas.height
|
||||
const imgX = Math.round(mx * scaleX)
|
||||
const imgY = Math.round(my * scaleY)
|
||||
const handle = this.resizeHandle
|
||||
|
||||
if (handle.includes('w')) {
|
||||
const right = zone.x + zone.width
|
||||
zone.x = Math.max(0, Math.min(imgX, right - 10))
|
||||
zone.width = right - zone.x
|
||||
}
|
||||
if (handle.includes('e')) {
|
||||
zone.width = Math.max(10, imgX - zone.x)
|
||||
}
|
||||
if (handle.includes('n')) {
|
||||
const bottom = zone.y + zone.height
|
||||
zone.y = Math.max(0, Math.min(imgY, bottom - 10))
|
||||
zone.height = bottom - zone.y
|
||||
}
|
||||
if (handle.includes('s')) {
|
||||
zone.height = Math.max(10, imgY - zone.y)
|
||||
}
|
||||
}
|
||||
|
||||
private findZoneAt(displayX: number, displayY: number): number | null {
|
||||
const canvas = this.canvasRef.nativeElement
|
||||
const img = this.imageRef.nativeElement
|
||||
if (!img.naturalWidth) return null
|
||||
|
||||
for (let i = this.template.zones.length - 1; i >= 0; i--) {
|
||||
const z = this.template.zones[i]
|
||||
if (!this.isOnCurrentPage(z)) continue
|
||||
const srcW = z.zone_source_width || img.naturalWidth
|
||||
const srcH = z.zone_source_height || img.naturalHeight
|
||||
const scaleX = canvas.width / srcW
|
||||
const scaleY = canvas.height / srcH
|
||||
const zx = z.x * scaleX
|
||||
const zy = z.y * scaleY
|
||||
const zw = z.width * scaleX
|
||||
const zh = z.height * scaleY
|
||||
if (
|
||||
displayX >= zx &&
|
||||
displayX <= zx + zw &&
|
||||
displayY >= zy &&
|
||||
displayY <= zy + zh
|
||||
) {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
redrawCanvas() {
|
||||
if (!this.canvasRef || !this.imageRef) return
|
||||
const canvas = this.canvasRef.nativeElement
|
||||
const img = this.imageRef.nativeElement
|
||||
const ctx = canvas.getContext('2d')
|
||||
|
||||
canvas.width = img.clientWidth
|
||||
canvas.height = img.clientHeight
|
||||
|
||||
ctx.clearRect(0, 0, canvas.width, canvas.height)
|
||||
|
||||
const colors = [
|
||||
'#4f8ff7',
|
||||
'#ff6b6b',
|
||||
'#51cf66',
|
||||
'#ffd43b',
|
||||
'#cc5de8',
|
||||
'#ff922b',
|
||||
'#20c997',
|
||||
'#e599f7',
|
||||
]
|
||||
|
||||
this.template.zones.forEach((zone, idx) => {
|
||||
if (!this.isOnCurrentPage(zone)) return
|
||||
const color = colors[idx % colors.length]
|
||||
const srcW = zone.zone_source_width || img.naturalWidth
|
||||
const srcH = zone.zone_source_height || img.naturalHeight
|
||||
const scaleX = canvas.width / srcW
|
||||
const scaleY = canvas.height / srcH
|
||||
const x = zone.x * scaleX
|
||||
const y = zone.y * scaleY
|
||||
const w = zone.width * scaleX
|
||||
const h = zone.height * scaleY
|
||||
|
||||
ctx.strokeStyle = color
|
||||
ctx.lineWidth = idx === this.selectedZoneIndex ? 3 : 2
|
||||
ctx.strokeRect(x, y, w, h)
|
||||
|
||||
ctx.fillStyle = color + '20'
|
||||
ctx.fillRect(x, y, w, h)
|
||||
|
||||
const label = zone.name || `Zone ${idx + 1}`
|
||||
ctx.font = '12px sans-serif'
|
||||
ctx.textBaseline = 'middle'
|
||||
const padX = 6
|
||||
const pillH = 17
|
||||
const pillW = ctx.measureText(label).width + padX * 2
|
||||
const pillX = x
|
||||
const pillY = Math.max(0, y - pillH - 2)
|
||||
const r = 4
|
||||
ctx.fillStyle = color
|
||||
ctx.beginPath()
|
||||
ctx.moveTo(pillX + r, pillY)
|
||||
ctx.arcTo(pillX + pillW, pillY, pillX + pillW, pillY + pillH, r)
|
||||
ctx.arcTo(pillX + pillW, pillY + pillH, pillX, pillY + pillH, r)
|
||||
ctx.arcTo(pillX, pillY + pillH, pillX, pillY, r)
|
||||
ctx.arcTo(pillX, pillY, pillX + pillW, pillY, r)
|
||||
ctx.closePath()
|
||||
ctx.fill()
|
||||
ctx.fillStyle = '#ffffff'
|
||||
ctx.fillText(label, pillX + padX, pillY + pillH / 2 + 0.5)
|
||||
ctx.textBaseline = 'alphabetic'
|
||||
|
||||
if (idx === this.selectedZoneIndex) {
|
||||
const hs = this.HANDLE_SIZE
|
||||
ctx.fillStyle = color
|
||||
const handles = [
|
||||
[x, y],
|
||||
[x + w / 2, y],
|
||||
[x + w, y],
|
||||
[x, y + h / 2],
|
||||
[x + w, y + h / 2],
|
||||
[x, y + h],
|
||||
[x + w / 2, y + h],
|
||||
[x + w, y + h],
|
||||
]
|
||||
for (const [hx, hy] of handles) {
|
||||
ctx.fillRect(hx - hs / 2, hy - hs / 2, hs, hs)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
if (this.currentRect) {
|
||||
const cw = this.currentRect.endX - this.currentRect.startX
|
||||
const ch = this.currentRect.endY - this.currentRect.startY
|
||||
ctx.fillStyle = 'rgba(105, 219, 124, 0.25)'
|
||||
ctx.fillRect(this.currentRect.startX, this.currentRect.startY, cw, ch)
|
||||
ctx.strokeStyle = '#69db7c'
|
||||
ctx.lineWidth = 2
|
||||
ctx.setLineDash([5, 5])
|
||||
ctx.strokeRect(this.currentRect.startX, this.currentRect.startY, cw, ch)
|
||||
ctx.setLineDash([])
|
||||
}
|
||||
}
|
||||
|
||||
removeZone(index: number) {
|
||||
this.template.zones.splice(index, 1)
|
||||
if (this.selectedZoneIndex === index) {
|
||||
this.selectedZoneIndex = null
|
||||
} else if (this.selectedZoneIndex > index) {
|
||||
this.selectedZoneIndex--
|
||||
}
|
||||
this.redrawCanvas()
|
||||
}
|
||||
|
||||
selectZone(index: number) {
|
||||
this.selectedZoneIndex = index
|
||||
this.activeTab = 'zone'
|
||||
this.zoneTestResult = null
|
||||
const zone = this.template.zones[index]
|
||||
if (zone) {
|
||||
this.dateFormatCustom =
|
||||
!!zone.date_format &&
|
||||
!this.dateFormatOptions.some((o) => o.id === zone.date_format)
|
||||
this.seedCombineDefault(zone)
|
||||
this.goToPage(this.zonePage(zone) - 1)
|
||||
}
|
||||
this.redrawCanvas()
|
||||
}
|
||||
|
||||
testZone() {
|
||||
const zone = this.selectedZone
|
||||
if (!zone || !this.previewDocId) return
|
||||
this.zoneTesting = true
|
||||
this.zoneTestResult = null
|
||||
const payload: ZoneTestRequest = {
|
||||
name: zone.name,
|
||||
x: zone.x,
|
||||
y: zone.y,
|
||||
width: zone.width,
|
||||
height: zone.height,
|
||||
page: zone.page ?? 1,
|
||||
ocr_language: zone.ocr_language,
|
||||
transform: zone.transform,
|
||||
date_format: zone.date_format,
|
||||
validation_regex: zone.validation_regex,
|
||||
zone_source_width: zone.zone_source_width,
|
||||
zone_source_height: zone.zone_source_height,
|
||||
}
|
||||
this.templateService
|
||||
.testZone(this.previewDocId, payload)
|
||||
.pipe(takeUntil(this.destroy$))
|
||||
.subscribe({
|
||||
next: (res) => {
|
||||
this.zoneTestResult = res
|
||||
this.zoneTesting = false
|
||||
},
|
||||
error: (err) => {
|
||||
this.zoneTestResult = {
|
||||
error: err.error?.error || $localize`Test failed`,
|
||||
}
|
||||
this.zoneTesting = false
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
deleteSelectedZone() {
|
||||
if (this.selectedZoneIndex === null) return
|
||||
this.removeZone(this.selectedZoneIndex)
|
||||
this.activeTab = 'zones'
|
||||
}
|
||||
|
||||
save() {
|
||||
this.saving = true
|
||||
this.pruneCombineFormats()
|
||||
this.template.sample_document = this.previewDocId
|
||||
const obs = this.isNew
|
||||
? this.templateService.create(this.template)
|
||||
: this.templateService.update(this.template)
|
||||
|
||||
obs.pipe(takeUntil(this.destroy$)).subscribe({
|
||||
next: (saved) => {
|
||||
const idx = this.selectedZoneIndex
|
||||
this.template = saved
|
||||
this.isNew = false
|
||||
this.selectedZoneIndex = idx
|
||||
this.saving = false
|
||||
this.toastService.showInfo($localize`OCR template saved.`)
|
||||
this.redrawCanvas()
|
||||
},
|
||||
error: (e) => {
|
||||
this.saving = false
|
||||
this.toastService.showError($localize`Error saving OCR template.`, e)
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
private ocrLangCache = new WeakMap<
|
||||
OcrTemplateZone,
|
||||
{ src: string; arr: string[] }
|
||||
>()
|
||||
|
||||
ocrLanguageArray(zone: OcrTemplateZone): string[] {
|
||||
const src = zone.ocr_language || ''
|
||||
const cached = this.ocrLangCache.get(zone)
|
||||
if (cached && cached.src === src) return cached.arr
|
||||
const arr = src ? src.split('+').filter(Boolean) : []
|
||||
this.ocrLangCache.set(zone, { src, arr })
|
||||
return arr
|
||||
}
|
||||
|
||||
setOcrLanguages(zone: OcrTemplateZone, langs: string[]) {
|
||||
zone.ocr_language = (langs || []).join('+')
|
||||
this.ocrLangCache.set(zone, {
|
||||
src: zone.ocr_language,
|
||||
arr: langs ? [...langs] : [],
|
||||
})
|
||||
}
|
||||
|
||||
getCustomFieldName(id: number): string {
|
||||
const cf = this.customFields.find((f) => f.id === id)
|
||||
return cf ? cf.name : `Field #${id}`
|
||||
}
|
||||
|
||||
/** Value bound to the field select: a built-in id string or a custom-field id. */
|
||||
zoneFieldValue(zone: OcrTemplateZone): number | string | null {
|
||||
const target = zone.target || 'custom_field'
|
||||
return target === 'custom_field' ? zone.custom_field : target
|
||||
}
|
||||
|
||||
setZoneField(zone: OcrTemplateZone, value: number | string) {
|
||||
if (value === 'title' || value === 'asn' || value === 'created') {
|
||||
zone.target = value
|
||||
zone.custom_field = null
|
||||
} else {
|
||||
zone.target = 'custom_field'
|
||||
zone.custom_field = typeof value === 'number' ? value : null
|
||||
}
|
||||
this.seedCombineDefault(zone)
|
||||
}
|
||||
|
||||
fieldKeyFor(zone: OcrTemplateZone): string | null {
|
||||
const v = this.zoneFieldValue(zone)
|
||||
return v === null || v === undefined || v === '' ? null : String(v)
|
||||
}
|
||||
|
||||
zonesForField(zone: OcrTemplateZone): OcrTemplateZone[] {
|
||||
const key = this.fieldKeyFor(zone)
|
||||
if (!key) return []
|
||||
return this.template.zones.filter((z) => this.fieldKeyFor(z) === key)
|
||||
}
|
||||
|
||||
isFieldShared(zone: OcrTemplateZone): boolean {
|
||||
return this.zonesForField(zone).length > 1
|
||||
}
|
||||
|
||||
getCombineFormat(zone: OcrTemplateZone): string {
|
||||
const key = this.fieldKeyFor(zone)
|
||||
return (key && this.template.combine_formats?.[key]) || ''
|
||||
}
|
||||
|
||||
setCombineFormat(zone: OcrTemplateZone, value: string) {
|
||||
const key = this.fieldKeyFor(zone)
|
||||
if (!key) return
|
||||
this.template.combine_formats ??= {}
|
||||
this.template.combine_formats[key] = value
|
||||
}
|
||||
|
||||
insertCombineToken(zone: OcrTemplateZone, tokenZone: OcrTemplateZone) {
|
||||
const token = `{${tokenZone.name}}`
|
||||
const current = this.getCombineFormat(zone)
|
||||
const sep = current && !current.endsWith(' ') ? ' ' : ''
|
||||
this.setCombineFormat(zone, `${current}${sep}${token}`)
|
||||
}
|
||||
|
||||
private seedCombineDefault(zone: OcrTemplateZone) {
|
||||
const key = this.fieldKeyFor(zone)
|
||||
if (!key) return
|
||||
const shared = this.zonesForField(zone)
|
||||
if (shared.length <= 1) return
|
||||
this.template.combine_formats ??= {}
|
||||
if (!this.template.combine_formats[key]) {
|
||||
this.template.combine_formats[key] = shared
|
||||
.map((z) => `{${z.name}}`)
|
||||
.join(' ')
|
||||
}
|
||||
}
|
||||
|
||||
private pruneCombineFormats() {
|
||||
const formats = this.template.combine_formats
|
||||
if (!formats) return
|
||||
const counts = new Map<string, number>()
|
||||
for (const z of this.template.zones) {
|
||||
const key = this.fieldKeyFor(z)
|
||||
if (key) counts.set(key, (counts.get(key) ?? 0) + 1)
|
||||
}
|
||||
for (const key of Object.keys(formats)) {
|
||||
if ((counts.get(key) ?? 0) <= 1) delete formats[key]
|
||||
}
|
||||
}
|
||||
|
||||
/** Value bound to the date-format select: a preset, '' (auto), or 'custom'. */
|
||||
dateFormatChoice(zone: OcrTemplateZone): string {
|
||||
if (this.dateFormatCustom) return 'custom'
|
||||
return zone.date_format || ''
|
||||
}
|
||||
|
||||
setDateFormatChoice(zone: OcrTemplateZone, value: string) {
|
||||
if (value === 'custom') {
|
||||
this.dateFormatCustom = true
|
||||
} else {
|
||||
this.dateFormatCustom = false
|
||||
zone.date_format = value
|
||||
}
|
||||
}
|
||||
|
||||
getZoneTargetName(zone: OcrTemplateZone): string {
|
||||
const target = zone.target || 'custom_field'
|
||||
if (target === 'custom_field') {
|
||||
return zone.custom_field
|
||||
? this.getCustomFieldName(zone.custom_field)
|
||||
: $localize`(no field)`
|
||||
}
|
||||
return this.builtinTargets.find((t) => t.id === target)?.name ?? target
|
||||
}
|
||||
|
||||
getDocumentTypeName(id: number): string {
|
||||
const dt = this.documentTypes.find((d) => d.id === id)
|
||||
return dt ? dt.name : `Type #${id}`
|
||||
}
|
||||
|
||||
openQuickCreate(zoneIndex: number | null) {
|
||||
if (zoneIndex === null) return
|
||||
this.quickCreateForZoneIndex = zoneIndex
|
||||
this.quickCreateName = this.template.zones[zoneIndex]?.name || ''
|
||||
this.quickCreateType = 'string'
|
||||
this.showQuickCreate = true
|
||||
}
|
||||
|
||||
cancelQuickCreate() {
|
||||
this.showQuickCreate = false
|
||||
this.quickCreateForZoneIndex = null
|
||||
}
|
||||
|
||||
submitQuickCreate() {
|
||||
if (!this.quickCreateName.trim()) return
|
||||
|
||||
this.templateService
|
||||
.quickCreateField(this.quickCreateName.trim(), this.quickCreateType)
|
||||
.pipe(takeUntil(this.destroy$))
|
||||
.subscribe({
|
||||
next: (result) => {
|
||||
this.customFieldsService.clearCache()
|
||||
this.customFieldsService
|
||||
.listAll()
|
||||
.pipe(takeUntil(this.destroy$))
|
||||
.subscribe((r) => {
|
||||
this.customFields = r.results
|
||||
if (this.quickCreateForZoneIndex !== null) {
|
||||
this.template.zones[this.quickCreateForZoneIndex].custom_field =
|
||||
result.id
|
||||
this.template.zones[this.quickCreateForZoneIndex].target =
|
||||
'custom_field'
|
||||
}
|
||||
this.showQuickCreate = false
|
||||
this.quickCreateForZoneIndex = null
|
||||
})
|
||||
},
|
||||
error: (err) => {
|
||||
alert(err.error?.error || 'Failed to create custom field')
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
ngOnDestroy() {
|
||||
this.destroy$.next()
|
||||
this.destroy$.complete()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
<pngx-page-header
|
||||
title="OCR Templates"
|
||||
i18n-title
|
||||
info="Define extraction zones on document types to automatically populate custom fields via OCR."
|
||||
i18n-info
|
||||
>
|
||||
<button type="button" class="btn btn-sm btn-outline-primary" (click)="createTemplate()" *pngxIfPermissions="{ action: PermissionAction.Add, type: PermissionType.OcrTemplate }">
|
||||
<i-bs name="plus-circle" class="me-1"></i-bs><ng-container i18n>Create Template</ng-container>
|
||||
</button>
|
||||
</pngx-page-header>
|
||||
|
||||
<ul class="list-group">
|
||||
|
||||
<li class="list-group-item">
|
||||
<div class="row">
|
||||
<div class="col" i18n>Name</div>
|
||||
<div class="col d-none d-sm-flex" i18n>Document Type</div>
|
||||
<div class="col d-none d-sm-flex" i18n>Zones</div>
|
||||
<div class="col" i18n>Status</div>
|
||||
<div class="col" i18n>Actions</div>
|
||||
</div>
|
||||
</li>
|
||||
|
||||
@if (loading && templates.length === 0) {
|
||||
<li class="list-group-item">
|
||||
<div class="spinner-border spinner-border-sm me-2" role="status"></div>
|
||||
<ng-container i18n>Loading...</ng-container>
|
||||
</li>
|
||||
}
|
||||
|
||||
@for (t of templates; track t.id) {
|
||||
<li class="list-group-item">
|
||||
<div class="row fade" [class.show]="show">
|
||||
<div class="col d-flex align-items-center"><button class="btn btn-link p-0 text-start" type="button" (click)="editTemplate(t)" [disabled]="!permissionsService.currentUserCan(PermissionAction.Change, PermissionType.OcrTemplate)">{{t.name}}</button></div>
|
||||
<div class="col d-flex align-items-center d-none d-sm-flex">{{getDocumentTypeName(t)}}</div>
|
||||
<div class="col d-flex align-items-center d-none d-sm-flex"><code>{{t.zones?.length || 0}}</code></div>
|
||||
<div class="col d-flex align-items-center">
|
||||
<div class="form-check form-switch mb-0">
|
||||
<input type="checkbox" class="form-check-input cursor-pointer" [id]="t.id+'_enable'" [(ngModel)]="t.enabled" (change)="toggleTemplate(t)" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.OcrTemplate }">
|
||||
<label class="form-check-label cursor-pointer" [for]="t.id+'_enable'">
|
||||
<code> @if(t.enabled) { <ng-container i18n>Enabled</ng-container> } @else { <span i18n class="text-muted">Disabled</span> }</code>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col">
|
||||
|
||||
<div class="btn-group d-block d-sm-none">
|
||||
<div ngbDropdown container="body" class="d-inline-block">
|
||||
<button type="button" class="btn btn-link" id="actionsMenuMobile{{t.id}}" (click)="$event.stopPropagation()" ngbDropdownToggle>
|
||||
<i-bs name="three-dots-vertical"></i-bs>
|
||||
</button>
|
||||
<div ngbDropdownMenu aria-labelledby="actionsMenuMobile{{t.id}}">
|
||||
<button (click)="editTemplate(t)" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.OcrTemplate }" ngbDropdownItem i18n>Edit</button>
|
||||
<button (click)="deleteTemplate(t)" *pngxIfPermissions="{ action: PermissionAction.Delete, type: PermissionType.OcrTemplate }" ngbDropdownItem i18n>Delete</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="btn-toolbar d-none d-sm-flex gap-2" role="toolbar">
|
||||
<div class="btn-group">
|
||||
<button *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.OcrTemplate }" class="btn btn-sm btn-outline-secondary" type="button" (click)="editTemplate(t)">
|
||||
<i-bs width="1em" height="1em" name="pencil" class="me-1"></i-bs><ng-container i18n>Edit</ng-container>
|
||||
</button>
|
||||
<button *pngxIfPermissions="{ action: PermissionAction.Delete, type: PermissionType.OcrTemplate }" class="btn btn-sm btn-outline-danger" type="button" (click)="deleteTemplate(t)">
|
||||
<i-bs width="1em" height="1em" name="trash" class="me-1"></i-bs><ng-container i18n>Delete</ng-container>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
}
|
||||
@if (!loading && templates.length === 0) {
|
||||
<li class="list-group-item" [class.show]="show" i18n>No OCR templates defined.</li>
|
||||
}
|
||||
</ul>
|
||||
@@ -0,0 +1,98 @@
|
||||
import { Component, OnInit, inject } from '@angular/core'
|
||||
import { FormsModule } from '@angular/forms'
|
||||
import { Router } from '@angular/router'
|
||||
import { NgbDropdownModule, NgbModal } from '@ng-bootstrap/ng-bootstrap'
|
||||
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
|
||||
import { delay, takeUntil, tap } from 'rxjs'
|
||||
import { OcrTemplate } from 'src/app/data/ocr-template'
|
||||
import { IfPermissionsDirective } from 'src/app/directives/if-permissions.directive'
|
||||
import { PermissionsService } from 'src/app/services/permissions.service'
|
||||
import { DocumentTypeService } from 'src/app/services/rest/document-type.service'
|
||||
import { OcrTemplateService } from 'src/app/services/rest/ocr-template.service'
|
||||
import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
|
||||
import { PageHeaderComponent } from '../../common/page-header/page-header.component'
|
||||
import { LoadingComponentWithPermissions } from '../../loading-component/loading.component'
|
||||
|
||||
@Component({
|
||||
selector: 'pngx-ocr-templates',
|
||||
templateUrl: './ocr-templates.component.html',
|
||||
imports: [
|
||||
PageHeaderComponent,
|
||||
IfPermissionsDirective,
|
||||
FormsModule,
|
||||
NgbDropdownModule,
|
||||
NgxBootstrapIconsModule,
|
||||
],
|
||||
})
|
||||
export class OcrTemplatesComponent
|
||||
extends LoadingComponentWithPermissions
|
||||
implements OnInit
|
||||
{
|
||||
private readonly service = inject(OcrTemplateService)
|
||||
private readonly documentTypeService = inject(DocumentTypeService)
|
||||
private readonly router = inject(Router)
|
||||
private readonly modalService = inject(NgbModal)
|
||||
permissionsService = inject(PermissionsService)
|
||||
|
||||
public templates: OcrTemplate[] = []
|
||||
private documentTypeNames: Map<number, string> = new Map()
|
||||
|
||||
ngOnInit() {
|
||||
this.documentTypeService
|
||||
.listAll()
|
||||
.pipe(takeUntil(this.unsubscribeNotifier))
|
||||
.subscribe((r) => {
|
||||
this.documentTypeNames = new Map(
|
||||
r.results.map((dt) => [dt.id, dt.name])
|
||||
)
|
||||
})
|
||||
this.reload()
|
||||
}
|
||||
|
||||
reload() {
|
||||
this.loading = true
|
||||
this.service
|
||||
.listAll()
|
||||
.pipe(
|
||||
takeUntil(this.unsubscribeNotifier),
|
||||
tap((r) => (this.templates = r.results)),
|
||||
delay(100)
|
||||
)
|
||||
.subscribe(() => {
|
||||
this.show = true
|
||||
this.loading = false
|
||||
})
|
||||
}
|
||||
|
||||
getDocumentTypeName(t: OcrTemplate): string {
|
||||
return (
|
||||
this.documentTypeNames.get(t.document_type) ?? `${t.document_type ?? ''}`
|
||||
)
|
||||
}
|
||||
|
||||
createTemplate() {
|
||||
this.router.navigate(['/ocr-templates', 'new'])
|
||||
}
|
||||
|
||||
editTemplate(t: OcrTemplate) {
|
||||
this.router.navigate(['/ocr-templates', t.id])
|
||||
}
|
||||
|
||||
toggleTemplate(t: OcrTemplate) {
|
||||
// ngModel has already flipped t.enabled — just persist it.
|
||||
this.service.patch(t).subscribe()
|
||||
}
|
||||
|
||||
deleteTemplate(t: OcrTemplate) {
|
||||
const modal = this.modalService.open(ConfirmDialogComponent)
|
||||
modal.componentInstance.title = $localize`Delete OCR Template`
|
||||
modal.componentInstance.messageBoldPart = t.name
|
||||
modal.componentInstance.message = $localize`Do you really want to delete this OCR template?`
|
||||
modal.componentInstance.btnClass = 'btn-danger'
|
||||
modal.componentInstance.btnCaption = $localize`Delete`
|
||||
modal.componentInstance.confirmClicked.subscribe(() => {
|
||||
modal.close()
|
||||
this.service.delete(t).subscribe(() => this.reload())
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
import { ObjectWithId } from './object-with-id'
|
||||
|
||||
export type OcrZoneTarget = 'custom_field' | 'title' | 'asn' | 'created'
|
||||
|
||||
export const OCR_BUILTIN_TARGETS = [
|
||||
{ id: 'title', name: $localize`Title` },
|
||||
{ id: 'asn', name: $localize`Archive serial number` },
|
||||
{ id: 'created', name: $localize`Date created` },
|
||||
]
|
||||
|
||||
export interface OcrTemplateZone {
|
||||
id?: number
|
||||
name: string
|
||||
target?: OcrZoneTarget
|
||||
custom_field: number | null
|
||||
page?: number
|
||||
x: number
|
||||
y: number
|
||||
width: number
|
||||
height: number
|
||||
ocr_language: string
|
||||
transform: string
|
||||
date_format?: string
|
||||
validation_regex: string
|
||||
order: number
|
||||
zone_source_width?: number
|
||||
zone_source_height?: number
|
||||
}
|
||||
|
||||
export const TRANSFORM_OPTIONS = [
|
||||
{ id: 'none', name: $localize`None` },
|
||||
{ id: 'strip', name: $localize`Strip whitespace` },
|
||||
{ id: 'uppercase', name: $localize`Uppercase` },
|
||||
{ id: 'lowercase', name: $localize`Lowercase` },
|
||||
{ id: 'numeric', name: $localize`Numeric only` },
|
||||
{
|
||||
id: 'strip_punctuation',
|
||||
name: $localize`Remove leading/trailing punctuation`,
|
||||
},
|
||||
{ id: 'date', name: $localize`Parse date` },
|
||||
{ id: 'qr_code', name: $localize`Read QR/barcode` },
|
||||
]
|
||||
|
||||
export const OCR_LANGUAGE_OPTIONS = [
|
||||
{ id: 'eng', name: $localize`English` },
|
||||
{ id: 'deu', name: $localize`German` },
|
||||
{ id: 'fra', name: $localize`French` },
|
||||
{ id: 'ita', name: $localize`Italian` },
|
||||
{ id: 'spa', name: $localize`Spanish` },
|
||||
{ id: 'por', name: $localize`Portuguese` },
|
||||
{ id: 'nld', name: $localize`Dutch` },
|
||||
]
|
||||
|
||||
export const DATE_FORMAT_OPTIONS = [
|
||||
{ id: '', name: $localize`Auto-detect` },
|
||||
{ id: '%d.%m.%Y', name: 'DD.MM.YYYY' },
|
||||
{ id: '%Y/%m/%d', name: 'YYYY/MM/DD' },
|
||||
{ id: '%d/%m/%Y', name: 'DD/MM/YYYY' },
|
||||
]
|
||||
|
||||
export interface OcrTemplate extends ObjectWithId {
|
||||
name: string
|
||||
document_type: number
|
||||
sample_document: number | null
|
||||
source_width: number
|
||||
source_height: number
|
||||
enabled: boolean
|
||||
combine_formats?: Record<string, string>
|
||||
created?: string
|
||||
updated?: string
|
||||
zones: OcrTemplateZone[]
|
||||
}
|
||||
|
||||
export interface ZoneTestRequest {
|
||||
name: string
|
||||
x: number
|
||||
y: number
|
||||
width: number
|
||||
height: number
|
||||
page: number
|
||||
ocr_language: string
|
||||
transform: string
|
||||
date_format?: string
|
||||
validation_regex: string
|
||||
zone_source_width?: number
|
||||
zone_source_height?: number
|
||||
}
|
||||
|
||||
export interface OcrZoneTestResult {
|
||||
raw_text?: string | null
|
||||
value?: string | null
|
||||
regex?: string
|
||||
regex_match?: boolean | null
|
||||
error?: string
|
||||
}
|
||||
|
||||
export interface OcrZoneRunResult {
|
||||
template: string
|
||||
zone: string
|
||||
custom_field: string
|
||||
value: string | number | null
|
||||
}
|
||||
@@ -28,6 +28,7 @@ export enum PermissionType {
|
||||
ShareLink = '%s_sharelink',
|
||||
CustomField = '%s_customfield',
|
||||
Workflow = '%s_workflow',
|
||||
OcrTemplate = '%s_ocrtemplate',
|
||||
ProcessedMail = '%s_processedmail',
|
||||
GlobalStatistics = '%s_global_statistics',
|
||||
SystemMonitoring = '%s_system_monitoring',
|
||||
|
||||
@@ -12,6 +12,7 @@ import {
|
||||
import { DocumentMetadata } from 'src/app/data/document-metadata'
|
||||
import { DocumentSuggestions } from 'src/app/data/document-suggestions'
|
||||
import { FilterRule } from 'src/app/data/filter-rule'
|
||||
import { OcrZoneRunResult } from 'src/app/data/ocr-template'
|
||||
import { Results, SelectionData } from 'src/app/data/results'
|
||||
import { SETTINGS_KEYS } from 'src/app/data/ui-settings'
|
||||
import { queryParamsFromFilterRules } from '../../utils/query-params'
|
||||
@@ -355,6 +356,13 @@ export class DocumentService extends AbstractPaperlessService<Document> {
|
||||
})
|
||||
}
|
||||
|
||||
runZoneOcr(id: number): Observable<{ results: OcrZoneRunResult[] }> {
|
||||
return this.http.post<{ results: OcrZoneRunResult[] }>(
|
||||
this.getResourceUrl(id, 'run-zone-ocr'),
|
||||
{}
|
||||
)
|
||||
}
|
||||
|
||||
rotateDocuments(
|
||||
selection: DocumentSelectionQuery,
|
||||
degrees: number,
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
import { Injectable } from '@angular/core'
|
||||
import { Observable } from 'rxjs'
|
||||
import {
|
||||
OcrTemplate,
|
||||
OcrZoneTestResult,
|
||||
ZoneTestRequest,
|
||||
} from '../../data/ocr-template'
|
||||
import { AbstractPaperlessService } from './abstract-paperless-service'
|
||||
|
||||
export interface QuickCreateFieldResult {
|
||||
id: number
|
||||
name: string
|
||||
data_type: string
|
||||
created: boolean
|
||||
}
|
||||
|
||||
@Injectable({ providedIn: 'root' })
|
||||
export class OcrTemplateService extends AbstractPaperlessService<OcrTemplate> {
|
||||
constructor() {
|
||||
super()
|
||||
this.resourceName = 'ocr_templates'
|
||||
}
|
||||
|
||||
getPageImageUrl(docId: number, page: number): string {
|
||||
return `${this.baseUrl}${this.resourceName}/document-page-image/${docId}/${page}/`
|
||||
}
|
||||
|
||||
testZone(
|
||||
docId: number,
|
||||
zone: ZoneTestRequest
|
||||
): Observable<OcrZoneTestResult> {
|
||||
return this.http.post<OcrZoneTestResult>(
|
||||
`${this.baseUrl}${this.resourceName}/test-zone/`,
|
||||
{ document: docId, zone }
|
||||
)
|
||||
}
|
||||
|
||||
quickCreateField(
|
||||
name: string,
|
||||
dataType: string
|
||||
): Observable<QuickCreateFieldResult> {
|
||||
return this.http.post<QuickCreateFieldResult>(
|
||||
`${this.baseUrl}${this.resourceName}/quick-create-field/`,
|
||||
{ name, data_type: dataType }
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -79,13 +79,16 @@ import {
|
||||
exclamationTriangleFill,
|
||||
eye,
|
||||
fileEarmark,
|
||||
fileEarmarkBreak,
|
||||
fileEarmarkCheck,
|
||||
fileEarmarkDiff,
|
||||
fileEarmarkFill,
|
||||
fileEarmarkLock,
|
||||
fileEarmarkMedical,
|
||||
fileEarmarkMinus,
|
||||
fileEarmarkPlus,
|
||||
fileEarmarkRichtext,
|
||||
fileEarmarkRuled,
|
||||
fileText,
|
||||
files,
|
||||
filter,
|
||||
@@ -302,13 +305,16 @@ const icons = {
|
||||
exclamationTriangleFill,
|
||||
eye,
|
||||
fileEarmark,
|
||||
fileEarmarkBreak,
|
||||
fileEarmarkCheck,
|
||||
fileEarmarkDiff,
|
||||
fileEarmarkFill,
|
||||
fileEarmarkLock,
|
||||
fileEarmarkMedical,
|
||||
fileEarmarkMinus,
|
||||
fileEarmarkPlus,
|
||||
fileEarmarkRichtext,
|
||||
fileEarmarkRuled,
|
||||
files,
|
||||
fileText,
|
||||
filter,
|
||||
|
||||
@@ -13,8 +13,11 @@ class DocumentsConfig(AppConfig):
|
||||
from documents.signals.handlers import add_inbox_tags
|
||||
from documents.signals.handlers import add_or_update_document_in_llm_index
|
||||
from documents.signals.handlers import add_to_index
|
||||
from documents.signals.handlers import capture_old_document_type
|
||||
from documents.signals.handlers import run_workflows_added
|
||||
from documents.signals.handlers import run_workflows_updated
|
||||
from documents.signals.handlers import run_zone_ocr_extraction
|
||||
from documents.signals.handlers import run_zone_ocr_on_type_change
|
||||
from documents.signals.handlers import send_websocket_document_updated
|
||||
from documents.signals.handlers import set_correspondent
|
||||
from documents.signals.handlers import set_document_type
|
||||
@@ -29,6 +32,16 @@ class DocumentsConfig(AppConfig):
|
||||
document_consumption_finished.connect(add_to_index)
|
||||
document_consumption_finished.connect(run_workflows_added)
|
||||
document_consumption_finished.connect(add_or_update_document_in_llm_index)
|
||||
document_consumption_finished.connect(run_zone_ocr_extraction)
|
||||
|
||||
from django.db.models.signals import post_save
|
||||
from django.db.models.signals import pre_save
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
pre_save.connect(capture_old_document_type, sender=Document)
|
||||
post_save.connect(run_zone_ocr_on_type_change, sender=Document)
|
||||
|
||||
document_updated.connect(run_workflows_updated)
|
||||
document_updated.connect(send_websocket_document_updated)
|
||||
document_updated.connect(add_or_update_document_in_llm_index)
|
||||
|
||||
@@ -0,0 +1,267 @@
|
||||
# Generated by Django 5.2.14 on 2026-06-16 17:36
|
||||
|
||||
import django.core.validators
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "0021_widen_workflow_integer_fields"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="OcrTemplate",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("name", models.CharField(max_length=128, verbose_name="name")),
|
||||
(
|
||||
"source_width",
|
||||
models.PositiveIntegerField(
|
||||
help_text="Width of the image the zones were drawn on (px)",
|
||||
validators=[django.core.validators.MinValueValidator(1)],
|
||||
verbose_name="source width",
|
||||
),
|
||||
),
|
||||
(
|
||||
"source_height",
|
||||
models.PositiveIntegerField(
|
||||
help_text="Height of the image the zones were drawn on (px)",
|
||||
validators=[django.core.validators.MinValueValidator(1)],
|
||||
verbose_name="source height",
|
||||
),
|
||||
),
|
||||
("enabled", models.BooleanField(default=True, verbose_name="enabled")),
|
||||
(
|
||||
"combine_formats",
|
||||
models.JSONField(
|
||||
blank=True,
|
||||
default=dict,
|
||||
help_text="Per-target format strings for combining several zones into one field, keyed by target (custom field id, or 'title'/'asn'/'created'). Tokens like {Zone Name} are replaced with that zone's value.",
|
||||
verbose_name="combine formats",
|
||||
),
|
||||
),
|
||||
(
|
||||
"created",
|
||||
models.DateTimeField(
|
||||
db_index=True,
|
||||
default=django.utils.timezone.now,
|
||||
editable=False,
|
||||
verbose_name="created",
|
||||
),
|
||||
),
|
||||
(
|
||||
"updated",
|
||||
models.DateTimeField(auto_now=True, verbose_name="updated"),
|
||||
),
|
||||
(
|
||||
"document_type",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="ocr_templates",
|
||||
to="documents.documenttype",
|
||||
verbose_name="document type",
|
||||
),
|
||||
),
|
||||
(
|
||||
"sample_document",
|
||||
models.ForeignKey(
|
||||
blank=True,
|
||||
help_text="Document used for previewing zones in the editor",
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name="+",
|
||||
to="documents.document",
|
||||
verbose_name="sample document",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"verbose_name": "OCR template",
|
||||
"verbose_name_plural": "OCR templates",
|
||||
"ordering": ("name",),
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="OcrTemplateZone",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"name",
|
||||
models.CharField(
|
||||
help_text="Descriptive name for this zone (e.g. 'Invoice Number')",
|
||||
max_length=128,
|
||||
verbose_name="zone name",
|
||||
),
|
||||
),
|
||||
(
|
||||
"target",
|
||||
models.CharField(
|
||||
choices=[
|
||||
("custom_field", "Custom field"),
|
||||
("title", "Title"),
|
||||
("asn", "Archive serial number"),
|
||||
("created", "Date created"),
|
||||
],
|
||||
default="custom_field",
|
||||
help_text="Where the extracted value is written: a custom field, or a built-in document field (title, ASN, created date)",
|
||||
max_length=20,
|
||||
verbose_name="target",
|
||||
),
|
||||
),
|
||||
(
|
||||
"page",
|
||||
models.IntegerField(
|
||||
blank=True,
|
||||
help_text="Page (1 = first, -1 = last; blank uses the template default)",
|
||||
null=True,
|
||||
verbose_name="page",
|
||||
),
|
||||
),
|
||||
(
|
||||
"x",
|
||||
models.PositiveIntegerField(
|
||||
help_text="Left edge (px)",
|
||||
verbose_name="x",
|
||||
),
|
||||
),
|
||||
(
|
||||
"y",
|
||||
models.PositiveIntegerField(
|
||||
help_text="Top edge (px)",
|
||||
verbose_name="y",
|
||||
),
|
||||
),
|
||||
(
|
||||
"width",
|
||||
models.PositiveIntegerField(
|
||||
help_text="Zone width (px)",
|
||||
validators=[django.core.validators.MinValueValidator(1)],
|
||||
verbose_name="width",
|
||||
),
|
||||
),
|
||||
(
|
||||
"height",
|
||||
models.PositiveIntegerField(
|
||||
help_text="Zone height (px)",
|
||||
validators=[django.core.validators.MinValueValidator(1)],
|
||||
verbose_name="height",
|
||||
),
|
||||
),
|
||||
(
|
||||
"zone_source_width",
|
||||
models.PositiveIntegerField(
|
||||
blank=True,
|
||||
help_text="Width of the page image this zone was drawn on (px). Falls back to template source_width if unset.",
|
||||
null=True,
|
||||
verbose_name="zone source width",
|
||||
),
|
||||
),
|
||||
(
|
||||
"zone_source_height",
|
||||
models.PositiveIntegerField(
|
||||
blank=True,
|
||||
help_text="Height of the page image this zone was drawn on (px). Falls back to template source_height if unset.",
|
||||
null=True,
|
||||
verbose_name="zone source height",
|
||||
),
|
||||
),
|
||||
(
|
||||
"ocr_language",
|
||||
models.CharField(
|
||||
default="deu+eng",
|
||||
help_text="Tesseract language code(s), e.g. 'deu+eng'",
|
||||
max_length=20,
|
||||
verbose_name="OCR language",
|
||||
),
|
||||
),
|
||||
(
|
||||
"transform",
|
||||
models.CharField(
|
||||
choices=[
|
||||
("none", "None"),
|
||||
("strip", "Strip whitespace"),
|
||||
("uppercase", "Uppercase"),
|
||||
("lowercase", "Lowercase"),
|
||||
("numeric", "Numeric only"),
|
||||
(
|
||||
"strip_punctuation",
|
||||
"Remove leading/trailing punctuation",
|
||||
),
|
||||
("date", "Parse date"),
|
||||
("qr_code", "Read QR/barcode"),
|
||||
],
|
||||
default="strip",
|
||||
max_length=20,
|
||||
verbose_name="transform",
|
||||
),
|
||||
),
|
||||
(
|
||||
"date_format",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
default="",
|
||||
help_text="Python strptime format for the 'Parse date' transform (e.g. %d.%m.%Y). Blank = auto-detect.",
|
||||
max_length=64,
|
||||
verbose_name="date format",
|
||||
),
|
||||
),
|
||||
(
|
||||
"validation_regex",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
default="",
|
||||
help_text="Optional regex pattern — extracted text is only accepted if it matches",
|
||||
max_length=256,
|
||||
verbose_name="validation regex",
|
||||
),
|
||||
),
|
||||
("order", models.PositiveIntegerField(default=0, verbose_name="order")),
|
||||
(
|
||||
"custom_field",
|
||||
models.ForeignKey(
|
||||
blank=True,
|
||||
help_text="Target custom field (only used when target is 'custom_field')",
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="ocr_zones",
|
||||
to="documents.customfield",
|
||||
verbose_name="custom field",
|
||||
),
|
||||
),
|
||||
(
|
||||
"template",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="zones",
|
||||
to="documents.ocrtemplate",
|
||||
verbose_name="template",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"verbose_name": "OCR template zone",
|
||||
"verbose_name_plural": "OCR template zones",
|
||||
"ordering": ("template", "order"),
|
||||
},
|
||||
),
|
||||
]
|
||||
@@ -1894,3 +1894,248 @@ class WorkflowRun(SoftDeleteModel):
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"WorkflowRun of {self.workflow} at {self.run_at} on {self.document}"
|
||||
|
||||
|
||||
class OcrTemplate(models.Model):
|
||||
"""
|
||||
Defines a set of OCR extraction zones for a specific document type.
|
||||
|
||||
When a document of that type is consumed, each zone in the template is
|
||||
cropped from the document image and OCR'd separately. The extracted text
|
||||
is written to the configured custom field or built-in document field.
|
||||
"""
|
||||
|
||||
name = models.CharField(
|
||||
_("name"),
|
||||
max_length=128,
|
||||
)
|
||||
|
||||
document_type = models.ForeignKey(
|
||||
"documents.DocumentType",
|
||||
on_delete=models.CASCADE,
|
||||
related_name="ocr_templates",
|
||||
verbose_name=_("document type"),
|
||||
db_index=True,
|
||||
)
|
||||
|
||||
source_width = models.PositiveIntegerField(
|
||||
_("source width"),
|
||||
validators=[MinValueValidator(1)],
|
||||
help_text=_("Width of the image the zones were drawn on (px)"),
|
||||
)
|
||||
|
||||
source_height = models.PositiveIntegerField(
|
||||
_("source height"),
|
||||
validators=[MinValueValidator(1)],
|
||||
help_text=_("Height of the image the zones were drawn on (px)"),
|
||||
)
|
||||
|
||||
sample_document = models.ForeignKey(
|
||||
"documents.Document",
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name="+",
|
||||
verbose_name=_("sample document"),
|
||||
help_text=_("Document used for previewing zones in the editor"),
|
||||
)
|
||||
|
||||
enabled = models.BooleanField(_("enabled"), default=True)
|
||||
|
||||
combine_formats = models.JSONField(
|
||||
_("combine formats"),
|
||||
default=dict,
|
||||
blank=True,
|
||||
help_text=_(
|
||||
"Per-target format strings for combining several zones into one "
|
||||
"field, keyed by target (custom field id, or 'title'/'asn'/'created'). "
|
||||
"Tokens like {Zone Name} are replaced with that zone's value.",
|
||||
),
|
||||
)
|
||||
|
||||
created = models.DateTimeField(
|
||||
_("created"),
|
||||
default=timezone.now,
|
||||
db_index=True,
|
||||
editable=False,
|
||||
)
|
||||
|
||||
updated = models.DateTimeField(
|
||||
_("updated"),
|
||||
auto_now=True,
|
||||
)
|
||||
|
||||
class Meta:
|
||||
ordering = ("name",)
|
||||
verbose_name = _("OCR template")
|
||||
verbose_name_plural = _("OCR templates")
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.name} ({self.document_type})"
|
||||
|
||||
|
||||
class OcrTemplateZone(models.Model):
|
||||
"""
|
||||
A rectangular region within a document page to OCR and extract into a custom
|
||||
field or built-in document field. Coordinates are relative to the source
|
||||
image dimensions stored on the template.
|
||||
"""
|
||||
|
||||
template = models.ForeignKey(
|
||||
OcrTemplate,
|
||||
on_delete=models.CASCADE,
|
||||
related_name="zones",
|
||||
verbose_name=_("template"),
|
||||
)
|
||||
|
||||
name = models.CharField(
|
||||
_("zone name"),
|
||||
max_length=128,
|
||||
help_text=_("Descriptive name for this zone (e.g. 'Invoice Number')"),
|
||||
)
|
||||
|
||||
class TargetType(models.TextChoices):
|
||||
CUSTOM_FIELD = ("custom_field", _("Custom field"))
|
||||
TITLE = ("title", _("Title"))
|
||||
ASN = ("asn", _("Archive serial number"))
|
||||
CREATED = ("created", _("Date created"))
|
||||
|
||||
target = models.CharField(
|
||||
_("target"),
|
||||
max_length=20,
|
||||
choices=TargetType.choices,
|
||||
default=TargetType.CUSTOM_FIELD,
|
||||
help_text=_(
|
||||
"Where the extracted value is written: a custom field, or a "
|
||||
"built-in document field (title, ASN, created date)",
|
||||
),
|
||||
)
|
||||
|
||||
custom_field = models.ForeignKey(
|
||||
"documents.CustomField",
|
||||
on_delete=models.CASCADE,
|
||||
related_name="ocr_zones",
|
||||
verbose_name=_("custom field"),
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text=_("Target custom field (only used when target is 'custom_field')"),
|
||||
)
|
||||
|
||||
page = models.IntegerField(
|
||||
_("page"),
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text=_("Page (1 = first, -1 = last; blank uses the template default)"),
|
||||
)
|
||||
|
||||
x = models.PositiveIntegerField(_("x"), help_text=_("Left edge (px)"))
|
||||
y = models.PositiveIntegerField(_("y"), help_text=_("Top edge (px)"))
|
||||
width = models.PositiveIntegerField(
|
||||
_("width"),
|
||||
validators=[MinValueValidator(1)],
|
||||
help_text=_("Zone width (px)"),
|
||||
)
|
||||
height = models.PositiveIntegerField(
|
||||
_("height"),
|
||||
validators=[MinValueValidator(1)],
|
||||
help_text=_("Zone height (px)"),
|
||||
)
|
||||
|
||||
# Per-zone source dimensions for coordinate scaling.
|
||||
# Stored from the page image the zone was drawn on.
|
||||
# If null, falls back to the template's source_width/source_height.
|
||||
# This handles PDFs with mixed page sizes (e.g. landscape + portrait,
|
||||
# or different paper formats across pages).
|
||||
zone_source_width = models.PositiveIntegerField(
|
||||
_("zone source width"),
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text=_(
|
||||
"Width of the page image this zone was drawn on (px). "
|
||||
"Falls back to template source_width if unset.",
|
||||
),
|
||||
)
|
||||
zone_source_height = models.PositiveIntegerField(
|
||||
_("zone source height"),
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text=_(
|
||||
"Height of the page image this zone was drawn on (px). "
|
||||
"Falls back to template source_height if unset.",
|
||||
),
|
||||
)
|
||||
|
||||
ocr_language = models.CharField(
|
||||
_("OCR language"),
|
||||
max_length=20,
|
||||
default="deu+eng",
|
||||
help_text=_("Tesseract language code(s), e.g. 'deu+eng'"),
|
||||
)
|
||||
|
||||
class TransformType(models.TextChoices):
|
||||
NONE = ("none", _("None"))
|
||||
STRIP = ("strip", _("Strip whitespace"))
|
||||
UPPERCASE = ("uppercase", _("Uppercase"))
|
||||
LOWERCASE = ("lowercase", _("Lowercase"))
|
||||
NUMERIC = ("numeric", _("Numeric only"))
|
||||
STRIP_PUNCTUATION = (
|
||||
"strip_punctuation",
|
||||
_("Remove leading/trailing punctuation"),
|
||||
)
|
||||
DATE = ("date", _("Parse date"))
|
||||
QR_CODE = ("qr_code", _("Read QR/barcode"))
|
||||
|
||||
transform = models.CharField(
|
||||
_("transform"),
|
||||
max_length=20,
|
||||
choices=TransformType.choices,
|
||||
default=TransformType.STRIP,
|
||||
)
|
||||
|
||||
date_format = models.CharField(
|
||||
_("date format"),
|
||||
max_length=64,
|
||||
blank=True,
|
||||
default="",
|
||||
help_text=_(
|
||||
"Python strptime format for the 'Parse date' transform "
|
||||
"(e.g. %d.%m.%Y). Blank = auto-detect.",
|
||||
),
|
||||
)
|
||||
|
||||
validation_regex = models.CharField(
|
||||
_("validation regex"),
|
||||
max_length=256,
|
||||
blank=True,
|
||||
default="",
|
||||
help_text=_(
|
||||
"Optional regex pattern — extracted text is only accepted if it matches",
|
||||
),
|
||||
)
|
||||
|
||||
order = models.PositiveIntegerField(_("order"), default=0)
|
||||
|
||||
class Meta:
|
||||
ordering = ("template", "order")
|
||||
verbose_name = _("OCR template zone")
|
||||
verbose_name_plural = _("OCR template zones")
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.template.name} -> {self.name}"
|
||||
|
||||
|
||||
# Custom field data types that zone OCR can extract into. DOCUMENTLINK and
|
||||
# SELECT are excluded (they reference other objects, not free text). Single
|
||||
# source of truth for the serializer, the quick-create endpoint and the engine.
|
||||
OCR_SUPPORTED_FIELD_TYPES = frozenset(
|
||||
{
|
||||
CustomField.FieldDataType.STRING,
|
||||
CustomField.FieldDataType.URL,
|
||||
CustomField.FieldDataType.DATE,
|
||||
CustomField.FieldDataType.INT,
|
||||
CustomField.FieldDataType.FLOAT,
|
||||
CustomField.FieldDataType.MONETARY,
|
||||
CustomField.FieldDataType.LONG_TEXT,
|
||||
CustomField.FieldDataType.BOOL,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -57,6 +57,7 @@ if settings.AUDIT_LOG_ENABLED:
|
||||
from documents import bulk_edit
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.filters import CustomFieldQueryParser
|
||||
from documents.models import OCR_SUPPORTED_FIELD_TYPES
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
@@ -64,6 +65,8 @@ from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import MatchingModel
|
||||
from documents.models import Note
|
||||
from documents.models import OcrTemplate
|
||||
from documents.models import OcrTemplateZone
|
||||
from documents.models import PaperlessTask
|
||||
from documents.models import SavedView
|
||||
from documents.models import SavedViewFilterRule
|
||||
@@ -3501,3 +3504,129 @@ class StoragePathTestSerializer(SerializerWithPerms):
|
||||
"documents.view_document",
|
||||
Document,
|
||||
)
|
||||
|
||||
|
||||
class OcrTemplateZoneSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = OcrTemplateZone
|
||||
fields = [
|
||||
"id",
|
||||
"name",
|
||||
"target",
|
||||
"custom_field",
|
||||
"page",
|
||||
"x",
|
||||
"y",
|
||||
"width",
|
||||
"height",
|
||||
"ocr_language",
|
||||
"transform",
|
||||
"date_format",
|
||||
"order",
|
||||
"zone_source_width",
|
||||
"zone_source_height",
|
||||
"validation_regex",
|
||||
]
|
||||
|
||||
def validate_width(self, value):
|
||||
if value < 1:
|
||||
raise serializers.ValidationError("Width must be at least 1.")
|
||||
return value
|
||||
|
||||
def validate_height(self, value):
|
||||
if value < 1:
|
||||
raise serializers.ValidationError("Height must be at least 1.")
|
||||
return value
|
||||
|
||||
def validate_custom_field(self, value):
|
||||
if value is None:
|
||||
# Built-in target (title/asn/created) — no custom field required.
|
||||
return value
|
||||
if value.data_type not in OCR_SUPPORTED_FIELD_TYPES:
|
||||
raise serializers.ValidationError(
|
||||
f"Custom field type '{value.data_type}' is not supported for OCR extraction. "
|
||||
f"Use string, integer, float, date, monetary, boolean, URL, or long text.",
|
||||
)
|
||||
return value
|
||||
|
||||
|
||||
class OcrTemplateSerializer(serializers.ModelSerializer):
|
||||
zones = OcrTemplateZoneSerializer(many=True, required=False)
|
||||
|
||||
class Meta:
|
||||
model = OcrTemplate
|
||||
fields = [
|
||||
"id",
|
||||
"name",
|
||||
"document_type",
|
||||
"source_width",
|
||||
"source_height",
|
||||
"sample_document",
|
||||
"enabled",
|
||||
"combine_formats",
|
||||
"created",
|
||||
"updated",
|
||||
"zones",
|
||||
]
|
||||
read_only_fields = ["created", "updated"]
|
||||
|
||||
def validate_source_width(self, value):
|
||||
if value < 1:
|
||||
raise serializers.ValidationError("Source width must be at least 1.")
|
||||
return value
|
||||
|
||||
def validate_source_height(self, value):
|
||||
if value < 1:
|
||||
raise serializers.ValidationError("Source height must be at least 1.")
|
||||
return value
|
||||
|
||||
def validate_zones(self, zones_data):
|
||||
"""Validate zone coordinates are within the source dimensions."""
|
||||
# source_width/height may not be in initial_data during partial updates
|
||||
source_width = self.initial_data.get("source_width") or (
|
||||
self.instance.source_width if self.instance else None
|
||||
)
|
||||
source_height = self.initial_data.get("source_height") or (
|
||||
self.instance.source_height if self.instance else None
|
||||
)
|
||||
|
||||
if source_width and source_height:
|
||||
for zone in zones_data:
|
||||
x = zone.get("x", 0)
|
||||
y = zone.get("y", 0)
|
||||
w = zone.get("width", 0)
|
||||
h = zone.get("height", 0)
|
||||
if x + w > int(source_width):
|
||||
raise serializers.ValidationError(
|
||||
f"Zone '{zone.get('name', '?')}' extends beyond source width "
|
||||
f"({x + w} > {source_width}).",
|
||||
)
|
||||
if y + h > int(source_height):
|
||||
raise serializers.ValidationError(
|
||||
f"Zone '{zone.get('name', '?')}' extends beyond source height "
|
||||
f"({y + h} > {source_height}).",
|
||||
)
|
||||
|
||||
return zones_data
|
||||
|
||||
def create(self, validated_data):
|
||||
zones_data = validated_data.pop("zones", [])
|
||||
template = OcrTemplate.objects.create(**validated_data)
|
||||
for zone_data in zones_data:
|
||||
OcrTemplateZone.objects.create(template=template, **zone_data)
|
||||
return template
|
||||
|
||||
def update(self, instance, validated_data):
|
||||
zones_data = validated_data.pop("zones", None)
|
||||
|
||||
for attr, value in validated_data.items():
|
||||
setattr(instance, attr, value)
|
||||
instance.save()
|
||||
|
||||
if zones_data is not None:
|
||||
# Replace all zones with the new set
|
||||
instance.zones.all().delete()
|
||||
for zone_data in zones_data:
|
||||
OcrTemplateZone.objects.create(template=instance, **zone_data)
|
||||
|
||||
return instance
|
||||
|
||||
@@ -1340,6 +1340,75 @@ def close_connection_pool_on_worker_init(**kwargs) -> None:
|
||||
conn.close_pool()
|
||||
|
||||
|
||||
def run_zone_ocr_extraction(sender, document, original_file=None, **kwargs):
|
||||
"""
|
||||
Run zone-based OCR extraction if the document's type has an active template.
|
||||
"""
|
||||
try:
|
||||
from documents.zone_ocr import run_zone_extraction
|
||||
|
||||
run_zone_extraction(document, Path(original_file) if original_file else None)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Zone OCR extraction failed for document %s",
|
||||
document.pk,
|
||||
)
|
||||
|
||||
|
||||
def capture_old_document_type(sender, instance, **kwargs):
|
||||
"""pre_save: remember the document's previous type so the post_save handler
|
||||
can tell whether the type actually changed (vs. every other save)."""
|
||||
if instance.pk:
|
||||
instance._old_document_type_id = (
|
||||
Document.objects.filter(pk=instance.pk)
|
||||
.values_list("document_type_id", flat=True)
|
||||
.first()
|
||||
)
|
||||
else:
|
||||
instance._old_document_type_id = None
|
||||
|
||||
|
||||
def run_zone_ocr_on_type_change(sender, instance, *, created=False, **kwargs):
|
||||
"""
|
||||
Run zone OCR only when a document's TYPE actually changes (and the new type
|
||||
has an enabled template). NOT on every save — zone OCR overwrites fields, so
|
||||
re-running it on each edit would clobber the user's changes. Newly created
|
||||
documents are handled by the consumption signal, and the user can always
|
||||
trigger extraction manually via the run-zone-ocr action.
|
||||
"""
|
||||
if created or not instance.pk or not instance.document_type_id:
|
||||
return
|
||||
|
||||
# Only proceed if the type changed compared to what was in the DB before.
|
||||
old_type = getattr(instance, "_old_document_type_id", None)
|
||||
if old_type == instance.document_type_id:
|
||||
return
|
||||
|
||||
from documents.models import OcrTemplate
|
||||
|
||||
if not OcrTemplate.objects.filter(
|
||||
document_type_id=instance.document_type_id,
|
||||
enabled=True,
|
||||
).exists():
|
||||
return
|
||||
|
||||
try:
|
||||
from documents.zone_ocr import run_zone_extraction
|
||||
|
||||
doc_path = instance.archive_path or instance.source_path
|
||||
if doc_path and Path(doc_path).is_file():
|
||||
logger.info(
|
||||
"Zone OCR: running extraction for document %d (type %d)",
|
||||
instance.pk,
|
||||
instance.document_type_id,
|
||||
)
|
||||
run_zone_extraction(instance, None)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Zone OCR extraction failed for document %s",
|
||||
instance.pk,
|
||||
)
|
||||
|
||||
@worker_process_shutdown.connect
|
||||
def close_connection_pool_on_worker_shutdown(**kwargs) -> None: # pragma: no cover
|
||||
"""
|
||||
|
||||
@@ -0,0 +1,449 @@
|
||||
"""Tests for the OCR Template API."""
|
||||
|
||||
import json
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from rest_framework import status
|
||||
from rest_framework.test import APITestCase
|
||||
|
||||
from documents.models import CustomField
|
||||
from documents.models import DocumentType
|
||||
from documents.models import OcrTemplate
|
||||
from documents.models import OcrTemplateZone
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestOcrTemplatesAPI(DirectoriesMixin, APITestCase):
|
||||
ENDPOINT = "/api/ocr_templates/"
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.user = User.objects.create_superuser(username="temp_admin")
|
||||
self.client.force_authenticate(user=self.user)
|
||||
|
||||
self.doc_type = DocumentType.objects.create(name="Invoice")
|
||||
self.custom_field_text = CustomField.objects.create(
|
||||
name="Invoice Number",
|
||||
data_type=CustomField.FieldDataType.STRING,
|
||||
)
|
||||
self.custom_field_date = CustomField.objects.create(
|
||||
name="Invoice Date",
|
||||
data_type=CustomField.FieldDataType.DATE,
|
||||
)
|
||||
self.custom_field_int = CustomField.objects.create(
|
||||
name="Amount",
|
||||
data_type=CustomField.FieldDataType.INT,
|
||||
)
|
||||
self.custom_field_doclink = CustomField.objects.create(
|
||||
name="Related Docs",
|
||||
data_type=CustomField.FieldDataType.DOCUMENTLINK,
|
||||
)
|
||||
|
||||
return super().setUp()
|
||||
|
||||
def _make_template_data(self, **overrides):
|
||||
data = {
|
||||
"name": "Invoice Template",
|
||||
"document_type": self.doc_type.pk,
|
||||
"default_page": 0,
|
||||
"source_width": 2480,
|
||||
"source_height": 3508,
|
||||
"enabled": True,
|
||||
"zones": [],
|
||||
}
|
||||
data.update(overrides)
|
||||
return data
|
||||
|
||||
def _make_zone_data(self, **overrides):
|
||||
data = {
|
||||
"name": "Zone 1",
|
||||
"custom_field": self.custom_field_text.pk,
|
||||
"x": 100,
|
||||
"y": 100,
|
||||
"width": 200,
|
||||
"height": 50,
|
||||
"ocr_language": "deu+eng",
|
||||
"transform": "strip",
|
||||
"order": 0,
|
||||
}
|
||||
data.update(overrides)
|
||||
return data
|
||||
|
||||
# --- Create ---
|
||||
|
||||
def test_create_template(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- A document type and custom fields exist
|
||||
WHEN:
|
||||
- API request to create an OCR template with one zone
|
||||
THEN:
|
||||
- The template and zone are created
|
||||
"""
|
||||
data = self._make_template_data(
|
||||
zones=[
|
||||
self._make_zone_data(
|
||||
name="Invoice Number",
|
||||
x=1500,
|
||||
y=200,
|
||||
width=800,
|
||||
height=100,
|
||||
),
|
||||
],
|
||||
)
|
||||
resp = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
|
||||
|
||||
result = resp.json()
|
||||
self.assertEqual(result["name"], "Invoice Template")
|
||||
self.assertEqual(result["document_type"], self.doc_type.pk)
|
||||
self.assertEqual(len(result["zones"]), 1)
|
||||
self.assertEqual(result["zones"][0]["name"], "Invoice Number")
|
||||
self.assertEqual(OcrTemplate.objects.count(), 1)
|
||||
self.assertEqual(OcrTemplateZone.objects.count(), 1)
|
||||
|
||||
def test_create_template_multiple_zones(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Multiple custom fields exist
|
||||
WHEN:
|
||||
- A template with multiple zones is created
|
||||
THEN:
|
||||
- All zones are created
|
||||
"""
|
||||
data = self._make_template_data(
|
||||
zones=[
|
||||
self._make_zone_data(
|
||||
name="Invoice Number",
|
||||
custom_field=self.custom_field_text.pk,
|
||||
),
|
||||
self._make_zone_data(
|
||||
name="Invoice Date",
|
||||
custom_field=self.custom_field_date.pk,
|
||||
order=1,
|
||||
),
|
||||
],
|
||||
)
|
||||
resp = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
|
||||
self.assertEqual(len(resp.json()["zones"]), 2)
|
||||
self.assertEqual(OcrTemplateZone.objects.count(), 2)
|
||||
|
||||
def test_create_template_no_zones(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Valid template data without zones
|
||||
WHEN:
|
||||
- Template is created
|
||||
THEN:
|
||||
- Template is created with no zones
|
||||
"""
|
||||
data = self._make_template_data()
|
||||
resp = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
|
||||
self.assertEqual(len(resp.json()["zones"]), 0)
|
||||
|
||||
# --- Validation ---
|
||||
|
||||
def test_create_template_zero_source_width_rejected(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Template data with source_width=0
|
||||
WHEN:
|
||||
- Create is attempted
|
||||
THEN:
|
||||
- 400 error is returned
|
||||
"""
|
||||
data = self._make_template_data(source_width=0)
|
||||
resp = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
def test_create_template_zero_source_height_rejected(self):
|
||||
data = self._make_template_data(source_height=0)
|
||||
resp = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
def test_create_zone_zero_width_rejected(self):
|
||||
data = self._make_template_data(
|
||||
zones=[self._make_zone_data(width=0)],
|
||||
)
|
||||
resp = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
def test_create_zone_zero_height_rejected(self):
|
||||
data = self._make_template_data(
|
||||
zones=[self._make_zone_data(height=0)],
|
||||
)
|
||||
resp = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
def test_create_zone_exceeds_source_width_rejected(self):
|
||||
"""Zone that extends beyond the source image width should be rejected."""
|
||||
data = self._make_template_data(
|
||||
source_width=1000,
|
||||
zones=[self._make_zone_data(x=800, width=300)], # 800+300 > 1000
|
||||
)
|
||||
resp = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
def test_create_zone_exceeds_source_height_rejected(self):
|
||||
data = self._make_template_data(
|
||||
source_height=1000,
|
||||
zones=[self._make_zone_data(y=900, height=200)], # 900+200 > 1000
|
||||
)
|
||||
resp = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
def test_create_zone_unsupported_custom_field_type_rejected(self):
|
||||
"""DOCUMENTLINK and SELECT fields can't be populated via OCR."""
|
||||
data = self._make_template_data(
|
||||
zones=[self._make_zone_data(custom_field=self.custom_field_doclink.pk)],
|
||||
)
|
||||
resp = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
# --- List ---
|
||||
|
||||
def test_list_templates(self):
|
||||
template = OcrTemplate.objects.create(
|
||||
name="Test Template",
|
||||
document_type=self.doc_type,
|
||||
source_width=2480,
|
||||
source_height=3508,
|
||||
)
|
||||
OcrTemplateZone.objects.create(
|
||||
template=template,
|
||||
name="Zone 1",
|
||||
custom_field=self.custom_field_text,
|
||||
x=100,
|
||||
y=100,
|
||||
width=200,
|
||||
height=50,
|
||||
)
|
||||
|
||||
resp = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||
data = resp.json()
|
||||
self.assertEqual(data["count"], 1)
|
||||
self.assertEqual(len(data["results"][0]["zones"]), 1)
|
||||
|
||||
def test_list_empty(self):
|
||||
resp = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(resp.json()["count"], 0)
|
||||
|
||||
# --- Update ---
|
||||
|
||||
def test_update_template_replaces_zones(self):
|
||||
"""PUT should replace all zones with the new set."""
|
||||
template = OcrTemplate.objects.create(
|
||||
name="Old Name",
|
||||
document_type=self.doc_type,
|
||||
source_width=2480,
|
||||
source_height=3508,
|
||||
)
|
||||
OcrTemplateZone.objects.create(
|
||||
template=template,
|
||||
name="Old Zone",
|
||||
custom_field=self.custom_field_text,
|
||||
x=0,
|
||||
y=0,
|
||||
width=100,
|
||||
height=100,
|
||||
)
|
||||
|
||||
data = self._make_template_data(
|
||||
name="New Name",
|
||||
zones=[
|
||||
self._make_zone_data(
|
||||
name="New Zone",
|
||||
custom_field=self.custom_field_date.pk,
|
||||
),
|
||||
],
|
||||
)
|
||||
resp = self.client.put(
|
||||
f"{self.ENDPOINT}{template.pk}/",
|
||||
data=json.dumps(data),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||
|
||||
template.refresh_from_db()
|
||||
self.assertEqual(template.name, "New Name")
|
||||
self.assertEqual(OcrTemplateZone.objects.count(), 1)
|
||||
self.assertEqual(OcrTemplateZone.objects.first().name, "New Zone")
|
||||
|
||||
# --- Delete ---
|
||||
|
||||
def test_delete_template_cascades_zones(self):
|
||||
template = OcrTemplate.objects.create(
|
||||
name="To Delete",
|
||||
document_type=self.doc_type,
|
||||
source_width=2480,
|
||||
source_height=3508,
|
||||
)
|
||||
OcrTemplateZone.objects.create(
|
||||
template=template,
|
||||
name="Zone",
|
||||
custom_field=self.custom_field_text,
|
||||
x=0,
|
||||
y=0,
|
||||
width=100,
|
||||
height=100,
|
||||
)
|
||||
|
||||
resp = self.client.delete(f"{self.ENDPOINT}{template.pk}/")
|
||||
self.assertEqual(resp.status_code, status.HTTP_204_NO_CONTENT)
|
||||
self.assertEqual(OcrTemplate.objects.count(), 0)
|
||||
self.assertEqual(OcrTemplateZone.objects.count(), 0)
|
||||
|
||||
def test_delete_nonexistent_returns_404(self):
|
||||
resp = self.client.delete(f"{self.ENDPOINT}99999/")
|
||||
self.assertEqual(resp.status_code, status.HTTP_404_NOT_FOUND)
|
||||
|
||||
# --- Patch ---
|
||||
|
||||
def test_patch_toggle_enabled(self):
|
||||
template = OcrTemplate.objects.create(
|
||||
name="Toggle Test",
|
||||
document_type=self.doc_type,
|
||||
source_width=2480,
|
||||
source_height=3508,
|
||||
enabled=True,
|
||||
)
|
||||
|
||||
resp = self.client.patch(
|
||||
f"{self.ENDPOINT}{template.pk}/",
|
||||
data=json.dumps({"enabled": False}),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||
template.refresh_from_db()
|
||||
self.assertFalse(template.enabled)
|
||||
|
||||
def test_patch_preserves_zones(self):
|
||||
"""PATCH without zones field should not delete existing zones."""
|
||||
template = OcrTemplate.objects.create(
|
||||
name="Patch Test",
|
||||
document_type=self.doc_type,
|
||||
source_width=2480,
|
||||
source_height=3508,
|
||||
)
|
||||
OcrTemplateZone.objects.create(
|
||||
template=template,
|
||||
name="Existing Zone",
|
||||
custom_field=self.custom_field_text,
|
||||
x=0,
|
||||
y=0,
|
||||
width=100,
|
||||
height=100,
|
||||
)
|
||||
|
||||
resp = self.client.patch(
|
||||
f"{self.ENDPOINT}{template.pk}/",
|
||||
data=json.dumps({"name": "Updated Name"}),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(OcrTemplateZone.objects.count(), 1)
|
||||
|
||||
# --- Auth ---
|
||||
|
||||
def test_unauthenticated_rejected(self):
|
||||
self.client.logout()
|
||||
resp = self.client.get(self.ENDPOINT)
|
||||
self.assertIn(
|
||||
resp.status_code,
|
||||
(status.HTTP_401_UNAUTHORIZED, status.HTTP_403_FORBIDDEN),
|
||||
)
|
||||
|
||||
# --- Quick create field ---
|
||||
|
||||
def test_quick_create_field(self):
|
||||
"""Creating a custom field inline from the template editor."""
|
||||
resp = self.client.post(
|
||||
f"{self.ENDPOINT}quick-create-field/",
|
||||
data=json.dumps({"name": "New Field", "data_type": "string"}),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
|
||||
data = resp.json()
|
||||
self.assertEqual(data["name"], "New Field")
|
||||
self.assertEqual(data["data_type"], "string")
|
||||
self.assertTrue(data["created"])
|
||||
self.assertTrue(CustomField.objects.filter(name="New Field").exists())
|
||||
|
||||
def test_quick_create_field_existing(self):
|
||||
"""If a field with the same name exists, return it without creating."""
|
||||
resp = self.client.post(
|
||||
f"{self.ENDPOINT}quick-create-field/",
|
||||
data=json.dumps({"name": "Invoice Number", "data_type": "string"}),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_200_OK)
|
||||
data = resp.json()
|
||||
self.assertEqual(data["id"], self.custom_field_text.pk)
|
||||
self.assertFalse(data["created"])
|
||||
|
||||
def test_quick_create_field_empty_name_rejected(self):
|
||||
resp = self.client.post(
|
||||
f"{self.ENDPOINT}quick-create-field/",
|
||||
data=json.dumps({"name": "", "data_type": "string"}),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
def test_quick_create_field_unsupported_type_rejected(self):
|
||||
resp = self.client.post(
|
||||
f"{self.ENDPOINT}quick-create-field/",
|
||||
data=json.dumps({"name": "Bad Field", "data_type": "documentlink"}),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
def test_quick_create_field_select_type_rejected(self):
|
||||
resp = self.client.post(
|
||||
f"{self.ENDPOINT}quick-create-field/",
|
||||
data=json.dumps({"name": "Bad Field", "data_type": "select"}),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
@@ -0,0 +1,454 @@
|
||||
"""Tests for the zone-based OCR extraction engine."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import OcrTemplate
|
||||
from documents.models import OcrTemplateZone
|
||||
from documents.zone_ocr import _apply_transform
|
||||
from documents.zone_ocr import _convert_value
|
||||
from documents.zone_ocr import _detect_mime
|
||||
from documents.zone_ocr import _resolve_doc_path
|
||||
from documents.zone_ocr import run_zone_extraction
|
||||
|
||||
|
||||
class TestApplyTransform(TestCase):
|
||||
"""Tests for the _apply_transform function."""
|
||||
|
||||
def test_strip(self):
|
||||
self.assertEqual(_apply_transform(" hello ", "strip"), "hello")
|
||||
|
||||
def test_none_transform(self):
|
||||
self.assertEqual(_apply_transform(" hello ", "none"), "hello")
|
||||
|
||||
def test_uppercase(self):
|
||||
self.assertEqual(_apply_transform("hello world", "uppercase"), "HELLO WORLD")
|
||||
|
||||
def test_lowercase(self):
|
||||
self.assertEqual(_apply_transform("HELLO WORLD", "lowercase"), "hello world")
|
||||
|
||||
def test_numeric_basic(self):
|
||||
self.assertEqual(_apply_transform("INV-2026-001", "numeric"), "2026-001")
|
||||
|
||||
def test_numeric_with_currency(self):
|
||||
self.assertEqual(_apply_transform("€1,234.56", "numeric"), "1,234.56")
|
||||
|
||||
def test_numeric_empty_result_falls_back(self):
|
||||
self.assertEqual(_apply_transform("abc", "numeric"), "abc")
|
||||
|
||||
def test_date_dmy_dots(self):
|
||||
self.assertEqual(_apply_transform("13.04.2026", "date_dmy"), "2026-04-13")
|
||||
|
||||
def test_date_dmy_slashes(self):
|
||||
self.assertEqual(_apply_transform("01/12/2025", "date_dmy"), "2025-12-01")
|
||||
|
||||
def test_date_dmy_two_digit_year(self):
|
||||
self.assertEqual(_apply_transform("13.04.26", "date_dmy"), "2026-04-13")
|
||||
|
||||
def test_date_dmy_with_prefix(self):
|
||||
self.assertEqual(_apply_transform("Date: 01/12/2025", "date_dmy"), "2025-12-01")
|
||||
|
||||
def test_date_dmy_invalid_falls_back(self):
|
||||
self.assertEqual(_apply_transform("32.13.2026", "date_dmy"), "32.13.2026")
|
||||
|
||||
def test_date_dmy_no_match_falls_back(self):
|
||||
self.assertEqual(_apply_transform("not a date", "date_dmy"), "not a date")
|
||||
|
||||
def test_date_ymd_dashes(self):
|
||||
self.assertEqual(_apply_transform("2026-04-13", "date_ymd"), "2026-04-13")
|
||||
|
||||
def test_date_ymd_slashes(self):
|
||||
self.assertEqual(_apply_transform("2026/04/13", "date_ymd"), "2026-04-13")
|
||||
|
||||
def test_date_ymd_invalid_falls_back(self):
|
||||
self.assertEqual(_apply_transform("2026-13-32", "date_ymd"), "2026-13-32")
|
||||
|
||||
def test_empty_string(self):
|
||||
self.assertEqual(_apply_transform("", "strip"), "")
|
||||
|
||||
def test_whitespace_only(self):
|
||||
self.assertEqual(_apply_transform(" ", "strip"), "")
|
||||
|
||||
def test_unknown_transform_strips(self):
|
||||
self.assertEqual(_apply_transform(" hello ", "unknown"), "hello")
|
||||
|
||||
|
||||
class TestConvertValue(TestCase):
|
||||
"""Tests for the _convert_value function."""
|
||||
|
||||
def test_string(self):
|
||||
self.assertEqual(
|
||||
_convert_value("Hello", CustomField.FieldDataType.STRING),
|
||||
"Hello",
|
||||
)
|
||||
|
||||
def test_string_truncation(self):
|
||||
result = _convert_value("x" * 200, CustomField.FieldDataType.STRING)
|
||||
self.assertEqual(len(result), 128)
|
||||
|
||||
def test_url(self):
|
||||
self.assertEqual(
|
||||
_convert_value("https://example.com", CustomField.FieldDataType.URL),
|
||||
"https://example.com",
|
||||
)
|
||||
|
||||
def test_long_text(self):
|
||||
long = "x" * 500
|
||||
self.assertEqual(
|
||||
_convert_value(long, CustomField.FieldDataType.LONG_TEXT),
|
||||
long,
|
||||
)
|
||||
|
||||
def test_int_simple(self):
|
||||
self.assertEqual(_convert_value("42", CustomField.FieldDataType.INT), 42)
|
||||
|
||||
def test_int_with_noise(self):
|
||||
self.assertEqual(_convert_value("INV-123", CustomField.FieldDataType.INT), 123)
|
||||
|
||||
def test_int_negative(self):
|
||||
self.assertEqual(_convert_value("-42", CustomField.FieldDataType.INT), -42)
|
||||
|
||||
def test_int_empty_returns_none(self):
|
||||
self.assertIsNone(_convert_value("abc", CustomField.FieldDataType.INT))
|
||||
|
||||
def test_int_only_dash_returns_none(self):
|
||||
self.assertIsNone(_convert_value("-", CustomField.FieldDataType.INT))
|
||||
|
||||
def test_float_simple(self):
|
||||
self.assertAlmostEqual(
|
||||
_convert_value("1234.56", CustomField.FieldDataType.FLOAT),
|
||||
1234.56,
|
||||
)
|
||||
|
||||
def test_float_european_format(self):
|
||||
self.assertAlmostEqual(
|
||||
_convert_value("1.234,56", CustomField.FieldDataType.FLOAT),
|
||||
1234.56,
|
||||
)
|
||||
|
||||
def test_float_us_format(self):
|
||||
self.assertAlmostEqual(
|
||||
_convert_value("1,234.56", CustomField.FieldDataType.FLOAT),
|
||||
1234.56,
|
||||
)
|
||||
|
||||
def test_float_comma_only(self):
|
||||
self.assertAlmostEqual(
|
||||
_convert_value("1234,56", CustomField.FieldDataType.FLOAT),
|
||||
1234.56,
|
||||
)
|
||||
|
||||
def test_float_empty_returns_none(self):
|
||||
self.assertIsNone(_convert_value("abc", CustomField.FieldDataType.FLOAT))
|
||||
|
||||
def test_float_only_separator_returns_none(self):
|
||||
self.assertIsNone(_convert_value(",", CustomField.FieldDataType.FLOAT))
|
||||
|
||||
def test_date_iso(self):
|
||||
self.assertEqual(
|
||||
_convert_value("2026-04-13", CustomField.FieldDataType.DATE),
|
||||
"2026-04-13",
|
||||
)
|
||||
|
||||
def test_date_invalid_returns_none(self):
|
||||
self.assertIsNone(_convert_value("not a date", CustomField.FieldDataType.DATE))
|
||||
|
||||
def test_date_invalid_values_returns_none(self):
|
||||
self.assertIsNone(_convert_value("2026-13-32", CustomField.FieldDataType.DATE))
|
||||
|
||||
def test_monetary_simple(self):
|
||||
self.assertEqual(
|
||||
_convert_value("123.45", CustomField.FieldDataType.MONETARY),
|
||||
"123.45",
|
||||
)
|
||||
|
||||
def test_monetary_european(self):
|
||||
self.assertEqual(
|
||||
_convert_value("1.234,56", CustomField.FieldDataType.MONETARY),
|
||||
"1234.56",
|
||||
)
|
||||
|
||||
def test_monetary_with_currency_symbol(self):
|
||||
self.assertEqual(
|
||||
_convert_value("€1,234.56", CustomField.FieldDataType.MONETARY),
|
||||
"1234.56",
|
||||
)
|
||||
|
||||
def test_monetary_empty_returns_none(self):
|
||||
self.assertIsNone(_convert_value("CHF", CustomField.FieldDataType.MONETARY))
|
||||
|
||||
def test_bool_true(self):
|
||||
for val in ("true", "True", "yes", "1", "ja", "x", "X"):
|
||||
self.assertTrue(
|
||||
_convert_value(val, CustomField.FieldDataType.BOOL),
|
||||
f"Expected True for {val!r}",
|
||||
)
|
||||
|
||||
def test_bool_false(self):
|
||||
for val in ("false", "False", "no", "0", "nein"):
|
||||
self.assertFalse(
|
||||
_convert_value(val, CustomField.FieldDataType.BOOL),
|
||||
f"Expected False for {val!r}",
|
||||
)
|
||||
|
||||
def test_bool_unknown_returns_none(self):
|
||||
self.assertIsNone(_convert_value("maybe", CustomField.FieldDataType.BOOL))
|
||||
|
||||
def test_unsupported_type_returns_none(self):
|
||||
self.assertIsNone(
|
||||
_convert_value("test", CustomField.FieldDataType.DOCUMENTLINK),
|
||||
)
|
||||
self.assertIsNone(
|
||||
_convert_value("test", CustomField.FieldDataType.SELECT),
|
||||
)
|
||||
|
||||
def test_empty_string_returns_none(self):
|
||||
self.assertIsNone(_convert_value("", CustomField.FieldDataType.STRING))
|
||||
|
||||
|
||||
class TestDetectMime(TestCase):
|
||||
"""Tests for _detect_mime."""
|
||||
|
||||
def test_pdf_extension(self):
|
||||
self.assertEqual(_detect_mime(Path("test.pdf")), "application/pdf")
|
||||
|
||||
def test_png_extension(self):
|
||||
self.assertEqual(_detect_mime(Path("test.png")), "image/png")
|
||||
|
||||
def test_jpg_extension(self):
|
||||
self.assertEqual(_detect_mime(Path("test.jpg")), "image/jpeg")
|
||||
|
||||
def test_unknown_extension(self):
|
||||
self.assertIsNone(_detect_mime(Path("test.xyz")))
|
||||
|
||||
def test_webp_extension(self):
|
||||
self.assertEqual(_detect_mime(Path("test.webp")), "image/webp")
|
||||
|
||||
|
||||
class TestResolveDocPath(TestCase):
|
||||
"""Tests for _resolve_doc_path."""
|
||||
|
||||
def test_returns_none_when_no_files_exist(self):
|
||||
doc = MagicMock()
|
||||
doc.has_archive_version = False
|
||||
doc.source_path = Path("/nonexistent/source.pdf")
|
||||
result = _resolve_doc_path(doc, None)
|
||||
self.assertIsNone(result)
|
||||
|
||||
def test_returns_original_file_as_fallback(self):
|
||||
doc = MagicMock()
|
||||
doc.has_archive_version = False
|
||||
doc.source_path = Path("/nonexistent/source.pdf")
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
|
||||
result = _resolve_doc_path(doc, Path(f.name))
|
||||
self.assertEqual(result, Path(f.name))
|
||||
|
||||
def test_returns_none_for_none_original_file(self):
|
||||
doc = MagicMock()
|
||||
doc.has_archive_version = False
|
||||
doc.source_path = Path("/nonexistent/source.pdf")
|
||||
result = _resolve_doc_path(doc, None)
|
||||
self.assertIsNone(result)
|
||||
|
||||
|
||||
class TestRunZoneExtraction(TestCase):
|
||||
"""Tests for the full extraction pipeline."""
|
||||
|
||||
def setUp(self):
|
||||
self.doc_type = DocumentType.objects.create(name="Invoice")
|
||||
self.custom_field = CustomField.objects.create(
|
||||
name="Invoice Number",
|
||||
data_type=CustomField.FieldDataType.STRING,
|
||||
)
|
||||
|
||||
def test_skips_document_without_type(self):
|
||||
doc = Document.objects.create(
|
||||
title="No Type",
|
||||
content="test",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
run_zone_extraction(doc, Path("/nonexistent"))
|
||||
self.assertEqual(CustomFieldInstance.objects.count(), 0)
|
||||
|
||||
def test_skips_document_without_matching_template(self):
|
||||
other_type = DocumentType.objects.create(name="Other")
|
||||
doc = Document.objects.create(
|
||||
title="No Template",
|
||||
content="test",
|
||||
mime_type="application/pdf",
|
||||
document_type=other_type,
|
||||
)
|
||||
run_zone_extraction(doc, Path("/nonexistent"))
|
||||
self.assertEqual(CustomFieldInstance.objects.count(), 0)
|
||||
|
||||
def test_skips_disabled_template(self):
|
||||
template = OcrTemplate.objects.create(
|
||||
name="Disabled",
|
||||
document_type=self.doc_type,
|
||||
source_width=2480,
|
||||
source_height=3508,
|
||||
enabled=False,
|
||||
)
|
||||
OcrTemplateZone.objects.create(
|
||||
template=template,
|
||||
name="Zone",
|
||||
custom_field=self.custom_field,
|
||||
x=0,
|
||||
y=0,
|
||||
width=100,
|
||||
height=50,
|
||||
)
|
||||
|
||||
doc = Document.objects.create(
|
||||
title="Test",
|
||||
content="test",
|
||||
mime_type="application/pdf",
|
||||
document_type=self.doc_type,
|
||||
)
|
||||
run_zone_extraction(doc, Path("/nonexistent"))
|
||||
self.assertEqual(CustomFieldInstance.objects.count(), 0)
|
||||
|
||||
def test_skips_template_with_no_zones(self):
|
||||
OcrTemplate.objects.create(
|
||||
name="Empty",
|
||||
document_type=self.doc_type,
|
||||
source_width=2480,
|
||||
source_height=3508,
|
||||
enabled=True,
|
||||
)
|
||||
|
||||
doc = Document.objects.create(
|
||||
title="Test",
|
||||
content="test",
|
||||
mime_type="application/pdf",
|
||||
document_type=self.doc_type,
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
|
||||
f.write(b"%PDF-1.4 fake")
|
||||
f.flush()
|
||||
run_zone_extraction(doc, Path(f.name))
|
||||
self.assertEqual(CustomFieldInstance.objects.count(), 0)
|
||||
|
||||
@patch("documents.zone_ocr._process_template")
|
||||
def test_calls_process_for_enabled_template(self, mock_process):
|
||||
template = OcrTemplate.objects.create(
|
||||
name="Active",
|
||||
document_type=self.doc_type,
|
||||
source_width=2480,
|
||||
source_height=3508,
|
||||
enabled=True,
|
||||
)
|
||||
OcrTemplateZone.objects.create(
|
||||
template=template,
|
||||
name="Zone",
|
||||
custom_field=self.custom_field,
|
||||
x=0,
|
||||
y=0,
|
||||
width=100,
|
||||
height=50,
|
||||
)
|
||||
|
||||
doc = Document.objects.create(
|
||||
title="Test",
|
||||
content="test",
|
||||
mime_type="application/pdf",
|
||||
document_type=self.doc_type,
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
|
||||
f.write(b"%PDF-1.4 fake")
|
||||
f.flush()
|
||||
run_zone_extraction(doc, Path(f.name))
|
||||
|
||||
self.assertTrue(mock_process.called)
|
||||
|
||||
@patch("documents.zone_ocr._process_template")
|
||||
def test_handles_process_exception_gracefully(self, mock_process):
|
||||
"""A failing template should not prevent other templates from running."""
|
||||
mock_process.side_effect = RuntimeError("test error")
|
||||
|
||||
template = OcrTemplate.objects.create(
|
||||
name="Failing",
|
||||
document_type=self.doc_type,
|
||||
source_width=2480,
|
||||
source_height=3508,
|
||||
enabled=True,
|
||||
)
|
||||
OcrTemplateZone.objects.create(
|
||||
template=template,
|
||||
name="Zone",
|
||||
custom_field=self.custom_field,
|
||||
x=0,
|
||||
y=0,
|
||||
width=100,
|
||||
height=50,
|
||||
)
|
||||
|
||||
doc = Document.objects.create(
|
||||
title="Test",
|
||||
content="test",
|
||||
mime_type="application/pdf",
|
||||
document_type=self.doc_type,
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
|
||||
f.write(b"%PDF-1.4 fake")
|
||||
f.flush()
|
||||
# Should not raise
|
||||
run_zone_extraction(doc, Path(f.name))
|
||||
|
||||
def test_handles_none_original_file(self):
|
||||
"""Should not crash when original_file is None."""
|
||||
doc = Document.objects.create(
|
||||
title="Test",
|
||||
content="test",
|
||||
mime_type="application/pdf",
|
||||
document_type=self.doc_type,
|
||||
)
|
||||
# No template, so it exits early — but shouldn't crash on None
|
||||
run_zone_extraction(doc, None)
|
||||
|
||||
@patch("documents.zone_ocr._process_template")
|
||||
def test_multiple_templates_all_process(self, mock_process):
|
||||
"""Multiple enabled templates for the same type should all run."""
|
||||
for i in range(3):
|
||||
template = OcrTemplate.objects.create(
|
||||
name=f"Template {i}",
|
||||
document_type=self.doc_type,
|
||||
source_width=2480,
|
||||
source_height=3508,
|
||||
enabled=True,
|
||||
)
|
||||
OcrTemplateZone.objects.create(
|
||||
template=template,
|
||||
name=f"Zone {i}",
|
||||
custom_field=self.custom_field,
|
||||
x=0,
|
||||
y=0,
|
||||
width=100,
|
||||
height=50,
|
||||
)
|
||||
|
||||
doc = Document.objects.create(
|
||||
title="Test",
|
||||
content="test",
|
||||
mime_type="application/pdf",
|
||||
document_type=self.doc_type,
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
|
||||
f.write(b"%PDF-1.4 fake")
|
||||
f.flush()
|
||||
run_zone_extraction(doc, Path(f.name))
|
||||
|
||||
self.assertEqual(mock_process.call_count, 3)
|
||||
@@ -3,6 +3,7 @@ import logging
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import zipfile
|
||||
from collections import defaultdict
|
||||
@@ -148,12 +149,14 @@ from documents.matching import match_correspondents
|
||||
from documents.matching import match_document_types
|
||||
from documents.matching import match_storage_paths
|
||||
from documents.matching import match_tags
|
||||
from documents.models import OCR_SUPPORTED_FIELD_TYPES
|
||||
from documents.models import Correspondent
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import Note
|
||||
from documents.models import OcrTemplate
|
||||
from documents.models import PaperlessTask
|
||||
from documents.models import SavedView
|
||||
from documents.models import ShareLink
|
||||
@@ -195,6 +198,7 @@ from documents.serialisers import EditPdfDocumentsSerializer
|
||||
from documents.serialisers import EmailSerializer
|
||||
from documents.serialisers import MergeDocumentsSerializer
|
||||
from documents.serialisers import NotesSerializer
|
||||
from documents.serialisers import OcrTemplateSerializer
|
||||
from documents.serialisers import PostDocumentSerializer
|
||||
from documents.serialisers import RemovePasswordDocumentsSerializer
|
||||
from documents.serialisers import ReprocessDocumentsSerializer
|
||||
@@ -2029,6 +2033,73 @@ class DocumentViewSet(
|
||||
},
|
||||
),
|
||||
)
|
||||
@action(methods=["post"], detail=True, url_path="run-zone-ocr")
|
||||
def run_zone_ocr(self, request, pk=None):
|
||||
"""Run zone-based OCR extraction on this document."""
|
||||
try:
|
||||
document = Document.objects.get(pk=pk)
|
||||
except Document.DoesNotExist:
|
||||
raise Http404
|
||||
|
||||
if not document.document_type_id:
|
||||
return Response(
|
||||
{"error": "Document has no type assigned"},
|
||||
status=status.HTTP_400_BAD_REQUEST,
|
||||
)
|
||||
|
||||
templates = OcrTemplate.objects.filter(
|
||||
document_type_id=document.document_type_id,
|
||||
enabled=True,
|
||||
)
|
||||
if not templates.exists():
|
||||
return Response(
|
||||
{"error": "No OCR templates found for this document type"},
|
||||
status=status.HTTP_404_NOT_FOUND,
|
||||
)
|
||||
|
||||
doc_path = document.archive_path or document.source_path
|
||||
if not doc_path or not Path(doc_path).is_file():
|
||||
return Response(
|
||||
{"error": "Document file not found"},
|
||||
status=status.HTTP_404_NOT_FOUND,
|
||||
)
|
||||
|
||||
from documents.zone_ocr import run_zone_extraction
|
||||
|
||||
run_zone_extraction(document, None)
|
||||
|
||||
# Collect results
|
||||
results = []
|
||||
builtin_labels = {"title": "Title", "asn": "ASN", "created": "Created"}
|
||||
for template in templates.prefetch_related("zones", "zones__custom_field"):
|
||||
for zone in template.zones.all():
|
||||
target = getattr(zone, "target", None) or "custom_field"
|
||||
if target == "custom_field" and zone.custom_field_id:
|
||||
cf_instance = document.custom_fields.filter(
|
||||
field=zone.custom_field,
|
||||
).first()
|
||||
field_name = zone.custom_field.name
|
||||
value = cf_instance.value if cf_instance else None
|
||||
else:
|
||||
field_name = builtin_labels.get(target, target)
|
||||
value = {
|
||||
"title": document.title,
|
||||
"asn": document.archive_serial_number,
|
||||
"created": document.created.isoformat()
|
||||
if document.created
|
||||
else None,
|
||||
}.get(target)
|
||||
results.append(
|
||||
{
|
||||
"template": template.name,
|
||||
"zone": zone.name,
|
||||
"custom_field": field_name,
|
||||
"value": value,
|
||||
},
|
||||
)
|
||||
|
||||
return Response({"results": results})
|
||||
|
||||
@action(
|
||||
methods=["delete"],
|
||||
detail=True,
|
||||
@@ -5269,3 +5340,224 @@ def serve_logo(request: HttpRequest, filename: str | None = None) -> FileRespons
|
||||
filename=app_logo.name,
|
||||
as_attachment=True,
|
||||
)
|
||||
|
||||
|
||||
class OcrTemplateViewSet(ModelViewSet):
|
||||
"""CRUD for OCR templates with zone definitions."""
|
||||
|
||||
queryset = (
|
||||
OcrTemplate.objects.all()
|
||||
.prefetch_related(
|
||||
"zones",
|
||||
"zones__custom_field",
|
||||
)
|
||||
.order_by("name")
|
||||
)
|
||||
serializer_class = OcrTemplateSerializer
|
||||
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
|
||||
pagination_class = StandardPagination
|
||||
|
||||
@action(
|
||||
detail=False,
|
||||
methods=["get"],
|
||||
url_path=r"document-page-image/(?P<doc_id>[0-9]+)/(?P<page>[0-9]+)",
|
||||
)
|
||||
def document_page_image(self, request, doc_id=None, page=None):
|
||||
"""Render a specific page of a document as a PNG image.
|
||||
|
||||
Used by the frontend template editor to display document pages
|
||||
as images that users can draw zones on.
|
||||
"""
|
||||
try:
|
||||
document = Document.objects.get(pk=doc_id)
|
||||
except Document.DoesNotExist:
|
||||
raise Http404("Document not found")
|
||||
|
||||
page_num = int(page)
|
||||
|
||||
# Validate page number
|
||||
if document.page_count and page_num >= document.page_count:
|
||||
raise Http404(
|
||||
f"Page {page_num} out of range (document has {document.page_count} pages)",
|
||||
)
|
||||
|
||||
doc_path = document.archive_path or document.source_path
|
||||
if not doc_path or not Path(doc_path).is_file():
|
||||
raise Http404("Document file not found")
|
||||
|
||||
# Check if document is an image (single page, no PDF rendering needed)
|
||||
if document.mime_type and document.mime_type.startswith("image/"):
|
||||
content = Path(doc_path).read_bytes()
|
||||
return HttpResponse(content, content_type=document.mime_type)
|
||||
|
||||
with tempfile.TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
|
||||
output_prefix = Path(tmp_dir) / "page"
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"pdftoppm",
|
||||
"-png",
|
||||
"-r",
|
||||
"150", # Lower DPI for preview
|
||||
"-f",
|
||||
str(page_num + 1),
|
||||
"-l",
|
||||
str(page_num + 1),
|
||||
str(doc_path),
|
||||
str(output_prefix),
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise Http404(
|
||||
f"Failed to render page: {e.stderr.decode(errors='replace')[:200]}",
|
||||
)
|
||||
except FileNotFoundError:
|
||||
raise Http404("pdftoppm not available - is poppler-utils installed?")
|
||||
|
||||
rendered = sorted(Path(tmp_dir).glob("page-*.png"))
|
||||
if not rendered:
|
||||
raise Http404("No rendered page found")
|
||||
|
||||
content = rendered[0].read_bytes()
|
||||
|
||||
return HttpResponse(content, content_type="image/png")
|
||||
|
||||
@action(detail=False, methods=["post"], url_path="test-zone")
|
||||
def test_zone(self, request):
|
||||
"""Run OCR on a single ad-hoc zone of a document and return what it
|
||||
yields: the raw OCR text, the transformed value, and whether the
|
||||
validation regex matches. Non-destructive - writes nothing. Used by the
|
||||
editor's per-zone test so a user can tune the zone/regex before saving.
|
||||
|
||||
Accepts: {"document": <id>, "zone": {x, y, width, height, page,
|
||||
ocr_language, transform, validation_regex, zone_source_width,
|
||||
zone_source_height}}.
|
||||
"""
|
||||
from documents.models import OcrTemplateZone
|
||||
from documents.zone_ocr import extract_zone_preview
|
||||
|
||||
zone_data = request.data.get("zone") or {}
|
||||
|
||||
try:
|
||||
document = Document.objects.get(pk=request.data.get("document"))
|
||||
except (Document.DoesNotExist, ValueError, TypeError):
|
||||
return Response(
|
||||
{"error": "Document not found"},
|
||||
status=status.HTTP_404_NOT_FOUND,
|
||||
)
|
||||
|
||||
doc_path = document.archive_path or document.source_path
|
||||
if not doc_path or not Path(doc_path).is_file():
|
||||
return Response(
|
||||
{"error": "Document file not found"},
|
||||
status=status.HTTP_404_NOT_FOUND,
|
||||
)
|
||||
|
||||
try:
|
||||
zone = OcrTemplateZone(
|
||||
name=zone_data.get("name") or "test",
|
||||
x=int(zone_data.get("x", 0)),
|
||||
y=int(zone_data.get("y", 0)),
|
||||
width=int(zone_data.get("width", 0)),
|
||||
height=int(zone_data.get("height", 0)),
|
||||
page=zone_data.get("page"),
|
||||
ocr_language=zone_data.get("ocr_language") or "eng",
|
||||
transform=zone_data.get("transform") or "strip",
|
||||
date_format=zone_data.get("date_format") or "",
|
||||
validation_regex=zone_data.get("validation_regex") or "",
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
return Response(
|
||||
{"error": "Invalid zone definition"},
|
||||
status=status.HTTP_400_BAD_REQUEST,
|
||||
)
|
||||
|
||||
if zone.width < 2 or zone.height < 2:
|
||||
return Response(
|
||||
{"error": "Zone is too small to test"},
|
||||
status=status.HTTP_400_BAD_REQUEST,
|
||||
)
|
||||
|
||||
result = extract_zone_preview(
|
||||
Path(doc_path),
|
||||
zone,
|
||||
int(zone_data.get("zone_source_width") or 0),
|
||||
int(zone_data.get("zone_source_height") or 0),
|
||||
document.page_count,
|
||||
)
|
||||
|
||||
regex_match = None
|
||||
if zone.validation_regex and result.get("value") is not None:
|
||||
try:
|
||||
regex_match = (
|
||||
re.fullmatch(zone.validation_regex, result["value"]) is not None
|
||||
)
|
||||
except re.error:
|
||||
regex_match = None
|
||||
|
||||
return Response(
|
||||
{
|
||||
"raw_text": result.get("raw_text"),
|
||||
"value": result.get("value"),
|
||||
"regex": zone.validation_regex,
|
||||
"regex_match": regex_match,
|
||||
},
|
||||
)
|
||||
|
||||
@action(detail=False, methods=["post"], url_path="quick-create-field")
|
||||
def quick_create_field(self, request):
|
||||
"""Create a custom field inline from the template editor.
|
||||
|
||||
Accepts: {"name": "Invoice Number", "data_type": "string"}
|
||||
Returns the created field so the frontend can immediately use it.
|
||||
"""
|
||||
name = request.data.get("name", "").strip()
|
||||
data_type = request.data.get("data_type", "").strip()
|
||||
|
||||
if not name:
|
||||
return Response(
|
||||
{"error": "Field name is required"},
|
||||
status=status.HTTP_400_BAD_REQUEST,
|
||||
)
|
||||
|
||||
if data_type not in OCR_SUPPORTED_FIELD_TYPES:
|
||||
return Response(
|
||||
{
|
||||
"error": f"Unsupported data type '{data_type}'. "
|
||||
f"Supported: {', '.join(sorted(OCR_SUPPORTED_FIELD_TYPES))}",
|
||||
},
|
||||
status=status.HTTP_400_BAD_REQUEST,
|
||||
)
|
||||
|
||||
# Check if field already exists
|
||||
existing = CustomField.objects.filter(name=name).first()
|
||||
if existing:
|
||||
return Response(
|
||||
{
|
||||
"id": existing.pk,
|
||||
"name": existing.name,
|
||||
"data_type": existing.data_type,
|
||||
"created": False,
|
||||
},
|
||||
)
|
||||
|
||||
# Check user has permission to create custom fields
|
||||
if not request.user.has_perm("documents.add_customfield"):
|
||||
return Response(
|
||||
{"error": "You don't have permission to create custom fields"},
|
||||
status=status.HTTP_403_FORBIDDEN,
|
||||
)
|
||||
|
||||
field = CustomField.objects.create(name=name, data_type=data_type)
|
||||
return Response(
|
||||
{
|
||||
"id": field.pk,
|
||||
"name": field.name,
|
||||
"data_type": field.data_type,
|
||||
"created": True,
|
||||
},
|
||||
status=status.HTTP_201_CREATED,
|
||||
)
|
||||
|
||||
@@ -0,0 +1,757 @@
|
||||
"""
|
||||
Zone-based OCR extraction engine.
|
||||
|
||||
After a document is consumed, this module checks if the document's type has
|
||||
an active OCR template. If so, it renders the relevant pages as images,
|
||||
crops each zone, runs Tesseract OCR on the crop, applies transforms,
|
||||
and writes the results to the mapped custom fields.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import string
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import date
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from PIL import Image
|
||||
|
||||
from documents.models import CustomField
|
||||
from documents.models import CustomFieldInstance
|
||||
from documents.models import Document
|
||||
from documents.models import OcrTemplate
|
||||
from documents.models import OcrTemplateZone
|
||||
|
||||
logger = logging.getLogger("paperless.zone_ocr")
|
||||
|
||||
|
||||
def run_zone_extraction(
|
||||
document: Document,
|
||||
original_file: Path | None,
|
||||
) -> None:
|
||||
"""
|
||||
Run zone-based OCR extraction for a document if its type has an active template.
|
||||
Called from the document_consumption_finished signal handler.
|
||||
"""
|
||||
if not document.document_type_id:
|
||||
return
|
||||
|
||||
templates = OcrTemplate.objects.filter(
|
||||
document_type_id=document.document_type_id,
|
||||
enabled=True,
|
||||
).prefetch_related("zones", "zones__custom_field")
|
||||
|
||||
if not templates.exists():
|
||||
return
|
||||
|
||||
# Resolve the document file: prefer archive (PDF/A), then source, then signal arg
|
||||
doc_path = _resolve_doc_path(document, original_file)
|
||||
if doc_path is None:
|
||||
logger.warning(
|
||||
"Zone OCR: no accessible file for document %d",
|
||||
document.pk,
|
||||
)
|
||||
return
|
||||
|
||||
for template in templates:
|
||||
zones = list(template.zones.all())
|
||||
if not zones:
|
||||
continue
|
||||
|
||||
logger.info(
|
||||
"Zone OCR: processing template '%s' for document %d (%d zones)",
|
||||
template.name,
|
||||
document.pk,
|
||||
len(zones),
|
||||
)
|
||||
|
||||
try:
|
||||
_process_template(document, doc_path, template, zones)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Zone OCR: error processing template '%s' for document %d",
|
||||
template.name,
|
||||
document.pk,
|
||||
)
|
||||
|
||||
|
||||
def _resolve_doc_path(
|
||||
document: Document,
|
||||
original_file: Path | None,
|
||||
) -> Path | None:
|
||||
"""Find an accessible file for the document."""
|
||||
candidates = []
|
||||
if document.has_archive_version:
|
||||
candidates.append(document.archive_path)
|
||||
candidates.append(document.source_path)
|
||||
if original_file is not None:
|
||||
candidates.append(original_file)
|
||||
|
||||
for path in candidates:
|
||||
if path is not None and Path(path).is_file():
|
||||
return Path(path)
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_page_idx(page_value, page_count) -> int:
|
||||
"""Resolve a 1-indexed page (1 = first, -1 = last) to a 0-indexed image
|
||||
index. A blank page_value defaults to the first page."""
|
||||
if page_value is None:
|
||||
return 0
|
||||
if page_value == -1:
|
||||
return (page_count - 1) if page_count else 0
|
||||
if page_value >= 1:
|
||||
return page_value - 1
|
||||
return 0
|
||||
|
||||
|
||||
def _process_template(
|
||||
document: Document,
|
||||
doc_path: Path,
|
||||
template: OcrTemplate,
|
||||
zones: list[OcrTemplateZone],
|
||||
) -> None:
|
||||
"""Process all zones in a template against a document.
|
||||
|
||||
Each zone is OCR'd independently, then zones are grouped by their target
|
||||
field and each field is written exactly once. When several zones share a
|
||||
field, their values are combined via the template's per-field format string
|
||||
(or joined in order if none is set) — this avoids the zones overwriting each
|
||||
other's value.
|
||||
"""
|
||||
pages_needed: set[int] = {
|
||||
_resolve_page_idx(zone.page, document.page_count) for zone in zones
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
|
||||
tmp_path = Path(tmp_dir)
|
||||
|
||||
page_images = _render_pages(
|
||||
doc_path,
|
||||
pages_needed,
|
||||
tmp_path,
|
||||
document.page_count,
|
||||
)
|
||||
|
||||
# Pass 1: OCR every zone into a value (or None if it failed/was rejected).
|
||||
zone_values: dict[int, str | None] = {}
|
||||
for zone in zones:
|
||||
page_idx = _resolve_page_idx(zone.page, document.page_count)
|
||||
|
||||
if page_idx not in page_images:
|
||||
logger.warning(
|
||||
"Zone OCR: page %d not available for zone '%s'",
|
||||
page_idx,
|
||||
zone.name,
|
||||
)
|
||||
continue
|
||||
|
||||
src_w = zone.zone_source_width or template.source_width
|
||||
src_h = zone.zone_source_height or template.source_height
|
||||
|
||||
extracted = _extract_zone(
|
||||
page_images[page_idx],
|
||||
zone,
|
||||
src_w,
|
||||
src_h,
|
||||
tmp_path,
|
||||
)
|
||||
|
||||
if (
|
||||
extracted is not None
|
||||
and zone.validation_regex
|
||||
and not re.fullmatch(zone.validation_regex, extracted)
|
||||
):
|
||||
logger.info(
|
||||
"Zone OCR: '%s' value %r rejected by regex '%s'",
|
||||
zone.name,
|
||||
extracted[:100],
|
||||
zone.validation_regex,
|
||||
)
|
||||
extracted = None
|
||||
|
||||
zone_values[id(zone)] = extracted
|
||||
|
||||
# Pass 2: group zones by target field and write each field once.
|
||||
grouped: dict[str, list[OcrTemplateZone]] = {}
|
||||
for zone in zones:
|
||||
grouped.setdefault(_field_key(zone), []).append(zone)
|
||||
|
||||
combine_formats = template.combine_formats or {}
|
||||
for key, field_zones in grouped.items():
|
||||
value = _combine_field_value(
|
||||
combine_formats.get(key, ""),
|
||||
field_zones,
|
||||
zone_values,
|
||||
)
|
||||
if not value:
|
||||
continue
|
||||
|
||||
target_zone = field_zones[0]
|
||||
_write_zone_value(document, target_zone, value)
|
||||
logger.info(
|
||||
"Zone OCR: %s = %r (from %d zone(s))",
|
||||
_zone_target_label(target_zone),
|
||||
value[:100] if len(value) > 100 else value,
|
||||
len(field_zones),
|
||||
)
|
||||
|
||||
|
||||
def _field_key(zone: OcrTemplateZone) -> str:
|
||||
"""Identify a zone's target field. Custom fields key by id, built-in targets
|
||||
by their name. Matches the key used in OcrTemplate.combine_formats and on the
|
||||
frontend field select."""
|
||||
target = getattr(zone, "target", None) or "custom_field"
|
||||
if target == "custom_field" and zone.custom_field_id:
|
||||
return str(zone.custom_field_id)
|
||||
return target
|
||||
|
||||
|
||||
def _combine_field_value(
|
||||
fmt: str,
|
||||
field_zones: list[OcrTemplateZone],
|
||||
zone_values: dict[int, str | None],
|
||||
) -> str:
|
||||
"""Combine the OCR values of all zones targeting one field.
|
||||
|
||||
With a format string, `{Zone Name}` tokens are replaced by that zone's value
|
||||
and literal text is kept; separators left dangling by an empty token are
|
||||
cleaned up. Without a format, the zone values are joined in order by a space.
|
||||
"""
|
||||
values = {z.name: (zone_values.get(id(z)) or "") for z in field_zones}
|
||||
|
||||
if not fmt:
|
||||
parts = [zone_values.get(id(z)) or "" for z in field_zones]
|
||||
return " ".join(p for p in parts if p).strip()
|
||||
|
||||
def _replace(match: re.Match) -> str:
|
||||
return values.get(match.group(1).strip(), "")
|
||||
|
||||
combined = re.sub(r"\{([^{}]+)\}", _replace, fmt)
|
||||
# Tidy up separators an empty token may have left behind.
|
||||
combined = re.sub(r"\s{2,}", " ", combined)
|
||||
combined = re.sub(r"([^\w\s])\s*\1+", r"\1", combined)
|
||||
return combined.strip().strip("-/.,;:| \t")
|
||||
|
||||
|
||||
def _render_pages(
|
||||
doc_path: Path,
|
||||
pages: set[int],
|
||||
tmp_dir: Path,
|
||||
page_count: int | None,
|
||||
) -> dict[int, Path]:
|
||||
"""Render specific PDF pages as PNG images using pdftoppm (poppler-utils)."""
|
||||
result: dict[int, Path] = {}
|
||||
mime = _detect_mime(doc_path)
|
||||
|
||||
if mime and mime.startswith("image/"):
|
||||
# Single-image document — use it directly as page 0.
|
||||
result[0] = doc_path
|
||||
return result
|
||||
|
||||
# Callers pass already-resolved 0-indexed page numbers (see _resolve_page_idx).
|
||||
for actual_page in pages:
|
||||
if actual_page < 0:
|
||||
logger.warning("Zone OCR: invalid page index %d", actual_page)
|
||||
continue
|
||||
|
||||
output_prefix = tmp_dir / f"page_{actual_page}"
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"pdftoppm",
|
||||
"-png",
|
||||
"-r",
|
||||
"300",
|
||||
"-f",
|
||||
str(actual_page + 1), # pdftoppm is 1-indexed
|
||||
"-l",
|
||||
str(actual_page + 1),
|
||||
str(doc_path),
|
||||
str(output_prefix),
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error("Zone OCR: pdftoppm timed out for page %d", actual_page)
|
||||
continue
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(
|
||||
"Zone OCR: pdftoppm failed for page %d: %s",
|
||||
actual_page,
|
||||
e.stderr.decode(errors="replace") if e.stderr else str(e),
|
||||
)
|
||||
continue
|
||||
except FileNotFoundError:
|
||||
logger.error("Zone OCR: pdftoppm not found — is poppler-utils installed?")
|
||||
return result # No point trying other pages
|
||||
|
||||
# pdftoppm names output as prefix-NNNN.png
|
||||
rendered = sorted(tmp_dir.glob(f"page_{actual_page}-*.png"))
|
||||
if rendered:
|
||||
result[actual_page] = rendered[0]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _crop_zone(
|
||||
page_img: Path,
|
||||
zone: OcrTemplateZone,
|
||||
source_width: int,
|
||||
source_height: int,
|
||||
tmp_dir: Path,
|
||||
) -> Image.Image | None:
|
||||
"""Crop a zone from the page image and return the PIL Image."""
|
||||
try:
|
||||
with Image.open(page_img) as img:
|
||||
img_width, img_height = img.size
|
||||
|
||||
scale_x = img_width / source_width
|
||||
scale_y = img_height / source_height
|
||||
|
||||
crop_left = int(zone.x * scale_x)
|
||||
crop_top = int(zone.y * scale_y)
|
||||
crop_right = int((zone.x + zone.width) * scale_x)
|
||||
crop_bottom = int((zone.y + zone.height) * scale_y)
|
||||
|
||||
# Clamp to the image so an oversized zone can't crop out of bounds.
|
||||
crop_left = max(0, min(crop_left, img_width))
|
||||
crop_top = max(0, min(crop_top, img_height))
|
||||
crop_right = max(crop_left + 1, min(crop_right, img_width))
|
||||
crop_bottom = max(crop_top + 1, min(crop_bottom, img_height))
|
||||
|
||||
if crop_right - crop_left < 2 or crop_bottom - crop_top < 2:
|
||||
logger.warning("Zone OCR: crop too small for zone '%s'", zone.name)
|
||||
return None
|
||||
|
||||
return img.crop((crop_left, crop_top, crop_right, crop_bottom)).copy()
|
||||
except Exception:
|
||||
logger.exception("Zone OCR: crop failed for zone '%s'", zone.name)
|
||||
return None
|
||||
|
||||
|
||||
def _read_barcode(cropped: Image.Image, zone_name: str) -> str | None:
|
||||
"""Read QR/barcode from a cropped image using zxingcpp."""
|
||||
try:
|
||||
import zxingcpp
|
||||
|
||||
results = zxingcpp.read_barcodes(cropped)
|
||||
if results:
|
||||
text = results[0].text
|
||||
logger.debug(
|
||||
"Zone OCR: barcode found in zone '%s': %s",
|
||||
zone_name,
|
||||
text[:100],
|
||||
)
|
||||
return text
|
||||
logger.debug("Zone OCR: no barcode found in zone '%s'", zone_name)
|
||||
return None
|
||||
except ImportError:
|
||||
logger.error("Zone OCR: zxingcpp not available — install zxing-cpp")
|
||||
return None
|
||||
except Exception:
|
||||
logger.exception("Zone OCR: barcode read failed for zone '%s'", zone_name)
|
||||
return None
|
||||
|
||||
|
||||
def _ocr_text(cropped: Image.Image, zone: OcrTemplateZone, tmp_dir: Path) -> str | None:
|
||||
"""OCR a cropped image with Tesseract."""
|
||||
crop_path = tmp_dir / f"zone_{zone.pk}.png"
|
||||
cropped.save(crop_path)
|
||||
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
[
|
||||
"tesseract",
|
||||
str(crop_path),
|
||||
"stdout",
|
||||
"-l",
|
||||
zone.ocr_language,
|
||||
"--psm",
|
||||
"6", # Assume uniform block of text
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
check=True,
|
||||
)
|
||||
return proc.stdout.strip() or None
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error("Zone OCR: Tesseract timed out for zone '%s'", zone.name)
|
||||
return None
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(
|
||||
"Zone OCR: Tesseract failed for zone '%s': %s",
|
||||
zone.name,
|
||||
e.stderr[:200] if e.stderr else str(e),
|
||||
)
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
logger.error("Zone OCR: Tesseract not found — is tesseract-ocr installed?")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_zone(
|
||||
page_img: Path,
|
||||
zone: OcrTemplateZone,
|
||||
source_width: int,
|
||||
source_height: int,
|
||||
tmp_dir: Path,
|
||||
) -> str | None:
|
||||
"""Crop a zone from the page image and extract text via OCR or barcode reader."""
|
||||
cropped = _crop_zone(page_img, zone, source_width, source_height, tmp_dir)
|
||||
if cropped is None:
|
||||
return None
|
||||
|
||||
# QR/barcode zones skip Tesseract entirely
|
||||
if zone.transform == "qr_code":
|
||||
text = _read_barcode(cropped, zone.name)
|
||||
if not text:
|
||||
return None
|
||||
return _apply_transform(
|
||||
text,
|
||||
zone.transform,
|
||||
getattr(zone, "date_format", "") or "",
|
||||
)
|
||||
|
||||
text = _ocr_text(cropped, zone, tmp_dir)
|
||||
if not text:
|
||||
return None
|
||||
|
||||
return _apply_transform(
|
||||
text,
|
||||
zone.transform,
|
||||
getattr(zone, "date_format", "") or "",
|
||||
)
|
||||
|
||||
|
||||
def extract_zone_preview(
|
||||
doc_path: Path,
|
||||
zone: OcrTemplateZone,
|
||||
source_width: int,
|
||||
source_height: int,
|
||||
page_count: int | None,
|
||||
) -> dict:
|
||||
"""Non-destructive single-zone extraction for the editor's per-zone test.
|
||||
|
||||
Renders the zone's page, crops it, runs OCR (or the barcode reader) and
|
||||
applies the transform — WITHOUT writing any custom field. Returns the raw
|
||||
OCR text and the transformed value so the user can see what the zone yields
|
||||
(and tune the validation regex) before saving.
|
||||
"""
|
||||
# zone.page is 1-indexed (1 = first, -1 = last); resolve to a 0-indexed
|
||||
# image index exactly like the production extraction path does.
|
||||
page_idx = _resolve_page_idx(zone.page, page_count)
|
||||
with tempfile.TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
|
||||
tmp_path = Path(tmp_dir)
|
||||
page_images = _render_pages(doc_path, {page_idx}, tmp_path, page_count)
|
||||
if page_idx not in page_images:
|
||||
return {"raw_text": None, "value": None}
|
||||
|
||||
if not source_width or not source_height:
|
||||
with Image.open(page_images[page_idx]) as im:
|
||||
source_width, source_height = im.size
|
||||
|
||||
cropped = _crop_zone(
|
||||
page_images[page_idx],
|
||||
zone,
|
||||
source_width,
|
||||
source_height,
|
||||
tmp_path,
|
||||
)
|
||||
if cropped is None:
|
||||
return {"raw_text": None, "value": None}
|
||||
|
||||
if zone.transform == "qr_code":
|
||||
raw_text = _read_barcode(cropped, zone.name)
|
||||
else:
|
||||
raw_text = _ocr_text(cropped, zone, tmp_path)
|
||||
|
||||
value = (
|
||||
_apply_transform(
|
||||
raw_text,
|
||||
zone.transform,
|
||||
getattr(zone, "date_format", "") or "",
|
||||
)
|
||||
if raw_text
|
||||
else None
|
||||
)
|
||||
return {"raw_text": raw_text, "value": value}
|
||||
|
||||
|
||||
def _parse_date(text: str, fmt: str) -> str:
|
||||
"""Parse a date from OCR text. With a Python strptime `fmt`, try that first;
|
||||
otherwise (or on failure) fall back to dateparser auto-detection. Returns an
|
||||
ISO date string, or the original text if nothing parses."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return text
|
||||
if fmt:
|
||||
try:
|
||||
return datetime.strptime(text, fmt).date().isoformat()
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
import dateparser
|
||||
|
||||
parsed = dateparser.parse(
|
||||
text,
|
||||
settings={
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE": False,
|
||||
},
|
||||
)
|
||||
if parsed:
|
||||
return parsed.date().isoformat()
|
||||
except Exception:
|
||||
logger.debug("Zone OCR: dateparser failed for %r", text[:50])
|
||||
return text
|
||||
|
||||
|
||||
def _apply_transform(text: str, transform: str, date_format: str = "") -> str:
|
||||
"""Apply post-processing transform to extracted text."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return text
|
||||
|
||||
if transform in ("strip", "none"):
|
||||
return text
|
||||
elif transform == "date":
|
||||
return _parse_date(text, date_format)
|
||||
elif transform == "uppercase":
|
||||
return text.upper()
|
||||
elif transform == "lowercase":
|
||||
return text.lower()
|
||||
elif transform == "numeric":
|
||||
result = re.sub(r"[^\d.,\-]", "", text)
|
||||
return result if result else text
|
||||
elif transform == "strip_punctuation":
|
||||
return text.strip(string.punctuation + " \t\r\n")
|
||||
elif transform == "qr_code":
|
||||
# Barcode/QR content as read by _read_barcode.
|
||||
return text
|
||||
return text
|
||||
|
||||
|
||||
def _zone_target_label(zone: OcrTemplateZone) -> str:
|
||||
"""Human label of a zone's write target (for logging)."""
|
||||
target = getattr(zone, "target", None) or "custom_field"
|
||||
if target == "custom_field":
|
||||
return zone.custom_field.name if zone.custom_field_id else "(no field)"
|
||||
return {"title": "Title", "asn": "ASN", "created": "Created"}.get(target, target)
|
||||
|
||||
|
||||
def _parse_created_datetime(value: str):
|
||||
"""Parse an extracted value into a tz-aware datetime for document.created.
|
||||
|
||||
Prefers an ISO date (the zone should use a date transform); falls back to
|
||||
dateparser. Returns None if no date can be parsed.
|
||||
"""
|
||||
from django.utils import timezone as djtz
|
||||
|
||||
m = re.search(r"(\d{4})-(\d{2})-(\d{2})", value)
|
||||
if m:
|
||||
try:
|
||||
dt = datetime(int(m[1]), int(m[2]), int(m[3]))
|
||||
return djtz.make_aware(dt) if djtz.is_naive(dt) else dt
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
import dateparser
|
||||
|
||||
parsed = dateparser.parse(
|
||||
value,
|
||||
settings={"RETURN_AS_TIMEZONE_AWARE": False},
|
||||
)
|
||||
if parsed:
|
||||
return djtz.make_aware(parsed) if djtz.is_naive(parsed) else parsed
|
||||
except Exception:
|
||||
logger.debug("Zone OCR: dateparser failed for created value %r", value[:50])
|
||||
return None
|
||||
|
||||
|
||||
def _write_zone_value(
|
||||
document: Document,
|
||||
zone: OcrTemplateZone,
|
||||
value: str,
|
||||
) -> None:
|
||||
"""Write an extracted value to the zone's target — a custom field, or a
|
||||
built-in document field (title / archive_serial_number / created)."""
|
||||
target = getattr(zone, "target", None) or "custom_field"
|
||||
|
||||
if target == "custom_field":
|
||||
if zone.custom_field_id:
|
||||
_write_custom_field(document, zone.custom_field, value)
|
||||
else:
|
||||
logger.debug("Zone OCR: zone '%s' has no custom field set", zone.name)
|
||||
return
|
||||
|
||||
if target == "title":
|
||||
document.title = value[:128]
|
||||
document.save(update_fields=["title"])
|
||||
elif target == "asn":
|
||||
digits = re.sub(r"[^\d]", "", value)
|
||||
if not digits:
|
||||
logger.debug(
|
||||
"Zone OCR: ASN zone '%s' produced no digits (%r)",
|
||||
zone.name,
|
||||
value[:50],
|
||||
)
|
||||
return
|
||||
document.archive_serial_number = int(digits)
|
||||
document.save(update_fields=["archive_serial_number"])
|
||||
elif target == "created":
|
||||
parsed = _parse_created_datetime(value)
|
||||
if parsed is None:
|
||||
logger.debug(
|
||||
"Zone OCR: created zone '%s' could not parse a date (%r)",
|
||||
zone.name,
|
||||
value[:50],
|
||||
)
|
||||
return
|
||||
document.created = parsed
|
||||
document.save(update_fields=["created"])
|
||||
|
||||
|
||||
def _write_custom_field(
|
||||
document: Document,
|
||||
custom_field: CustomField,
|
||||
value: str,
|
||||
) -> None:
|
||||
"""Write an extracted value to a document's custom field."""
|
||||
typed_value = _convert_value(value, custom_field.data_type)
|
||||
if typed_value is None:
|
||||
logger.debug(
|
||||
"Zone OCR: skipping custom field '%s' — value conversion returned None",
|
||||
custom_field.name,
|
||||
)
|
||||
return
|
||||
|
||||
value_field_name = CustomFieldInstance.get_value_field_name(custom_field.data_type)
|
||||
|
||||
CustomFieldInstance.objects.update_or_create(
|
||||
document=document,
|
||||
field=custom_field,
|
||||
defaults={value_field_name: typed_value},
|
||||
)
|
||||
|
||||
|
||||
def _convert_value(value: str, data_type: str) -> object | None:
|
||||
"""Convert an extracted OCR string to the appropriate type for the custom field."""
|
||||
if not value:
|
||||
return None
|
||||
|
||||
try:
|
||||
if data_type in (
|
||||
CustomField.FieldDataType.STRING,
|
||||
CustomField.FieldDataType.URL,
|
||||
):
|
||||
return value[:128]
|
||||
|
||||
elif data_type == CustomField.FieldDataType.LONG_TEXT:
|
||||
return value
|
||||
|
||||
elif data_type == CustomField.FieldDataType.INT:
|
||||
digits = re.sub(r"[^\d\-]", "", value)
|
||||
# Handle edge case: only dashes or empty
|
||||
digits = digits.lstrip("-") or ""
|
||||
if not digits:
|
||||
return None
|
||||
# Restore leading minus if original had one
|
||||
if value.strip().startswith("-"):
|
||||
digits = "-" + digits
|
||||
return int(digits)
|
||||
|
||||
elif data_type == CustomField.FieldDataType.FLOAT:
|
||||
# Handle European format: 1.234,56 → 1234.56
|
||||
cleaned = re.sub(r"[^\d.,\-]", "", value)
|
||||
if not cleaned or cleaned in (".", ",", "-"):
|
||||
return None
|
||||
# If both . and , present, the last one is the decimal separator
|
||||
if "," in cleaned and "." in cleaned:
|
||||
if cleaned.rindex(",") > cleaned.rindex("."):
|
||||
# European: 1.234,56
|
||||
cleaned = cleaned.replace(".", "").replace(",", ".")
|
||||
else:
|
||||
# US: 1,234.56
|
||||
cleaned = cleaned.replace(",", "")
|
||||
elif "," in cleaned:
|
||||
# Only comma — treat as decimal separator
|
||||
cleaned = cleaned.replace(",", ".")
|
||||
return float(cleaned)
|
||||
|
||||
elif data_type == CustomField.FieldDataType.DATE:
|
||||
match = re.search(r"(\d{4})-(\d{2})-(\d{2})", value)
|
||||
if match:
|
||||
y, m, d = match.groups()
|
||||
# Validate the date
|
||||
date(int(y), int(m), int(d))
|
||||
return f"{y}-{m}-{d}"
|
||||
return None
|
||||
|
||||
elif data_type == CustomField.FieldDataType.MONETARY:
|
||||
cleaned = re.sub(r"[^\d.,\-]", "", value)
|
||||
if not cleaned or cleaned in (".", ",", "-"):
|
||||
return None
|
||||
if "," in cleaned and "." in cleaned:
|
||||
if cleaned.rindex(",") > cleaned.rindex("."):
|
||||
cleaned = cleaned.replace(".", "").replace(",", ".")
|
||||
else:
|
||||
cleaned = cleaned.replace(",", "")
|
||||
elif "," in cleaned:
|
||||
cleaned = cleaned.replace(",", ".")
|
||||
# Validate it parses as a number
|
||||
float(cleaned)
|
||||
return cleaned
|
||||
|
||||
elif data_type == CustomField.FieldDataType.BOOL:
|
||||
lower = value.lower().strip()
|
||||
if lower in ("true", "yes", "1", "ja", "oui", "si", "x"):
|
||||
return True
|
||||
elif lower in ("false", "no", "0", "nein", "non"):
|
||||
return False
|
||||
return None
|
||||
|
||||
else:
|
||||
# Unsupported types (DOCUMENTLINK, SELECT) — can't OCR into these
|
||||
logger.debug(
|
||||
"Zone OCR: unsupported custom field type %s for OCR extraction",
|
||||
data_type,
|
||||
)
|
||||
return None
|
||||
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning("Zone OCR: could not convert %r to %s: %s", value, data_type, e)
|
||||
return None
|
||||
|
||||
|
||||
def _detect_mime(path: Path) -> str | None:
|
||||
"""Detect MIME type of a file."""
|
||||
try:
|
||||
import magic
|
||||
|
||||
return magic.from_file(str(path), mime=True)
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
logger.debug("Zone OCR: magic failed for %s, falling back to extension", path)
|
||||
|
||||
suffix = path.suffix.lower()
|
||||
return {
|
||||
".pdf": "application/pdf",
|
||||
".png": "image/png",
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".tiff": "image/tiff",
|
||||
".tif": "image/tiff",
|
||||
".webp": "image/webp",
|
||||
".bmp": "image/bmp",
|
||||
".gif": "image/gif",
|
||||
}.get(suffix)
|
||||
@@ -28,6 +28,7 @@ from documents.views import GlobalSearchView
|
||||
from documents.views import IndexView
|
||||
from documents.views import LogViewSet
|
||||
from documents.views import MergeDocumentsView
|
||||
from documents.views import OcrTemplateViewSet
|
||||
from documents.views import PostDocumentView
|
||||
from documents.views import RemoteVersionView
|
||||
from documents.views import RemovePasswordDocumentsView
|
||||
@@ -86,6 +87,7 @@ api_router.register(r"workflow_triggers", WorkflowTriggerViewSet)
|
||||
api_router.register(r"workflow_actions", WorkflowActionViewSet)
|
||||
api_router.register(r"workflows", WorkflowViewSet)
|
||||
api_router.register(r"custom_fields", CustomFieldViewSet)
|
||||
api_router.register(r"ocr_templates", OcrTemplateViewSet)
|
||||
api_router.register(r"config", ApplicationConfigurationViewSet)
|
||||
api_router.register(r"processed_mail", ProcessedMailViewSet)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user