Feature: OCR Templates (#13043)

[skip ci]

Signed-off-by: dependabot[bot] <support@github.com>
Co-Authored-By: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-Authored-By: stumpylog <797416+stumpylog@users.noreply.github.com>
Co-Authored-By: GitHub Actions <41898282+github-actions[bot]@users.noreply.github.com>
Co-Authored-By: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
Christoph Schlaepfer
2026-06-23 16:24:37 +02:00
committed by shamoon
parent bf70e597ee
commit bf73b5b1d1
26 changed files with 4545 additions and 1 deletions
+26
View File
@@ -13,6 +13,8 @@ import { DocumentDetailComponent } from './components/document-detail/document-d
import { DocumentListComponent } from './components/document-list/document-list.component'
import { DocumentAttributesComponent } from './components/manage/document-attributes/document-attributes.component'
import { MailComponent } from './components/manage/mail/mail.component'
import { OcrTemplateEditorComponent } from './components/manage/ocr-templates/ocr-template-editor/ocr-template-editor.component'
import { OcrTemplatesComponent } from './components/manage/ocr-templates/ocr-templates.component'
import { SavedViewsComponent } from './components/manage/saved-views/saved-views.component'
import { WorkflowsComponent } from './components/manage/workflows/workflows.component'
import { NotFoundComponent } from './components/not-found/not-found.component'
@@ -274,6 +276,30 @@ export const routes: Routes = [
componentName: 'WorkflowsComponent',
},
},
{
path: 'ocr-templates',
component: OcrTemplatesComponent,
canActivate: [PermissionsGuard],
data: {
requiredPermission: {
action: PermissionAction.View,
type: PermissionType.OcrTemplate,
},
componentName: 'OcrTemplatesComponent',
},
},
{
path: 'ocr-templates/:id',
component: OcrTemplateEditorComponent,
canActivate: [PermissionsGuard],
data: {
requiredPermission: {
action: PermissionAction.Change,
type: PermissionType.OcrTemplate,
},
componentName: 'OcrTemplateEditorComponent',
},
},
{
path: 'mail',
component: MailComponent,
@@ -243,6 +243,14 @@
<i-bs class="me-2" name="boxes"></i-bs><span><ng-container i18n>Workflows</ng-container></span>
</a>
</li>
<li class="nav-item app-link"
*pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.OcrTemplate }">
<a class="nav-link" routerLink="ocr-templates" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="OCR Templates" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-2" name="file-earmark-break"></i-bs><span><ng-container i18n>OCR Templates</ng-container></span>
</a>
</li>
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.MailAccount }"
tourAnchor="tour.mail">
<a class="nav-link" routerLink="mail" routerLinkActive="active" (click)="closeMenu()" ngbPopover="Mail"
@@ -82,6 +82,14 @@
<i-bs name="pencil" class="me-1"></i-bs><ng-container i18n>PDF Editor</ng-container>
</button>
<button ngbDropdownItem (click)="runZoneOcr()" [disabled]="!userCanEdit || !document?.document_type">
<i-bs width="1em" height="1em" name="file-earmark-ruled" class="me-1"></i-bs><span i18n>Run Zone OCR</span>
</button>
<button ngbDropdownItem (click)="createOcrTemplate()">
<i-bs width="1em" height="1em" name="file-earmark-medical" class="me-1"></i-bs><span i18n>Create OCR Template</span>
</button>
@if (userIsOwner && (requiresPassword || password)) {
<button ngbDropdownItem (click)="removePassword()" [disabled]="!password">
<i-bs name="unlock" class="me-1"></i-bs><ng-container i18n>Remove Password</ng-container>
@@ -1405,6 +1405,48 @@ export class DocumentDetailComponent
})
}
runZoneOcr() {
this.documentsService.runZoneOcr(this.document.id).subscribe({
next: (res) => {
const results = res.results ?? []
if (results.length) {
const failed = results.filter(
(r) =>
r.value === null ||
r.value === undefined ||
`${r.value}`.trim() === ''
)
const filled = results.length - failed.length
let msg = $localize`Filled ${filled} of ${results.length} fields`
if (failed.length) {
const names = failed.map((r) => r.zone).join(', ')
msg = `${msg}. ${$localize`Failed to match zones: ${names}`}`
}
this.toastService.showInfo(msg)
} else {
this.toastService.showInfo(
$localize`Zone OCR ran but no results extracted.`
)
}
this.documentsService
.get(this.documentId)
.subscribe((doc) => this.updateComponent(doc))
},
error: (error) => {
this.toastService.showError($localize`Zone OCR failed`, error)
},
})
}
createOcrTemplate() {
this.router.navigate(['/ocr-templates', 'new'], {
queryParams: {
document_type: this.document.document_type,
sample_document: this.document.id,
},
})
}
private getSelectedNonLatestVersionId(): number | null {
const versions = this.document?.versions ?? []
if (!versions.length || !this.selectedVersionId) {
@@ -95,6 +95,9 @@
<button ngbDropdownItem (click)="mergeSelected()" [disabled]="!userCanAdd || list.allSelected || list.selectedCount < 2">
<i-bs name="journals" class="me-1"></i-bs><ng-container i18n>Merge</ng-container>
</button>
<button ngbDropdownItem (click)="runZoneOcrSelected()" [disabled]="!userCanEditAll || list.allSelected">
<i-bs name="file-earmark-ruled" class="me-1"></i-bs><ng-container i18n>Run Zone OCR</ng-container>
</button>
</div>
</div>
</div>
@@ -12,7 +12,15 @@ import {
} from '@ng-bootstrap/ng-bootstrap'
import { saveAs } from 'file-saver'
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
import { first, map, Observable, Subject, switchMap, takeUntil } from 'rxjs'
import {
first,
forkJoin,
map,
Observable,
Subject,
switchMap,
takeUntil,
} from 'rxjs'
import { ConfirmDialogComponent } from 'src/app/components/common/confirm-dialog/confirm-dialog.component'
import { CustomField } from 'src/app/data/custom-field'
import { MatchingModel } from 'src/app/data/matching-model'
@@ -908,6 +916,27 @@ export class BulkEditorComponent
})
}
runZoneOcrSelected() {
const ids = Array.from(this.list.selected)
if (!ids.length) return
const modal = this.modalService.open(ConfirmDialogComponent, {
backdrop: 'static',
})
modal.componentInstance.title = $localize`Run Zone OCR`
modal.componentInstance.messageBold = $localize`Run zone OCR on ${this.getSelectionSize()} selected document(s)?`
modal.componentInstance.message = $localize`Each document's type template (if it has one) is applied, overwriting the mapped fields.`
modal.componentInstance.btnCaption = $localize`Proceed`
modal.componentInstance.confirmClicked
.pipe(takeUntil(this.unsubscribeNotifier))
.subscribe(() => {
modal.componentInstance.buttonsEnabled = false
this.executeDocumentAction(
modal,
forkJoin(ids.map((id) => this.documentService.runZoneOcr(id)))
)
})
}
setPermissions() {
let modal = this.modalService.open(PermissionsDialogComponent, {
backdrop: 'static',
@@ -0,0 +1,414 @@
<pngx-page-header [title]="pageTitle" [id]="template.id">
<div class="input-group input-group-sm me-5 align-items-center">
<div class="input-group-text">
<i-bs name="file-text"></i-bs>
</div>
<input
type="text"
class="form-control"
[(ngModel)]="previewDocModel"
[ngbTypeahead]="searchDocuments"
[inputFormatter]="documentFormatter"
[resultFormatter]="documentFormatter"
(selectItem)="onPreviewDocSelected($event)"
[editable]="false"
placeholder="Search documents by title..."
i18n-placeholder
/>
</div>
<div class="d-flex align-items-center flex-wrap gap-2">
<div class="input-group input-group-sm ms-2 d-none d-md-flex">
<div class="input-group-text" i18n>Page</div>
<input class="form-control flex-grow-0 w-auto" type="number" min="1" [max]="previewPageCount" [(ngModel)]="previewPageDisplay" />
<div class="input-group-text" i18n>of {{previewPageCount}}</div>
</div>
<button type="button" class="btn btn-sm btn-outline-secondary" i18n-title title="Previous" (click)="prevPage()" [disabled]="!pageImageUrl || previewPage <= 0">
<i-bs width="1.2em" height="1.2em" name="arrow-left"></i-bs>
</button>
<button type="button" class="btn btn-sm btn-outline-secondary" i18n-title title="Next" (click)="nextPage()" [disabled]="!pageImageUrl || previewPage >= (previewPageCount ?? 1) - 1">
<i-bs width="1.2em" height="1.2em" name="arrow-right"></i-bs>
</button>
<div class="input-group input-group-sm">
<button class="btn btn-outline-secondary" (click)="zoomOut()" i18n>-</button>
<span class="input-group-text">{{ zoom * 100 | number: '1.0-0' }}%</span>
<button class="btn btn-outline-secondary" (click)="zoomIn()" i18n>+</button>
</div>
</div>
</pngx-page-header>
<div class="row">
<div class="col-md-4">
<div class="btn-toolbar mb-1 border-bottom">
<div class="btn-group pb-3">
<a routerLink="/ocr-templates" class="btn btn-sm btn-outline-secondary">
<i-bs width="1.2em" height="1.2em" name="x"></i-bs>
<span class="ms-1" i18n>Close</span>
</a>
</div>
<div class="btn-group ms-auto pb-3">
<button class="btn btn-sm btn-primary" (click)="save()" [disabled]="saving">
@if (saving) {
<span class="spinner-border spinner-border-sm me-1"></span>
}
<span i18n>Save</span>
</button>
</div>
</div>
<ul ngbNav #nav="ngbNav" [(activeId)]="activeTab" class="nav-underline flex-nowrap flex-md-wrap overflow-auto">
<li ngbNavItem="settings">
<a ngbNavLink i18n>Settings</a>
<ng-template ngbNavContent>
<div class="row mb-3">
<div class="col-9">
<pngx-input-text [(ngModel)]="template.name" title="Template name" i18n-title></pngx-input-text>
</div>
<div class="col-3">
<pngx-input-switch [(ngModel)]="template.enabled" title="Enabled" i18n-title></pngx-input-switch>
</div>
</div>
<pngx-input-select [(ngModel)]="template.document_type" [items]="documentTypes" bindLabel="name" bindValue="id" title="Document type" i18n-title></pngx-input-select>
<small class="text-muted" i18n>
Draw rectangles on the preview to define extraction zones. Use the
page controls above the preview to add zones on different pages.
</small>
</ng-template>
</li>
<li ngbNavItem="zones">
<a ngbNavLink><ng-container i18n>Zones</ng-container> <span class="badge bg-primary ms-2">{{ template.zones.length }}</span></a>
<ng-template ngbNavContent>
@if (template.zones.length === 0) {
<p class="text-muted" i18n>
No zones defined. Load a document preview and draw rectangles to add zones.
</p>
}
<div class="list-group">
@for (zone of template.zones; track $index; let i = $index) {
<div
class="list-group-item list-group-item-action d-flex justify-content-between align-items-center"
[style.box-shadow]="selectedZoneIndex === i ? 'inset 3px 0 0 0 var(--bs-primary)' : null"
>
<div class="flex-grow-1" role="button" style="cursor: pointer;" (click)="selectZone(i)">
<div><strong [class.text-primary]="selectedZoneIndex === i">{{ zone.name }}</strong></div>
<div class="small text-muted">
{{ getZoneTargetName(zone) }} - {{ zone.width }}x{{ zone.height }}px <ng-container i18n>p.</ng-container>{{ zonePage(zone) }}
</div>
</div>
<div class="btn-group">
<button class="btn btn-sm btn-outline-secondary" type="button" (click)="selectZone(i)" title="Edit" i18n-title>
<i-bs name="pencil"></i-bs>
</button>
<button class="btn btn-sm btn-outline-danger" type="button" (click)="removeZone(i)" title="Delete" i18n-title>
<i-bs name="trash"></i-bs>
</button>
</div>
</div>
}
</div>
</ng-template>
</li>
<li ngbNavItem="zone">
<a ngbNavLink i18n>Zone</a>
<ng-template ngbNavContent>
@if (selectedZone; as zone) {
<div class="d-flex justify-content-between align-items-center mb-3">
<strong>{{ zone.name }}</strong>
<div class="d-flex gap-2">
<button class="btn btn-sm btn-primary" (click)="save()" [disabled]="saving">
@if (saving) {
<span class="spinner-border spinner-border-sm me-1"></span>
}
<span i18n>Save</span>
</button>
<button class="btn btn-sm btn-outline-danger" (click)="deleteSelectedZone()">
<i-bs name="trash" class="me-1"></i-bs><ng-container i18n>Delete zone</ng-container>
</button>
</div>
</div>
<div class="mb-3">
<label class="form-label" i18n>Zone Name</label>
<input
type="text"
class="form-control"
[(ngModel)]="zone.name"
(ngModelChange)="redrawCanvas()"
/>
</div>
<div class="mb-3">
<label class="form-label" i18n>Page</label>
<input
type="number"
class="form-control"
[(ngModel)]="zone.page"
min="-1"
(ngModelChange)="redrawCanvas()"
/>
<small class="text-muted" i18n>Page this zone is on. Use -1 for the last page. Set automatically when you draw it.</small>
</div>
<div class="mb-3">
<label class="form-label" i18n>Field</label>
<div class="input-group">
<select class="form-select" [ngModel]="zoneFieldValue(zone)" (ngModelChange)="setZoneField(zone, $event)">
<optgroup label="Built-in fields" i18n-label>
@for (t of builtinTargets; track t.id) {
<option [ngValue]="t.id">{{ t.name }}</option>
}
</optgroup>
<optgroup label="Custom fields" i18n-label>
@for (cf of customFields; track cf.id) {
<option [ngValue]="cf.id">{{ cf.name }} ({{ cf.data_type }})</option>
}
</optgroup>
</select>
<button
class="btn btn-outline-secondary"
type="button"
(click)="openQuickCreate(selectedZoneIndex)"
title="Create new custom field"
i18n-title
>
<i-bs name="plus"></i-bs>
</button>
</div>
<small class="text-muted" i18n>Write the extracted value to a custom field, or to a built-in field (Title, ASN, Date created).</small>
</div>
@if (isFieldShared(zone)) {
<div class="card mb-3 border-info">
<div class="card-body">
<h6 class="card-title d-flex align-items-center gap-2">
<i-bs name="braces"></i-bs>
<span i18n>Combine zones into this field</span>
</h6>
<p class="small text-muted mb-2" i18n>
More than one zone writes to this field. Build the combined
value below: click a zone to insert its token, and type any
separators or literal text between tokens.
</p>
<div class="d-flex flex-wrap gap-1 mb-2">
@for (z of zonesForField(zone); track $index) {
<button
type="button"
class="btn btn-sm btn-outline-info"
(click)="insertCombineToken(zone, z)"
title="Insert token"
i18n-title
>
+ {{ z.name || 'Zone' }}
</button>
}
</div>
<input
type="text"
class="form-control font-monospace"
[ngModel]="getCombineFormat(zone)"
(ngModelChange)="setCombineFormat(zone, $event)"
placeholder="{Zone 1} - {Zone 2}"
/>
<small class="text-muted" i18n>
Tokens are matched by zone name. An empty zone leaves its
token blank and the stray separator is trimmed. Leave empty
to just join the zones in order with a space.
</small>
</div>
</div>
}
@if (showQuickCreate) {
<div class="card mb-3 border-primary">
<div class="card-body">
<h6 class="card-title" i18n>Create Custom Field</h6>
<div class="mb-2">
<label class="form-label small" i18n>Field Name</label>
<input type="text" class="form-control form-control-sm"
[(ngModel)]="quickCreateName" placeholder="e.g. Invoice Number" />
</div>
<div class="mb-2">
<label class="form-label small" i18n>Field Type</label>
<select class="form-select form-select-sm" [(ngModel)]="quickCreateType">
@for (t of quickCreateTypes; track t.id) {
<option [ngValue]="t.id">{{ t.name }}</option>
}
</select>
</div>
<div class="d-flex gap-2">
<button class="btn btn-primary btn-sm" (click)="submitQuickCreate()"
[disabled]="!quickCreateName.trim()" i18n>
Create & Assign
</button>
<button class="btn btn-outline-secondary btn-sm" (click)="cancelQuickCreate()" i18n>
Cancel
</button>
</div>
</div>
</div>
}
<div class="mb-3">
<label class="form-label" i18n>OCR Language</label>
<ng-select
[items]="ocrLanguageOptions"
bindLabel="name"
bindValue="id"
[multiple]="true"
[closeOnSelect]="false"
[ngModel]="ocrLanguageArray(zone)"
(ngModelChange)="setOcrLanguages(zone, $event)"
placeholder="Select languages"
i18n-placeholder
></ng-select>
</div>
<div class="mb-3">
<label class="form-label" i18n>Transform</label>
<select class="form-select" [(ngModel)]="zone.transform">
@for (opt of transformOptions; track opt.id) {
<option [ngValue]="opt.id">{{ opt.name }}</option>
}
</select>
</div>
@if (zone.transform === 'date') {
<div class="mb-3">
<label class="form-label" i18n>Date format</label>
<select class="form-select" [ngModel]="dateFormatChoice(zone)" (ngModelChange)="setDateFormatChoice(zone, $event)">
@for (opt of dateFormatOptions; track opt.id) {
<option [ngValue]="opt.id">{{ opt.name }}</option>
}
<option [ngValue]="'custom'" i18n>Custom...</option>
</select>
@if (dateFormatCustom) {
<div class="input-group mt-2">
<input type="text" class="form-control font-monospace" [(ngModel)]="zone.date_format" placeholder="%d.%m.%Y" />
<button class="btn btn-outline-secondary" type="button" [ngbPopover]="dateFmtHelp" [autoClose]="true" title="Date format help" i18n-title>
<i-bs name="question-circle"></i-bs>
</button>
</div>
<ng-template #dateFmtHelp>
<p class="mb-1" i18n>Python date codes:</p>
<ul class="mb-1 ps-3">
<li><code>%d</code> <ng-container i18n>day (01-31)</ng-container></li>
<li><code>%m</code> <ng-container i18n>month (01-12)</ng-container></li>
<li><code>%Y</code> <ng-container i18n>year, 4-digit</ng-container></li>
<li><code>%y</code> <ng-container i18n>year, 2-digit</ng-container></li>
<li><code>%b</code> <ng-container i18n>month name (Jan)</ng-container></li>
</ul>
<span i18n>Example:</span> <code>%d.%m.%Y</code> -> 03.03.2026
</ng-template>
}
</div>
}
<div class="mb-3">
<label class="form-label" i18n>Validation Regex</label>
<input
type="text"
class="form-control font-monospace"
[(ngModel)]="zone.validation_regex"
placeholder="e.g. \d{2}\.\d{2}\.\d{4}"
>
</div>
<div class="text-muted small">
{{ zone.x }}, {{ zone.y }} - {{ zone.width }}x{{ zone.height }}px
</div>
<hr class="my-3" />
<h6 i18n>Test</h6>
@if (!previewDocId) {
<p class="text-muted small mb-0" i18n>
Load a document in the Settings tab to test this zone.
</p>
} @else {
<button class="btn btn-sm btn-outline-secondary" (click)="testZone()" [disabled]="zoneTesting">
@if (zoneTesting) {
<span class="spinner-border spinner-border-sm me-1"></span>
}
<span i18n>Test this zone</span>
</button>
@if (zoneTestResult) {
@if (zoneTestResult.error) {
<div class="alert alert-warning py-2 mt-2 mb-0 small">{{ zoneTestResult.error }}</div>
} @else {
<dl class="row small mt-2 mb-0">
<dt class="col-sm-4" i18n>OCR text</dt>
<dd class="col-sm-8"><code>{{ zoneTestResult.raw_text || '(nothing detected)' }}</code></dd>
<dt class="col-sm-4" i18n>Value</dt>
<dd class="col-sm-8"><code>{{ zoneTestResult.value || '(empty)' }}</code></dd>
@if (zoneTestResult.regex) {
<dt class="col-sm-4" i18n>Validation</dt>
<dd class="col-sm-8">
@if (zoneTestResult.regex_match) {
<span class="badge bg-success" i18n>Regex matches</span>
} @else {
<span class="badge bg-danger" i18n>Regex does not match</span>
}
</dd>
}
</dl>
}
}
}
} @else {
<p class="text-muted" i18n>
Select a zone from the Zones tab, or draw a rectangle on the document to create one.
</p>
}
</ng-template>
</li>
</ul>
<div [ngbNavOutlet]="nav" class="mt-3"></div>
</div>
<!-- Right column: Document preview with zone overlay -->
<div class="col-md-8">
@if (pageImageUrl) {
<div class="border" style="overflow: auto; max-height: 78vh;">
<div class="position-relative d-inline-block" [style.width.%]="zoom * 100">
<img
#pageImage
[src]="pageImageUrl"
(load)="onImageLoad()"
style="width: 100%; display: block;"
[style.visibility]="imageLoaded ? 'visible' : 'hidden'"
crossorigin="use-credentials"
/>
@if (imageLoaded) {
<canvas
#zoneCanvas
class="position-absolute top-0 start-0"
style="width: 100%; height: 100%; cursor: crosshair;"
(mousedown)="onCanvasMouseDown($event)"
(mousemove)="onCanvasMouseMove($event)"
(mouseup)="onCanvasMouseUp($event)"
></canvas>
}
@if (!imageLoaded) {
<div class="d-flex justify-content-center p-5">
<div class="spinner-border" role="status">
<span class="visually-hidden" i18n>Loading page...</span>
</div>
</div>
}
</div>
</div>
} @else {
<div class="border rounded p-5 text-center text-muted">
<i-bs name="file-earmark-image" width="48" height="48"></i-bs>
<p class="mt-3" i18n>
Enter a document ID and click "Load" to preview a page and draw extraction zones.
</p>
</div>
}
</div>
</div>
@@ -0,0 +1,3 @@
:host {
display: block;
}
@@ -0,0 +1,997 @@
import { CommonModule } from '@angular/common'
import {
AfterViewInit,
Component,
ElementRef,
inject,
OnDestroy,
OnInit,
ViewChild,
} from '@angular/core'
import { FormsModule } from '@angular/forms'
import { ActivatedRoute, Router, RouterModule } from '@angular/router'
import {
NgbNavModule,
NgbPopoverModule,
NgbTypeaheadModule,
NgbTypeaheadSelectItemEvent,
} from '@ng-bootstrap/ng-bootstrap'
import { NgSelectModule } from '@ng-select/ng-select'
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
import {
catchError,
debounceTime,
distinctUntilChanged,
map,
Observable,
of,
Subject,
switchMap,
takeUntil,
} from 'rxjs'
import { SelectComponent } from 'src/app/components/common/input/select/select.component'
import { SwitchComponent } from 'src/app/components/common/input/switch/switch.component'
import { TextComponent } from 'src/app/components/common/input/text/text.component'
import { PageHeaderComponent } from 'src/app/components/common/page-header/page-header.component'
import { CustomField } from 'src/app/data/custom-field'
import { Document } from 'src/app/data/document'
import { DocumentType } from 'src/app/data/document-type'
import {
DATE_FORMAT_OPTIONS,
OCR_BUILTIN_TARGETS,
OCR_LANGUAGE_OPTIONS,
OcrTemplate,
OcrTemplateZone,
OcrZoneTestResult,
TRANSFORM_OPTIONS,
ZoneTestRequest,
} from 'src/app/data/ocr-template'
import { CorrespondentService } from 'src/app/services/rest/correspondent.service'
import { CustomFieldsService } from 'src/app/services/rest/custom-fields.service'
import { DocumentTypeService } from 'src/app/services/rest/document-type.service'
import { DocumentService } from 'src/app/services/rest/document.service'
import { OcrTemplateService } from 'src/app/services/rest/ocr-template.service'
import { ToastService } from 'src/app/services/toast.service'
interface DrawingRect {
startX: number
startY: number
endX: number
endY: number
}
type ResizeHandle = 'n' | 's' | 'e' | 'w' | 'ne' | 'nw' | 'se' | 'sw'
type ActiveTab = 'settings' | 'zones' | 'zone'
@Component({
selector: 'pngx-ocr-template-editor',
standalone: true,
imports: [
PageHeaderComponent,
TextComponent,
SelectComponent,
SwitchComponent,
CommonModule,
FormsModule,
RouterModule,
NgbNavModule,
NgbPopoverModule,
NgbTypeaheadModule,
NgSelectModule,
NgxBootstrapIconsModule,
],
templateUrl: './ocr-template-editor.component.html',
styleUrls: ['./ocr-template-editor.component.scss'],
})
export class OcrTemplateEditorComponent
implements OnInit, OnDestroy, AfterViewInit
{
private readonly route = inject(ActivatedRoute)
private readonly router = inject(Router)
private readonly templateService = inject(OcrTemplateService)
private readonly customFieldsService = inject(CustomFieldsService)
private readonly documentTypeService = inject(DocumentTypeService)
private readonly correspondentService = inject(CorrespondentService)
private readonly documentService = inject(DocumentService)
private readonly toastService = inject(ToastService)
private readonly destroy$ = new Subject<void>()
@ViewChild('zoneCanvas') canvasRef: ElementRef<HTMLCanvasElement>
@ViewChild('pageImage') imageRef: ElementRef<HTMLImageElement>
template: OcrTemplate = {
id: null,
name: '',
document_type: null,
sample_document: null,
source_width: 0,
source_height: 0,
enabled: true,
combine_formats: {},
zones: [],
}
customFields: CustomField[] = []
documentTypes: DocumentType[] = []
transformOptions = TRANSFORM_OPTIONS
builtinTargets = OCR_BUILTIN_TARGETS
dateFormatOptions = DATE_FORMAT_OPTIONS
ocrLanguageOptions = OCR_LANGUAGE_OPTIONS
dateFormatCustom = false
isNew = true
saving = false
previewDocId: number | null = null
previewPage = 0
previewPageCount: number | null = null
private pageCountForDoc: number | null = null
pageImageUrl: string | null = null
imageLoaded = false
zoom = 1
previewDocModel: Document | string = ''
private correspondentNames = new Map<number, string>()
public get previewPageDisplay(): number {
return this.previewPage + 1
}
public set previewPageDisplay(value: number) {
this.previewPage = Math.max(0, value) - 1
}
activeTab: ActiveTab = 'settings'
isDrawing = false
currentRect: DrawingRect | null = null
selectedZoneIndex: number | null = null
isResizing = false
resizeHandle: ResizeHandle | null = null
resizeZoneIndex: number | null = null
private readonly HANDLE_SIZE = 8
isMoving = false
moveZoneIndex: number | null = null
private moveStart = { mouseX: 0, mouseY: 0, zoneX: 0, zoneY: 0 }
zoneTestResult: OcrZoneTestResult | null = null
zoneTesting = false
showQuickCreate = false
quickCreateName = ''
quickCreateType = 'string'
quickCreateForZoneIndex: number | null = null
quickCreateTypes = [
{ id: 'string', name: $localize`String` },
{ id: 'integer', name: $localize`Integer` },
{ id: 'float', name: $localize`Float` },
{ id: 'date', name: $localize`Date` },
{ id: 'monetary', name: $localize`Monetary` },
{ id: 'boolean', name: $localize`Boolean` },
{ id: 'url', name: $localize`URL` },
{ id: 'longtext', name: $localize`Long Text` },
]
get selectedZone(): OcrTemplateZone | null {
return this.selectedZoneIndex !== null
? (this.template.zones[this.selectedZoneIndex] ?? null)
: null
}
get pageTitle(): string {
return this.isNew
? $localize`New OCR Template`
: $localize`Edit OCR Template`
}
ngOnInit() {
this.customFieldsService
.listAll()
.pipe(takeUntil(this.destroy$))
.subscribe((r) => (this.customFields = r.results))
this.documentTypeService
.listAll()
.pipe(takeUntil(this.destroy$))
.subscribe((r) => (this.documentTypes = r.results))
this.correspondentService
.listAll()
.pipe(takeUntil(this.destroy$))
.subscribe((r) => {
this.correspondentNames = new Map(r.results.map((c) => [c.id, c.name]))
})
const id = this.route.snapshot.paramMap.get('id')
if (id && id !== 'new') {
this.isNew = false
this.templateService
.get(parseInt(id))
.pipe(takeUntil(this.destroy$))
.subscribe((t) => {
this.template = t
this.template.combine_formats ??= {}
if (t.sample_document) {
this.previewDocId = t.sample_document
this.loadPreview()
}
})
} else {
const qp = this.route.snapshot.queryParams
if (qp['document_type']) {
this.template.document_type = parseInt(qp['document_type'])
}
if (qp['sample_document']) {
const docId = parseInt(qp['sample_document'])
this.template.sample_document = docId
this.previewDocId = docId
this.loadPreview()
}
}
}
ngAfterViewInit() {}
searchDocuments = (text$: Observable<string>): Observable<Document[]> =>
text$.pipe(
debounceTime(250),
distinctUntilChanged(),
switchMap((term) => {
if (!term || term.trim().length < 2) return of([])
const params: { title__icontains: string; document_type__id?: number } =
{ title__icontains: term.trim() }
if (this.template.document_type) {
params['document_type__id'] = this.template.document_type
}
return this.documentService.list(1, 10, 'created', true, params).pipe(
map((r) => r.results),
catchError(() => of([]))
)
})
)
documentFormatter = (doc: Document | string): string => {
if (typeof doc === 'string') return doc
const corr = doc.correspondent
? this.correspondentNames.get(doc.correspondent)
: null
return corr
? `#${doc.id} ${doc.title} (${corr})`
: `#${doc.id} ${doc.title}`
}
onPreviewDocSelected(event: NgbTypeaheadSelectItemEvent<Document>) {
event.preventDefault()
const doc: Document = event.item
this.previewDocModel = doc
this.previewDocId = doc.id
if (!this.template.document_type && doc.document_type) {
this.template.document_type = doc.document_type
}
this.previewPage = 0
this.loadPreview()
}
clearPreviewDoc() {
this.previewDocModel = ''
this.previewDocId = null
this.previewPageCount = null
this.pageCountForDoc = null
this.previewPage = 0
this.pageImageUrl = null
this.imageLoaded = false
}
loadPreview() {
if (!this.previewDocId) return
if (this.pageCountForDoc !== this.previewDocId) {
this.pageCountForDoc = this.previewDocId
this.previewPageCount = null
this.documentService
.get(this.previewDocId)
.pipe(takeUntil(this.destroy$))
.subscribe({
next: (doc) => {
this.previewPageCount = doc?.page_count ?? null
if (doc && !this.previewDocModel) this.previewDocModel = doc
},
error: () => (this.previewPageCount = null),
})
}
this.pageImageUrl = this.templateService.getPageImageUrl(
this.previewDocId,
this.previewPage
)
this.imageLoaded = false
}
goToPage(page: number) {
const max = this.previewPageCount ? this.previewPageCount - 1 : page
const clamped = Math.max(0, Math.min(page, max))
if (clamped === this.previewPage) return
this.previewPage = clamped
this.loadPreview()
}
prevPage() {
this.goToPage(this.previewPage - 1)
}
nextPage() {
this.goToPage(this.previewPage + 1)
}
zoomIn() {
this.zoom = Math.min(4, Math.round((this.zoom + 0.25) * 100) / 100)
this.afterZoom()
}
zoomOut() {
this.zoom = Math.max(0.5, Math.round((this.zoom - 0.25) * 100) / 100)
this.afterZoom()
}
resetZoom() {
this.zoom = 1
this.afterZoom()
}
private afterZoom() {
// Defer so the wrapper reflows to the new width before the canvas resizes.
setTimeout(() => this.redrawCanvas())
}
zonePage(zone: OcrTemplateZone): number {
const v = zone.page ?? 1
if (v === -1) return this.previewPageCount ?? this.previewPage + 1
return v >= 1 ? v : 1
}
private isOnCurrentPage(zone: OcrTemplateZone): boolean {
return this.zonePage(zone) === this.previewPage + 1
}
onImageLoad() {
this.imageLoaded = true
const img = this.imageRef.nativeElement
this.template.source_width = img.naturalWidth
this.template.source_height = img.naturalHeight
// The canvas only exists after @if(imageLoaded) renders, so defer the draw.
setTimeout(() => this.redrawCanvas())
}
onCanvasMouseDown(event: MouseEvent) {
const rect = this.canvasRef.nativeElement.getBoundingClientRect()
const x = event.clientX - rect.left
const y = event.clientY - rect.top
if (this.selectedZoneIndex !== null) {
const handle = this.findHandleAt(x, y, this.selectedZoneIndex)
if (handle) {
this.isResizing = true
this.resizeHandle = handle
this.resizeZoneIndex = this.selectedZoneIndex
return
}
}
const clickedIdx = this.findZoneAt(x, y)
if (clickedIdx !== null && !event.shiftKey) {
this.selectZone(clickedIdx)
const zone = this.template.zones[clickedIdx]
this.isMoving = true
this.moveZoneIndex = clickedIdx
this.moveStart = { mouseX: x, mouseY: y, zoneX: zone.x, zoneY: zone.y }
return
}
// Shift+click or click on empty area starts a new zone.
this.isDrawing = true
this.currentRect = { startX: x, startY: y, endX: x, endY: y }
this.selectedZoneIndex = null
}
onCanvasMouseMove(event: MouseEvent) {
const rect = this.canvasRef.nativeElement.getBoundingClientRect()
const mx = event.clientX - rect.left
const my = event.clientY - rect.top
if (this.isResizing && this.resizeZoneIndex !== null && this.resizeHandle) {
this.applyResize(mx, my)
this.redrawCanvas()
return
}
if (this.isMoving && this.moveZoneIndex !== null) {
const zone = this.template.zones[this.moveZoneIndex]
const canvas = this.canvasRef.nativeElement
const img = this.imageRef.nativeElement
const srcW = zone.zone_source_width || img.naturalWidth
const srcH = zone.zone_source_height || img.naturalHeight
const scaleX = srcW / canvas.width
const scaleY = srcH / canvas.height
const dx = Math.round((mx - this.moveStart.mouseX) * scaleX)
const dy = Math.round((my - this.moveStart.mouseY) * scaleY)
zone.x = Math.max(
0,
Math.min(this.moveStart.zoneX + dx, srcW - zone.width)
)
zone.y = Math.max(
0,
Math.min(this.moveStart.zoneY + dy, srcH - zone.height)
)
this.redrawCanvas()
return
}
if (this.isDrawing && this.currentRect) {
this.currentRect.endX = mx
this.currentRect.endY = my
this.redrawCanvas()
return
}
// Cursor feedback: resize handle > move (over a zone) > crosshair.
const canvas = this.canvasRef.nativeElement
if (this.selectedZoneIndex !== null) {
const handle = this.findHandleAt(mx, my, this.selectedZoneIndex)
if (handle) {
const cursorMap: Record<ResizeHandle, string> = {
nw: 'nw-resize',
ne: 'ne-resize',
sw: 'sw-resize',
se: 'se-resize',
n: 'n-resize',
s: 's-resize',
w: 'w-resize',
e: 'e-resize',
}
canvas.style.cursor = cursorMap[handle] || 'crosshair'
return
}
}
canvas.style.cursor =
this.findZoneAt(mx, my) !== null ? 'move' : 'crosshair'
}
onCanvasMouseUp(event: MouseEvent) {
if (this.isMoving) {
this.isMoving = false
this.moveZoneIndex = null
return
}
if (this.isResizing) {
this.isResizing = false
this.resizeHandle = null
this.resizeZoneIndex = null
return
}
if (!this.isDrawing || !this.currentRect) return
this.isDrawing = false
const canvas = this.canvasRef.nativeElement
const img = this.imageRef.nativeElement
const scaleX = img.naturalWidth / canvas.width
const scaleY = img.naturalHeight / canvas.height
const x = Math.round(
Math.min(this.currentRect.startX, this.currentRect.endX) * scaleX
)
const y = Math.round(
Math.min(this.currentRect.startY, this.currentRect.endY) * scaleY
)
const w = Math.round(
Math.abs(this.currentRect.endX - this.currentRect.startX) * scaleX
)
const h = Math.round(
Math.abs(this.currentRect.endY - this.currentRect.startY) * scaleY
)
// Ignore tiny accidental clicks.
if (w < 10 || h < 10) {
this.currentRect = null
this.redrawCanvas()
return
}
const zone: OcrTemplateZone = {
name: `Zone ${this.template.zones.length + 1}`,
target: 'custom_field',
custom_field:
this.customFields.length > 0 ? this.customFields[0].id : null,
x,
y,
width: w,
height: h,
page: this.previewPage + 1,
ocr_language: 'deu+eng',
transform: 'strip',
date_format: '',
validation_regex: '',
order: this.template.zones.length,
zone_source_width: img.naturalWidth,
zone_source_height: img.naturalHeight,
}
this.template.zones.push(zone)
this.currentRect = null
this.selectZone(this.template.zones.length - 1)
}
private getZoneDisplayRect(
zoneIdx: number
): { x: number; y: number; w: number; h: number } | null {
const canvas = this.canvasRef?.nativeElement
const img = this.imageRef?.nativeElement
if (!canvas || !img || !img.naturalWidth) return null
const zone = this.template.zones[zoneIdx]
if (!zone) return null
if (!this.isOnCurrentPage(zone)) return null
const srcW = zone.zone_source_width || img.naturalWidth
const srcH = zone.zone_source_height || img.naturalHeight
const scaleX = canvas.width / srcW
const scaleY = canvas.height / srcH
return {
x: zone.x * scaleX,
y: zone.y * scaleY,
w: zone.width * scaleX,
h: zone.height * scaleY,
}
}
private findHandleAt(
mx: number,
my: number,
zoneIdx: number
): ResizeHandle | null {
const r = this.getZoneDisplayRect(zoneIdx)
if (!r) return null
const hs = this.HANDLE_SIZE
const handles: [ResizeHandle, number, number][] = [
['nw', r.x, r.y],
['n', r.x + r.w / 2, r.y],
['ne', r.x + r.w, r.y],
['w', r.x, r.y + r.h / 2],
['e', r.x + r.w, r.y + r.h / 2],
['sw', r.x, r.y + r.h],
['s', r.x + r.w / 2, r.y + r.h],
['se', r.x + r.w, r.y + r.h],
]
for (const [name, hx, hy] of handles) {
if (Math.abs(mx - hx) <= hs && Math.abs(my - hy) <= hs) return name
}
return null
}
private applyResize(mx: number, my: number) {
const canvas = this.canvasRef.nativeElement
const img = this.imageRef.nativeElement
const zone = this.template.zones[this.resizeZoneIndex]
if (!zone) return
const srcW = zone.zone_source_width || img.naturalWidth
const srcH = zone.zone_source_height || img.naturalHeight
const scaleX = srcW / canvas.width
const scaleY = srcH / canvas.height
const imgX = Math.round(mx * scaleX)
const imgY = Math.round(my * scaleY)
const handle = this.resizeHandle
if (handle.includes('w')) {
const right = zone.x + zone.width
zone.x = Math.max(0, Math.min(imgX, right - 10))
zone.width = right - zone.x
}
if (handle.includes('e')) {
zone.width = Math.max(10, imgX - zone.x)
}
if (handle.includes('n')) {
const bottom = zone.y + zone.height
zone.y = Math.max(0, Math.min(imgY, bottom - 10))
zone.height = bottom - zone.y
}
if (handle.includes('s')) {
zone.height = Math.max(10, imgY - zone.y)
}
}
private findZoneAt(displayX: number, displayY: number): number | null {
const canvas = this.canvasRef.nativeElement
const img = this.imageRef.nativeElement
if (!img.naturalWidth) return null
for (let i = this.template.zones.length - 1; i >= 0; i--) {
const z = this.template.zones[i]
if (!this.isOnCurrentPage(z)) continue
const srcW = z.zone_source_width || img.naturalWidth
const srcH = z.zone_source_height || img.naturalHeight
const scaleX = canvas.width / srcW
const scaleY = canvas.height / srcH
const zx = z.x * scaleX
const zy = z.y * scaleY
const zw = z.width * scaleX
const zh = z.height * scaleY
if (
displayX >= zx &&
displayX <= zx + zw &&
displayY >= zy &&
displayY <= zy + zh
) {
return i
}
}
return null
}
redrawCanvas() {
if (!this.canvasRef || !this.imageRef) return
const canvas = this.canvasRef.nativeElement
const img = this.imageRef.nativeElement
const ctx = canvas.getContext('2d')
canvas.width = img.clientWidth
canvas.height = img.clientHeight
ctx.clearRect(0, 0, canvas.width, canvas.height)
const colors = [
'#4f8ff7',
'#ff6b6b',
'#51cf66',
'#ffd43b',
'#cc5de8',
'#ff922b',
'#20c997',
'#e599f7',
]
this.template.zones.forEach((zone, idx) => {
if (!this.isOnCurrentPage(zone)) return
const color = colors[idx % colors.length]
const srcW = zone.zone_source_width || img.naturalWidth
const srcH = zone.zone_source_height || img.naturalHeight
const scaleX = canvas.width / srcW
const scaleY = canvas.height / srcH
const x = zone.x * scaleX
const y = zone.y * scaleY
const w = zone.width * scaleX
const h = zone.height * scaleY
ctx.strokeStyle = color
ctx.lineWidth = idx === this.selectedZoneIndex ? 3 : 2
ctx.strokeRect(x, y, w, h)
ctx.fillStyle = color + '20'
ctx.fillRect(x, y, w, h)
const label = zone.name || `Zone ${idx + 1}`
ctx.font = '12px sans-serif'
ctx.textBaseline = 'middle'
const padX = 6
const pillH = 17
const pillW = ctx.measureText(label).width + padX * 2
const pillX = x
const pillY = Math.max(0, y - pillH - 2)
const r = 4
ctx.fillStyle = color
ctx.beginPath()
ctx.moveTo(pillX + r, pillY)
ctx.arcTo(pillX + pillW, pillY, pillX + pillW, pillY + pillH, r)
ctx.arcTo(pillX + pillW, pillY + pillH, pillX, pillY + pillH, r)
ctx.arcTo(pillX, pillY + pillH, pillX, pillY, r)
ctx.arcTo(pillX, pillY, pillX + pillW, pillY, r)
ctx.closePath()
ctx.fill()
ctx.fillStyle = '#ffffff'
ctx.fillText(label, pillX + padX, pillY + pillH / 2 + 0.5)
ctx.textBaseline = 'alphabetic'
if (idx === this.selectedZoneIndex) {
const hs = this.HANDLE_SIZE
ctx.fillStyle = color
const handles = [
[x, y],
[x + w / 2, y],
[x + w, y],
[x, y + h / 2],
[x + w, y + h / 2],
[x, y + h],
[x + w / 2, y + h],
[x + w, y + h],
]
for (const [hx, hy] of handles) {
ctx.fillRect(hx - hs / 2, hy - hs / 2, hs, hs)
}
}
})
if (this.currentRect) {
const cw = this.currentRect.endX - this.currentRect.startX
const ch = this.currentRect.endY - this.currentRect.startY
ctx.fillStyle = 'rgba(105, 219, 124, 0.25)'
ctx.fillRect(this.currentRect.startX, this.currentRect.startY, cw, ch)
ctx.strokeStyle = '#69db7c'
ctx.lineWidth = 2
ctx.setLineDash([5, 5])
ctx.strokeRect(this.currentRect.startX, this.currentRect.startY, cw, ch)
ctx.setLineDash([])
}
}
removeZone(index: number) {
this.template.zones.splice(index, 1)
if (this.selectedZoneIndex === index) {
this.selectedZoneIndex = null
} else if (this.selectedZoneIndex > index) {
this.selectedZoneIndex--
}
this.redrawCanvas()
}
selectZone(index: number) {
this.selectedZoneIndex = index
this.activeTab = 'zone'
this.zoneTestResult = null
const zone = this.template.zones[index]
if (zone) {
this.dateFormatCustom =
!!zone.date_format &&
!this.dateFormatOptions.some((o) => o.id === zone.date_format)
this.seedCombineDefault(zone)
this.goToPage(this.zonePage(zone) - 1)
}
this.redrawCanvas()
}
testZone() {
const zone = this.selectedZone
if (!zone || !this.previewDocId) return
this.zoneTesting = true
this.zoneTestResult = null
const payload: ZoneTestRequest = {
name: zone.name,
x: zone.x,
y: zone.y,
width: zone.width,
height: zone.height,
page: zone.page ?? 1,
ocr_language: zone.ocr_language,
transform: zone.transform,
date_format: zone.date_format,
validation_regex: zone.validation_regex,
zone_source_width: zone.zone_source_width,
zone_source_height: zone.zone_source_height,
}
this.templateService
.testZone(this.previewDocId, payload)
.pipe(takeUntil(this.destroy$))
.subscribe({
next: (res) => {
this.zoneTestResult = res
this.zoneTesting = false
},
error: (err) => {
this.zoneTestResult = {
error: err.error?.error || $localize`Test failed`,
}
this.zoneTesting = false
},
})
}
deleteSelectedZone() {
if (this.selectedZoneIndex === null) return
this.removeZone(this.selectedZoneIndex)
this.activeTab = 'zones'
}
save() {
this.saving = true
this.pruneCombineFormats()
this.template.sample_document = this.previewDocId
const obs = this.isNew
? this.templateService.create(this.template)
: this.templateService.update(this.template)
obs.pipe(takeUntil(this.destroy$)).subscribe({
next: (saved) => {
const idx = this.selectedZoneIndex
this.template = saved
this.isNew = false
this.selectedZoneIndex = idx
this.saving = false
this.toastService.showInfo($localize`OCR template saved.`)
this.redrawCanvas()
},
error: (e) => {
this.saving = false
this.toastService.showError($localize`Error saving OCR template.`, e)
},
})
}
private ocrLangCache = new WeakMap<
OcrTemplateZone,
{ src: string; arr: string[] }
>()
ocrLanguageArray(zone: OcrTemplateZone): string[] {
const src = zone.ocr_language || ''
const cached = this.ocrLangCache.get(zone)
if (cached && cached.src === src) return cached.arr
const arr = src ? src.split('+').filter(Boolean) : []
this.ocrLangCache.set(zone, { src, arr })
return arr
}
setOcrLanguages(zone: OcrTemplateZone, langs: string[]) {
zone.ocr_language = (langs || []).join('+')
this.ocrLangCache.set(zone, {
src: zone.ocr_language,
arr: langs ? [...langs] : [],
})
}
getCustomFieldName(id: number): string {
const cf = this.customFields.find((f) => f.id === id)
return cf ? cf.name : `Field #${id}`
}
/** Value bound to the field select: a built-in id string or a custom-field id. */
zoneFieldValue(zone: OcrTemplateZone): number | string | null {
const target = zone.target || 'custom_field'
return target === 'custom_field' ? zone.custom_field : target
}
setZoneField(zone: OcrTemplateZone, value: number | string) {
if (value === 'title' || value === 'asn' || value === 'created') {
zone.target = value
zone.custom_field = null
} else {
zone.target = 'custom_field'
zone.custom_field = typeof value === 'number' ? value : null
}
this.seedCombineDefault(zone)
}
fieldKeyFor(zone: OcrTemplateZone): string | null {
const v = this.zoneFieldValue(zone)
return v === null || v === undefined || v === '' ? null : String(v)
}
zonesForField(zone: OcrTemplateZone): OcrTemplateZone[] {
const key = this.fieldKeyFor(zone)
if (!key) return []
return this.template.zones.filter((z) => this.fieldKeyFor(z) === key)
}
isFieldShared(zone: OcrTemplateZone): boolean {
return this.zonesForField(zone).length > 1
}
getCombineFormat(zone: OcrTemplateZone): string {
const key = this.fieldKeyFor(zone)
return (key && this.template.combine_formats?.[key]) || ''
}
setCombineFormat(zone: OcrTemplateZone, value: string) {
const key = this.fieldKeyFor(zone)
if (!key) return
this.template.combine_formats ??= {}
this.template.combine_formats[key] = value
}
insertCombineToken(zone: OcrTemplateZone, tokenZone: OcrTemplateZone) {
const token = `{${tokenZone.name}}`
const current = this.getCombineFormat(zone)
const sep = current && !current.endsWith(' ') ? ' ' : ''
this.setCombineFormat(zone, `${current}${sep}${token}`)
}
private seedCombineDefault(zone: OcrTemplateZone) {
const key = this.fieldKeyFor(zone)
if (!key) return
const shared = this.zonesForField(zone)
if (shared.length <= 1) return
this.template.combine_formats ??= {}
if (!this.template.combine_formats[key]) {
this.template.combine_formats[key] = shared
.map((z) => `{${z.name}}`)
.join(' ')
}
}
private pruneCombineFormats() {
const formats = this.template.combine_formats
if (!formats) return
const counts = new Map<string, number>()
for (const z of this.template.zones) {
const key = this.fieldKeyFor(z)
if (key) counts.set(key, (counts.get(key) ?? 0) + 1)
}
for (const key of Object.keys(formats)) {
if ((counts.get(key) ?? 0) <= 1) delete formats[key]
}
}
/** Value bound to the date-format select: a preset, '' (auto), or 'custom'. */
dateFormatChoice(zone: OcrTemplateZone): string {
if (this.dateFormatCustom) return 'custom'
return zone.date_format || ''
}
setDateFormatChoice(zone: OcrTemplateZone, value: string) {
if (value === 'custom') {
this.dateFormatCustom = true
} else {
this.dateFormatCustom = false
zone.date_format = value
}
}
getZoneTargetName(zone: OcrTemplateZone): string {
const target = zone.target || 'custom_field'
if (target === 'custom_field') {
return zone.custom_field
? this.getCustomFieldName(zone.custom_field)
: $localize`(no field)`
}
return this.builtinTargets.find((t) => t.id === target)?.name ?? target
}
getDocumentTypeName(id: number): string {
const dt = this.documentTypes.find((d) => d.id === id)
return dt ? dt.name : `Type #${id}`
}
openQuickCreate(zoneIndex: number | null) {
if (zoneIndex === null) return
this.quickCreateForZoneIndex = zoneIndex
this.quickCreateName = this.template.zones[zoneIndex]?.name || ''
this.quickCreateType = 'string'
this.showQuickCreate = true
}
cancelQuickCreate() {
this.showQuickCreate = false
this.quickCreateForZoneIndex = null
}
submitQuickCreate() {
if (!this.quickCreateName.trim()) return
this.templateService
.quickCreateField(this.quickCreateName.trim(), this.quickCreateType)
.pipe(takeUntil(this.destroy$))
.subscribe({
next: (result) => {
this.customFieldsService.clearCache()
this.customFieldsService
.listAll()
.pipe(takeUntil(this.destroy$))
.subscribe((r) => {
this.customFields = r.results
if (this.quickCreateForZoneIndex !== null) {
this.template.zones[this.quickCreateForZoneIndex].custom_field =
result.id
this.template.zones[this.quickCreateForZoneIndex].target =
'custom_field'
}
this.showQuickCreate = false
this.quickCreateForZoneIndex = null
})
},
error: (err) => {
alert(err.error?.error || 'Failed to create custom field')
},
})
}
ngOnDestroy() {
this.destroy$.next()
this.destroy$.complete()
}
}
@@ -0,0 +1,75 @@
<pngx-page-header
title="OCR Templates"
i18n-title
info="Define extraction zones on document types to automatically populate custom fields via OCR."
i18n-info
>
<button type="button" class="btn btn-sm btn-outline-primary" (click)="createTemplate()" *pngxIfPermissions="{ action: PermissionAction.Add, type: PermissionType.OcrTemplate }">
<i-bs name="plus-circle" class="me-1"></i-bs><ng-container i18n>Create Template</ng-container>
</button>
</pngx-page-header>
<ul class="list-group">
<li class="list-group-item">
<div class="row">
<div class="col" i18n>Name</div>
<div class="col d-none d-sm-flex" i18n>Document Type</div>
<div class="col d-none d-sm-flex" i18n>Zones</div>
<div class="col" i18n>Status</div>
<div class="col" i18n>Actions</div>
</div>
</li>
@if (loading && templates.length === 0) {
<li class="list-group-item">
<div class="spinner-border spinner-border-sm me-2" role="status"></div>
<ng-container i18n>Loading...</ng-container>
</li>
}
@for (t of templates; track t.id) {
<li class="list-group-item">
<div class="row fade" [class.show]="show">
<div class="col d-flex align-items-center"><button class="btn btn-link p-0 text-start" type="button" (click)="editTemplate(t)" [disabled]="!permissionsService.currentUserCan(PermissionAction.Change, PermissionType.OcrTemplate)">{{t.name}}</button></div>
<div class="col d-flex align-items-center d-none d-sm-flex">{{getDocumentTypeName(t)}}</div>
<div class="col d-flex align-items-center d-none d-sm-flex"><code>{{t.zones?.length || 0}}</code></div>
<div class="col d-flex align-items-center">
<div class="form-check form-switch mb-0">
<input type="checkbox" class="form-check-input cursor-pointer" [id]="t.id+'_enable'" [(ngModel)]="t.enabled" (change)="toggleTemplate(t)" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.OcrTemplate }">
<label class="form-check-label cursor-pointer" [for]="t.id+'_enable'">
<code> @if(t.enabled) { <ng-container i18n>Enabled</ng-container> } @else { <span i18n class="text-muted">Disabled</span> }</code>
</label>
</div>
</div>
<div class="col">
<div class="btn-group d-block d-sm-none">
<div ngbDropdown container="body" class="d-inline-block">
<button type="button" class="btn btn-link" id="actionsMenuMobile{{t.id}}" (click)="$event.stopPropagation()" ngbDropdownToggle>
<i-bs name="three-dots-vertical"></i-bs>
</button>
<div ngbDropdownMenu aria-labelledby="actionsMenuMobile{{t.id}}">
<button (click)="editTemplate(t)" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.OcrTemplate }" ngbDropdownItem i18n>Edit</button>
<button (click)="deleteTemplate(t)" *pngxIfPermissions="{ action: PermissionAction.Delete, type: PermissionType.OcrTemplate }" ngbDropdownItem i18n>Delete</button>
</div>
</div>
</div>
<div class="btn-toolbar d-none d-sm-flex gap-2" role="toolbar">
<div class="btn-group">
<button *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.OcrTemplate }" class="btn btn-sm btn-outline-secondary" type="button" (click)="editTemplate(t)">
<i-bs width="1em" height="1em" name="pencil" class="me-1"></i-bs><ng-container i18n>Edit</ng-container>
</button>
<button *pngxIfPermissions="{ action: PermissionAction.Delete, type: PermissionType.OcrTemplate }" class="btn btn-sm btn-outline-danger" type="button" (click)="deleteTemplate(t)">
<i-bs width="1em" height="1em" name="trash" class="me-1"></i-bs><ng-container i18n>Delete</ng-container>
</button>
</div>
</div>
</div>
</div>
</li>
}
@if (!loading && templates.length === 0) {
<li class="list-group-item" [class.show]="show" i18n>No OCR templates defined.</li>
}
</ul>
@@ -0,0 +1,98 @@
import { Component, OnInit, inject } from '@angular/core'
import { FormsModule } from '@angular/forms'
import { Router } from '@angular/router'
import { NgbDropdownModule, NgbModal } from '@ng-bootstrap/ng-bootstrap'
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
import { delay, takeUntil, tap } from 'rxjs'
import { OcrTemplate } from 'src/app/data/ocr-template'
import { IfPermissionsDirective } from 'src/app/directives/if-permissions.directive'
import { PermissionsService } from 'src/app/services/permissions.service'
import { DocumentTypeService } from 'src/app/services/rest/document-type.service'
import { OcrTemplateService } from 'src/app/services/rest/ocr-template.service'
import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
import { PageHeaderComponent } from '../../common/page-header/page-header.component'
import { LoadingComponentWithPermissions } from '../../loading-component/loading.component'
@Component({
selector: 'pngx-ocr-templates',
templateUrl: './ocr-templates.component.html',
imports: [
PageHeaderComponent,
IfPermissionsDirective,
FormsModule,
NgbDropdownModule,
NgxBootstrapIconsModule,
],
})
export class OcrTemplatesComponent
extends LoadingComponentWithPermissions
implements OnInit
{
private readonly service = inject(OcrTemplateService)
private readonly documentTypeService = inject(DocumentTypeService)
private readonly router = inject(Router)
private readonly modalService = inject(NgbModal)
permissionsService = inject(PermissionsService)
public templates: OcrTemplate[] = []
private documentTypeNames: Map<number, string> = new Map()
ngOnInit() {
this.documentTypeService
.listAll()
.pipe(takeUntil(this.unsubscribeNotifier))
.subscribe((r) => {
this.documentTypeNames = new Map(
r.results.map((dt) => [dt.id, dt.name])
)
})
this.reload()
}
reload() {
this.loading = true
this.service
.listAll()
.pipe(
takeUntil(this.unsubscribeNotifier),
tap((r) => (this.templates = r.results)),
delay(100)
)
.subscribe(() => {
this.show = true
this.loading = false
})
}
getDocumentTypeName(t: OcrTemplate): string {
return (
this.documentTypeNames.get(t.document_type) ?? `${t.document_type ?? ''}`
)
}
createTemplate() {
this.router.navigate(['/ocr-templates', 'new'])
}
editTemplate(t: OcrTemplate) {
this.router.navigate(['/ocr-templates', t.id])
}
toggleTemplate(t: OcrTemplate) {
// ngModel has already flipped t.enabled — just persist it.
this.service.patch(t).subscribe()
}
deleteTemplate(t: OcrTemplate) {
const modal = this.modalService.open(ConfirmDialogComponent)
modal.componentInstance.title = $localize`Delete OCR Template`
modal.componentInstance.messageBoldPart = t.name
modal.componentInstance.message = $localize`Do you really want to delete this OCR template?`
modal.componentInstance.btnClass = 'btn-danger'
modal.componentInstance.btnCaption = $localize`Delete`
modal.componentInstance.confirmClicked.subscribe(() => {
modal.close()
this.service.delete(t).subscribe(() => this.reload())
})
}
}
+102
View File
@@ -0,0 +1,102 @@
import { ObjectWithId } from './object-with-id'
export type OcrZoneTarget = 'custom_field' | 'title' | 'asn' | 'created'
export const OCR_BUILTIN_TARGETS = [
{ id: 'title', name: $localize`Title` },
{ id: 'asn', name: $localize`Archive serial number` },
{ id: 'created', name: $localize`Date created` },
]
export interface OcrTemplateZone {
id?: number
name: string
target?: OcrZoneTarget
custom_field: number | null
page?: number
x: number
y: number
width: number
height: number
ocr_language: string
transform: string
date_format?: string
validation_regex: string
order: number
zone_source_width?: number
zone_source_height?: number
}
export const TRANSFORM_OPTIONS = [
{ id: 'none', name: $localize`None` },
{ id: 'strip', name: $localize`Strip whitespace` },
{ id: 'uppercase', name: $localize`Uppercase` },
{ id: 'lowercase', name: $localize`Lowercase` },
{ id: 'numeric', name: $localize`Numeric only` },
{
id: 'strip_punctuation',
name: $localize`Remove leading/trailing punctuation`,
},
{ id: 'date', name: $localize`Parse date` },
{ id: 'qr_code', name: $localize`Read QR/barcode` },
]
export const OCR_LANGUAGE_OPTIONS = [
{ id: 'eng', name: $localize`English` },
{ id: 'deu', name: $localize`German` },
{ id: 'fra', name: $localize`French` },
{ id: 'ita', name: $localize`Italian` },
{ id: 'spa', name: $localize`Spanish` },
{ id: 'por', name: $localize`Portuguese` },
{ id: 'nld', name: $localize`Dutch` },
]
export const DATE_FORMAT_OPTIONS = [
{ id: '', name: $localize`Auto-detect` },
{ id: '%d.%m.%Y', name: 'DD.MM.YYYY' },
{ id: '%Y/%m/%d', name: 'YYYY/MM/DD' },
{ id: '%d/%m/%Y', name: 'DD/MM/YYYY' },
]
export interface OcrTemplate extends ObjectWithId {
name: string
document_type: number
sample_document: number | null
source_width: number
source_height: number
enabled: boolean
combine_formats?: Record<string, string>
created?: string
updated?: string
zones: OcrTemplateZone[]
}
export interface ZoneTestRequest {
name: string
x: number
y: number
width: number
height: number
page: number
ocr_language: string
transform: string
date_format?: string
validation_regex: string
zone_source_width?: number
zone_source_height?: number
}
export interface OcrZoneTestResult {
raw_text?: string | null
value?: string | null
regex?: string
regex_match?: boolean | null
error?: string
}
export interface OcrZoneRunResult {
template: string
zone: string
custom_field: string
value: string | number | null
}
@@ -28,6 +28,7 @@ export enum PermissionType {
ShareLink = '%s_sharelink',
CustomField = '%s_customfield',
Workflow = '%s_workflow',
OcrTemplate = '%s_ocrtemplate',
ProcessedMail = '%s_processedmail',
GlobalStatistics = '%s_global_statistics',
SystemMonitoring = '%s_system_monitoring',
@@ -12,6 +12,7 @@ import {
import { DocumentMetadata } from 'src/app/data/document-metadata'
import { DocumentSuggestions } from 'src/app/data/document-suggestions'
import { FilterRule } from 'src/app/data/filter-rule'
import { OcrZoneRunResult } from 'src/app/data/ocr-template'
import { Results, SelectionData } from 'src/app/data/results'
import { SETTINGS_KEYS } from 'src/app/data/ui-settings'
import { queryParamsFromFilterRules } from '../../utils/query-params'
@@ -355,6 +356,13 @@ export class DocumentService extends AbstractPaperlessService<Document> {
})
}
runZoneOcr(id: number): Observable<{ results: OcrZoneRunResult[] }> {
return this.http.post<{ results: OcrZoneRunResult[] }>(
this.getResourceUrl(id, 'run-zone-ocr'),
{}
)
}
rotateDocuments(
selection: DocumentSelectionQuery,
degrees: number,
@@ -0,0 +1,47 @@
import { Injectable } from '@angular/core'
import { Observable } from 'rxjs'
import {
OcrTemplate,
OcrZoneTestResult,
ZoneTestRequest,
} from '../../data/ocr-template'
import { AbstractPaperlessService } from './abstract-paperless-service'
export interface QuickCreateFieldResult {
id: number
name: string
data_type: string
created: boolean
}
@Injectable({ providedIn: 'root' })
export class OcrTemplateService extends AbstractPaperlessService<OcrTemplate> {
constructor() {
super()
this.resourceName = 'ocr_templates'
}
getPageImageUrl(docId: number, page: number): string {
return `${this.baseUrl}${this.resourceName}/document-page-image/${docId}/${page}/`
}
testZone(
docId: number,
zone: ZoneTestRequest
): Observable<OcrZoneTestResult> {
return this.http.post<OcrZoneTestResult>(
`${this.baseUrl}${this.resourceName}/test-zone/`,
{ document: docId, zone }
)
}
quickCreateField(
name: string,
dataType: string
): Observable<QuickCreateFieldResult> {
return this.http.post<QuickCreateFieldResult>(
`${this.baseUrl}${this.resourceName}/quick-create-field/`,
{ name, data_type: dataType }
)
}
}
+6
View File
@@ -79,13 +79,16 @@ import {
exclamationTriangleFill,
eye,
fileEarmark,
fileEarmarkBreak,
fileEarmarkCheck,
fileEarmarkDiff,
fileEarmarkFill,
fileEarmarkLock,
fileEarmarkMedical,
fileEarmarkMinus,
fileEarmarkPlus,
fileEarmarkRichtext,
fileEarmarkRuled,
fileText,
files,
filter,
@@ -302,13 +305,16 @@ const icons = {
exclamationTriangleFill,
eye,
fileEarmark,
fileEarmarkBreak,
fileEarmarkCheck,
fileEarmarkDiff,
fileEarmarkFill,
fileEarmarkLock,
fileEarmarkMedical,
fileEarmarkMinus,
fileEarmarkPlus,
fileEarmarkRichtext,
fileEarmarkRuled,
files,
fileText,
filter,
+13
View File
@@ -13,8 +13,11 @@ class DocumentsConfig(AppConfig):
from documents.signals.handlers import add_inbox_tags
from documents.signals.handlers import add_or_update_document_in_llm_index
from documents.signals.handlers import add_to_index
from documents.signals.handlers import capture_old_document_type
from documents.signals.handlers import run_workflows_added
from documents.signals.handlers import run_workflows_updated
from documents.signals.handlers import run_zone_ocr_extraction
from documents.signals.handlers import run_zone_ocr_on_type_change
from documents.signals.handlers import send_websocket_document_updated
from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_document_type
@@ -29,6 +32,16 @@ class DocumentsConfig(AppConfig):
document_consumption_finished.connect(add_to_index)
document_consumption_finished.connect(run_workflows_added)
document_consumption_finished.connect(add_or_update_document_in_llm_index)
document_consumption_finished.connect(run_zone_ocr_extraction)
from django.db.models.signals import post_save
from django.db.models.signals import pre_save
from documents.models import Document
pre_save.connect(capture_old_document_type, sender=Document)
post_save.connect(run_zone_ocr_on_type_change, sender=Document)
document_updated.connect(run_workflows_updated)
document_updated.connect(send_websocket_document_updated)
document_updated.connect(add_or_update_document_in_llm_index)
@@ -0,0 +1,267 @@
# Generated by Django 5.2.14 on 2026-06-16 17:36
import django.core.validators
import django.db.models.deletion
import django.utils.timezone
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0021_widen_workflow_integer_fields"),
]
operations = [
migrations.CreateModel(
name="OcrTemplate",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128, verbose_name="name")),
(
"source_width",
models.PositiveIntegerField(
help_text="Width of the image the zones were drawn on (px)",
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="source width",
),
),
(
"source_height",
models.PositiveIntegerField(
help_text="Height of the image the zones were drawn on (px)",
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="source height",
),
),
("enabled", models.BooleanField(default=True, verbose_name="enabled")),
(
"combine_formats",
models.JSONField(
blank=True,
default=dict,
help_text="Per-target format strings for combining several zones into one field, keyed by target (custom field id, or 'title'/'asn'/'created'). Tokens like {Zone Name} are replaced with that zone's value.",
verbose_name="combine formats",
),
),
(
"created",
models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
editable=False,
verbose_name="created",
),
),
(
"updated",
models.DateTimeField(auto_now=True, verbose_name="updated"),
),
(
"document_type",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="ocr_templates",
to="documents.documenttype",
verbose_name="document type",
),
),
(
"sample_document",
models.ForeignKey(
blank=True,
help_text="Document used for previewing zones in the editor",
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="+",
to="documents.document",
verbose_name="sample document",
),
),
],
options={
"verbose_name": "OCR template",
"verbose_name_plural": "OCR templates",
"ordering": ("name",),
},
),
migrations.CreateModel(
name="OcrTemplateZone",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"name",
models.CharField(
help_text="Descriptive name for this zone (e.g. 'Invoice Number')",
max_length=128,
verbose_name="zone name",
),
),
(
"target",
models.CharField(
choices=[
("custom_field", "Custom field"),
("title", "Title"),
("asn", "Archive serial number"),
("created", "Date created"),
],
default="custom_field",
help_text="Where the extracted value is written: a custom field, or a built-in document field (title, ASN, created date)",
max_length=20,
verbose_name="target",
),
),
(
"page",
models.IntegerField(
blank=True,
help_text="Page (1 = first, -1 = last; blank uses the template default)",
null=True,
verbose_name="page",
),
),
(
"x",
models.PositiveIntegerField(
help_text="Left edge (px)",
verbose_name="x",
),
),
(
"y",
models.PositiveIntegerField(
help_text="Top edge (px)",
verbose_name="y",
),
),
(
"width",
models.PositiveIntegerField(
help_text="Zone width (px)",
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="width",
),
),
(
"height",
models.PositiveIntegerField(
help_text="Zone height (px)",
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="height",
),
),
(
"zone_source_width",
models.PositiveIntegerField(
blank=True,
help_text="Width of the page image this zone was drawn on (px). Falls back to template source_width if unset.",
null=True,
verbose_name="zone source width",
),
),
(
"zone_source_height",
models.PositiveIntegerField(
blank=True,
help_text="Height of the page image this zone was drawn on (px). Falls back to template source_height if unset.",
null=True,
verbose_name="zone source height",
),
),
(
"ocr_language",
models.CharField(
default="deu+eng",
help_text="Tesseract language code(s), e.g. 'deu+eng'",
max_length=20,
verbose_name="OCR language",
),
),
(
"transform",
models.CharField(
choices=[
("none", "None"),
("strip", "Strip whitespace"),
("uppercase", "Uppercase"),
("lowercase", "Lowercase"),
("numeric", "Numeric only"),
(
"strip_punctuation",
"Remove leading/trailing punctuation",
),
("date", "Parse date"),
("qr_code", "Read QR/barcode"),
],
default="strip",
max_length=20,
verbose_name="transform",
),
),
(
"date_format",
models.CharField(
blank=True,
default="",
help_text="Python strptime format for the 'Parse date' transform (e.g. %d.%m.%Y). Blank = auto-detect.",
max_length=64,
verbose_name="date format",
),
),
(
"validation_regex",
models.CharField(
blank=True,
default="",
help_text="Optional regex pattern — extracted text is only accepted if it matches",
max_length=256,
verbose_name="validation regex",
),
),
("order", models.PositiveIntegerField(default=0, verbose_name="order")),
(
"custom_field",
models.ForeignKey(
blank=True,
help_text="Target custom field (only used when target is 'custom_field')",
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="ocr_zones",
to="documents.customfield",
verbose_name="custom field",
),
),
(
"template",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="zones",
to="documents.ocrtemplate",
verbose_name="template",
),
),
],
options={
"verbose_name": "OCR template zone",
"verbose_name_plural": "OCR template zones",
"ordering": ("template", "order"),
},
),
]
+245
View File
@@ -1894,3 +1894,248 @@ class WorkflowRun(SoftDeleteModel):
def __str__(self) -> str:
return f"WorkflowRun of {self.workflow} at {self.run_at} on {self.document}"
class OcrTemplate(models.Model):
"""
Defines a set of OCR extraction zones for a specific document type.
When a document of that type is consumed, each zone in the template is
cropped from the document image and OCR'd separately. The extracted text
is written to the configured custom field or built-in document field.
"""
name = models.CharField(
_("name"),
max_length=128,
)
document_type = models.ForeignKey(
"documents.DocumentType",
on_delete=models.CASCADE,
related_name="ocr_templates",
verbose_name=_("document type"),
db_index=True,
)
source_width = models.PositiveIntegerField(
_("source width"),
validators=[MinValueValidator(1)],
help_text=_("Width of the image the zones were drawn on (px)"),
)
source_height = models.PositiveIntegerField(
_("source height"),
validators=[MinValueValidator(1)],
help_text=_("Height of the image the zones were drawn on (px)"),
)
sample_document = models.ForeignKey(
"documents.Document",
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name="+",
verbose_name=_("sample document"),
help_text=_("Document used for previewing zones in the editor"),
)
enabled = models.BooleanField(_("enabled"), default=True)
combine_formats = models.JSONField(
_("combine formats"),
default=dict,
blank=True,
help_text=_(
"Per-target format strings for combining several zones into one "
"field, keyed by target (custom field id, or 'title'/'asn'/'created'). "
"Tokens like {Zone Name} are replaced with that zone's value.",
),
)
created = models.DateTimeField(
_("created"),
default=timezone.now,
db_index=True,
editable=False,
)
updated = models.DateTimeField(
_("updated"),
auto_now=True,
)
class Meta:
ordering = ("name",)
verbose_name = _("OCR template")
verbose_name_plural = _("OCR templates")
def __str__(self) -> str:
return f"{self.name} ({self.document_type})"
class OcrTemplateZone(models.Model):
"""
A rectangular region within a document page to OCR and extract into a custom
field or built-in document field. Coordinates are relative to the source
image dimensions stored on the template.
"""
template = models.ForeignKey(
OcrTemplate,
on_delete=models.CASCADE,
related_name="zones",
verbose_name=_("template"),
)
name = models.CharField(
_("zone name"),
max_length=128,
help_text=_("Descriptive name for this zone (e.g. 'Invoice Number')"),
)
class TargetType(models.TextChoices):
CUSTOM_FIELD = ("custom_field", _("Custom field"))
TITLE = ("title", _("Title"))
ASN = ("asn", _("Archive serial number"))
CREATED = ("created", _("Date created"))
target = models.CharField(
_("target"),
max_length=20,
choices=TargetType.choices,
default=TargetType.CUSTOM_FIELD,
help_text=_(
"Where the extracted value is written: a custom field, or a "
"built-in document field (title, ASN, created date)",
),
)
custom_field = models.ForeignKey(
"documents.CustomField",
on_delete=models.CASCADE,
related_name="ocr_zones",
verbose_name=_("custom field"),
null=True,
blank=True,
help_text=_("Target custom field (only used when target is 'custom_field')"),
)
page = models.IntegerField(
_("page"),
null=True,
blank=True,
help_text=_("Page (1 = first, -1 = last; blank uses the template default)"),
)
x = models.PositiveIntegerField(_("x"), help_text=_("Left edge (px)"))
y = models.PositiveIntegerField(_("y"), help_text=_("Top edge (px)"))
width = models.PositiveIntegerField(
_("width"),
validators=[MinValueValidator(1)],
help_text=_("Zone width (px)"),
)
height = models.PositiveIntegerField(
_("height"),
validators=[MinValueValidator(1)],
help_text=_("Zone height (px)"),
)
# Per-zone source dimensions for coordinate scaling.
# Stored from the page image the zone was drawn on.
# If null, falls back to the template's source_width/source_height.
# This handles PDFs with mixed page sizes (e.g. landscape + portrait,
# or different paper formats across pages).
zone_source_width = models.PositiveIntegerField(
_("zone source width"),
null=True,
blank=True,
help_text=_(
"Width of the page image this zone was drawn on (px). "
"Falls back to template source_width if unset.",
),
)
zone_source_height = models.PositiveIntegerField(
_("zone source height"),
null=True,
blank=True,
help_text=_(
"Height of the page image this zone was drawn on (px). "
"Falls back to template source_height if unset.",
),
)
ocr_language = models.CharField(
_("OCR language"),
max_length=20,
default="deu+eng",
help_text=_("Tesseract language code(s), e.g. 'deu+eng'"),
)
class TransformType(models.TextChoices):
NONE = ("none", _("None"))
STRIP = ("strip", _("Strip whitespace"))
UPPERCASE = ("uppercase", _("Uppercase"))
LOWERCASE = ("lowercase", _("Lowercase"))
NUMERIC = ("numeric", _("Numeric only"))
STRIP_PUNCTUATION = (
"strip_punctuation",
_("Remove leading/trailing punctuation"),
)
DATE = ("date", _("Parse date"))
QR_CODE = ("qr_code", _("Read QR/barcode"))
transform = models.CharField(
_("transform"),
max_length=20,
choices=TransformType.choices,
default=TransformType.STRIP,
)
date_format = models.CharField(
_("date format"),
max_length=64,
blank=True,
default="",
help_text=_(
"Python strptime format for the 'Parse date' transform "
"(e.g. %d.%m.%Y). Blank = auto-detect.",
),
)
validation_regex = models.CharField(
_("validation regex"),
max_length=256,
blank=True,
default="",
help_text=_(
"Optional regex pattern — extracted text is only accepted if it matches",
),
)
order = models.PositiveIntegerField(_("order"), default=0)
class Meta:
ordering = ("template", "order")
verbose_name = _("OCR template zone")
verbose_name_plural = _("OCR template zones")
def __str__(self) -> str:
return f"{self.template.name} -> {self.name}"
# Custom field data types that zone OCR can extract into. DOCUMENTLINK and
# SELECT are excluded (they reference other objects, not free text). Single
# source of truth for the serializer, the quick-create endpoint and the engine.
OCR_SUPPORTED_FIELD_TYPES = frozenset(
{
CustomField.FieldDataType.STRING,
CustomField.FieldDataType.URL,
CustomField.FieldDataType.DATE,
CustomField.FieldDataType.INT,
CustomField.FieldDataType.FLOAT,
CustomField.FieldDataType.MONETARY,
CustomField.FieldDataType.LONG_TEXT,
CustomField.FieldDataType.BOOL,
},
)
+129
View File
@@ -57,6 +57,7 @@ if settings.AUDIT_LOG_ENABLED:
from documents import bulk_edit
from documents.data_models import DocumentSource
from documents.filters import CustomFieldQueryParser
from documents.models import OCR_SUPPORTED_FIELD_TYPES
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
@@ -64,6 +65,8 @@ from documents.models import Document
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import Note
from documents.models import OcrTemplate
from documents.models import OcrTemplateZone
from documents.models import PaperlessTask
from documents.models import SavedView
from documents.models import SavedViewFilterRule
@@ -3501,3 +3504,129 @@ class StoragePathTestSerializer(SerializerWithPerms):
"documents.view_document",
Document,
)
class OcrTemplateZoneSerializer(serializers.ModelSerializer):
class Meta:
model = OcrTemplateZone
fields = [
"id",
"name",
"target",
"custom_field",
"page",
"x",
"y",
"width",
"height",
"ocr_language",
"transform",
"date_format",
"order",
"zone_source_width",
"zone_source_height",
"validation_regex",
]
def validate_width(self, value):
if value < 1:
raise serializers.ValidationError("Width must be at least 1.")
return value
def validate_height(self, value):
if value < 1:
raise serializers.ValidationError("Height must be at least 1.")
return value
def validate_custom_field(self, value):
if value is None:
# Built-in target (title/asn/created) — no custom field required.
return value
if value.data_type not in OCR_SUPPORTED_FIELD_TYPES:
raise serializers.ValidationError(
f"Custom field type '{value.data_type}' is not supported for OCR extraction. "
f"Use string, integer, float, date, monetary, boolean, URL, or long text.",
)
return value
class OcrTemplateSerializer(serializers.ModelSerializer):
zones = OcrTemplateZoneSerializer(many=True, required=False)
class Meta:
model = OcrTemplate
fields = [
"id",
"name",
"document_type",
"source_width",
"source_height",
"sample_document",
"enabled",
"combine_formats",
"created",
"updated",
"zones",
]
read_only_fields = ["created", "updated"]
def validate_source_width(self, value):
if value < 1:
raise serializers.ValidationError("Source width must be at least 1.")
return value
def validate_source_height(self, value):
if value < 1:
raise serializers.ValidationError("Source height must be at least 1.")
return value
def validate_zones(self, zones_data):
"""Validate zone coordinates are within the source dimensions."""
# source_width/height may not be in initial_data during partial updates
source_width = self.initial_data.get("source_width") or (
self.instance.source_width if self.instance else None
)
source_height = self.initial_data.get("source_height") or (
self.instance.source_height if self.instance else None
)
if source_width and source_height:
for zone in zones_data:
x = zone.get("x", 0)
y = zone.get("y", 0)
w = zone.get("width", 0)
h = zone.get("height", 0)
if x + w > int(source_width):
raise serializers.ValidationError(
f"Zone '{zone.get('name', '?')}' extends beyond source width "
f"({x + w} > {source_width}).",
)
if y + h > int(source_height):
raise serializers.ValidationError(
f"Zone '{zone.get('name', '?')}' extends beyond source height "
f"({y + h} > {source_height}).",
)
return zones_data
def create(self, validated_data):
zones_data = validated_data.pop("zones", [])
template = OcrTemplate.objects.create(**validated_data)
for zone_data in zones_data:
OcrTemplateZone.objects.create(template=template, **zone_data)
return template
def update(self, instance, validated_data):
zones_data = validated_data.pop("zones", None)
for attr, value in validated_data.items():
setattr(instance, attr, value)
instance.save()
if zones_data is not None:
# Replace all zones with the new set
instance.zones.all().delete()
for zone_data in zones_data:
OcrTemplateZone.objects.create(template=instance, **zone_data)
return instance
+69
View File
@@ -1340,6 +1340,75 @@ def close_connection_pool_on_worker_init(**kwargs) -> None:
conn.close_pool()
def run_zone_ocr_extraction(sender, document, original_file=None, **kwargs):
"""
Run zone-based OCR extraction if the document's type has an active template.
"""
try:
from documents.zone_ocr import run_zone_extraction
run_zone_extraction(document, Path(original_file) if original_file else None)
except Exception:
logger.exception(
"Zone OCR extraction failed for document %s",
document.pk,
)
def capture_old_document_type(sender, instance, **kwargs):
"""pre_save: remember the document's previous type so the post_save handler
can tell whether the type actually changed (vs. every other save)."""
if instance.pk:
instance._old_document_type_id = (
Document.objects.filter(pk=instance.pk)
.values_list("document_type_id", flat=True)
.first()
)
else:
instance._old_document_type_id = None
def run_zone_ocr_on_type_change(sender, instance, *, created=False, **kwargs):
"""
Run zone OCR only when a document's TYPE actually changes (and the new type
has an enabled template). NOT on every save — zone OCR overwrites fields, so
re-running it on each edit would clobber the user's changes. Newly created
documents are handled by the consumption signal, and the user can always
trigger extraction manually via the run-zone-ocr action.
"""
if created or not instance.pk or not instance.document_type_id:
return
# Only proceed if the type changed compared to what was in the DB before.
old_type = getattr(instance, "_old_document_type_id", None)
if old_type == instance.document_type_id:
return
from documents.models import OcrTemplate
if not OcrTemplate.objects.filter(
document_type_id=instance.document_type_id,
enabled=True,
).exists():
return
try:
from documents.zone_ocr import run_zone_extraction
doc_path = instance.archive_path or instance.source_path
if doc_path and Path(doc_path).is_file():
logger.info(
"Zone OCR: running extraction for document %d (type %d)",
instance.pk,
instance.document_type_id,
)
run_zone_extraction(instance, None)
except Exception:
logger.exception(
"Zone OCR extraction failed for document %s",
instance.pk,
)
@worker_process_shutdown.connect
def close_connection_pool_on_worker_shutdown(**kwargs) -> None: # pragma: no cover
"""
@@ -0,0 +1,449 @@
"""Tests for the OCR Template API."""
import json
from django.contrib.auth.models import User
from rest_framework import status
from rest_framework.test import APITestCase
from documents.models import CustomField
from documents.models import DocumentType
from documents.models import OcrTemplate
from documents.models import OcrTemplateZone
from documents.tests.utils import DirectoriesMixin
class TestOcrTemplatesAPI(DirectoriesMixin, APITestCase):
ENDPOINT = "/api/ocr_templates/"
def setUp(self) -> None:
self.user = User.objects.create_superuser(username="temp_admin")
self.client.force_authenticate(user=self.user)
self.doc_type = DocumentType.objects.create(name="Invoice")
self.custom_field_text = CustomField.objects.create(
name="Invoice Number",
data_type=CustomField.FieldDataType.STRING,
)
self.custom_field_date = CustomField.objects.create(
name="Invoice Date",
data_type=CustomField.FieldDataType.DATE,
)
self.custom_field_int = CustomField.objects.create(
name="Amount",
data_type=CustomField.FieldDataType.INT,
)
self.custom_field_doclink = CustomField.objects.create(
name="Related Docs",
data_type=CustomField.FieldDataType.DOCUMENTLINK,
)
return super().setUp()
def _make_template_data(self, **overrides):
data = {
"name": "Invoice Template",
"document_type": self.doc_type.pk,
"default_page": 0,
"source_width": 2480,
"source_height": 3508,
"enabled": True,
"zones": [],
}
data.update(overrides)
return data
def _make_zone_data(self, **overrides):
data = {
"name": "Zone 1",
"custom_field": self.custom_field_text.pk,
"x": 100,
"y": 100,
"width": 200,
"height": 50,
"ocr_language": "deu+eng",
"transform": "strip",
"order": 0,
}
data.update(overrides)
return data
# --- Create ---
def test_create_template(self):
"""
GIVEN:
- A document type and custom fields exist
WHEN:
- API request to create an OCR template with one zone
THEN:
- The template and zone are created
"""
data = self._make_template_data(
zones=[
self._make_zone_data(
name="Invoice Number",
x=1500,
y=200,
width=800,
height=100,
),
],
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
result = resp.json()
self.assertEqual(result["name"], "Invoice Template")
self.assertEqual(result["document_type"], self.doc_type.pk)
self.assertEqual(len(result["zones"]), 1)
self.assertEqual(result["zones"][0]["name"], "Invoice Number")
self.assertEqual(OcrTemplate.objects.count(), 1)
self.assertEqual(OcrTemplateZone.objects.count(), 1)
def test_create_template_multiple_zones(self):
"""
GIVEN:
- Multiple custom fields exist
WHEN:
- A template with multiple zones is created
THEN:
- All zones are created
"""
data = self._make_template_data(
zones=[
self._make_zone_data(
name="Invoice Number",
custom_field=self.custom_field_text.pk,
),
self._make_zone_data(
name="Invoice Date",
custom_field=self.custom_field_date.pk,
order=1,
),
],
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
self.assertEqual(len(resp.json()["zones"]), 2)
self.assertEqual(OcrTemplateZone.objects.count(), 2)
def test_create_template_no_zones(self):
"""
GIVEN:
- Valid template data without zones
WHEN:
- Template is created
THEN:
- Template is created with no zones
"""
data = self._make_template_data()
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
self.assertEqual(len(resp.json()["zones"]), 0)
# --- Validation ---
def test_create_template_zero_source_width_rejected(self):
"""
GIVEN:
- Template data with source_width=0
WHEN:
- Create is attempted
THEN:
- 400 error is returned
"""
data = self._make_template_data(source_width=0)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_template_zero_source_height_rejected(self):
data = self._make_template_data(source_height=0)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_zone_zero_width_rejected(self):
data = self._make_template_data(
zones=[self._make_zone_data(width=0)],
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_zone_zero_height_rejected(self):
data = self._make_template_data(
zones=[self._make_zone_data(height=0)],
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_zone_exceeds_source_width_rejected(self):
"""Zone that extends beyond the source image width should be rejected."""
data = self._make_template_data(
source_width=1000,
zones=[self._make_zone_data(x=800, width=300)], # 800+300 > 1000
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_zone_exceeds_source_height_rejected(self):
data = self._make_template_data(
source_height=1000,
zones=[self._make_zone_data(y=900, height=200)], # 900+200 > 1000
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_zone_unsupported_custom_field_type_rejected(self):
"""DOCUMENTLINK and SELECT fields can't be populated via OCR."""
data = self._make_template_data(
zones=[self._make_zone_data(custom_field=self.custom_field_doclink.pk)],
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
# --- List ---
def test_list_templates(self):
template = OcrTemplate.objects.create(
name="Test Template",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
)
OcrTemplateZone.objects.create(
template=template,
name="Zone 1",
custom_field=self.custom_field_text,
x=100,
y=100,
width=200,
height=50,
)
resp = self.client.get(self.ENDPOINT)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
data = resp.json()
self.assertEqual(data["count"], 1)
self.assertEqual(len(data["results"][0]["zones"]), 1)
def test_list_empty(self):
resp = self.client.get(self.ENDPOINT)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
self.assertEqual(resp.json()["count"], 0)
# --- Update ---
def test_update_template_replaces_zones(self):
"""PUT should replace all zones with the new set."""
template = OcrTemplate.objects.create(
name="Old Name",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
)
OcrTemplateZone.objects.create(
template=template,
name="Old Zone",
custom_field=self.custom_field_text,
x=0,
y=0,
width=100,
height=100,
)
data = self._make_template_data(
name="New Name",
zones=[
self._make_zone_data(
name="New Zone",
custom_field=self.custom_field_date.pk,
),
],
)
resp = self.client.put(
f"{self.ENDPOINT}{template.pk}/",
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
template.refresh_from_db()
self.assertEqual(template.name, "New Name")
self.assertEqual(OcrTemplateZone.objects.count(), 1)
self.assertEqual(OcrTemplateZone.objects.first().name, "New Zone")
# --- Delete ---
def test_delete_template_cascades_zones(self):
template = OcrTemplate.objects.create(
name="To Delete",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
)
OcrTemplateZone.objects.create(
template=template,
name="Zone",
custom_field=self.custom_field_text,
x=0,
y=0,
width=100,
height=100,
)
resp = self.client.delete(f"{self.ENDPOINT}{template.pk}/")
self.assertEqual(resp.status_code, status.HTTP_204_NO_CONTENT)
self.assertEqual(OcrTemplate.objects.count(), 0)
self.assertEqual(OcrTemplateZone.objects.count(), 0)
def test_delete_nonexistent_returns_404(self):
resp = self.client.delete(f"{self.ENDPOINT}99999/")
self.assertEqual(resp.status_code, status.HTTP_404_NOT_FOUND)
# --- Patch ---
def test_patch_toggle_enabled(self):
template = OcrTemplate.objects.create(
name="Toggle Test",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=True,
)
resp = self.client.patch(
f"{self.ENDPOINT}{template.pk}/",
data=json.dumps({"enabled": False}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
template.refresh_from_db()
self.assertFalse(template.enabled)
def test_patch_preserves_zones(self):
"""PATCH without zones field should not delete existing zones."""
template = OcrTemplate.objects.create(
name="Patch Test",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
)
OcrTemplateZone.objects.create(
template=template,
name="Existing Zone",
custom_field=self.custom_field_text,
x=0,
y=0,
width=100,
height=100,
)
resp = self.client.patch(
f"{self.ENDPOINT}{template.pk}/",
data=json.dumps({"name": "Updated Name"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
self.assertEqual(OcrTemplateZone.objects.count(), 1)
# --- Auth ---
def test_unauthenticated_rejected(self):
self.client.logout()
resp = self.client.get(self.ENDPOINT)
self.assertIn(
resp.status_code,
(status.HTTP_401_UNAUTHORIZED, status.HTTP_403_FORBIDDEN),
)
# --- Quick create field ---
def test_quick_create_field(self):
"""Creating a custom field inline from the template editor."""
resp = self.client.post(
f"{self.ENDPOINT}quick-create-field/",
data=json.dumps({"name": "New Field", "data_type": "string"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
data = resp.json()
self.assertEqual(data["name"], "New Field")
self.assertEqual(data["data_type"], "string")
self.assertTrue(data["created"])
self.assertTrue(CustomField.objects.filter(name="New Field").exists())
def test_quick_create_field_existing(self):
"""If a field with the same name exists, return it without creating."""
resp = self.client.post(
f"{self.ENDPOINT}quick-create-field/",
data=json.dumps({"name": "Invoice Number", "data_type": "string"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
data = resp.json()
self.assertEqual(data["id"], self.custom_field_text.pk)
self.assertFalse(data["created"])
def test_quick_create_field_empty_name_rejected(self):
resp = self.client.post(
f"{self.ENDPOINT}quick-create-field/",
data=json.dumps({"name": "", "data_type": "string"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_quick_create_field_unsupported_type_rejected(self):
resp = self.client.post(
f"{self.ENDPOINT}quick-create-field/",
data=json.dumps({"name": "Bad Field", "data_type": "documentlink"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_quick_create_field_select_type_rejected(self):
resp = self.client.post(
f"{self.ENDPOINT}quick-create-field/",
data=json.dumps({"name": "Bad Field", "data_type": "select"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
+454
View File
@@ -0,0 +1,454 @@
"""Tests for the zone-based OCR extraction engine."""
import tempfile
from pathlib import Path
from unittest.mock import MagicMock
from unittest.mock import patch
from django.test import TestCase
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import OcrTemplate
from documents.models import OcrTemplateZone
from documents.zone_ocr import _apply_transform
from documents.zone_ocr import _convert_value
from documents.zone_ocr import _detect_mime
from documents.zone_ocr import _resolve_doc_path
from documents.zone_ocr import run_zone_extraction
class TestApplyTransform(TestCase):
"""Tests for the _apply_transform function."""
def test_strip(self):
self.assertEqual(_apply_transform(" hello ", "strip"), "hello")
def test_none_transform(self):
self.assertEqual(_apply_transform(" hello ", "none"), "hello")
def test_uppercase(self):
self.assertEqual(_apply_transform("hello world", "uppercase"), "HELLO WORLD")
def test_lowercase(self):
self.assertEqual(_apply_transform("HELLO WORLD", "lowercase"), "hello world")
def test_numeric_basic(self):
self.assertEqual(_apply_transform("INV-2026-001", "numeric"), "2026-001")
def test_numeric_with_currency(self):
self.assertEqual(_apply_transform("€1,234.56", "numeric"), "1,234.56")
def test_numeric_empty_result_falls_back(self):
self.assertEqual(_apply_transform("abc", "numeric"), "abc")
def test_date_dmy_dots(self):
self.assertEqual(_apply_transform("13.04.2026", "date_dmy"), "2026-04-13")
def test_date_dmy_slashes(self):
self.assertEqual(_apply_transform("01/12/2025", "date_dmy"), "2025-12-01")
def test_date_dmy_two_digit_year(self):
self.assertEqual(_apply_transform("13.04.26", "date_dmy"), "2026-04-13")
def test_date_dmy_with_prefix(self):
self.assertEqual(_apply_transform("Date: 01/12/2025", "date_dmy"), "2025-12-01")
def test_date_dmy_invalid_falls_back(self):
self.assertEqual(_apply_transform("32.13.2026", "date_dmy"), "32.13.2026")
def test_date_dmy_no_match_falls_back(self):
self.assertEqual(_apply_transform("not a date", "date_dmy"), "not a date")
def test_date_ymd_dashes(self):
self.assertEqual(_apply_transform("2026-04-13", "date_ymd"), "2026-04-13")
def test_date_ymd_slashes(self):
self.assertEqual(_apply_transform("2026/04/13", "date_ymd"), "2026-04-13")
def test_date_ymd_invalid_falls_back(self):
self.assertEqual(_apply_transform("2026-13-32", "date_ymd"), "2026-13-32")
def test_empty_string(self):
self.assertEqual(_apply_transform("", "strip"), "")
def test_whitespace_only(self):
self.assertEqual(_apply_transform(" ", "strip"), "")
def test_unknown_transform_strips(self):
self.assertEqual(_apply_transform(" hello ", "unknown"), "hello")
class TestConvertValue(TestCase):
"""Tests for the _convert_value function."""
def test_string(self):
self.assertEqual(
_convert_value("Hello", CustomField.FieldDataType.STRING),
"Hello",
)
def test_string_truncation(self):
result = _convert_value("x" * 200, CustomField.FieldDataType.STRING)
self.assertEqual(len(result), 128)
def test_url(self):
self.assertEqual(
_convert_value("https://example.com", CustomField.FieldDataType.URL),
"https://example.com",
)
def test_long_text(self):
long = "x" * 500
self.assertEqual(
_convert_value(long, CustomField.FieldDataType.LONG_TEXT),
long,
)
def test_int_simple(self):
self.assertEqual(_convert_value("42", CustomField.FieldDataType.INT), 42)
def test_int_with_noise(self):
self.assertEqual(_convert_value("INV-123", CustomField.FieldDataType.INT), 123)
def test_int_negative(self):
self.assertEqual(_convert_value("-42", CustomField.FieldDataType.INT), -42)
def test_int_empty_returns_none(self):
self.assertIsNone(_convert_value("abc", CustomField.FieldDataType.INT))
def test_int_only_dash_returns_none(self):
self.assertIsNone(_convert_value("-", CustomField.FieldDataType.INT))
def test_float_simple(self):
self.assertAlmostEqual(
_convert_value("1234.56", CustomField.FieldDataType.FLOAT),
1234.56,
)
def test_float_european_format(self):
self.assertAlmostEqual(
_convert_value("1.234,56", CustomField.FieldDataType.FLOAT),
1234.56,
)
def test_float_us_format(self):
self.assertAlmostEqual(
_convert_value("1,234.56", CustomField.FieldDataType.FLOAT),
1234.56,
)
def test_float_comma_only(self):
self.assertAlmostEqual(
_convert_value("1234,56", CustomField.FieldDataType.FLOAT),
1234.56,
)
def test_float_empty_returns_none(self):
self.assertIsNone(_convert_value("abc", CustomField.FieldDataType.FLOAT))
def test_float_only_separator_returns_none(self):
self.assertIsNone(_convert_value(",", CustomField.FieldDataType.FLOAT))
def test_date_iso(self):
self.assertEqual(
_convert_value("2026-04-13", CustomField.FieldDataType.DATE),
"2026-04-13",
)
def test_date_invalid_returns_none(self):
self.assertIsNone(_convert_value("not a date", CustomField.FieldDataType.DATE))
def test_date_invalid_values_returns_none(self):
self.assertIsNone(_convert_value("2026-13-32", CustomField.FieldDataType.DATE))
def test_monetary_simple(self):
self.assertEqual(
_convert_value("123.45", CustomField.FieldDataType.MONETARY),
"123.45",
)
def test_monetary_european(self):
self.assertEqual(
_convert_value("1.234,56", CustomField.FieldDataType.MONETARY),
"1234.56",
)
def test_monetary_with_currency_symbol(self):
self.assertEqual(
_convert_value("€1,234.56", CustomField.FieldDataType.MONETARY),
"1234.56",
)
def test_monetary_empty_returns_none(self):
self.assertIsNone(_convert_value("CHF", CustomField.FieldDataType.MONETARY))
def test_bool_true(self):
for val in ("true", "True", "yes", "1", "ja", "x", "X"):
self.assertTrue(
_convert_value(val, CustomField.FieldDataType.BOOL),
f"Expected True for {val!r}",
)
def test_bool_false(self):
for val in ("false", "False", "no", "0", "nein"):
self.assertFalse(
_convert_value(val, CustomField.FieldDataType.BOOL),
f"Expected False for {val!r}",
)
def test_bool_unknown_returns_none(self):
self.assertIsNone(_convert_value("maybe", CustomField.FieldDataType.BOOL))
def test_unsupported_type_returns_none(self):
self.assertIsNone(
_convert_value("test", CustomField.FieldDataType.DOCUMENTLINK),
)
self.assertIsNone(
_convert_value("test", CustomField.FieldDataType.SELECT),
)
def test_empty_string_returns_none(self):
self.assertIsNone(_convert_value("", CustomField.FieldDataType.STRING))
class TestDetectMime(TestCase):
"""Tests for _detect_mime."""
def test_pdf_extension(self):
self.assertEqual(_detect_mime(Path("test.pdf")), "application/pdf")
def test_png_extension(self):
self.assertEqual(_detect_mime(Path("test.png")), "image/png")
def test_jpg_extension(self):
self.assertEqual(_detect_mime(Path("test.jpg")), "image/jpeg")
def test_unknown_extension(self):
self.assertIsNone(_detect_mime(Path("test.xyz")))
def test_webp_extension(self):
self.assertEqual(_detect_mime(Path("test.webp")), "image/webp")
class TestResolveDocPath(TestCase):
"""Tests for _resolve_doc_path."""
def test_returns_none_when_no_files_exist(self):
doc = MagicMock()
doc.has_archive_version = False
doc.source_path = Path("/nonexistent/source.pdf")
result = _resolve_doc_path(doc, None)
self.assertIsNone(result)
def test_returns_original_file_as_fallback(self):
doc = MagicMock()
doc.has_archive_version = False
doc.source_path = Path("/nonexistent/source.pdf")
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
result = _resolve_doc_path(doc, Path(f.name))
self.assertEqual(result, Path(f.name))
def test_returns_none_for_none_original_file(self):
doc = MagicMock()
doc.has_archive_version = False
doc.source_path = Path("/nonexistent/source.pdf")
result = _resolve_doc_path(doc, None)
self.assertIsNone(result)
class TestRunZoneExtraction(TestCase):
"""Tests for the full extraction pipeline."""
def setUp(self):
self.doc_type = DocumentType.objects.create(name="Invoice")
self.custom_field = CustomField.objects.create(
name="Invoice Number",
data_type=CustomField.FieldDataType.STRING,
)
def test_skips_document_without_type(self):
doc = Document.objects.create(
title="No Type",
content="test",
mime_type="application/pdf",
)
run_zone_extraction(doc, Path("/nonexistent"))
self.assertEqual(CustomFieldInstance.objects.count(), 0)
def test_skips_document_without_matching_template(self):
other_type = DocumentType.objects.create(name="Other")
doc = Document.objects.create(
title="No Template",
content="test",
mime_type="application/pdf",
document_type=other_type,
)
run_zone_extraction(doc, Path("/nonexistent"))
self.assertEqual(CustomFieldInstance.objects.count(), 0)
def test_skips_disabled_template(self):
template = OcrTemplate.objects.create(
name="Disabled",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=False,
)
OcrTemplateZone.objects.create(
template=template,
name="Zone",
custom_field=self.custom_field,
x=0,
y=0,
width=100,
height=50,
)
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
run_zone_extraction(doc, Path("/nonexistent"))
self.assertEqual(CustomFieldInstance.objects.count(), 0)
def test_skips_template_with_no_zones(self):
OcrTemplate.objects.create(
name="Empty",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=True,
)
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(b"%PDF-1.4 fake")
f.flush()
run_zone_extraction(doc, Path(f.name))
self.assertEqual(CustomFieldInstance.objects.count(), 0)
@patch("documents.zone_ocr._process_template")
def test_calls_process_for_enabled_template(self, mock_process):
template = OcrTemplate.objects.create(
name="Active",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=True,
)
OcrTemplateZone.objects.create(
template=template,
name="Zone",
custom_field=self.custom_field,
x=0,
y=0,
width=100,
height=50,
)
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(b"%PDF-1.4 fake")
f.flush()
run_zone_extraction(doc, Path(f.name))
self.assertTrue(mock_process.called)
@patch("documents.zone_ocr._process_template")
def test_handles_process_exception_gracefully(self, mock_process):
"""A failing template should not prevent other templates from running."""
mock_process.side_effect = RuntimeError("test error")
template = OcrTemplate.objects.create(
name="Failing",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=True,
)
OcrTemplateZone.objects.create(
template=template,
name="Zone",
custom_field=self.custom_field,
x=0,
y=0,
width=100,
height=50,
)
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(b"%PDF-1.4 fake")
f.flush()
# Should not raise
run_zone_extraction(doc, Path(f.name))
def test_handles_none_original_file(self):
"""Should not crash when original_file is None."""
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
# No template, so it exits early — but shouldn't crash on None
run_zone_extraction(doc, None)
@patch("documents.zone_ocr._process_template")
def test_multiple_templates_all_process(self, mock_process):
"""Multiple enabled templates for the same type should all run."""
for i in range(3):
template = OcrTemplate.objects.create(
name=f"Template {i}",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=True,
)
OcrTemplateZone.objects.create(
template=template,
name=f"Zone {i}",
custom_field=self.custom_field,
x=0,
y=0,
width=100,
height=50,
)
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(b"%PDF-1.4 fake")
f.flush()
run_zone_extraction(doc, Path(f.name))
self.assertEqual(mock_process.call_count, 3)
+292
View File
@@ -3,6 +3,7 @@ import logging
import os
import platform
import re
import subprocess
import tempfile
import zipfile
from collections import defaultdict
@@ -148,12 +149,14 @@ from documents.matching import match_correspondents
from documents.matching import match_document_types
from documents.matching import match_storage_paths
from documents.matching import match_tags
from documents.models import OCR_SUPPORTED_FIELD_TYPES
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import Note
from documents.models import OcrTemplate
from documents.models import PaperlessTask
from documents.models import SavedView
from documents.models import ShareLink
@@ -195,6 +198,7 @@ from documents.serialisers import EditPdfDocumentsSerializer
from documents.serialisers import EmailSerializer
from documents.serialisers import MergeDocumentsSerializer
from documents.serialisers import NotesSerializer
from documents.serialisers import OcrTemplateSerializer
from documents.serialisers import PostDocumentSerializer
from documents.serialisers import RemovePasswordDocumentsSerializer
from documents.serialisers import ReprocessDocumentsSerializer
@@ -2029,6 +2033,73 @@ class DocumentViewSet(
},
),
)
@action(methods=["post"], detail=True, url_path="run-zone-ocr")
def run_zone_ocr(self, request, pk=None):
"""Run zone-based OCR extraction on this document."""
try:
document = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404
if not document.document_type_id:
return Response(
{"error": "Document has no type assigned"},
status=status.HTTP_400_BAD_REQUEST,
)
templates = OcrTemplate.objects.filter(
document_type_id=document.document_type_id,
enabled=True,
)
if not templates.exists():
return Response(
{"error": "No OCR templates found for this document type"},
status=status.HTTP_404_NOT_FOUND,
)
doc_path = document.archive_path or document.source_path
if not doc_path or not Path(doc_path).is_file():
return Response(
{"error": "Document file not found"},
status=status.HTTP_404_NOT_FOUND,
)
from documents.zone_ocr import run_zone_extraction
run_zone_extraction(document, None)
# Collect results
results = []
builtin_labels = {"title": "Title", "asn": "ASN", "created": "Created"}
for template in templates.prefetch_related("zones", "zones__custom_field"):
for zone in template.zones.all():
target = getattr(zone, "target", None) or "custom_field"
if target == "custom_field" and zone.custom_field_id:
cf_instance = document.custom_fields.filter(
field=zone.custom_field,
).first()
field_name = zone.custom_field.name
value = cf_instance.value if cf_instance else None
else:
field_name = builtin_labels.get(target, target)
value = {
"title": document.title,
"asn": document.archive_serial_number,
"created": document.created.isoformat()
if document.created
else None,
}.get(target)
results.append(
{
"template": template.name,
"zone": zone.name,
"custom_field": field_name,
"value": value,
},
)
return Response({"results": results})
@action(
methods=["delete"],
detail=True,
@@ -5269,3 +5340,224 @@ def serve_logo(request: HttpRequest, filename: str | None = None) -> FileRespons
filename=app_logo.name,
as_attachment=True,
)
class OcrTemplateViewSet(ModelViewSet):
"""CRUD for OCR templates with zone definitions."""
queryset = (
OcrTemplate.objects.all()
.prefetch_related(
"zones",
"zones__custom_field",
)
.order_by("name")
)
serializer_class = OcrTemplateSerializer
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
pagination_class = StandardPagination
@action(
detail=False,
methods=["get"],
url_path=r"document-page-image/(?P<doc_id>[0-9]+)/(?P<page>[0-9]+)",
)
def document_page_image(self, request, doc_id=None, page=None):
"""Render a specific page of a document as a PNG image.
Used by the frontend template editor to display document pages
as images that users can draw zones on.
"""
try:
document = Document.objects.get(pk=doc_id)
except Document.DoesNotExist:
raise Http404("Document not found")
page_num = int(page)
# Validate page number
if document.page_count and page_num >= document.page_count:
raise Http404(
f"Page {page_num} out of range (document has {document.page_count} pages)",
)
doc_path = document.archive_path or document.source_path
if not doc_path or not Path(doc_path).is_file():
raise Http404("Document file not found")
# Check if document is an image (single page, no PDF rendering needed)
if document.mime_type and document.mime_type.startswith("image/"):
content = Path(doc_path).read_bytes()
return HttpResponse(content, content_type=document.mime_type)
with tempfile.TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
output_prefix = Path(tmp_dir) / "page"
try:
subprocess.run(
[
"pdftoppm",
"-png",
"-r",
"150", # Lower DPI for preview
"-f",
str(page_num + 1),
"-l",
str(page_num + 1),
str(doc_path),
str(output_prefix),
],
check=True,
capture_output=True,
timeout=30,
)
except subprocess.CalledProcessError as e:
raise Http404(
f"Failed to render page: {e.stderr.decode(errors='replace')[:200]}",
)
except FileNotFoundError:
raise Http404("pdftoppm not available - is poppler-utils installed?")
rendered = sorted(Path(tmp_dir).glob("page-*.png"))
if not rendered:
raise Http404("No rendered page found")
content = rendered[0].read_bytes()
return HttpResponse(content, content_type="image/png")
@action(detail=False, methods=["post"], url_path="test-zone")
def test_zone(self, request):
"""Run OCR on a single ad-hoc zone of a document and return what it
yields: the raw OCR text, the transformed value, and whether the
validation regex matches. Non-destructive - writes nothing. Used by the
editor's per-zone test so a user can tune the zone/regex before saving.
Accepts: {"document": <id>, "zone": {x, y, width, height, page,
ocr_language, transform, validation_regex, zone_source_width,
zone_source_height}}.
"""
from documents.models import OcrTemplateZone
from documents.zone_ocr import extract_zone_preview
zone_data = request.data.get("zone") or {}
try:
document = Document.objects.get(pk=request.data.get("document"))
except (Document.DoesNotExist, ValueError, TypeError):
return Response(
{"error": "Document not found"},
status=status.HTTP_404_NOT_FOUND,
)
doc_path = document.archive_path or document.source_path
if not doc_path or not Path(doc_path).is_file():
return Response(
{"error": "Document file not found"},
status=status.HTTP_404_NOT_FOUND,
)
try:
zone = OcrTemplateZone(
name=zone_data.get("name") or "test",
x=int(zone_data.get("x", 0)),
y=int(zone_data.get("y", 0)),
width=int(zone_data.get("width", 0)),
height=int(zone_data.get("height", 0)),
page=zone_data.get("page"),
ocr_language=zone_data.get("ocr_language") or "eng",
transform=zone_data.get("transform") or "strip",
date_format=zone_data.get("date_format") or "",
validation_regex=zone_data.get("validation_regex") or "",
)
except (ValueError, TypeError):
return Response(
{"error": "Invalid zone definition"},
status=status.HTTP_400_BAD_REQUEST,
)
if zone.width < 2 or zone.height < 2:
return Response(
{"error": "Zone is too small to test"},
status=status.HTTP_400_BAD_REQUEST,
)
result = extract_zone_preview(
Path(doc_path),
zone,
int(zone_data.get("zone_source_width") or 0),
int(zone_data.get("zone_source_height") or 0),
document.page_count,
)
regex_match = None
if zone.validation_regex and result.get("value") is not None:
try:
regex_match = (
re.fullmatch(zone.validation_regex, result["value"]) is not None
)
except re.error:
regex_match = None
return Response(
{
"raw_text": result.get("raw_text"),
"value": result.get("value"),
"regex": zone.validation_regex,
"regex_match": regex_match,
},
)
@action(detail=False, methods=["post"], url_path="quick-create-field")
def quick_create_field(self, request):
"""Create a custom field inline from the template editor.
Accepts: {"name": "Invoice Number", "data_type": "string"}
Returns the created field so the frontend can immediately use it.
"""
name = request.data.get("name", "").strip()
data_type = request.data.get("data_type", "").strip()
if not name:
return Response(
{"error": "Field name is required"},
status=status.HTTP_400_BAD_REQUEST,
)
if data_type not in OCR_SUPPORTED_FIELD_TYPES:
return Response(
{
"error": f"Unsupported data type '{data_type}'. "
f"Supported: {', '.join(sorted(OCR_SUPPORTED_FIELD_TYPES))}",
},
status=status.HTTP_400_BAD_REQUEST,
)
# Check if field already exists
existing = CustomField.objects.filter(name=name).first()
if existing:
return Response(
{
"id": existing.pk,
"name": existing.name,
"data_type": existing.data_type,
"created": False,
},
)
# Check user has permission to create custom fields
if not request.user.has_perm("documents.add_customfield"):
return Response(
{"error": "You don't have permission to create custom fields"},
status=status.HTTP_403_FORBIDDEN,
)
field = CustomField.objects.create(name=name, data_type=data_type)
return Response(
{
"id": field.pk,
"name": field.name,
"data_type": field.data_type,
"created": True,
},
status=status.HTTP_201_CREATED,
)
+757
View File
@@ -0,0 +1,757 @@
"""
Zone-based OCR extraction engine.
After a document is consumed, this module checks if the document's type has
an active OCR template. If so, it renders the relevant pages as images,
crops each zone, runs Tesseract OCR on the crop, applies transforms,
and writes the results to the mapped custom fields.
"""
from __future__ import annotations
import logging
import re
import string
import subprocess
import tempfile
from datetime import date
from datetime import datetime
from pathlib import Path
from django.conf import settings
from PIL import Image
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import OcrTemplate
from documents.models import OcrTemplateZone
logger = logging.getLogger("paperless.zone_ocr")
def run_zone_extraction(
document: Document,
original_file: Path | None,
) -> None:
"""
Run zone-based OCR extraction for a document if its type has an active template.
Called from the document_consumption_finished signal handler.
"""
if not document.document_type_id:
return
templates = OcrTemplate.objects.filter(
document_type_id=document.document_type_id,
enabled=True,
).prefetch_related("zones", "zones__custom_field")
if not templates.exists():
return
# Resolve the document file: prefer archive (PDF/A), then source, then signal arg
doc_path = _resolve_doc_path(document, original_file)
if doc_path is None:
logger.warning(
"Zone OCR: no accessible file for document %d",
document.pk,
)
return
for template in templates:
zones = list(template.zones.all())
if not zones:
continue
logger.info(
"Zone OCR: processing template '%s' for document %d (%d zones)",
template.name,
document.pk,
len(zones),
)
try:
_process_template(document, doc_path, template, zones)
except Exception:
logger.exception(
"Zone OCR: error processing template '%s' for document %d",
template.name,
document.pk,
)
def _resolve_doc_path(
document: Document,
original_file: Path | None,
) -> Path | None:
"""Find an accessible file for the document."""
candidates = []
if document.has_archive_version:
candidates.append(document.archive_path)
candidates.append(document.source_path)
if original_file is not None:
candidates.append(original_file)
for path in candidates:
if path is not None and Path(path).is_file():
return Path(path)
return None
def _resolve_page_idx(page_value, page_count) -> int:
"""Resolve a 1-indexed page (1 = first, -1 = last) to a 0-indexed image
index. A blank page_value defaults to the first page."""
if page_value is None:
return 0
if page_value == -1:
return (page_count - 1) if page_count else 0
if page_value >= 1:
return page_value - 1
return 0
def _process_template(
document: Document,
doc_path: Path,
template: OcrTemplate,
zones: list[OcrTemplateZone],
) -> None:
"""Process all zones in a template against a document.
Each zone is OCR'd independently, then zones are grouped by their target
field and each field is written exactly once. When several zones share a
field, their values are combined via the template's per-field format string
(or joined in order if none is set) this avoids the zones overwriting each
other's value.
"""
pages_needed: set[int] = {
_resolve_page_idx(zone.page, document.page_count) for zone in zones
}
with tempfile.TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
tmp_path = Path(tmp_dir)
page_images = _render_pages(
doc_path,
pages_needed,
tmp_path,
document.page_count,
)
# Pass 1: OCR every zone into a value (or None if it failed/was rejected).
zone_values: dict[int, str | None] = {}
for zone in zones:
page_idx = _resolve_page_idx(zone.page, document.page_count)
if page_idx not in page_images:
logger.warning(
"Zone OCR: page %d not available for zone '%s'",
page_idx,
zone.name,
)
continue
src_w = zone.zone_source_width or template.source_width
src_h = zone.zone_source_height or template.source_height
extracted = _extract_zone(
page_images[page_idx],
zone,
src_w,
src_h,
tmp_path,
)
if (
extracted is not None
and zone.validation_regex
and not re.fullmatch(zone.validation_regex, extracted)
):
logger.info(
"Zone OCR: '%s' value %r rejected by regex '%s'",
zone.name,
extracted[:100],
zone.validation_regex,
)
extracted = None
zone_values[id(zone)] = extracted
# Pass 2: group zones by target field and write each field once.
grouped: dict[str, list[OcrTemplateZone]] = {}
for zone in zones:
grouped.setdefault(_field_key(zone), []).append(zone)
combine_formats = template.combine_formats or {}
for key, field_zones in grouped.items():
value = _combine_field_value(
combine_formats.get(key, ""),
field_zones,
zone_values,
)
if not value:
continue
target_zone = field_zones[0]
_write_zone_value(document, target_zone, value)
logger.info(
"Zone OCR: %s = %r (from %d zone(s))",
_zone_target_label(target_zone),
value[:100] if len(value) > 100 else value,
len(field_zones),
)
def _field_key(zone: OcrTemplateZone) -> str:
"""Identify a zone's target field. Custom fields key by id, built-in targets
by their name. Matches the key used in OcrTemplate.combine_formats and on the
frontend field select."""
target = getattr(zone, "target", None) or "custom_field"
if target == "custom_field" and zone.custom_field_id:
return str(zone.custom_field_id)
return target
def _combine_field_value(
fmt: str,
field_zones: list[OcrTemplateZone],
zone_values: dict[int, str | None],
) -> str:
"""Combine the OCR values of all zones targeting one field.
With a format string, `{Zone Name}` tokens are replaced by that zone's value
and literal text is kept; separators left dangling by an empty token are
cleaned up. Without a format, the zone values are joined in order by a space.
"""
values = {z.name: (zone_values.get(id(z)) or "") for z in field_zones}
if not fmt:
parts = [zone_values.get(id(z)) or "" for z in field_zones]
return " ".join(p for p in parts if p).strip()
def _replace(match: re.Match) -> str:
return values.get(match.group(1).strip(), "")
combined = re.sub(r"\{([^{}]+)\}", _replace, fmt)
# Tidy up separators an empty token may have left behind.
combined = re.sub(r"\s{2,}", " ", combined)
combined = re.sub(r"([^\w\s])\s*\1+", r"\1", combined)
return combined.strip().strip("-/.,;:| \t")
def _render_pages(
doc_path: Path,
pages: set[int],
tmp_dir: Path,
page_count: int | None,
) -> dict[int, Path]:
"""Render specific PDF pages as PNG images using pdftoppm (poppler-utils)."""
result: dict[int, Path] = {}
mime = _detect_mime(doc_path)
if mime and mime.startswith("image/"):
# Single-image document — use it directly as page 0.
result[0] = doc_path
return result
# Callers pass already-resolved 0-indexed page numbers (see _resolve_page_idx).
for actual_page in pages:
if actual_page < 0:
logger.warning("Zone OCR: invalid page index %d", actual_page)
continue
output_prefix = tmp_dir / f"page_{actual_page}"
try:
subprocess.run(
[
"pdftoppm",
"-png",
"-r",
"300",
"-f",
str(actual_page + 1), # pdftoppm is 1-indexed
"-l",
str(actual_page + 1),
str(doc_path),
str(output_prefix),
],
check=True,
capture_output=True,
timeout=60,
)
except subprocess.TimeoutExpired:
logger.error("Zone OCR: pdftoppm timed out for page %d", actual_page)
continue
except subprocess.CalledProcessError as e:
logger.error(
"Zone OCR: pdftoppm failed for page %d: %s",
actual_page,
e.stderr.decode(errors="replace") if e.stderr else str(e),
)
continue
except FileNotFoundError:
logger.error("Zone OCR: pdftoppm not found — is poppler-utils installed?")
return result # No point trying other pages
# pdftoppm names output as prefix-NNNN.png
rendered = sorted(tmp_dir.glob(f"page_{actual_page}-*.png"))
if rendered:
result[actual_page] = rendered[0]
return result
def _crop_zone(
page_img: Path,
zone: OcrTemplateZone,
source_width: int,
source_height: int,
tmp_dir: Path,
) -> Image.Image | None:
"""Crop a zone from the page image and return the PIL Image."""
try:
with Image.open(page_img) as img:
img_width, img_height = img.size
scale_x = img_width / source_width
scale_y = img_height / source_height
crop_left = int(zone.x * scale_x)
crop_top = int(zone.y * scale_y)
crop_right = int((zone.x + zone.width) * scale_x)
crop_bottom = int((zone.y + zone.height) * scale_y)
# Clamp to the image so an oversized zone can't crop out of bounds.
crop_left = max(0, min(crop_left, img_width))
crop_top = max(0, min(crop_top, img_height))
crop_right = max(crop_left + 1, min(crop_right, img_width))
crop_bottom = max(crop_top + 1, min(crop_bottom, img_height))
if crop_right - crop_left < 2 or crop_bottom - crop_top < 2:
logger.warning("Zone OCR: crop too small for zone '%s'", zone.name)
return None
return img.crop((crop_left, crop_top, crop_right, crop_bottom)).copy()
except Exception:
logger.exception("Zone OCR: crop failed for zone '%s'", zone.name)
return None
def _read_barcode(cropped: Image.Image, zone_name: str) -> str | None:
"""Read QR/barcode from a cropped image using zxingcpp."""
try:
import zxingcpp
results = zxingcpp.read_barcodes(cropped)
if results:
text = results[0].text
logger.debug(
"Zone OCR: barcode found in zone '%s': %s",
zone_name,
text[:100],
)
return text
logger.debug("Zone OCR: no barcode found in zone '%s'", zone_name)
return None
except ImportError:
logger.error("Zone OCR: zxingcpp not available — install zxing-cpp")
return None
except Exception:
logger.exception("Zone OCR: barcode read failed for zone '%s'", zone_name)
return None
def _ocr_text(cropped: Image.Image, zone: OcrTemplateZone, tmp_dir: Path) -> str | None:
"""OCR a cropped image with Tesseract."""
crop_path = tmp_dir / f"zone_{zone.pk}.png"
cropped.save(crop_path)
try:
proc = subprocess.run(
[
"tesseract",
str(crop_path),
"stdout",
"-l",
zone.ocr_language,
"--psm",
"6", # Assume uniform block of text
],
capture_output=True,
text=True,
timeout=30,
check=True,
)
return proc.stdout.strip() or None
except subprocess.TimeoutExpired:
logger.error("Zone OCR: Tesseract timed out for zone '%s'", zone.name)
return None
except subprocess.CalledProcessError as e:
logger.error(
"Zone OCR: Tesseract failed for zone '%s': %s",
zone.name,
e.stderr[:200] if e.stderr else str(e),
)
return None
except FileNotFoundError:
logger.error("Zone OCR: Tesseract not found — is tesseract-ocr installed?")
return None
def _extract_zone(
page_img: Path,
zone: OcrTemplateZone,
source_width: int,
source_height: int,
tmp_dir: Path,
) -> str | None:
"""Crop a zone from the page image and extract text via OCR or barcode reader."""
cropped = _crop_zone(page_img, zone, source_width, source_height, tmp_dir)
if cropped is None:
return None
# QR/barcode zones skip Tesseract entirely
if zone.transform == "qr_code":
text = _read_barcode(cropped, zone.name)
if not text:
return None
return _apply_transform(
text,
zone.transform,
getattr(zone, "date_format", "") or "",
)
text = _ocr_text(cropped, zone, tmp_dir)
if not text:
return None
return _apply_transform(
text,
zone.transform,
getattr(zone, "date_format", "") or "",
)
def extract_zone_preview(
doc_path: Path,
zone: OcrTemplateZone,
source_width: int,
source_height: int,
page_count: int | None,
) -> dict:
"""Non-destructive single-zone extraction for the editor's per-zone test.
Renders the zone's page, crops it, runs OCR (or the barcode reader) and
applies the transform WITHOUT writing any custom field. Returns the raw
OCR text and the transformed value so the user can see what the zone yields
(and tune the validation regex) before saving.
"""
# zone.page is 1-indexed (1 = first, -1 = last); resolve to a 0-indexed
# image index exactly like the production extraction path does.
page_idx = _resolve_page_idx(zone.page, page_count)
with tempfile.TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
tmp_path = Path(tmp_dir)
page_images = _render_pages(doc_path, {page_idx}, tmp_path, page_count)
if page_idx not in page_images:
return {"raw_text": None, "value": None}
if not source_width or not source_height:
with Image.open(page_images[page_idx]) as im:
source_width, source_height = im.size
cropped = _crop_zone(
page_images[page_idx],
zone,
source_width,
source_height,
tmp_path,
)
if cropped is None:
return {"raw_text": None, "value": None}
if zone.transform == "qr_code":
raw_text = _read_barcode(cropped, zone.name)
else:
raw_text = _ocr_text(cropped, zone, tmp_path)
value = (
_apply_transform(
raw_text,
zone.transform,
getattr(zone, "date_format", "") or "",
)
if raw_text
else None
)
return {"raw_text": raw_text, "value": value}
def _parse_date(text: str, fmt: str) -> str:
"""Parse a date from OCR text. With a Python strptime `fmt`, try that first;
otherwise (or on failure) fall back to dateparser auto-detection. Returns an
ISO date string, or the original text if nothing parses."""
text = text.strip()
if not text:
return text
if fmt:
try:
return datetime.strptime(text, fmt).date().isoformat()
except ValueError:
pass
try:
import dateparser
parsed = dateparser.parse(
text,
settings={
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": False,
},
)
if parsed:
return parsed.date().isoformat()
except Exception:
logger.debug("Zone OCR: dateparser failed for %r", text[:50])
return text
def _apply_transform(text: str, transform: str, date_format: str = "") -> str:
"""Apply post-processing transform to extracted text."""
text = text.strip()
if not text:
return text
if transform in ("strip", "none"):
return text
elif transform == "date":
return _parse_date(text, date_format)
elif transform == "uppercase":
return text.upper()
elif transform == "lowercase":
return text.lower()
elif transform == "numeric":
result = re.sub(r"[^\d.,\-]", "", text)
return result if result else text
elif transform == "strip_punctuation":
return text.strip(string.punctuation + " \t\r\n")
elif transform == "qr_code":
# Barcode/QR content as read by _read_barcode.
return text
return text
def _zone_target_label(zone: OcrTemplateZone) -> str:
"""Human label of a zone's write target (for logging)."""
target = getattr(zone, "target", None) or "custom_field"
if target == "custom_field":
return zone.custom_field.name if zone.custom_field_id else "(no field)"
return {"title": "Title", "asn": "ASN", "created": "Created"}.get(target, target)
def _parse_created_datetime(value: str):
"""Parse an extracted value into a tz-aware datetime for document.created.
Prefers an ISO date (the zone should use a date transform); falls back to
dateparser. Returns None if no date can be parsed.
"""
from django.utils import timezone as djtz
m = re.search(r"(\d{4})-(\d{2})-(\d{2})", value)
if m:
try:
dt = datetime(int(m[1]), int(m[2]), int(m[3]))
return djtz.make_aware(dt) if djtz.is_naive(dt) else dt
except ValueError:
pass
try:
import dateparser
parsed = dateparser.parse(
value,
settings={"RETURN_AS_TIMEZONE_AWARE": False},
)
if parsed:
return djtz.make_aware(parsed) if djtz.is_naive(parsed) else parsed
except Exception:
logger.debug("Zone OCR: dateparser failed for created value %r", value[:50])
return None
def _write_zone_value(
document: Document,
zone: OcrTemplateZone,
value: str,
) -> None:
"""Write an extracted value to the zone's target — a custom field, or a
built-in document field (title / archive_serial_number / created)."""
target = getattr(zone, "target", None) or "custom_field"
if target == "custom_field":
if zone.custom_field_id:
_write_custom_field(document, zone.custom_field, value)
else:
logger.debug("Zone OCR: zone '%s' has no custom field set", zone.name)
return
if target == "title":
document.title = value[:128]
document.save(update_fields=["title"])
elif target == "asn":
digits = re.sub(r"[^\d]", "", value)
if not digits:
logger.debug(
"Zone OCR: ASN zone '%s' produced no digits (%r)",
zone.name,
value[:50],
)
return
document.archive_serial_number = int(digits)
document.save(update_fields=["archive_serial_number"])
elif target == "created":
parsed = _parse_created_datetime(value)
if parsed is None:
logger.debug(
"Zone OCR: created zone '%s' could not parse a date (%r)",
zone.name,
value[:50],
)
return
document.created = parsed
document.save(update_fields=["created"])
def _write_custom_field(
document: Document,
custom_field: CustomField,
value: str,
) -> None:
"""Write an extracted value to a document's custom field."""
typed_value = _convert_value(value, custom_field.data_type)
if typed_value is None:
logger.debug(
"Zone OCR: skipping custom field '%s' — value conversion returned None",
custom_field.name,
)
return
value_field_name = CustomFieldInstance.get_value_field_name(custom_field.data_type)
CustomFieldInstance.objects.update_or_create(
document=document,
field=custom_field,
defaults={value_field_name: typed_value},
)
def _convert_value(value: str, data_type: str) -> object | None:
"""Convert an extracted OCR string to the appropriate type for the custom field."""
if not value:
return None
try:
if data_type in (
CustomField.FieldDataType.STRING,
CustomField.FieldDataType.URL,
):
return value[:128]
elif data_type == CustomField.FieldDataType.LONG_TEXT:
return value
elif data_type == CustomField.FieldDataType.INT:
digits = re.sub(r"[^\d\-]", "", value)
# Handle edge case: only dashes or empty
digits = digits.lstrip("-") or ""
if not digits:
return None
# Restore leading minus if original had one
if value.strip().startswith("-"):
digits = "-" + digits
return int(digits)
elif data_type == CustomField.FieldDataType.FLOAT:
# Handle European format: 1.234,56 → 1234.56
cleaned = re.sub(r"[^\d.,\-]", "", value)
if not cleaned or cleaned in (".", ",", "-"):
return None
# If both . and , present, the last one is the decimal separator
if "," in cleaned and "." in cleaned:
if cleaned.rindex(",") > cleaned.rindex("."):
# European: 1.234,56
cleaned = cleaned.replace(".", "").replace(",", ".")
else:
# US: 1,234.56
cleaned = cleaned.replace(",", "")
elif "," in cleaned:
# Only comma — treat as decimal separator
cleaned = cleaned.replace(",", ".")
return float(cleaned)
elif data_type == CustomField.FieldDataType.DATE:
match = re.search(r"(\d{4})-(\d{2})-(\d{2})", value)
if match:
y, m, d = match.groups()
# Validate the date
date(int(y), int(m), int(d))
return f"{y}-{m}-{d}"
return None
elif data_type == CustomField.FieldDataType.MONETARY:
cleaned = re.sub(r"[^\d.,\-]", "", value)
if not cleaned or cleaned in (".", ",", "-"):
return None
if "," in cleaned and "." in cleaned:
if cleaned.rindex(",") > cleaned.rindex("."):
cleaned = cleaned.replace(".", "").replace(",", ".")
else:
cleaned = cleaned.replace(",", "")
elif "," in cleaned:
cleaned = cleaned.replace(",", ".")
# Validate it parses as a number
float(cleaned)
return cleaned
elif data_type == CustomField.FieldDataType.BOOL:
lower = value.lower().strip()
if lower in ("true", "yes", "1", "ja", "oui", "si", "x"):
return True
elif lower in ("false", "no", "0", "nein", "non"):
return False
return None
else:
# Unsupported types (DOCUMENTLINK, SELECT) — can't OCR into these
logger.debug(
"Zone OCR: unsupported custom field type %s for OCR extraction",
data_type,
)
return None
except (ValueError, TypeError) as e:
logger.warning("Zone OCR: could not convert %r to %s: %s", value, data_type, e)
return None
def _detect_mime(path: Path) -> str | None:
"""Detect MIME type of a file."""
try:
import magic
return magic.from_file(str(path), mime=True)
except ImportError:
pass
except Exception:
logger.debug("Zone OCR: magic failed for %s, falling back to extension", path)
suffix = path.suffix.lower()
return {
".pdf": "application/pdf",
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".tiff": "image/tiff",
".tif": "image/tiff",
".webp": "image/webp",
".bmp": "image/bmp",
".gif": "image/gif",
}.get(suffix)
+2
View File
@@ -28,6 +28,7 @@ from documents.views import GlobalSearchView
from documents.views import IndexView
from documents.views import LogViewSet
from documents.views import MergeDocumentsView
from documents.views import OcrTemplateViewSet
from documents.views import PostDocumentView
from documents.views import RemoteVersionView
from documents.views import RemovePasswordDocumentsView
@@ -86,6 +87,7 @@ api_router.register(r"workflow_triggers", WorkflowTriggerViewSet)
api_router.register(r"workflow_actions", WorkflowActionViewSet)
api_router.register(r"workflows", WorkflowViewSet)
api_router.register(r"custom_fields", CustomFieldViewSet)
api_router.register(r"ocr_templates", OcrTemplateViewSet)
api_router.register(r"config", ApplicationConfigurationViewSet)
api_router.register(r"processed_mail", ProcessedMailViewSet)