Compare commits

..

1 Commits

Author SHA1 Message Date
Christoph Schlaepfer bf73b5b1d1 Feature: OCR Templates (#13043)
[skip ci]

Signed-off-by: dependabot[bot] <support@github.com>
Co-Authored-By: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-Authored-By: stumpylog <797416+stumpylog@users.noreply.github.com>
Co-Authored-By: GitHub Actions <41898282+github-actions[bot]@users.noreply.github.com>
Co-Authored-By: shamoon <4887959+shamoon@users.noreply.github.com>
2026-06-23 07:32:58 -07:00
36 changed files with 4711 additions and 330 deletions
-1
View File
@@ -63,7 +63,6 @@ The following are not generally considered vulnerabilities unless accompanied by
- optional webhook, mail, AI, OCR, or integration behavior described without a product-level vulnerability
- missing limits or hardening settings presented without concrete impact
- generic AI or static-analysis output that is not confirmed against the current codebase and a real deployment scenario
- the ability to attach objects that a user cannot access to a document by ID is an intentional design choice, and not considered a vulnerability
## Transparency
+88 -131
View File
@@ -768,7 +768,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">19</context>
<context context-type="linenumber">16</context>
</context-group>
</trans-unit>
<trans-unit id="3894950702316166331" datatype="html">
@@ -783,7 +783,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">26</context>
<context context-type="linenumber">23</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/trash/trash.component.html</context>
@@ -1700,7 +1700,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/input/tags/tags.component.ts</context>
<context context-type="linenumber">81</context>
<context context-type="linenumber">80</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/suggestions-dropdown/suggestions-dropdown.component.html</context>
@@ -1830,18 +1830,11 @@
<context context-type="linenumber">147</context>
</context-group>
</trans-unit>
<trans-unit id="8829078752502782653" datatype="html">
<source>Dismiss all</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">15</context>
</context-group>
</trans-unit>
<trans-unit id="1616102757855967475" datatype="html">
<source>All</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">39</context>
<context context-type="linenumber">36</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
@@ -1849,7 +1842,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">154</context>
<context context-type="linenumber">151</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/filterable-dropdown/filterable-dropdown.component.html</context>
@@ -1880,36 +1873,36 @@
<source>Filter by</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">59</context>
<context context-type="linenumber">56</context>
</context-group>
</trans-unit>
<trans-unit id="424356320420294719" datatype="html">
<source>All types</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">64</context>
<context context-type="linenumber">61</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">215</context>
<context context-type="linenumber">209</context>
</context-group>
</trans-unit>
<trans-unit id="131016739441837046" datatype="html">
<source>All sources</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">73</context>
<context context-type="linenumber">70</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">227</context>
<context context-type="linenumber">221</context>
</context-group>
</trans-unit>
<trans-unit id="6849725902312323996" datatype="html">
<source>Reset filters</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">104</context>
<context context-type="linenumber">101</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-list.component.html</context>
@@ -1924,14 +1917,14 @@
<source>{VAR_PLURAL, plural, =1 {1 task} other {<x id="INTERPOLATION"/> tasks}}</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">125</context>
<context context-type="linenumber">122</context>
</context-group>
</trans-unit>
<trans-unit id="8953033926734869941" datatype="html">
<source>Name</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">147</context>
<context context-type="linenumber">144</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
@@ -2042,7 +2035,7 @@
<source>Created</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">148</context>
<context context-type="linenumber">145</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/dates-dropdown/dates-dropdown.component.html</context>
@@ -2073,21 +2066,21 @@
<source>Results</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">150</context>
<context context-type="linenumber">147</context>
</context-group>
</trans-unit>
<trans-unit id="314315645942131479" datatype="html">
<source>Info</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">152</context>
<context context-type="linenumber">149</context>
</context-group>
</trans-unit>
<trans-unit id="3193976279273491157" datatype="html">
<source>Actions</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">153</context>
<context context-type="linenumber">150</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/trash/trash.component.html</context>
@@ -2158,22 +2151,18 @@
<source>click for full output</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">204</context>
<context context-type="linenumber">201</context>
</context-group>
</trans-unit>
<trans-unit id="1536087519743707362" datatype="html">
<source>Dismiss</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">217</context>
<context context-type="linenumber">214</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">317</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">351</context>
<context context-type="linenumber">310</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-detail/document-detail.component.ts</context>
@@ -2188,28 +2177,28 @@
<source>Open Document</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">222</context>
<context context-type="linenumber">219</context>
</context-group>
</trans-unit>
<trans-unit id="5404759957685833020" datatype="html">
<source>Result message</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">235</context>
<context context-type="linenumber">232</context>
</context-group>
</trans-unit>
<trans-unit id="6621329748219109148" datatype="html">
<source>Duplicate</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">242</context>
<context context-type="linenumber">239</context>
</context-group>
</trans-unit>
<trans-unit id="7593555694782789615" datatype="html">
<source>Open</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">250</context>
<context context-type="linenumber">247</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/app-frame/global-search/global-search.component.html</context>
@@ -2240,21 +2229,21 @@
<source>Input data</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">260</context>
<context context-type="linenumber">257</context>
</context-group>
</trans-unit>
<trans-unit id="1585185618099050920" datatype="html">
<source>Result data</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">266</context>
<context context-type="linenumber">263</context>
</context-group>
</trans-unit>
<trans-unit id="7976920528153858271" datatype="html">
<source>No tasks match the current filters.</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.html</context>
<context context-type="linenumber">288</context>
<context context-type="linenumber">285</context>
</context-group>
</trans-unit>
<trans-unit id="2525230676386818985" datatype="html">
@@ -2431,78 +2420,60 @@
<source>Dismiss selected</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">239</context>
<context context-type="linenumber">233</context>
</context-group>
</trans-unit>
<trans-unit id="9169677036332103838" datatype="html">
<source>Dismiss visible</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">240</context>
<context context-type="linenumber">234</context>
</context-group>
</trans-unit>
<trans-unit id="3169751690815214293" datatype="html">
<source>Confirm Dismiss</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">314</context>
<context context-type="linenumber">307</context>
</context-group>
</trans-unit>
<trans-unit id="5029621907742319073" datatype="html">
<source>Dismiss <x id="PH" equiv-text="tasks.size"/> tasks?</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">315</context>
<context context-type="linenumber">308</context>
</context-group>
</trans-unit>
<trans-unit id="3597309129998924778" datatype="html">
<source>Error dismissing tasks</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">326</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">360</context>
<context context-type="linenumber">319</context>
</context-group>
</trans-unit>
<trans-unit id="2132179171926568807" datatype="html">
<source>Error dismissing task</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">338</context>
</context-group>
</trans-unit>
<trans-unit id="1323591410517879795" datatype="html">
<source>Confirm Dismiss All</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">348</context>
</context-group>
</trans-unit>
<trans-unit id="4157200209636243740" datatype="html">
<source>Dismiss all <x id="PH" equiv-text="this.totalTasks"/> tasks?</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">349</context>
<context context-type="linenumber">331</context>
</context-group>
</trans-unit>
<trans-unit id="8149502458056418229" datatype="html">
<source>Success. New document id <x id="PH" equiv-text="documentId"/> created</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">408</context>
<context context-type="linenumber">377</context>
</context-group>
</trans-unit>
<trans-unit id="8760066891202884337" datatype="html">
<source>Duplicate of document #<x id="PH" equiv-text="duplicateOf"/></source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">418</context>
<context context-type="linenumber">387</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/tasks/tasks.component.ts</context>
<context context-type="linenumber">452</context>
<context context-type="linenumber">421</context>
</context-group>
</trans-unit>
<trans-unit id="3418677553313974490" datatype="html">
@@ -3693,42 +3664,42 @@
<source>{VAR_PLURAL, plural, =1 {One page} other {<x id="INTERPOLATION"/> pages}}</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/confirm-dialog/merge-confirm-dialog/merge-confirm-dialog.component.html</context>
<context context-type="linenumber">28</context>
<context context-type="linenumber">25</context>
</context-group>
</trans-unit>
<trans-unit id="7508164375697837821" datatype="html">
<source>Use metadata from:</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/confirm-dialog/merge-confirm-dialog/merge-confirm-dialog.component.html</context>
<context context-type="linenumber">38</context>
<context context-type="linenumber">34</context>
</context-group>
</trans-unit>
<trans-unit id="2020403212524346652" datatype="html">
<source>Regenerate all metadata</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/confirm-dialog/merge-confirm-dialog/merge-confirm-dialog.component.html</context>
<context context-type="linenumber">40</context>
<context context-type="linenumber">36</context>
</context-group>
</trans-unit>
<trans-unit id="2710430925353472741" datatype="html">
<source>Try to include archive version in merge for non-PDF files</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/confirm-dialog/merge-confirm-dialog/merge-confirm-dialog.component.html</context>
<context context-type="linenumber">48</context>
<context context-type="linenumber">44</context>
</context-group>
</trans-unit>
<trans-unit id="5612366187076076264" datatype="html">
<source>Delete original documents after successful merge</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/confirm-dialog/merge-confirm-dialog/merge-confirm-dialog.component.html</context>
<context context-type="linenumber">52</context>
<context context-type="linenumber">48</context>
</context-group>
</trans-unit>
<trans-unit id="5138283234724909648" datatype="html">
<source>Note that only PDFs will be included.</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/confirm-dialog/merge-confirm-dialog/merge-confirm-dialog.component.html</context>
<context context-type="linenumber">55</context>
<context context-type="linenumber">51</context>
</context-group>
</trans-unit>
<trans-unit id="1309641780471803652" datatype="html">
@@ -3843,7 +3814,7 @@
<source>Saved field &quot;<x id="PH" equiv-text="newField.name"/>&quot;.</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-dropdown/custom-fields-dropdown.component.ts</context>
<context context-type="linenumber">129</context>
<context context-type="linenumber">130</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/document-attributes/custom-fields/custom-fields.component.ts</context>
@@ -3854,7 +3825,7 @@
<source>Error saving field.</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-dropdown/custom-fields-dropdown.component.ts</context>
<context context-type="linenumber">138</context>
<context context-type="linenumber">139</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/document-attributes/custom-fields/custom-fields.component.ts</context>
@@ -3939,11 +3910,11 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">96</context>
<context context-type="linenumber">94</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">102</context>
<context context-type="linenumber">100</context>
</context-group>
</trans-unit>
<trans-unit id="3800326155195149498" datatype="html">
@@ -3954,29 +3925,29 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">97</context>
<context context-type="linenumber">95</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">103</context>
<context context-type="linenumber">101</context>
</context-group>
</trans-unit>
<trans-unit id="7551700625201096185" datatype="html">
<source>Search docs...</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">70</context>
<context context-type="linenumber">69</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">119</context>
<context context-type="linenumber">117</context>
</context-group>
</trans-unit>
<trans-unit id="3184700926171002527" datatype="html">
<source>Any</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">152</context>
<context context-type="linenumber">149</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/filterable-dropdown/filterable-dropdown.component.html</context>
@@ -3987,21 +3958,21 @@
<source>Not</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">157</context>
<context context-type="linenumber">154</context>
</context-group>
</trans-unit>
<trans-unit id="6548676277933116532" datatype="html">
<source>Add query</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">176</context>
<context context-type="linenumber">173</context>
</context-group>
</trans-unit>
<trans-unit id="5599577087865387184" datatype="html">
<source>Add expression</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/custom-fields-query-dropdown/custom-fields-query-dropdown.component.html</context>
<context context-type="linenumber">179</context>
<context context-type="linenumber">176</context>
</context-group>
</trans-unit>
<trans-unit id="6312759212949884929" datatype="html">
@@ -4670,23 +4641,23 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">199</context>
<context context-type="linenumber">197</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">233</context>
<context context-type="linenumber">231</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">267</context>
<context context-type="linenumber">265</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">277</context>
<context context-type="linenumber">275</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">315</context>
<context context-type="linenumber">313</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/toast/toast.component.html</context>
@@ -6008,11 +5979,11 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/input/select/select.component.html</context>
<context context-type="linenumber">62</context>
<context context-type="linenumber">61</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/input/tags/tags.component.html</context>
<context context-type="linenumber">66</context>
<context context-type="linenumber">65</context>
</context-group>
</trans-unit>
<trans-unit id="6344437738844463465" datatype="html">
@@ -6023,7 +5994,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/input/select/select.component.ts</context>
<context context-type="linenumber">176</context>
<context context-type="linenumber">172</context>
</context-group>
</trans-unit>
<trans-unit id="1880237574877817137" datatype="html">
@@ -6127,7 +6098,7 @@
<source>Private</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/input/select/select.component.ts</context>
<context context-type="linenumber">72</context>
<context context-type="linenumber">71</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/tag/tag.component.html</context>
@@ -6150,7 +6121,7 @@
<source>No items found</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/input/select/select.component.ts</context>
<context context-type="linenumber">110</context>
<context context-type="linenumber">106</context>
</context-group>
</trans-unit>
<trans-unit id="6541407358060244620" datatype="html">
@@ -6164,21 +6135,21 @@
<source>Add tag</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/input/tags/tags.component.html</context>
<context context-type="linenumber">18</context>
<context context-type="linenumber">17</context>
</context-group>
</trans-unit>
<trans-unit id="3392754525167799121" datatype="html">
<source>Remove tag</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/input/tags/tags.component.html</context>
<context context-type="linenumber">24</context>
<context context-type="linenumber">23</context>
</context-group>
</trans-unit>
<trans-unit id="2561408369057364131" datatype="html">
<source>Filter documents with these Tags</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/input/tags/tags.component.html</context>
<context context-type="linenumber">56</context>
<context context-type="linenumber">55</context>
</context-group>
</trans-unit>
<trans-unit id="1400555558847223243" datatype="html">
@@ -6470,7 +6441,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">334</context>
<context context-type="linenumber">332</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/mail/mail.component.html</context>
@@ -7179,28 +7150,28 @@
<source>Recent Task Activity <x id="START_TAG_SPAN" ctype="x-span" equiv-text="&lt;span class=&quot;small text-muted fw-light&quot;&gt;"/>(<x id="INTERPOLATION" equiv-text="{{status.tasks.summary.days}}"/> days)<x id="CLOSE_TAG_SPAN" ctype="x-span" equiv-text="&lt;/span&gt;"/></source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">147</context>
<context context-type="linenumber">145</context>
</context-group>
</trans-unit>
<trans-unit id="3448462145758383019" datatype="html">
<source>Total</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">152</context>
<context context-type="linenumber">150</context>
</context-group>
</trans-unit>
<trans-unit id="3521084103654700903" datatype="html">
<source>Successful</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">156</context>
<context context-type="linenumber">154</context>
</context-group>
</trans-unit>
<trans-unit id="7256395947475975935" datatype="html">
<source>Failed</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">160</context>
<context context-type="linenumber">158</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/data/share-link-bundle.ts</context>
@@ -7211,7 +7182,7 @@
<source>Pending</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">164</context>
<context context-type="linenumber">162</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/data/share-link-bundle.ts</context>
@@ -7222,96 +7193,96 @@
<source>No recent tasks</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">169</context>
<context context-type="linenumber">167</context>
</context-group>
</trans-unit>
<trans-unit id="2041675390931385838" datatype="html">
<source>Health</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">180</context>
<context context-type="linenumber">178</context>
</context-group>
</trans-unit>
<trans-unit id="31377277941774469" datatype="html">
<source>Search Index</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">184</context>
<context context-type="linenumber">182</context>
</context-group>
</trans-unit>
<trans-unit id="4089509911694721896" datatype="html">
<source>Last Updated</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">197</context>
<context context-type="linenumber">195</context>
</context-group>
</trans-unit>
<trans-unit id="46628344485199198" datatype="html">
<source>Classifier</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">202</context>
<context context-type="linenumber">200</context>
</context-group>
</trans-unit>
<trans-unit id="9127131074422113272" datatype="html">
<source>Run Task</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">224</context>
<context context-type="linenumber">222</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">258</context>
<context context-type="linenumber">256</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">306</context>
<context context-type="linenumber">304</context>
</context-group>
</trans-unit>
<trans-unit id="6096684179126491743" datatype="html">
<source>Last Trained</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">231</context>
<context context-type="linenumber">229</context>
</context-group>
</trans-unit>
<trans-unit id="6427836860962380759" datatype="html">
<source>Sanity Checker</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">236</context>
<context context-type="linenumber">234</context>
</context-group>
</trans-unit>
<trans-unit id="6578747070254776938" datatype="html">
<source>Last Run</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">265</context>
<context context-type="linenumber">263</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">313</context>
<context context-type="linenumber">311</context>
</context-group>
</trans-unit>
<trans-unit id="5921685253729220446" datatype="html">
<source>WebSocket Connection</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">270</context>
<context context-type="linenumber">268</context>
</context-group>
</trans-unit>
<trans-unit id="8998179362936748717" datatype="html">
<source>OK</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">274</context>
<context context-type="linenumber">272</context>
</context-group>
</trans-unit>
<trans-unit id="3804349597565969872" datatype="html">
<source>AI Index</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">283</context>
<context context-type="linenumber">281</context>
</context-group>
</trans-unit>
<trans-unit id="6732151329960766506" datatype="html">
@@ -10939,20 +10910,6 @@
<context context-type="linenumber">361</context>
</context-group>
</trans-unit>
<trans-unit id="4493921125434706859" datatype="html">
<source>LLM Request Timeout</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/data/paperless-config.ts</context>
<context context-type="linenumber">365</context>
</context-group>
</trans-unit>
<trans-unit id="483994032066441287" datatype="html">
<source>Timeout in seconds for LLM requests.</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/data/paperless-config.ts</context>
<context context-type="linenumber">369</context>
</context-group>
</trans-unit>
<trans-unit id="9155387182259025015" datatype="html">
<source>Processing</source>
<context-group purpose="location">
+26
View File
@@ -13,6 +13,8 @@ import { DocumentDetailComponent } from './components/document-detail/document-d
import { DocumentListComponent } from './components/document-list/document-list.component'
import { DocumentAttributesComponent } from './components/manage/document-attributes/document-attributes.component'
import { MailComponent } from './components/manage/mail/mail.component'
import { OcrTemplateEditorComponent } from './components/manage/ocr-templates/ocr-template-editor/ocr-template-editor.component'
import { OcrTemplatesComponent } from './components/manage/ocr-templates/ocr-templates.component'
import { SavedViewsComponent } from './components/manage/saved-views/saved-views.component'
import { WorkflowsComponent } from './components/manage/workflows/workflows.component'
import { NotFoundComponent } from './components/not-found/not-found.component'
@@ -274,6 +276,30 @@ export const routes: Routes = [
componentName: 'WorkflowsComponent',
},
},
{
path: 'ocr-templates',
component: OcrTemplatesComponent,
canActivate: [PermissionsGuard],
data: {
requiredPermission: {
action: PermissionAction.View,
type: PermissionType.OcrTemplate,
},
componentName: 'OcrTemplatesComponent',
},
},
{
path: 'ocr-templates/:id',
component: OcrTemplateEditorComponent,
canActivate: [PermissionsGuard],
data: {
requiredPermission: {
action: PermissionAction.Change,
type: PermissionType.OcrTemplate,
},
componentName: 'OcrTemplateEditorComponent',
},
},
{
path: 'mail',
component: MailComponent,
@@ -243,6 +243,14 @@
<i-bs class="me-2" name="boxes"></i-bs><span><ng-container i18n>Workflows</ng-container></span>
</a>
</li>
<li class="nav-item app-link"
*pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.OcrTemplate }">
<a class="nav-link" routerLink="ocr-templates" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="OCR Templates" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-2" name="file-earmark-break"></i-bs><span><ng-container i18n>OCR Templates</ng-container></span>
</a>
</li>
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.MailAccount }"
tourAnchor="tour.mail">
<a class="nav-link" routerLink="mail" routerLinkActive="active" (click)="closeMenu()" ngbPopover="Mail"
@@ -82,6 +82,14 @@
<i-bs name="pencil" class="me-1"></i-bs><ng-container i18n>PDF Editor</ng-container>
</button>
<button ngbDropdownItem (click)="runZoneOcr()" [disabled]="!userCanEdit || !document?.document_type">
<i-bs width="1em" height="1em" name="file-earmark-ruled" class="me-1"></i-bs><span i18n>Run Zone OCR</span>
</button>
<button ngbDropdownItem (click)="createOcrTemplate()">
<i-bs width="1em" height="1em" name="file-earmark-medical" class="me-1"></i-bs><span i18n>Create OCR Template</span>
</button>
@if (userIsOwner && (requiresPassword || password)) {
<button ngbDropdownItem (click)="removePassword()" [disabled]="!password">
<i-bs name="unlock" class="me-1"></i-bs><ng-container i18n>Remove Password</ng-container>
@@ -1405,6 +1405,48 @@ export class DocumentDetailComponent
})
}
runZoneOcr() {
this.documentsService.runZoneOcr(this.document.id).subscribe({
next: (res) => {
const results = res.results ?? []
if (results.length) {
const failed = results.filter(
(r) =>
r.value === null ||
r.value === undefined ||
`${r.value}`.trim() === ''
)
const filled = results.length - failed.length
let msg = $localize`Filled ${filled} of ${results.length} fields`
if (failed.length) {
const names = failed.map((r) => r.zone).join(', ')
msg = `${msg}. ${$localize`Failed to match zones: ${names}`}`
}
this.toastService.showInfo(msg)
} else {
this.toastService.showInfo(
$localize`Zone OCR ran but no results extracted.`
)
}
this.documentsService
.get(this.documentId)
.subscribe((doc) => this.updateComponent(doc))
},
error: (error) => {
this.toastService.showError($localize`Zone OCR failed`, error)
},
})
}
createOcrTemplate() {
this.router.navigate(['/ocr-templates', 'new'], {
queryParams: {
document_type: this.document.document_type,
sample_document: this.document.id,
},
})
}
private getSelectedNonLatestVersionId(): number | null {
const versions = this.document?.versions ?? []
if (!versions.length || !this.selectedVersionId) {
@@ -95,6 +95,9 @@
<button ngbDropdownItem (click)="mergeSelected()" [disabled]="!userCanAdd || list.allSelected || list.selectedCount < 2">
<i-bs name="journals" class="me-1"></i-bs><ng-container i18n>Merge</ng-container>
</button>
<button ngbDropdownItem (click)="runZoneOcrSelected()" [disabled]="!userCanEditAll || list.allSelected">
<i-bs name="file-earmark-ruled" class="me-1"></i-bs><ng-container i18n>Run Zone OCR</ng-container>
</button>
</div>
</div>
</div>
@@ -12,7 +12,15 @@ import {
} from '@ng-bootstrap/ng-bootstrap'
import { saveAs } from 'file-saver'
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
import { first, map, Observable, Subject, switchMap, takeUntil } from 'rxjs'
import {
first,
forkJoin,
map,
Observable,
Subject,
switchMap,
takeUntil,
} from 'rxjs'
import { ConfirmDialogComponent } from 'src/app/components/common/confirm-dialog/confirm-dialog.component'
import { CustomField } from 'src/app/data/custom-field'
import { MatchingModel } from 'src/app/data/matching-model'
@@ -908,6 +916,27 @@ export class BulkEditorComponent
})
}
runZoneOcrSelected() {
const ids = Array.from(this.list.selected)
if (!ids.length) return
const modal = this.modalService.open(ConfirmDialogComponent, {
backdrop: 'static',
})
modal.componentInstance.title = $localize`Run Zone OCR`
modal.componentInstance.messageBold = $localize`Run zone OCR on ${this.getSelectionSize()} selected document(s)?`
modal.componentInstance.message = $localize`Each document's type template (if it has one) is applied, overwriting the mapped fields.`
modal.componentInstance.btnCaption = $localize`Proceed`
modal.componentInstance.confirmClicked
.pipe(takeUntil(this.unsubscribeNotifier))
.subscribe(() => {
modal.componentInstance.buttonsEnabled = false
this.executeDocumentAction(
modal,
forkJoin(ids.map((id) => this.documentService.runZoneOcr(id)))
)
})
}
setPermissions() {
let modal = this.modalService.open(PermissionsDialogComponent, {
backdrop: 'static',
@@ -0,0 +1,414 @@
<pngx-page-header [title]="pageTitle" [id]="template.id">
<div class="input-group input-group-sm me-5 align-items-center">
<div class="input-group-text">
<i-bs name="file-text"></i-bs>
</div>
<input
type="text"
class="form-control"
[(ngModel)]="previewDocModel"
[ngbTypeahead]="searchDocuments"
[inputFormatter]="documentFormatter"
[resultFormatter]="documentFormatter"
(selectItem)="onPreviewDocSelected($event)"
[editable]="false"
placeholder="Search documents by title..."
i18n-placeholder
/>
</div>
<div class="d-flex align-items-center flex-wrap gap-2">
<div class="input-group input-group-sm ms-2 d-none d-md-flex">
<div class="input-group-text" i18n>Page</div>
<input class="form-control flex-grow-0 w-auto" type="number" min="1" [max]="previewPageCount" [(ngModel)]="previewPageDisplay" />
<div class="input-group-text" i18n>of {{previewPageCount}}</div>
</div>
<button type="button" class="btn btn-sm btn-outline-secondary" i18n-title title="Previous" (click)="prevPage()" [disabled]="!pageImageUrl || previewPage <= 0">
<i-bs width="1.2em" height="1.2em" name="arrow-left"></i-bs>
</button>
<button type="button" class="btn btn-sm btn-outline-secondary" i18n-title title="Next" (click)="nextPage()" [disabled]="!pageImageUrl || previewPage >= (previewPageCount ?? 1) - 1">
<i-bs width="1.2em" height="1.2em" name="arrow-right"></i-bs>
</button>
<div class="input-group input-group-sm">
<button class="btn btn-outline-secondary" (click)="zoomOut()" i18n>-</button>
<span class="input-group-text">{{ zoom * 100 | number: '1.0-0' }}%</span>
<button class="btn btn-outline-secondary" (click)="zoomIn()" i18n>+</button>
</div>
</div>
</pngx-page-header>
<div class="row">
<div class="col-md-4">
<div class="btn-toolbar mb-1 border-bottom">
<div class="btn-group pb-3">
<a routerLink="/ocr-templates" class="btn btn-sm btn-outline-secondary">
<i-bs width="1.2em" height="1.2em" name="x"></i-bs>
<span class="ms-1" i18n>Close</span>
</a>
</div>
<div class="btn-group ms-auto pb-3">
<button class="btn btn-sm btn-primary" (click)="save()" [disabled]="saving">
@if (saving) {
<span class="spinner-border spinner-border-sm me-1"></span>
}
<span i18n>Save</span>
</button>
</div>
</div>
<ul ngbNav #nav="ngbNav" [(activeId)]="activeTab" class="nav-underline flex-nowrap flex-md-wrap overflow-auto">
<li ngbNavItem="settings">
<a ngbNavLink i18n>Settings</a>
<ng-template ngbNavContent>
<div class="row mb-3">
<div class="col-9">
<pngx-input-text [(ngModel)]="template.name" title="Template name" i18n-title></pngx-input-text>
</div>
<div class="col-3">
<pngx-input-switch [(ngModel)]="template.enabled" title="Enabled" i18n-title></pngx-input-switch>
</div>
</div>
<pngx-input-select [(ngModel)]="template.document_type" [items]="documentTypes" bindLabel="name" bindValue="id" title="Document type" i18n-title></pngx-input-select>
<small class="text-muted" i18n>
Draw rectangles on the preview to define extraction zones. Use the
page controls above the preview to add zones on different pages.
</small>
</ng-template>
</li>
<li ngbNavItem="zones">
<a ngbNavLink><ng-container i18n>Zones</ng-container> <span class="badge bg-primary ms-2">{{ template.zones.length }}</span></a>
<ng-template ngbNavContent>
@if (template.zones.length === 0) {
<p class="text-muted" i18n>
No zones defined. Load a document preview and draw rectangles to add zones.
</p>
}
<div class="list-group">
@for (zone of template.zones; track $index; let i = $index) {
<div
class="list-group-item list-group-item-action d-flex justify-content-between align-items-center"
[style.box-shadow]="selectedZoneIndex === i ? 'inset 3px 0 0 0 var(--bs-primary)' : null"
>
<div class="flex-grow-1" role="button" style="cursor: pointer;" (click)="selectZone(i)">
<div><strong [class.text-primary]="selectedZoneIndex === i">{{ zone.name }}</strong></div>
<div class="small text-muted">
{{ getZoneTargetName(zone) }} - {{ zone.width }}x{{ zone.height }}px <ng-container i18n>p.</ng-container>{{ zonePage(zone) }}
</div>
</div>
<div class="btn-group">
<button class="btn btn-sm btn-outline-secondary" type="button" (click)="selectZone(i)" title="Edit" i18n-title>
<i-bs name="pencil"></i-bs>
</button>
<button class="btn btn-sm btn-outline-danger" type="button" (click)="removeZone(i)" title="Delete" i18n-title>
<i-bs name="trash"></i-bs>
</button>
</div>
</div>
}
</div>
</ng-template>
</li>
<li ngbNavItem="zone">
<a ngbNavLink i18n>Zone</a>
<ng-template ngbNavContent>
@if (selectedZone; as zone) {
<div class="d-flex justify-content-between align-items-center mb-3">
<strong>{{ zone.name }}</strong>
<div class="d-flex gap-2">
<button class="btn btn-sm btn-primary" (click)="save()" [disabled]="saving">
@if (saving) {
<span class="spinner-border spinner-border-sm me-1"></span>
}
<span i18n>Save</span>
</button>
<button class="btn btn-sm btn-outline-danger" (click)="deleteSelectedZone()">
<i-bs name="trash" class="me-1"></i-bs><ng-container i18n>Delete zone</ng-container>
</button>
</div>
</div>
<div class="mb-3">
<label class="form-label" i18n>Zone Name</label>
<input
type="text"
class="form-control"
[(ngModel)]="zone.name"
(ngModelChange)="redrawCanvas()"
/>
</div>
<div class="mb-3">
<label class="form-label" i18n>Page</label>
<input
type="number"
class="form-control"
[(ngModel)]="zone.page"
min="-1"
(ngModelChange)="redrawCanvas()"
/>
<small class="text-muted" i18n>Page this zone is on. Use -1 for the last page. Set automatically when you draw it.</small>
</div>
<div class="mb-3">
<label class="form-label" i18n>Field</label>
<div class="input-group">
<select class="form-select" [ngModel]="zoneFieldValue(zone)" (ngModelChange)="setZoneField(zone, $event)">
<optgroup label="Built-in fields" i18n-label>
@for (t of builtinTargets; track t.id) {
<option [ngValue]="t.id">{{ t.name }}</option>
}
</optgroup>
<optgroup label="Custom fields" i18n-label>
@for (cf of customFields; track cf.id) {
<option [ngValue]="cf.id">{{ cf.name }} ({{ cf.data_type }})</option>
}
</optgroup>
</select>
<button
class="btn btn-outline-secondary"
type="button"
(click)="openQuickCreate(selectedZoneIndex)"
title="Create new custom field"
i18n-title
>
<i-bs name="plus"></i-bs>
</button>
</div>
<small class="text-muted" i18n>Write the extracted value to a custom field, or to a built-in field (Title, ASN, Date created).</small>
</div>
@if (isFieldShared(zone)) {
<div class="card mb-3 border-info">
<div class="card-body">
<h6 class="card-title d-flex align-items-center gap-2">
<i-bs name="braces"></i-bs>
<span i18n>Combine zones into this field</span>
</h6>
<p class="small text-muted mb-2" i18n>
More than one zone writes to this field. Build the combined
value below: click a zone to insert its token, and type any
separators or literal text between tokens.
</p>
<div class="d-flex flex-wrap gap-1 mb-2">
@for (z of zonesForField(zone); track $index) {
<button
type="button"
class="btn btn-sm btn-outline-info"
(click)="insertCombineToken(zone, z)"
title="Insert token"
i18n-title
>
+ {{ z.name || 'Zone' }}
</button>
}
</div>
<input
type="text"
class="form-control font-monospace"
[ngModel]="getCombineFormat(zone)"
(ngModelChange)="setCombineFormat(zone, $event)"
placeholder="{Zone 1} - {Zone 2}"
/>
<small class="text-muted" i18n>
Tokens are matched by zone name. An empty zone leaves its
token blank and the stray separator is trimmed. Leave empty
to just join the zones in order with a space.
</small>
</div>
</div>
}
@if (showQuickCreate) {
<div class="card mb-3 border-primary">
<div class="card-body">
<h6 class="card-title" i18n>Create Custom Field</h6>
<div class="mb-2">
<label class="form-label small" i18n>Field Name</label>
<input type="text" class="form-control form-control-sm"
[(ngModel)]="quickCreateName" placeholder="e.g. Invoice Number" />
</div>
<div class="mb-2">
<label class="form-label small" i18n>Field Type</label>
<select class="form-select form-select-sm" [(ngModel)]="quickCreateType">
@for (t of quickCreateTypes; track t.id) {
<option [ngValue]="t.id">{{ t.name }}</option>
}
</select>
</div>
<div class="d-flex gap-2">
<button class="btn btn-primary btn-sm" (click)="submitQuickCreate()"
[disabled]="!quickCreateName.trim()" i18n>
Create & Assign
</button>
<button class="btn btn-outline-secondary btn-sm" (click)="cancelQuickCreate()" i18n>
Cancel
</button>
</div>
</div>
</div>
}
<div class="mb-3">
<label class="form-label" i18n>OCR Language</label>
<ng-select
[items]="ocrLanguageOptions"
bindLabel="name"
bindValue="id"
[multiple]="true"
[closeOnSelect]="false"
[ngModel]="ocrLanguageArray(zone)"
(ngModelChange)="setOcrLanguages(zone, $event)"
placeholder="Select languages"
i18n-placeholder
></ng-select>
</div>
<div class="mb-3">
<label class="form-label" i18n>Transform</label>
<select class="form-select" [(ngModel)]="zone.transform">
@for (opt of transformOptions; track opt.id) {
<option [ngValue]="opt.id">{{ opt.name }}</option>
}
</select>
</div>
@if (zone.transform === 'date') {
<div class="mb-3">
<label class="form-label" i18n>Date format</label>
<select class="form-select" [ngModel]="dateFormatChoice(zone)" (ngModelChange)="setDateFormatChoice(zone, $event)">
@for (opt of dateFormatOptions; track opt.id) {
<option [ngValue]="opt.id">{{ opt.name }}</option>
}
<option [ngValue]="'custom'" i18n>Custom...</option>
</select>
@if (dateFormatCustom) {
<div class="input-group mt-2">
<input type="text" class="form-control font-monospace" [(ngModel)]="zone.date_format" placeholder="%d.%m.%Y" />
<button class="btn btn-outline-secondary" type="button" [ngbPopover]="dateFmtHelp" [autoClose]="true" title="Date format help" i18n-title>
<i-bs name="question-circle"></i-bs>
</button>
</div>
<ng-template #dateFmtHelp>
<p class="mb-1" i18n>Python date codes:</p>
<ul class="mb-1 ps-3">
<li><code>%d</code> <ng-container i18n>day (01-31)</ng-container></li>
<li><code>%m</code> <ng-container i18n>month (01-12)</ng-container></li>
<li><code>%Y</code> <ng-container i18n>year, 4-digit</ng-container></li>
<li><code>%y</code> <ng-container i18n>year, 2-digit</ng-container></li>
<li><code>%b</code> <ng-container i18n>month name (Jan)</ng-container></li>
</ul>
<span i18n>Example:</span> <code>%d.%m.%Y</code> -> 03.03.2026
</ng-template>
}
</div>
}
<div class="mb-3">
<label class="form-label" i18n>Validation Regex</label>
<input
type="text"
class="form-control font-monospace"
[(ngModel)]="zone.validation_regex"
placeholder="e.g. \d{2}\.\d{2}\.\d{4}"
>
</div>
<div class="text-muted small">
{{ zone.x }}, {{ zone.y }} - {{ zone.width }}x{{ zone.height }}px
</div>
<hr class="my-3" />
<h6 i18n>Test</h6>
@if (!previewDocId) {
<p class="text-muted small mb-0" i18n>
Load a document in the Settings tab to test this zone.
</p>
} @else {
<button class="btn btn-sm btn-outline-secondary" (click)="testZone()" [disabled]="zoneTesting">
@if (zoneTesting) {
<span class="spinner-border spinner-border-sm me-1"></span>
}
<span i18n>Test this zone</span>
</button>
@if (zoneTestResult) {
@if (zoneTestResult.error) {
<div class="alert alert-warning py-2 mt-2 mb-0 small">{{ zoneTestResult.error }}</div>
} @else {
<dl class="row small mt-2 mb-0">
<dt class="col-sm-4" i18n>OCR text</dt>
<dd class="col-sm-8"><code>{{ zoneTestResult.raw_text || '(nothing detected)' }}</code></dd>
<dt class="col-sm-4" i18n>Value</dt>
<dd class="col-sm-8"><code>{{ zoneTestResult.value || '(empty)' }}</code></dd>
@if (zoneTestResult.regex) {
<dt class="col-sm-4" i18n>Validation</dt>
<dd class="col-sm-8">
@if (zoneTestResult.regex_match) {
<span class="badge bg-success" i18n>Regex matches</span>
} @else {
<span class="badge bg-danger" i18n>Regex does not match</span>
}
</dd>
}
</dl>
}
}
}
} @else {
<p class="text-muted" i18n>
Select a zone from the Zones tab, or draw a rectangle on the document to create one.
</p>
}
</ng-template>
</li>
</ul>
<div [ngbNavOutlet]="nav" class="mt-3"></div>
</div>
<!-- Right column: Document preview with zone overlay -->
<div class="col-md-8">
@if (pageImageUrl) {
<div class="border" style="overflow: auto; max-height: 78vh;">
<div class="position-relative d-inline-block" [style.width.%]="zoom * 100">
<img
#pageImage
[src]="pageImageUrl"
(load)="onImageLoad()"
style="width: 100%; display: block;"
[style.visibility]="imageLoaded ? 'visible' : 'hidden'"
crossorigin="use-credentials"
/>
@if (imageLoaded) {
<canvas
#zoneCanvas
class="position-absolute top-0 start-0"
style="width: 100%; height: 100%; cursor: crosshair;"
(mousedown)="onCanvasMouseDown($event)"
(mousemove)="onCanvasMouseMove($event)"
(mouseup)="onCanvasMouseUp($event)"
></canvas>
}
@if (!imageLoaded) {
<div class="d-flex justify-content-center p-5">
<div class="spinner-border" role="status">
<span class="visually-hidden" i18n>Loading page...</span>
</div>
</div>
}
</div>
</div>
} @else {
<div class="border rounded p-5 text-center text-muted">
<i-bs name="file-earmark-image" width="48" height="48"></i-bs>
<p class="mt-3" i18n>
Enter a document ID and click "Load" to preview a page and draw extraction zones.
</p>
</div>
}
</div>
</div>
@@ -0,0 +1,3 @@
:host {
display: block;
}
@@ -0,0 +1,997 @@
import { CommonModule } from '@angular/common'
import {
AfterViewInit,
Component,
ElementRef,
inject,
OnDestroy,
OnInit,
ViewChild,
} from '@angular/core'
import { FormsModule } from '@angular/forms'
import { ActivatedRoute, Router, RouterModule } from '@angular/router'
import {
NgbNavModule,
NgbPopoverModule,
NgbTypeaheadModule,
NgbTypeaheadSelectItemEvent,
} from '@ng-bootstrap/ng-bootstrap'
import { NgSelectModule } from '@ng-select/ng-select'
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
import {
catchError,
debounceTime,
distinctUntilChanged,
map,
Observable,
of,
Subject,
switchMap,
takeUntil,
} from 'rxjs'
import { SelectComponent } from 'src/app/components/common/input/select/select.component'
import { SwitchComponent } from 'src/app/components/common/input/switch/switch.component'
import { TextComponent } from 'src/app/components/common/input/text/text.component'
import { PageHeaderComponent } from 'src/app/components/common/page-header/page-header.component'
import { CustomField } from 'src/app/data/custom-field'
import { Document } from 'src/app/data/document'
import { DocumentType } from 'src/app/data/document-type'
import {
DATE_FORMAT_OPTIONS,
OCR_BUILTIN_TARGETS,
OCR_LANGUAGE_OPTIONS,
OcrTemplate,
OcrTemplateZone,
OcrZoneTestResult,
TRANSFORM_OPTIONS,
ZoneTestRequest,
} from 'src/app/data/ocr-template'
import { CorrespondentService } from 'src/app/services/rest/correspondent.service'
import { CustomFieldsService } from 'src/app/services/rest/custom-fields.service'
import { DocumentTypeService } from 'src/app/services/rest/document-type.service'
import { DocumentService } from 'src/app/services/rest/document.service'
import { OcrTemplateService } from 'src/app/services/rest/ocr-template.service'
import { ToastService } from 'src/app/services/toast.service'
interface DrawingRect {
startX: number
startY: number
endX: number
endY: number
}
type ResizeHandle = 'n' | 's' | 'e' | 'w' | 'ne' | 'nw' | 'se' | 'sw'
type ActiveTab = 'settings' | 'zones' | 'zone'
@Component({
selector: 'pngx-ocr-template-editor',
standalone: true,
imports: [
PageHeaderComponent,
TextComponent,
SelectComponent,
SwitchComponent,
CommonModule,
FormsModule,
RouterModule,
NgbNavModule,
NgbPopoverModule,
NgbTypeaheadModule,
NgSelectModule,
NgxBootstrapIconsModule,
],
templateUrl: './ocr-template-editor.component.html',
styleUrls: ['./ocr-template-editor.component.scss'],
})
export class OcrTemplateEditorComponent
implements OnInit, OnDestroy, AfterViewInit
{
private readonly route = inject(ActivatedRoute)
private readonly router = inject(Router)
private readonly templateService = inject(OcrTemplateService)
private readonly customFieldsService = inject(CustomFieldsService)
private readonly documentTypeService = inject(DocumentTypeService)
private readonly correspondentService = inject(CorrespondentService)
private readonly documentService = inject(DocumentService)
private readonly toastService = inject(ToastService)
private readonly destroy$ = new Subject<void>()
@ViewChild('zoneCanvas') canvasRef: ElementRef<HTMLCanvasElement>
@ViewChild('pageImage') imageRef: ElementRef<HTMLImageElement>
template: OcrTemplate = {
id: null,
name: '',
document_type: null,
sample_document: null,
source_width: 0,
source_height: 0,
enabled: true,
combine_formats: {},
zones: [],
}
customFields: CustomField[] = []
documentTypes: DocumentType[] = []
transformOptions = TRANSFORM_OPTIONS
builtinTargets = OCR_BUILTIN_TARGETS
dateFormatOptions = DATE_FORMAT_OPTIONS
ocrLanguageOptions = OCR_LANGUAGE_OPTIONS
dateFormatCustom = false
isNew = true
saving = false
previewDocId: number | null = null
previewPage = 0
previewPageCount: number | null = null
private pageCountForDoc: number | null = null
pageImageUrl: string | null = null
imageLoaded = false
zoom = 1
previewDocModel: Document | string = ''
private correspondentNames = new Map<number, string>()
public get previewPageDisplay(): number {
return this.previewPage + 1
}
public set previewPageDisplay(value: number) {
this.previewPage = Math.max(0, value) - 1
}
activeTab: ActiveTab = 'settings'
isDrawing = false
currentRect: DrawingRect | null = null
selectedZoneIndex: number | null = null
isResizing = false
resizeHandle: ResizeHandle | null = null
resizeZoneIndex: number | null = null
private readonly HANDLE_SIZE = 8
isMoving = false
moveZoneIndex: number | null = null
private moveStart = { mouseX: 0, mouseY: 0, zoneX: 0, zoneY: 0 }
zoneTestResult: OcrZoneTestResult | null = null
zoneTesting = false
showQuickCreate = false
quickCreateName = ''
quickCreateType = 'string'
quickCreateForZoneIndex: number | null = null
quickCreateTypes = [
{ id: 'string', name: $localize`String` },
{ id: 'integer', name: $localize`Integer` },
{ id: 'float', name: $localize`Float` },
{ id: 'date', name: $localize`Date` },
{ id: 'monetary', name: $localize`Monetary` },
{ id: 'boolean', name: $localize`Boolean` },
{ id: 'url', name: $localize`URL` },
{ id: 'longtext', name: $localize`Long Text` },
]
get selectedZone(): OcrTemplateZone | null {
return this.selectedZoneIndex !== null
? (this.template.zones[this.selectedZoneIndex] ?? null)
: null
}
get pageTitle(): string {
return this.isNew
? $localize`New OCR Template`
: $localize`Edit OCR Template`
}
ngOnInit() {
this.customFieldsService
.listAll()
.pipe(takeUntil(this.destroy$))
.subscribe((r) => (this.customFields = r.results))
this.documentTypeService
.listAll()
.pipe(takeUntil(this.destroy$))
.subscribe((r) => (this.documentTypes = r.results))
this.correspondentService
.listAll()
.pipe(takeUntil(this.destroy$))
.subscribe((r) => {
this.correspondentNames = new Map(r.results.map((c) => [c.id, c.name]))
})
const id = this.route.snapshot.paramMap.get('id')
if (id && id !== 'new') {
this.isNew = false
this.templateService
.get(parseInt(id))
.pipe(takeUntil(this.destroy$))
.subscribe((t) => {
this.template = t
this.template.combine_formats ??= {}
if (t.sample_document) {
this.previewDocId = t.sample_document
this.loadPreview()
}
})
} else {
const qp = this.route.snapshot.queryParams
if (qp['document_type']) {
this.template.document_type = parseInt(qp['document_type'])
}
if (qp['sample_document']) {
const docId = parseInt(qp['sample_document'])
this.template.sample_document = docId
this.previewDocId = docId
this.loadPreview()
}
}
}
ngAfterViewInit() {}
searchDocuments = (text$: Observable<string>): Observable<Document[]> =>
text$.pipe(
debounceTime(250),
distinctUntilChanged(),
switchMap((term) => {
if (!term || term.trim().length < 2) return of([])
const params: { title__icontains: string; document_type__id?: number } =
{ title__icontains: term.trim() }
if (this.template.document_type) {
params['document_type__id'] = this.template.document_type
}
return this.documentService.list(1, 10, 'created', true, params).pipe(
map((r) => r.results),
catchError(() => of([]))
)
})
)
documentFormatter = (doc: Document | string): string => {
if (typeof doc === 'string') return doc
const corr = doc.correspondent
? this.correspondentNames.get(doc.correspondent)
: null
return corr
? `#${doc.id} ${doc.title} (${corr})`
: `#${doc.id} ${doc.title}`
}
onPreviewDocSelected(event: NgbTypeaheadSelectItemEvent<Document>) {
event.preventDefault()
const doc: Document = event.item
this.previewDocModel = doc
this.previewDocId = doc.id
if (!this.template.document_type && doc.document_type) {
this.template.document_type = doc.document_type
}
this.previewPage = 0
this.loadPreview()
}
clearPreviewDoc() {
this.previewDocModel = ''
this.previewDocId = null
this.previewPageCount = null
this.pageCountForDoc = null
this.previewPage = 0
this.pageImageUrl = null
this.imageLoaded = false
}
loadPreview() {
if (!this.previewDocId) return
if (this.pageCountForDoc !== this.previewDocId) {
this.pageCountForDoc = this.previewDocId
this.previewPageCount = null
this.documentService
.get(this.previewDocId)
.pipe(takeUntil(this.destroy$))
.subscribe({
next: (doc) => {
this.previewPageCount = doc?.page_count ?? null
if (doc && !this.previewDocModel) this.previewDocModel = doc
},
error: () => (this.previewPageCount = null),
})
}
this.pageImageUrl = this.templateService.getPageImageUrl(
this.previewDocId,
this.previewPage
)
this.imageLoaded = false
}
goToPage(page: number) {
const max = this.previewPageCount ? this.previewPageCount - 1 : page
const clamped = Math.max(0, Math.min(page, max))
if (clamped === this.previewPage) return
this.previewPage = clamped
this.loadPreview()
}
prevPage() {
this.goToPage(this.previewPage - 1)
}
nextPage() {
this.goToPage(this.previewPage + 1)
}
zoomIn() {
this.zoom = Math.min(4, Math.round((this.zoom + 0.25) * 100) / 100)
this.afterZoom()
}
zoomOut() {
this.zoom = Math.max(0.5, Math.round((this.zoom - 0.25) * 100) / 100)
this.afterZoom()
}
resetZoom() {
this.zoom = 1
this.afterZoom()
}
private afterZoom() {
// Defer so the wrapper reflows to the new width before the canvas resizes.
setTimeout(() => this.redrawCanvas())
}
zonePage(zone: OcrTemplateZone): number {
const v = zone.page ?? 1
if (v === -1) return this.previewPageCount ?? this.previewPage + 1
return v >= 1 ? v : 1
}
private isOnCurrentPage(zone: OcrTemplateZone): boolean {
return this.zonePage(zone) === this.previewPage + 1
}
onImageLoad() {
this.imageLoaded = true
const img = this.imageRef.nativeElement
this.template.source_width = img.naturalWidth
this.template.source_height = img.naturalHeight
// The canvas only exists after @if(imageLoaded) renders, so defer the draw.
setTimeout(() => this.redrawCanvas())
}
onCanvasMouseDown(event: MouseEvent) {
const rect = this.canvasRef.nativeElement.getBoundingClientRect()
const x = event.clientX - rect.left
const y = event.clientY - rect.top
if (this.selectedZoneIndex !== null) {
const handle = this.findHandleAt(x, y, this.selectedZoneIndex)
if (handle) {
this.isResizing = true
this.resizeHandle = handle
this.resizeZoneIndex = this.selectedZoneIndex
return
}
}
const clickedIdx = this.findZoneAt(x, y)
if (clickedIdx !== null && !event.shiftKey) {
this.selectZone(clickedIdx)
const zone = this.template.zones[clickedIdx]
this.isMoving = true
this.moveZoneIndex = clickedIdx
this.moveStart = { mouseX: x, mouseY: y, zoneX: zone.x, zoneY: zone.y }
return
}
// Shift+click or click on empty area starts a new zone.
this.isDrawing = true
this.currentRect = { startX: x, startY: y, endX: x, endY: y }
this.selectedZoneIndex = null
}
onCanvasMouseMove(event: MouseEvent) {
const rect = this.canvasRef.nativeElement.getBoundingClientRect()
const mx = event.clientX - rect.left
const my = event.clientY - rect.top
if (this.isResizing && this.resizeZoneIndex !== null && this.resizeHandle) {
this.applyResize(mx, my)
this.redrawCanvas()
return
}
if (this.isMoving && this.moveZoneIndex !== null) {
const zone = this.template.zones[this.moveZoneIndex]
const canvas = this.canvasRef.nativeElement
const img = this.imageRef.nativeElement
const srcW = zone.zone_source_width || img.naturalWidth
const srcH = zone.zone_source_height || img.naturalHeight
const scaleX = srcW / canvas.width
const scaleY = srcH / canvas.height
const dx = Math.round((mx - this.moveStart.mouseX) * scaleX)
const dy = Math.round((my - this.moveStart.mouseY) * scaleY)
zone.x = Math.max(
0,
Math.min(this.moveStart.zoneX + dx, srcW - zone.width)
)
zone.y = Math.max(
0,
Math.min(this.moveStart.zoneY + dy, srcH - zone.height)
)
this.redrawCanvas()
return
}
if (this.isDrawing && this.currentRect) {
this.currentRect.endX = mx
this.currentRect.endY = my
this.redrawCanvas()
return
}
// Cursor feedback: resize handle > move (over a zone) > crosshair.
const canvas = this.canvasRef.nativeElement
if (this.selectedZoneIndex !== null) {
const handle = this.findHandleAt(mx, my, this.selectedZoneIndex)
if (handle) {
const cursorMap: Record<ResizeHandle, string> = {
nw: 'nw-resize',
ne: 'ne-resize',
sw: 'sw-resize',
se: 'se-resize',
n: 'n-resize',
s: 's-resize',
w: 'w-resize',
e: 'e-resize',
}
canvas.style.cursor = cursorMap[handle] || 'crosshair'
return
}
}
canvas.style.cursor =
this.findZoneAt(mx, my) !== null ? 'move' : 'crosshair'
}
onCanvasMouseUp(event: MouseEvent) {
if (this.isMoving) {
this.isMoving = false
this.moveZoneIndex = null
return
}
if (this.isResizing) {
this.isResizing = false
this.resizeHandle = null
this.resizeZoneIndex = null
return
}
if (!this.isDrawing || !this.currentRect) return
this.isDrawing = false
const canvas = this.canvasRef.nativeElement
const img = this.imageRef.nativeElement
const scaleX = img.naturalWidth / canvas.width
const scaleY = img.naturalHeight / canvas.height
const x = Math.round(
Math.min(this.currentRect.startX, this.currentRect.endX) * scaleX
)
const y = Math.round(
Math.min(this.currentRect.startY, this.currentRect.endY) * scaleY
)
const w = Math.round(
Math.abs(this.currentRect.endX - this.currentRect.startX) * scaleX
)
const h = Math.round(
Math.abs(this.currentRect.endY - this.currentRect.startY) * scaleY
)
// Ignore tiny accidental clicks.
if (w < 10 || h < 10) {
this.currentRect = null
this.redrawCanvas()
return
}
const zone: OcrTemplateZone = {
name: `Zone ${this.template.zones.length + 1}`,
target: 'custom_field',
custom_field:
this.customFields.length > 0 ? this.customFields[0].id : null,
x,
y,
width: w,
height: h,
page: this.previewPage + 1,
ocr_language: 'deu+eng',
transform: 'strip',
date_format: '',
validation_regex: '',
order: this.template.zones.length,
zone_source_width: img.naturalWidth,
zone_source_height: img.naturalHeight,
}
this.template.zones.push(zone)
this.currentRect = null
this.selectZone(this.template.zones.length - 1)
}
private getZoneDisplayRect(
zoneIdx: number
): { x: number; y: number; w: number; h: number } | null {
const canvas = this.canvasRef?.nativeElement
const img = this.imageRef?.nativeElement
if (!canvas || !img || !img.naturalWidth) return null
const zone = this.template.zones[zoneIdx]
if (!zone) return null
if (!this.isOnCurrentPage(zone)) return null
const srcW = zone.zone_source_width || img.naturalWidth
const srcH = zone.zone_source_height || img.naturalHeight
const scaleX = canvas.width / srcW
const scaleY = canvas.height / srcH
return {
x: zone.x * scaleX,
y: zone.y * scaleY,
w: zone.width * scaleX,
h: zone.height * scaleY,
}
}
private findHandleAt(
mx: number,
my: number,
zoneIdx: number
): ResizeHandle | null {
const r = this.getZoneDisplayRect(zoneIdx)
if (!r) return null
const hs = this.HANDLE_SIZE
const handles: [ResizeHandle, number, number][] = [
['nw', r.x, r.y],
['n', r.x + r.w / 2, r.y],
['ne', r.x + r.w, r.y],
['w', r.x, r.y + r.h / 2],
['e', r.x + r.w, r.y + r.h / 2],
['sw', r.x, r.y + r.h],
['s', r.x + r.w / 2, r.y + r.h],
['se', r.x + r.w, r.y + r.h],
]
for (const [name, hx, hy] of handles) {
if (Math.abs(mx - hx) <= hs && Math.abs(my - hy) <= hs) return name
}
return null
}
private applyResize(mx: number, my: number) {
const canvas = this.canvasRef.nativeElement
const img = this.imageRef.nativeElement
const zone = this.template.zones[this.resizeZoneIndex]
if (!zone) return
const srcW = zone.zone_source_width || img.naturalWidth
const srcH = zone.zone_source_height || img.naturalHeight
const scaleX = srcW / canvas.width
const scaleY = srcH / canvas.height
const imgX = Math.round(mx * scaleX)
const imgY = Math.round(my * scaleY)
const handle = this.resizeHandle
if (handle.includes('w')) {
const right = zone.x + zone.width
zone.x = Math.max(0, Math.min(imgX, right - 10))
zone.width = right - zone.x
}
if (handle.includes('e')) {
zone.width = Math.max(10, imgX - zone.x)
}
if (handle.includes('n')) {
const bottom = zone.y + zone.height
zone.y = Math.max(0, Math.min(imgY, bottom - 10))
zone.height = bottom - zone.y
}
if (handle.includes('s')) {
zone.height = Math.max(10, imgY - zone.y)
}
}
private findZoneAt(displayX: number, displayY: number): number | null {
const canvas = this.canvasRef.nativeElement
const img = this.imageRef.nativeElement
if (!img.naturalWidth) return null
for (let i = this.template.zones.length - 1; i >= 0; i--) {
const z = this.template.zones[i]
if (!this.isOnCurrentPage(z)) continue
const srcW = z.zone_source_width || img.naturalWidth
const srcH = z.zone_source_height || img.naturalHeight
const scaleX = canvas.width / srcW
const scaleY = canvas.height / srcH
const zx = z.x * scaleX
const zy = z.y * scaleY
const zw = z.width * scaleX
const zh = z.height * scaleY
if (
displayX >= zx &&
displayX <= zx + zw &&
displayY >= zy &&
displayY <= zy + zh
) {
return i
}
}
return null
}
redrawCanvas() {
if (!this.canvasRef || !this.imageRef) return
const canvas = this.canvasRef.nativeElement
const img = this.imageRef.nativeElement
const ctx = canvas.getContext('2d')
canvas.width = img.clientWidth
canvas.height = img.clientHeight
ctx.clearRect(0, 0, canvas.width, canvas.height)
const colors = [
'#4f8ff7',
'#ff6b6b',
'#51cf66',
'#ffd43b',
'#cc5de8',
'#ff922b',
'#20c997',
'#e599f7',
]
this.template.zones.forEach((zone, idx) => {
if (!this.isOnCurrentPage(zone)) return
const color = colors[idx % colors.length]
const srcW = zone.zone_source_width || img.naturalWidth
const srcH = zone.zone_source_height || img.naturalHeight
const scaleX = canvas.width / srcW
const scaleY = canvas.height / srcH
const x = zone.x * scaleX
const y = zone.y * scaleY
const w = zone.width * scaleX
const h = zone.height * scaleY
ctx.strokeStyle = color
ctx.lineWidth = idx === this.selectedZoneIndex ? 3 : 2
ctx.strokeRect(x, y, w, h)
ctx.fillStyle = color + '20'
ctx.fillRect(x, y, w, h)
const label = zone.name || `Zone ${idx + 1}`
ctx.font = '12px sans-serif'
ctx.textBaseline = 'middle'
const padX = 6
const pillH = 17
const pillW = ctx.measureText(label).width + padX * 2
const pillX = x
const pillY = Math.max(0, y - pillH - 2)
const r = 4
ctx.fillStyle = color
ctx.beginPath()
ctx.moveTo(pillX + r, pillY)
ctx.arcTo(pillX + pillW, pillY, pillX + pillW, pillY + pillH, r)
ctx.arcTo(pillX + pillW, pillY + pillH, pillX, pillY + pillH, r)
ctx.arcTo(pillX, pillY + pillH, pillX, pillY, r)
ctx.arcTo(pillX, pillY, pillX + pillW, pillY, r)
ctx.closePath()
ctx.fill()
ctx.fillStyle = '#ffffff'
ctx.fillText(label, pillX + padX, pillY + pillH / 2 + 0.5)
ctx.textBaseline = 'alphabetic'
if (idx === this.selectedZoneIndex) {
const hs = this.HANDLE_SIZE
ctx.fillStyle = color
const handles = [
[x, y],
[x + w / 2, y],
[x + w, y],
[x, y + h / 2],
[x + w, y + h / 2],
[x, y + h],
[x + w / 2, y + h],
[x + w, y + h],
]
for (const [hx, hy] of handles) {
ctx.fillRect(hx - hs / 2, hy - hs / 2, hs, hs)
}
}
})
if (this.currentRect) {
const cw = this.currentRect.endX - this.currentRect.startX
const ch = this.currentRect.endY - this.currentRect.startY
ctx.fillStyle = 'rgba(105, 219, 124, 0.25)'
ctx.fillRect(this.currentRect.startX, this.currentRect.startY, cw, ch)
ctx.strokeStyle = '#69db7c'
ctx.lineWidth = 2
ctx.setLineDash([5, 5])
ctx.strokeRect(this.currentRect.startX, this.currentRect.startY, cw, ch)
ctx.setLineDash([])
}
}
removeZone(index: number) {
this.template.zones.splice(index, 1)
if (this.selectedZoneIndex === index) {
this.selectedZoneIndex = null
} else if (this.selectedZoneIndex > index) {
this.selectedZoneIndex--
}
this.redrawCanvas()
}
selectZone(index: number) {
this.selectedZoneIndex = index
this.activeTab = 'zone'
this.zoneTestResult = null
const zone = this.template.zones[index]
if (zone) {
this.dateFormatCustom =
!!zone.date_format &&
!this.dateFormatOptions.some((o) => o.id === zone.date_format)
this.seedCombineDefault(zone)
this.goToPage(this.zonePage(zone) - 1)
}
this.redrawCanvas()
}
testZone() {
const zone = this.selectedZone
if (!zone || !this.previewDocId) return
this.zoneTesting = true
this.zoneTestResult = null
const payload: ZoneTestRequest = {
name: zone.name,
x: zone.x,
y: zone.y,
width: zone.width,
height: zone.height,
page: zone.page ?? 1,
ocr_language: zone.ocr_language,
transform: zone.transform,
date_format: zone.date_format,
validation_regex: zone.validation_regex,
zone_source_width: zone.zone_source_width,
zone_source_height: zone.zone_source_height,
}
this.templateService
.testZone(this.previewDocId, payload)
.pipe(takeUntil(this.destroy$))
.subscribe({
next: (res) => {
this.zoneTestResult = res
this.zoneTesting = false
},
error: (err) => {
this.zoneTestResult = {
error: err.error?.error || $localize`Test failed`,
}
this.zoneTesting = false
},
})
}
deleteSelectedZone() {
if (this.selectedZoneIndex === null) return
this.removeZone(this.selectedZoneIndex)
this.activeTab = 'zones'
}
save() {
this.saving = true
this.pruneCombineFormats()
this.template.sample_document = this.previewDocId
const obs = this.isNew
? this.templateService.create(this.template)
: this.templateService.update(this.template)
obs.pipe(takeUntil(this.destroy$)).subscribe({
next: (saved) => {
const idx = this.selectedZoneIndex
this.template = saved
this.isNew = false
this.selectedZoneIndex = idx
this.saving = false
this.toastService.showInfo($localize`OCR template saved.`)
this.redrawCanvas()
},
error: (e) => {
this.saving = false
this.toastService.showError($localize`Error saving OCR template.`, e)
},
})
}
private ocrLangCache = new WeakMap<
OcrTemplateZone,
{ src: string; arr: string[] }
>()
ocrLanguageArray(zone: OcrTemplateZone): string[] {
const src = zone.ocr_language || ''
const cached = this.ocrLangCache.get(zone)
if (cached && cached.src === src) return cached.arr
const arr = src ? src.split('+').filter(Boolean) : []
this.ocrLangCache.set(zone, { src, arr })
return arr
}
setOcrLanguages(zone: OcrTemplateZone, langs: string[]) {
zone.ocr_language = (langs || []).join('+')
this.ocrLangCache.set(zone, {
src: zone.ocr_language,
arr: langs ? [...langs] : [],
})
}
getCustomFieldName(id: number): string {
const cf = this.customFields.find((f) => f.id === id)
return cf ? cf.name : `Field #${id}`
}
/** Value bound to the field select: a built-in id string or a custom-field id. */
zoneFieldValue(zone: OcrTemplateZone): number | string | null {
const target = zone.target || 'custom_field'
return target === 'custom_field' ? zone.custom_field : target
}
setZoneField(zone: OcrTemplateZone, value: number | string) {
if (value === 'title' || value === 'asn' || value === 'created') {
zone.target = value
zone.custom_field = null
} else {
zone.target = 'custom_field'
zone.custom_field = typeof value === 'number' ? value : null
}
this.seedCombineDefault(zone)
}
fieldKeyFor(zone: OcrTemplateZone): string | null {
const v = this.zoneFieldValue(zone)
return v === null || v === undefined || v === '' ? null : String(v)
}
zonesForField(zone: OcrTemplateZone): OcrTemplateZone[] {
const key = this.fieldKeyFor(zone)
if (!key) return []
return this.template.zones.filter((z) => this.fieldKeyFor(z) === key)
}
isFieldShared(zone: OcrTemplateZone): boolean {
return this.zonesForField(zone).length > 1
}
getCombineFormat(zone: OcrTemplateZone): string {
const key = this.fieldKeyFor(zone)
return (key && this.template.combine_formats?.[key]) || ''
}
setCombineFormat(zone: OcrTemplateZone, value: string) {
const key = this.fieldKeyFor(zone)
if (!key) return
this.template.combine_formats ??= {}
this.template.combine_formats[key] = value
}
insertCombineToken(zone: OcrTemplateZone, tokenZone: OcrTemplateZone) {
const token = `{${tokenZone.name}}`
const current = this.getCombineFormat(zone)
const sep = current && !current.endsWith(' ') ? ' ' : ''
this.setCombineFormat(zone, `${current}${sep}${token}`)
}
private seedCombineDefault(zone: OcrTemplateZone) {
const key = this.fieldKeyFor(zone)
if (!key) return
const shared = this.zonesForField(zone)
if (shared.length <= 1) return
this.template.combine_formats ??= {}
if (!this.template.combine_formats[key]) {
this.template.combine_formats[key] = shared
.map((z) => `{${z.name}}`)
.join(' ')
}
}
private pruneCombineFormats() {
const formats = this.template.combine_formats
if (!formats) return
const counts = new Map<string, number>()
for (const z of this.template.zones) {
const key = this.fieldKeyFor(z)
if (key) counts.set(key, (counts.get(key) ?? 0) + 1)
}
for (const key of Object.keys(formats)) {
if ((counts.get(key) ?? 0) <= 1) delete formats[key]
}
}
/** Value bound to the date-format select: a preset, '' (auto), or 'custom'. */
dateFormatChoice(zone: OcrTemplateZone): string {
if (this.dateFormatCustom) return 'custom'
return zone.date_format || ''
}
setDateFormatChoice(zone: OcrTemplateZone, value: string) {
if (value === 'custom') {
this.dateFormatCustom = true
} else {
this.dateFormatCustom = false
zone.date_format = value
}
}
getZoneTargetName(zone: OcrTemplateZone): string {
const target = zone.target || 'custom_field'
if (target === 'custom_field') {
return zone.custom_field
? this.getCustomFieldName(zone.custom_field)
: $localize`(no field)`
}
return this.builtinTargets.find((t) => t.id === target)?.name ?? target
}
getDocumentTypeName(id: number): string {
const dt = this.documentTypes.find((d) => d.id === id)
return dt ? dt.name : `Type #${id}`
}
openQuickCreate(zoneIndex: number | null) {
if (zoneIndex === null) return
this.quickCreateForZoneIndex = zoneIndex
this.quickCreateName = this.template.zones[zoneIndex]?.name || ''
this.quickCreateType = 'string'
this.showQuickCreate = true
}
cancelQuickCreate() {
this.showQuickCreate = false
this.quickCreateForZoneIndex = null
}
submitQuickCreate() {
if (!this.quickCreateName.trim()) return
this.templateService
.quickCreateField(this.quickCreateName.trim(), this.quickCreateType)
.pipe(takeUntil(this.destroy$))
.subscribe({
next: (result) => {
this.customFieldsService.clearCache()
this.customFieldsService
.listAll()
.pipe(takeUntil(this.destroy$))
.subscribe((r) => {
this.customFields = r.results
if (this.quickCreateForZoneIndex !== null) {
this.template.zones[this.quickCreateForZoneIndex].custom_field =
result.id
this.template.zones[this.quickCreateForZoneIndex].target =
'custom_field'
}
this.showQuickCreate = false
this.quickCreateForZoneIndex = null
})
},
error: (err) => {
alert(err.error?.error || 'Failed to create custom field')
},
})
}
ngOnDestroy() {
this.destroy$.next()
this.destroy$.complete()
}
}
@@ -0,0 +1,75 @@
<pngx-page-header
title="OCR Templates"
i18n-title
info="Define extraction zones on document types to automatically populate custom fields via OCR."
i18n-info
>
<button type="button" class="btn btn-sm btn-outline-primary" (click)="createTemplate()" *pngxIfPermissions="{ action: PermissionAction.Add, type: PermissionType.OcrTemplate }">
<i-bs name="plus-circle" class="me-1"></i-bs><ng-container i18n>Create Template</ng-container>
</button>
</pngx-page-header>
<ul class="list-group">
<li class="list-group-item">
<div class="row">
<div class="col" i18n>Name</div>
<div class="col d-none d-sm-flex" i18n>Document Type</div>
<div class="col d-none d-sm-flex" i18n>Zones</div>
<div class="col" i18n>Status</div>
<div class="col" i18n>Actions</div>
</div>
</li>
@if (loading && templates.length === 0) {
<li class="list-group-item">
<div class="spinner-border spinner-border-sm me-2" role="status"></div>
<ng-container i18n>Loading...</ng-container>
</li>
}
@for (t of templates; track t.id) {
<li class="list-group-item">
<div class="row fade" [class.show]="show">
<div class="col d-flex align-items-center"><button class="btn btn-link p-0 text-start" type="button" (click)="editTemplate(t)" [disabled]="!permissionsService.currentUserCan(PermissionAction.Change, PermissionType.OcrTemplate)">{{t.name}}</button></div>
<div class="col d-flex align-items-center d-none d-sm-flex">{{getDocumentTypeName(t)}}</div>
<div class="col d-flex align-items-center d-none d-sm-flex"><code>{{t.zones?.length || 0}}</code></div>
<div class="col d-flex align-items-center">
<div class="form-check form-switch mb-0">
<input type="checkbox" class="form-check-input cursor-pointer" [id]="t.id+'_enable'" [(ngModel)]="t.enabled" (change)="toggleTemplate(t)" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.OcrTemplate }">
<label class="form-check-label cursor-pointer" [for]="t.id+'_enable'">
<code> @if(t.enabled) { <ng-container i18n>Enabled</ng-container> } @else { <span i18n class="text-muted">Disabled</span> }</code>
</label>
</div>
</div>
<div class="col">
<div class="btn-group d-block d-sm-none">
<div ngbDropdown container="body" class="d-inline-block">
<button type="button" class="btn btn-link" id="actionsMenuMobile{{t.id}}" (click)="$event.stopPropagation()" ngbDropdownToggle>
<i-bs name="three-dots-vertical"></i-bs>
</button>
<div ngbDropdownMenu aria-labelledby="actionsMenuMobile{{t.id}}">
<button (click)="editTemplate(t)" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.OcrTemplate }" ngbDropdownItem i18n>Edit</button>
<button (click)="deleteTemplate(t)" *pngxIfPermissions="{ action: PermissionAction.Delete, type: PermissionType.OcrTemplate }" ngbDropdownItem i18n>Delete</button>
</div>
</div>
</div>
<div class="btn-toolbar d-none d-sm-flex gap-2" role="toolbar">
<div class="btn-group">
<button *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.OcrTemplate }" class="btn btn-sm btn-outline-secondary" type="button" (click)="editTemplate(t)">
<i-bs width="1em" height="1em" name="pencil" class="me-1"></i-bs><ng-container i18n>Edit</ng-container>
</button>
<button *pngxIfPermissions="{ action: PermissionAction.Delete, type: PermissionType.OcrTemplate }" class="btn btn-sm btn-outline-danger" type="button" (click)="deleteTemplate(t)">
<i-bs width="1em" height="1em" name="trash" class="me-1"></i-bs><ng-container i18n>Delete</ng-container>
</button>
</div>
</div>
</div>
</div>
</li>
}
@if (!loading && templates.length === 0) {
<li class="list-group-item" [class.show]="show" i18n>No OCR templates defined.</li>
}
</ul>
@@ -0,0 +1,98 @@
import { Component, OnInit, inject } from '@angular/core'
import { FormsModule } from '@angular/forms'
import { Router } from '@angular/router'
import { NgbDropdownModule, NgbModal } from '@ng-bootstrap/ng-bootstrap'
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
import { delay, takeUntil, tap } from 'rxjs'
import { OcrTemplate } from 'src/app/data/ocr-template'
import { IfPermissionsDirective } from 'src/app/directives/if-permissions.directive'
import { PermissionsService } from 'src/app/services/permissions.service'
import { DocumentTypeService } from 'src/app/services/rest/document-type.service'
import { OcrTemplateService } from 'src/app/services/rest/ocr-template.service'
import { ConfirmDialogComponent } from '../../common/confirm-dialog/confirm-dialog.component'
import { PageHeaderComponent } from '../../common/page-header/page-header.component'
import { LoadingComponentWithPermissions } from '../../loading-component/loading.component'
@Component({
selector: 'pngx-ocr-templates',
templateUrl: './ocr-templates.component.html',
imports: [
PageHeaderComponent,
IfPermissionsDirective,
FormsModule,
NgbDropdownModule,
NgxBootstrapIconsModule,
],
})
export class OcrTemplatesComponent
extends LoadingComponentWithPermissions
implements OnInit
{
private readonly service = inject(OcrTemplateService)
private readonly documentTypeService = inject(DocumentTypeService)
private readonly router = inject(Router)
private readonly modalService = inject(NgbModal)
permissionsService = inject(PermissionsService)
public templates: OcrTemplate[] = []
private documentTypeNames: Map<number, string> = new Map()
ngOnInit() {
this.documentTypeService
.listAll()
.pipe(takeUntil(this.unsubscribeNotifier))
.subscribe((r) => {
this.documentTypeNames = new Map(
r.results.map((dt) => [dt.id, dt.name])
)
})
this.reload()
}
reload() {
this.loading = true
this.service
.listAll()
.pipe(
takeUntil(this.unsubscribeNotifier),
tap((r) => (this.templates = r.results)),
delay(100)
)
.subscribe(() => {
this.show = true
this.loading = false
})
}
getDocumentTypeName(t: OcrTemplate): string {
return (
this.documentTypeNames.get(t.document_type) ?? `${t.document_type ?? ''}`
)
}
createTemplate() {
this.router.navigate(['/ocr-templates', 'new'])
}
editTemplate(t: OcrTemplate) {
this.router.navigate(['/ocr-templates', t.id])
}
toggleTemplate(t: OcrTemplate) {
// ngModel has already flipped t.enabled — just persist it.
this.service.patch(t).subscribe()
}
deleteTemplate(t: OcrTemplate) {
const modal = this.modalService.open(ConfirmDialogComponent)
modal.componentInstance.title = $localize`Delete OCR Template`
modal.componentInstance.messageBoldPart = t.name
modal.componentInstance.message = $localize`Do you really want to delete this OCR template?`
modal.componentInstance.btnClass = 'btn-danger'
modal.componentInstance.btnCaption = $localize`Delete`
modal.componentInstance.confirmClicked.subscribe(() => {
modal.close()
this.service.delete(t).subscribe(() => this.reload())
})
}
}
+102
View File
@@ -0,0 +1,102 @@
import { ObjectWithId } from './object-with-id'
export type OcrZoneTarget = 'custom_field' | 'title' | 'asn' | 'created'
export const OCR_BUILTIN_TARGETS = [
{ id: 'title', name: $localize`Title` },
{ id: 'asn', name: $localize`Archive serial number` },
{ id: 'created', name: $localize`Date created` },
]
export interface OcrTemplateZone {
id?: number
name: string
target?: OcrZoneTarget
custom_field: number | null
page?: number
x: number
y: number
width: number
height: number
ocr_language: string
transform: string
date_format?: string
validation_regex: string
order: number
zone_source_width?: number
zone_source_height?: number
}
export const TRANSFORM_OPTIONS = [
{ id: 'none', name: $localize`None` },
{ id: 'strip', name: $localize`Strip whitespace` },
{ id: 'uppercase', name: $localize`Uppercase` },
{ id: 'lowercase', name: $localize`Lowercase` },
{ id: 'numeric', name: $localize`Numeric only` },
{
id: 'strip_punctuation',
name: $localize`Remove leading/trailing punctuation`,
},
{ id: 'date', name: $localize`Parse date` },
{ id: 'qr_code', name: $localize`Read QR/barcode` },
]
export const OCR_LANGUAGE_OPTIONS = [
{ id: 'eng', name: $localize`English` },
{ id: 'deu', name: $localize`German` },
{ id: 'fra', name: $localize`French` },
{ id: 'ita', name: $localize`Italian` },
{ id: 'spa', name: $localize`Spanish` },
{ id: 'por', name: $localize`Portuguese` },
{ id: 'nld', name: $localize`Dutch` },
]
export const DATE_FORMAT_OPTIONS = [
{ id: '', name: $localize`Auto-detect` },
{ id: '%d.%m.%Y', name: 'DD.MM.YYYY' },
{ id: '%Y/%m/%d', name: 'YYYY/MM/DD' },
{ id: '%d/%m/%Y', name: 'DD/MM/YYYY' },
]
export interface OcrTemplate extends ObjectWithId {
name: string
document_type: number
sample_document: number | null
source_width: number
source_height: number
enabled: boolean
combine_formats?: Record<string, string>
created?: string
updated?: string
zones: OcrTemplateZone[]
}
export interface ZoneTestRequest {
name: string
x: number
y: number
width: number
height: number
page: number
ocr_language: string
transform: string
date_format?: string
validation_regex: string
zone_source_width?: number
zone_source_height?: number
}
export interface OcrZoneTestResult {
raw_text?: string | null
value?: string | null
regex?: string
regex_match?: boolean | null
error?: string
}
export interface OcrZoneRunResult {
template: string
zone: string
custom_field: string
value: string | number | null
}
@@ -28,6 +28,7 @@ export enum PermissionType {
ShareLink = '%s_sharelink',
CustomField = '%s_customfield',
Workflow = '%s_workflow',
OcrTemplate = '%s_ocrtemplate',
ProcessedMail = '%s_processedmail',
GlobalStatistics = '%s_global_statistics',
SystemMonitoring = '%s_system_monitoring',
@@ -12,6 +12,7 @@ import {
import { DocumentMetadata } from 'src/app/data/document-metadata'
import { DocumentSuggestions } from 'src/app/data/document-suggestions'
import { FilterRule } from 'src/app/data/filter-rule'
import { OcrZoneRunResult } from 'src/app/data/ocr-template'
import { Results, SelectionData } from 'src/app/data/results'
import { SETTINGS_KEYS } from 'src/app/data/ui-settings'
import { queryParamsFromFilterRules } from '../../utils/query-params'
@@ -355,6 +356,13 @@ export class DocumentService extends AbstractPaperlessService<Document> {
})
}
runZoneOcr(id: number): Observable<{ results: OcrZoneRunResult[] }> {
return this.http.post<{ results: OcrZoneRunResult[] }>(
this.getResourceUrl(id, 'run-zone-ocr'),
{}
)
}
rotateDocuments(
selection: DocumentSelectionQuery,
degrees: number,
@@ -0,0 +1,47 @@
import { Injectable } from '@angular/core'
import { Observable } from 'rxjs'
import {
OcrTemplate,
OcrZoneTestResult,
ZoneTestRequest,
} from '../../data/ocr-template'
import { AbstractPaperlessService } from './abstract-paperless-service'
export interface QuickCreateFieldResult {
id: number
name: string
data_type: string
created: boolean
}
@Injectable({ providedIn: 'root' })
export class OcrTemplateService extends AbstractPaperlessService<OcrTemplate> {
constructor() {
super()
this.resourceName = 'ocr_templates'
}
getPageImageUrl(docId: number, page: number): string {
return `${this.baseUrl}${this.resourceName}/document-page-image/${docId}/${page}/`
}
testZone(
docId: number,
zone: ZoneTestRequest
): Observable<OcrZoneTestResult> {
return this.http.post<OcrZoneTestResult>(
`${this.baseUrl}${this.resourceName}/test-zone/`,
{ document: docId, zone }
)
}
quickCreateField(
name: string,
dataType: string
): Observable<QuickCreateFieldResult> {
return this.http.post<QuickCreateFieldResult>(
`${this.baseUrl}${this.resourceName}/quick-create-field/`,
{ name, data_type: dataType }
)
}
}
+6
View File
@@ -79,13 +79,16 @@ import {
exclamationTriangleFill,
eye,
fileEarmark,
fileEarmarkBreak,
fileEarmarkCheck,
fileEarmarkDiff,
fileEarmarkFill,
fileEarmarkLock,
fileEarmarkMedical,
fileEarmarkMinus,
fileEarmarkPlus,
fileEarmarkRichtext,
fileEarmarkRuled,
fileText,
files,
filter,
@@ -302,13 +305,16 @@ const icons = {
exclamationTriangleFill,
eye,
fileEarmark,
fileEarmarkBreak,
fileEarmarkCheck,
fileEarmarkDiff,
fileEarmarkFill,
fileEarmarkLock,
fileEarmarkMedical,
fileEarmarkMinus,
fileEarmarkPlus,
fileEarmarkRichtext,
fileEarmarkRuled,
files,
fileText,
filter,
+13
View File
@@ -13,8 +13,11 @@ class DocumentsConfig(AppConfig):
from documents.signals.handlers import add_inbox_tags
from documents.signals.handlers import add_or_update_document_in_llm_index
from documents.signals.handlers import add_to_index
from documents.signals.handlers import capture_old_document_type
from documents.signals.handlers import run_workflows_added
from documents.signals.handlers import run_workflows_updated
from documents.signals.handlers import run_zone_ocr_extraction
from documents.signals.handlers import run_zone_ocr_on_type_change
from documents.signals.handlers import send_websocket_document_updated
from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_document_type
@@ -29,6 +32,16 @@ class DocumentsConfig(AppConfig):
document_consumption_finished.connect(add_to_index)
document_consumption_finished.connect(run_workflows_added)
document_consumption_finished.connect(add_or_update_document_in_llm_index)
document_consumption_finished.connect(run_zone_ocr_extraction)
from django.db.models.signals import post_save
from django.db.models.signals import pre_save
from documents.models import Document
pre_save.connect(capture_old_document_type, sender=Document)
post_save.connect(run_zone_ocr_on_type_change, sender=Document)
document_updated.connect(run_workflows_updated)
document_updated.connect(send_websocket_document_updated)
document_updated.connect(add_or_update_document_in_llm_index)
@@ -0,0 +1,267 @@
# Generated by Django 5.2.14 on 2026-06-16 17:36
import django.core.validators
import django.db.models.deletion
import django.utils.timezone
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0021_widen_workflow_integer_fields"),
]
operations = [
migrations.CreateModel(
name="OcrTemplate",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128, verbose_name="name")),
(
"source_width",
models.PositiveIntegerField(
help_text="Width of the image the zones were drawn on (px)",
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="source width",
),
),
(
"source_height",
models.PositiveIntegerField(
help_text="Height of the image the zones were drawn on (px)",
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="source height",
),
),
("enabled", models.BooleanField(default=True, verbose_name="enabled")),
(
"combine_formats",
models.JSONField(
blank=True,
default=dict,
help_text="Per-target format strings for combining several zones into one field, keyed by target (custom field id, or 'title'/'asn'/'created'). Tokens like {Zone Name} are replaced with that zone's value.",
verbose_name="combine formats",
),
),
(
"created",
models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
editable=False,
verbose_name="created",
),
),
(
"updated",
models.DateTimeField(auto_now=True, verbose_name="updated"),
),
(
"document_type",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="ocr_templates",
to="documents.documenttype",
verbose_name="document type",
),
),
(
"sample_document",
models.ForeignKey(
blank=True,
help_text="Document used for previewing zones in the editor",
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="+",
to="documents.document",
verbose_name="sample document",
),
),
],
options={
"verbose_name": "OCR template",
"verbose_name_plural": "OCR templates",
"ordering": ("name",),
},
),
migrations.CreateModel(
name="OcrTemplateZone",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"name",
models.CharField(
help_text="Descriptive name for this zone (e.g. 'Invoice Number')",
max_length=128,
verbose_name="zone name",
),
),
(
"target",
models.CharField(
choices=[
("custom_field", "Custom field"),
("title", "Title"),
("asn", "Archive serial number"),
("created", "Date created"),
],
default="custom_field",
help_text="Where the extracted value is written: a custom field, or a built-in document field (title, ASN, created date)",
max_length=20,
verbose_name="target",
),
),
(
"page",
models.IntegerField(
blank=True,
help_text="Page (1 = first, -1 = last; blank uses the template default)",
null=True,
verbose_name="page",
),
),
(
"x",
models.PositiveIntegerField(
help_text="Left edge (px)",
verbose_name="x",
),
),
(
"y",
models.PositiveIntegerField(
help_text="Top edge (px)",
verbose_name="y",
),
),
(
"width",
models.PositiveIntegerField(
help_text="Zone width (px)",
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="width",
),
),
(
"height",
models.PositiveIntegerField(
help_text="Zone height (px)",
validators=[django.core.validators.MinValueValidator(1)],
verbose_name="height",
),
),
(
"zone_source_width",
models.PositiveIntegerField(
blank=True,
help_text="Width of the page image this zone was drawn on (px). Falls back to template source_width if unset.",
null=True,
verbose_name="zone source width",
),
),
(
"zone_source_height",
models.PositiveIntegerField(
blank=True,
help_text="Height of the page image this zone was drawn on (px). Falls back to template source_height if unset.",
null=True,
verbose_name="zone source height",
),
),
(
"ocr_language",
models.CharField(
default="deu+eng",
help_text="Tesseract language code(s), e.g. 'deu+eng'",
max_length=20,
verbose_name="OCR language",
),
),
(
"transform",
models.CharField(
choices=[
("none", "None"),
("strip", "Strip whitespace"),
("uppercase", "Uppercase"),
("lowercase", "Lowercase"),
("numeric", "Numeric only"),
(
"strip_punctuation",
"Remove leading/trailing punctuation",
),
("date", "Parse date"),
("qr_code", "Read QR/barcode"),
],
default="strip",
max_length=20,
verbose_name="transform",
),
),
(
"date_format",
models.CharField(
blank=True,
default="",
help_text="Python strptime format for the 'Parse date' transform (e.g. %d.%m.%Y). Blank = auto-detect.",
max_length=64,
verbose_name="date format",
),
),
(
"validation_regex",
models.CharField(
blank=True,
default="",
help_text="Optional regex pattern — extracted text is only accepted if it matches",
max_length=256,
verbose_name="validation regex",
),
),
("order", models.PositiveIntegerField(default=0, verbose_name="order")),
(
"custom_field",
models.ForeignKey(
blank=True,
help_text="Target custom field (only used when target is 'custom_field')",
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="ocr_zones",
to="documents.customfield",
verbose_name="custom field",
),
),
(
"template",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="zones",
to="documents.ocrtemplate",
verbose_name="template",
),
),
],
options={
"verbose_name": "OCR template zone",
"verbose_name_plural": "OCR template zones",
"ordering": ("template", "order"),
},
),
]
+245
View File
@@ -1894,3 +1894,248 @@ class WorkflowRun(SoftDeleteModel):
def __str__(self) -> str:
return f"WorkflowRun of {self.workflow} at {self.run_at} on {self.document}"
class OcrTemplate(models.Model):
"""
Defines a set of OCR extraction zones for a specific document type.
When a document of that type is consumed, each zone in the template is
cropped from the document image and OCR'd separately. The extracted text
is written to the configured custom field or built-in document field.
"""
name = models.CharField(
_("name"),
max_length=128,
)
document_type = models.ForeignKey(
"documents.DocumentType",
on_delete=models.CASCADE,
related_name="ocr_templates",
verbose_name=_("document type"),
db_index=True,
)
source_width = models.PositiveIntegerField(
_("source width"),
validators=[MinValueValidator(1)],
help_text=_("Width of the image the zones were drawn on (px)"),
)
source_height = models.PositiveIntegerField(
_("source height"),
validators=[MinValueValidator(1)],
help_text=_("Height of the image the zones were drawn on (px)"),
)
sample_document = models.ForeignKey(
"documents.Document",
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name="+",
verbose_name=_("sample document"),
help_text=_("Document used for previewing zones in the editor"),
)
enabled = models.BooleanField(_("enabled"), default=True)
combine_formats = models.JSONField(
_("combine formats"),
default=dict,
blank=True,
help_text=_(
"Per-target format strings for combining several zones into one "
"field, keyed by target (custom field id, or 'title'/'asn'/'created'). "
"Tokens like {Zone Name} are replaced with that zone's value.",
),
)
created = models.DateTimeField(
_("created"),
default=timezone.now,
db_index=True,
editable=False,
)
updated = models.DateTimeField(
_("updated"),
auto_now=True,
)
class Meta:
ordering = ("name",)
verbose_name = _("OCR template")
verbose_name_plural = _("OCR templates")
def __str__(self) -> str:
return f"{self.name} ({self.document_type})"
class OcrTemplateZone(models.Model):
"""
A rectangular region within a document page to OCR and extract into a custom
field or built-in document field. Coordinates are relative to the source
image dimensions stored on the template.
"""
template = models.ForeignKey(
OcrTemplate,
on_delete=models.CASCADE,
related_name="zones",
verbose_name=_("template"),
)
name = models.CharField(
_("zone name"),
max_length=128,
help_text=_("Descriptive name for this zone (e.g. 'Invoice Number')"),
)
class TargetType(models.TextChoices):
CUSTOM_FIELD = ("custom_field", _("Custom field"))
TITLE = ("title", _("Title"))
ASN = ("asn", _("Archive serial number"))
CREATED = ("created", _("Date created"))
target = models.CharField(
_("target"),
max_length=20,
choices=TargetType.choices,
default=TargetType.CUSTOM_FIELD,
help_text=_(
"Where the extracted value is written: a custom field, or a "
"built-in document field (title, ASN, created date)",
),
)
custom_field = models.ForeignKey(
"documents.CustomField",
on_delete=models.CASCADE,
related_name="ocr_zones",
verbose_name=_("custom field"),
null=True,
blank=True,
help_text=_("Target custom field (only used when target is 'custom_field')"),
)
page = models.IntegerField(
_("page"),
null=True,
blank=True,
help_text=_("Page (1 = first, -1 = last; blank uses the template default)"),
)
x = models.PositiveIntegerField(_("x"), help_text=_("Left edge (px)"))
y = models.PositiveIntegerField(_("y"), help_text=_("Top edge (px)"))
width = models.PositiveIntegerField(
_("width"),
validators=[MinValueValidator(1)],
help_text=_("Zone width (px)"),
)
height = models.PositiveIntegerField(
_("height"),
validators=[MinValueValidator(1)],
help_text=_("Zone height (px)"),
)
# Per-zone source dimensions for coordinate scaling.
# Stored from the page image the zone was drawn on.
# If null, falls back to the template's source_width/source_height.
# This handles PDFs with mixed page sizes (e.g. landscape + portrait,
# or different paper formats across pages).
zone_source_width = models.PositiveIntegerField(
_("zone source width"),
null=True,
blank=True,
help_text=_(
"Width of the page image this zone was drawn on (px). "
"Falls back to template source_width if unset.",
),
)
zone_source_height = models.PositiveIntegerField(
_("zone source height"),
null=True,
blank=True,
help_text=_(
"Height of the page image this zone was drawn on (px). "
"Falls back to template source_height if unset.",
),
)
ocr_language = models.CharField(
_("OCR language"),
max_length=20,
default="deu+eng",
help_text=_("Tesseract language code(s), e.g. 'deu+eng'"),
)
class TransformType(models.TextChoices):
NONE = ("none", _("None"))
STRIP = ("strip", _("Strip whitespace"))
UPPERCASE = ("uppercase", _("Uppercase"))
LOWERCASE = ("lowercase", _("Lowercase"))
NUMERIC = ("numeric", _("Numeric only"))
STRIP_PUNCTUATION = (
"strip_punctuation",
_("Remove leading/trailing punctuation"),
)
DATE = ("date", _("Parse date"))
QR_CODE = ("qr_code", _("Read QR/barcode"))
transform = models.CharField(
_("transform"),
max_length=20,
choices=TransformType.choices,
default=TransformType.STRIP,
)
date_format = models.CharField(
_("date format"),
max_length=64,
blank=True,
default="",
help_text=_(
"Python strptime format for the 'Parse date' transform "
"(e.g. %d.%m.%Y). Blank = auto-detect.",
),
)
validation_regex = models.CharField(
_("validation regex"),
max_length=256,
blank=True,
default="",
help_text=_(
"Optional regex pattern — extracted text is only accepted if it matches",
),
)
order = models.PositiveIntegerField(_("order"), default=0)
class Meta:
ordering = ("template", "order")
verbose_name = _("OCR template zone")
verbose_name_plural = _("OCR template zones")
def __str__(self) -> str:
return f"{self.template.name} -> {self.name}"
# Custom field data types that zone OCR can extract into. DOCUMENTLINK and
# SELECT are excluded (they reference other objects, not free text). Single
# source of truth for the serializer, the quick-create endpoint and the engine.
OCR_SUPPORTED_FIELD_TYPES = frozenset(
{
CustomField.FieldDataType.STRING,
CustomField.FieldDataType.URL,
CustomField.FieldDataType.DATE,
CustomField.FieldDataType.INT,
CustomField.FieldDataType.FLOAT,
CustomField.FieldDataType.MONETARY,
CustomField.FieldDataType.LONG_TEXT,
CustomField.FieldDataType.BOOL,
},
)
+8 -29
View File
@@ -45,12 +45,6 @@ class SanityCheckMessages:
def __init__(self) -> None:
self._messages: dict[int | None, list[MessageEntry]] = defaultdict(list)
self._document_pks: set[int] = set()
self._document_error_pks: set[int] = set()
self._document_warning_pks: set[int] = set()
self._document_info_pks: set[int] = set()
self._document_error_issue_count: int = 0
self._document_warning_issue_count: int = 0
self.has_error: bool = False
self.has_warning: bool = False
self.has_info: bool = False
@@ -62,33 +56,20 @@ class SanityCheckMessages:
# -- Recording ----------------------------------------------------------
def _add_document_issue(self, doc_pk: int, document_pks: set[int]) -> bool:
if doc_pk not in self._document_pks:
self._document_pks.add(doc_pk)
self.document_count += 1
if doc_pk in document_pks:
return False
document_pks.add(doc_pk)
return True
def error(self, doc_pk: int | None, message: str) -> None:
self._messages[doc_pk].append({"level": logging.ERROR, "message": message})
self.has_error = True
if doc_pk is not None:
self._document_error_issue_count += 1
if self._add_document_issue(doc_pk, self._document_error_pks):
self.document_error_count += 1
self.document_count += 1
self.document_error_count += 1
def warning(self, doc_pk: int | None, message: str) -> None:
self._messages[doc_pk].append({"level": logging.WARNING, "message": message})
self.has_warning = True
if doc_pk is not None:
self._document_warning_issue_count += 1
if self._add_document_issue(doc_pk, self._document_warning_pks):
self.document_warning_count += 1
self.document_count += 1
self.document_warning_count += 1
else:
# This is the only type of global message we do right now
self.global_warning_count += 1
@@ -97,10 +78,8 @@ class SanityCheckMessages:
self._messages[doc_pk].append({"level": logging.INFO, "message": message})
self.has_info = True
if doc_pk is not None and self._add_document_issue(
doc_pk,
self._document_info_pks,
):
if doc_pk is not None:
self.document_count += 1
self.document_info_count += 1
# -- Iteration / query --------------------------------------------------
@@ -126,8 +105,8 @@ class SanityCheckMessages:
def total_issue_count(self) -> int:
"""Total number of error and warning messages across all documents and global."""
return (
self._document_error_issue_count
+ self._document_warning_issue_count
self.document_error_count
+ self.document_warning_count
+ self.global_warning_count
)
+4 -7
View File
@@ -48,9 +48,6 @@ _LANGUAGE_MAP: dict[str, str] = {
}
SUPPORTED_LANGUAGES: frozenset[str] = frozenset(_LANGUAGE_MAP)
# Document.title is max_length=128, so use 129 as the limit for
# Tantivy's remove_long filter
_TOKEN_REMOVE_LONG_LIMIT: Final[int] = 129
def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
@@ -80,10 +77,10 @@ def register_tokenizers(index: tantivy.Index, language: str | None) -> None:
def _paperless_text(language: str | None) -> tantivy.TextAnalyzer:
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(129) -> lowercase -> ascii_fold [-> stemmer]"""
"""Main full-text tokenizer for content, title, etc: simple -> remove_long(65) -> lowercase -> ascii_fold [-> stemmer]"""
builder = (
tantivy.TextAnalyzerBuilder(tantivy.Tokenizer.simple())
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
.filter(tantivy.Filter.remove_long(65))
.filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold())
)
@@ -122,12 +119,12 @@ def _bigram_analyzer() -> tantivy.TextAnalyzer:
def _simple_search_analyzer() -> tantivy.TextAnalyzer:
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(129) -> lowercase -> ascii_fold."""
"""Tokenizer for simple substring search fields: non-whitespace chunks -> remove_long(65) -> lowercase -> ascii_fold."""
return (
tantivy.TextAnalyzerBuilder(
tantivy.Tokenizer.regex(r"\S+"),
)
.filter(tantivy.Filter.remove_long(_TOKEN_REMOVE_LONG_LIMIT))
.filter(tantivy.Filter.remove_long(65))
.filter(tantivy.Filter.lowercase())
.filter(tantivy.Filter.ascii_fold())
.build()
+129
View File
@@ -57,6 +57,7 @@ if settings.AUDIT_LOG_ENABLED:
from documents import bulk_edit
from documents.data_models import DocumentSource
from documents.filters import CustomFieldQueryParser
from documents.models import OCR_SUPPORTED_FIELD_TYPES
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
@@ -64,6 +65,8 @@ from documents.models import Document
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import Note
from documents.models import OcrTemplate
from documents.models import OcrTemplateZone
from documents.models import PaperlessTask
from documents.models import SavedView
from documents.models import SavedViewFilterRule
@@ -3501,3 +3504,129 @@ class StoragePathTestSerializer(SerializerWithPerms):
"documents.view_document",
Document,
)
class OcrTemplateZoneSerializer(serializers.ModelSerializer):
class Meta:
model = OcrTemplateZone
fields = [
"id",
"name",
"target",
"custom_field",
"page",
"x",
"y",
"width",
"height",
"ocr_language",
"transform",
"date_format",
"order",
"zone_source_width",
"zone_source_height",
"validation_regex",
]
def validate_width(self, value):
if value < 1:
raise serializers.ValidationError("Width must be at least 1.")
return value
def validate_height(self, value):
if value < 1:
raise serializers.ValidationError("Height must be at least 1.")
return value
def validate_custom_field(self, value):
if value is None:
# Built-in target (title/asn/created) — no custom field required.
return value
if value.data_type not in OCR_SUPPORTED_FIELD_TYPES:
raise serializers.ValidationError(
f"Custom field type '{value.data_type}' is not supported for OCR extraction. "
f"Use string, integer, float, date, monetary, boolean, URL, or long text.",
)
return value
class OcrTemplateSerializer(serializers.ModelSerializer):
zones = OcrTemplateZoneSerializer(many=True, required=False)
class Meta:
model = OcrTemplate
fields = [
"id",
"name",
"document_type",
"source_width",
"source_height",
"sample_document",
"enabled",
"combine_formats",
"created",
"updated",
"zones",
]
read_only_fields = ["created", "updated"]
def validate_source_width(self, value):
if value < 1:
raise serializers.ValidationError("Source width must be at least 1.")
return value
def validate_source_height(self, value):
if value < 1:
raise serializers.ValidationError("Source height must be at least 1.")
return value
def validate_zones(self, zones_data):
"""Validate zone coordinates are within the source dimensions."""
# source_width/height may not be in initial_data during partial updates
source_width = self.initial_data.get("source_width") or (
self.instance.source_width if self.instance else None
)
source_height = self.initial_data.get("source_height") or (
self.instance.source_height if self.instance else None
)
if source_width and source_height:
for zone in zones_data:
x = zone.get("x", 0)
y = zone.get("y", 0)
w = zone.get("width", 0)
h = zone.get("height", 0)
if x + w > int(source_width):
raise serializers.ValidationError(
f"Zone '{zone.get('name', '?')}' extends beyond source width "
f"({x + w} > {source_width}).",
)
if y + h > int(source_height):
raise serializers.ValidationError(
f"Zone '{zone.get('name', '?')}' extends beyond source height "
f"({y + h} > {source_height}).",
)
return zones_data
def create(self, validated_data):
zones_data = validated_data.pop("zones", [])
template = OcrTemplate.objects.create(**validated_data)
for zone_data in zones_data:
OcrTemplateZone.objects.create(template=template, **zone_data)
return template
def update(self, instance, validated_data):
zones_data = validated_data.pop("zones", None)
for attr, value in validated_data.items():
setattr(instance, attr, value)
instance.save()
if zones_data is not None:
# Replace all zones with the new set
instance.zones.all().delete()
for zone_data in zones_data:
OcrTemplateZone.objects.create(template=instance, **zone_data)
return instance
+69
View File
@@ -1340,6 +1340,75 @@ def close_connection_pool_on_worker_init(**kwargs) -> None:
conn.close_pool()
def run_zone_ocr_extraction(sender, document, original_file=None, **kwargs):
"""
Run zone-based OCR extraction if the document's type has an active template.
"""
try:
from documents.zone_ocr import run_zone_extraction
run_zone_extraction(document, Path(original_file) if original_file else None)
except Exception:
logger.exception(
"Zone OCR extraction failed for document %s",
document.pk,
)
def capture_old_document_type(sender, instance, **kwargs):
"""pre_save: remember the document's previous type so the post_save handler
can tell whether the type actually changed (vs. every other save)."""
if instance.pk:
instance._old_document_type_id = (
Document.objects.filter(pk=instance.pk)
.values_list("document_type_id", flat=True)
.first()
)
else:
instance._old_document_type_id = None
def run_zone_ocr_on_type_change(sender, instance, *, created=False, **kwargs):
"""
Run zone OCR only when a document's TYPE actually changes (and the new type
has an enabled template). NOT on every save zone OCR overwrites fields, so
re-running it on each edit would clobber the user's changes. Newly created
documents are handled by the consumption signal, and the user can always
trigger extraction manually via the run-zone-ocr action.
"""
if created or not instance.pk or not instance.document_type_id:
return
# Only proceed if the type changed compared to what was in the DB before.
old_type = getattr(instance, "_old_document_type_id", None)
if old_type == instance.document_type_id:
return
from documents.models import OcrTemplate
if not OcrTemplate.objects.filter(
document_type_id=instance.document_type_id,
enabled=True,
).exists():
return
try:
from documents.zone_ocr import run_zone_extraction
doc_path = instance.archive_path or instance.source_path
if doc_path and Path(doc_path).is_file():
logger.info(
"Zone OCR: running extraction for document %d (type %d)",
instance.pk,
instance.document_type_id,
)
run_zone_extraction(instance, None)
except Exception:
logger.exception(
"Zone OCR extraction failed for document %s",
instance.pk,
)
@worker_process_shutdown.connect
def close_connection_pool_on_worker_shutdown(**kwargs) -> None: # pragma: no cover
"""
@@ -261,36 +261,6 @@ class TestSearch:
== 1
)
@pytest.mark.parametrize(
("search_mode", "query"),
[
pytest.param(SearchMode.TITLE, "12345", id="title_search"),
pytest.param(SearchMode.TEXT, "12345", id="text_search"),
pytest.param(SearchMode.QUERY, None, id="query_title_exact"),
],
)
def test_search_modes_match_model_limit_title_tokens(
self,
backend: TantivyBackend,
search_mode: SearchMode,
query: str | None,
) -> None:
"""Search must keep filename-like title tokens up to the model limit."""
long_title = "1234567890" * 12 + "12345678"
doc = Document.objects.create(
title=long_title,
content="ordinary content",
checksum="TXT12",
pk=18,
)
backend.add_or_update(doc)
assert backend.search_ids(
query or f"title:{long_title}",
user=None,
search_mode=search_mode,
) == [doc.pk]
@pytest.mark.parametrize(
("mode", "title", "content", "hits", "misses"),
[
@@ -99,25 +99,6 @@ class TestTokenizers:
)
assert simple_search_index.searcher().search(q, limit=5).count == 1
def test_simple_search_analyzer_supports_model_limit_token_substrings(
self,
simple_search_index: tantivy.Index,
) -> None:
"""Simple substring search keeps tokens up to Document.title's model limit."""
long_token = "abcdefghij" * 12 + "abcdefgh"
writer = simple_search_index.writer()
doc = tantivy.Document()
doc.add_text("simple_content", long_token)
writer.add_document(doc)
writer.commit()
simple_search_index.reload()
q = tantivy.Query.regex_query(
simple_search_index.schema,
"simple_content",
".*cdefg.*",
)
assert simple_search_index.searcher().search(q, limit=5).count == 1
def test_unsupported_language_logs_warning(self, caplog: LogCaptureFixture) -> None:
"""Unsupported language codes should log a warning and disable stemming gracefully."""
sb = tantivy.SchemaBuilder()
@@ -0,0 +1,449 @@
"""Tests for the OCR Template API."""
import json
from django.contrib.auth.models import User
from rest_framework import status
from rest_framework.test import APITestCase
from documents.models import CustomField
from documents.models import DocumentType
from documents.models import OcrTemplate
from documents.models import OcrTemplateZone
from documents.tests.utils import DirectoriesMixin
class TestOcrTemplatesAPI(DirectoriesMixin, APITestCase):
ENDPOINT = "/api/ocr_templates/"
def setUp(self) -> None:
self.user = User.objects.create_superuser(username="temp_admin")
self.client.force_authenticate(user=self.user)
self.doc_type = DocumentType.objects.create(name="Invoice")
self.custom_field_text = CustomField.objects.create(
name="Invoice Number",
data_type=CustomField.FieldDataType.STRING,
)
self.custom_field_date = CustomField.objects.create(
name="Invoice Date",
data_type=CustomField.FieldDataType.DATE,
)
self.custom_field_int = CustomField.objects.create(
name="Amount",
data_type=CustomField.FieldDataType.INT,
)
self.custom_field_doclink = CustomField.objects.create(
name="Related Docs",
data_type=CustomField.FieldDataType.DOCUMENTLINK,
)
return super().setUp()
def _make_template_data(self, **overrides):
data = {
"name": "Invoice Template",
"document_type": self.doc_type.pk,
"default_page": 0,
"source_width": 2480,
"source_height": 3508,
"enabled": True,
"zones": [],
}
data.update(overrides)
return data
def _make_zone_data(self, **overrides):
data = {
"name": "Zone 1",
"custom_field": self.custom_field_text.pk,
"x": 100,
"y": 100,
"width": 200,
"height": 50,
"ocr_language": "deu+eng",
"transform": "strip",
"order": 0,
}
data.update(overrides)
return data
# --- Create ---
def test_create_template(self):
"""
GIVEN:
- A document type and custom fields exist
WHEN:
- API request to create an OCR template with one zone
THEN:
- The template and zone are created
"""
data = self._make_template_data(
zones=[
self._make_zone_data(
name="Invoice Number",
x=1500,
y=200,
width=800,
height=100,
),
],
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
result = resp.json()
self.assertEqual(result["name"], "Invoice Template")
self.assertEqual(result["document_type"], self.doc_type.pk)
self.assertEqual(len(result["zones"]), 1)
self.assertEqual(result["zones"][0]["name"], "Invoice Number")
self.assertEqual(OcrTemplate.objects.count(), 1)
self.assertEqual(OcrTemplateZone.objects.count(), 1)
def test_create_template_multiple_zones(self):
"""
GIVEN:
- Multiple custom fields exist
WHEN:
- A template with multiple zones is created
THEN:
- All zones are created
"""
data = self._make_template_data(
zones=[
self._make_zone_data(
name="Invoice Number",
custom_field=self.custom_field_text.pk,
),
self._make_zone_data(
name="Invoice Date",
custom_field=self.custom_field_date.pk,
order=1,
),
],
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
self.assertEqual(len(resp.json()["zones"]), 2)
self.assertEqual(OcrTemplateZone.objects.count(), 2)
def test_create_template_no_zones(self):
"""
GIVEN:
- Valid template data without zones
WHEN:
- Template is created
THEN:
- Template is created with no zones
"""
data = self._make_template_data()
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
self.assertEqual(len(resp.json()["zones"]), 0)
# --- Validation ---
def test_create_template_zero_source_width_rejected(self):
"""
GIVEN:
- Template data with source_width=0
WHEN:
- Create is attempted
THEN:
- 400 error is returned
"""
data = self._make_template_data(source_width=0)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_template_zero_source_height_rejected(self):
data = self._make_template_data(source_height=0)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_zone_zero_width_rejected(self):
data = self._make_template_data(
zones=[self._make_zone_data(width=0)],
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_zone_zero_height_rejected(self):
data = self._make_template_data(
zones=[self._make_zone_data(height=0)],
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_zone_exceeds_source_width_rejected(self):
"""Zone that extends beyond the source image width should be rejected."""
data = self._make_template_data(
source_width=1000,
zones=[self._make_zone_data(x=800, width=300)], # 800+300 > 1000
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_zone_exceeds_source_height_rejected(self):
data = self._make_template_data(
source_height=1000,
zones=[self._make_zone_data(y=900, height=200)], # 900+200 > 1000
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_create_zone_unsupported_custom_field_type_rejected(self):
"""DOCUMENTLINK and SELECT fields can't be populated via OCR."""
data = self._make_template_data(
zones=[self._make_zone_data(custom_field=self.custom_field_doclink.pk)],
)
resp = self.client.post(
self.ENDPOINT,
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
# --- List ---
def test_list_templates(self):
template = OcrTemplate.objects.create(
name="Test Template",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
)
OcrTemplateZone.objects.create(
template=template,
name="Zone 1",
custom_field=self.custom_field_text,
x=100,
y=100,
width=200,
height=50,
)
resp = self.client.get(self.ENDPOINT)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
data = resp.json()
self.assertEqual(data["count"], 1)
self.assertEqual(len(data["results"][0]["zones"]), 1)
def test_list_empty(self):
resp = self.client.get(self.ENDPOINT)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
self.assertEqual(resp.json()["count"], 0)
# --- Update ---
def test_update_template_replaces_zones(self):
"""PUT should replace all zones with the new set."""
template = OcrTemplate.objects.create(
name="Old Name",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
)
OcrTemplateZone.objects.create(
template=template,
name="Old Zone",
custom_field=self.custom_field_text,
x=0,
y=0,
width=100,
height=100,
)
data = self._make_template_data(
name="New Name",
zones=[
self._make_zone_data(
name="New Zone",
custom_field=self.custom_field_date.pk,
),
],
)
resp = self.client.put(
f"{self.ENDPOINT}{template.pk}/",
data=json.dumps(data),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
template.refresh_from_db()
self.assertEqual(template.name, "New Name")
self.assertEqual(OcrTemplateZone.objects.count(), 1)
self.assertEqual(OcrTemplateZone.objects.first().name, "New Zone")
# --- Delete ---
def test_delete_template_cascades_zones(self):
template = OcrTemplate.objects.create(
name="To Delete",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
)
OcrTemplateZone.objects.create(
template=template,
name="Zone",
custom_field=self.custom_field_text,
x=0,
y=0,
width=100,
height=100,
)
resp = self.client.delete(f"{self.ENDPOINT}{template.pk}/")
self.assertEqual(resp.status_code, status.HTTP_204_NO_CONTENT)
self.assertEqual(OcrTemplate.objects.count(), 0)
self.assertEqual(OcrTemplateZone.objects.count(), 0)
def test_delete_nonexistent_returns_404(self):
resp = self.client.delete(f"{self.ENDPOINT}99999/")
self.assertEqual(resp.status_code, status.HTTP_404_NOT_FOUND)
# --- Patch ---
def test_patch_toggle_enabled(self):
template = OcrTemplate.objects.create(
name="Toggle Test",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=True,
)
resp = self.client.patch(
f"{self.ENDPOINT}{template.pk}/",
data=json.dumps({"enabled": False}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
template.refresh_from_db()
self.assertFalse(template.enabled)
def test_patch_preserves_zones(self):
"""PATCH without zones field should not delete existing zones."""
template = OcrTemplate.objects.create(
name="Patch Test",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
)
OcrTemplateZone.objects.create(
template=template,
name="Existing Zone",
custom_field=self.custom_field_text,
x=0,
y=0,
width=100,
height=100,
)
resp = self.client.patch(
f"{self.ENDPOINT}{template.pk}/",
data=json.dumps({"name": "Updated Name"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
self.assertEqual(OcrTemplateZone.objects.count(), 1)
# --- Auth ---
def test_unauthenticated_rejected(self):
self.client.logout()
resp = self.client.get(self.ENDPOINT)
self.assertIn(
resp.status_code,
(status.HTTP_401_UNAUTHORIZED, status.HTTP_403_FORBIDDEN),
)
# --- Quick create field ---
def test_quick_create_field(self):
"""Creating a custom field inline from the template editor."""
resp = self.client.post(
f"{self.ENDPOINT}quick-create-field/",
data=json.dumps({"name": "New Field", "data_type": "string"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
data = resp.json()
self.assertEqual(data["name"], "New Field")
self.assertEqual(data["data_type"], "string")
self.assertTrue(data["created"])
self.assertTrue(CustomField.objects.filter(name="New Field").exists())
def test_quick_create_field_existing(self):
"""If a field with the same name exists, return it without creating."""
resp = self.client.post(
f"{self.ENDPOINT}quick-create-field/",
data=json.dumps({"name": "Invoice Number", "data_type": "string"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
data = resp.json()
self.assertEqual(data["id"], self.custom_field_text.pk)
self.assertFalse(data["created"])
def test_quick_create_field_empty_name_rejected(self):
resp = self.client.post(
f"{self.ENDPOINT}quick-create-field/",
data=json.dumps({"name": "", "data_type": "string"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_quick_create_field_unsupported_type_rejected(self):
resp = self.client.post(
f"{self.ENDPOINT}quick-create-field/",
data=json.dumps({"name": "Bad Field", "data_type": "documentlink"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
def test_quick_create_field_select_type_rejected(self):
resp = self.client.post(
f"{self.ENDPOINT}quick-create-field/",
data=json.dumps({"name": "Bad Field", "data_type": "select"}),
content_type="application/json",
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
-21
View File
@@ -12,7 +12,6 @@ from typing import TYPE_CHECKING
import pytest
from documents.sanity_checker import SanityCheckMessages
from documents.sanity_checker import check_sanity
if TYPE_CHECKING:
@@ -22,26 +21,6 @@ if TYPE_CHECKING:
from documents.tests.conftest import PaperlessDirs
class TestSanityCheckMessages:
def test_document_counts_are_unique_per_severity(self) -> None:
messages = SanityCheckMessages()
messages.error(1, "first error")
messages.error(1, "second error")
messages.warning(1, "first warning")
messages.warning(1, "second warning")
messages.info(1, "first info")
messages.info(1, "second info")
messages.warning(None, "global warning")
assert messages.document_count == 1
assert messages.document_error_count == 1
assert messages.document_warning_count == 1
assert messages.document_info_count == 1
assert messages.global_warning_count == 1
assert messages.total_issue_count == 5
@pytest.mark.django_db
class TestCheckSanityNoDocuments:
"""Sanity checks against an empty archive."""
+454
View File
@@ -0,0 +1,454 @@
"""Tests for the zone-based OCR extraction engine."""
import tempfile
from pathlib import Path
from unittest.mock import MagicMock
from unittest.mock import patch
from django.test import TestCase
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import OcrTemplate
from documents.models import OcrTemplateZone
from documents.zone_ocr import _apply_transform
from documents.zone_ocr import _convert_value
from documents.zone_ocr import _detect_mime
from documents.zone_ocr import _resolve_doc_path
from documents.zone_ocr import run_zone_extraction
class TestApplyTransform(TestCase):
"""Tests for the _apply_transform function."""
def test_strip(self):
self.assertEqual(_apply_transform(" hello ", "strip"), "hello")
def test_none_transform(self):
self.assertEqual(_apply_transform(" hello ", "none"), "hello")
def test_uppercase(self):
self.assertEqual(_apply_transform("hello world", "uppercase"), "HELLO WORLD")
def test_lowercase(self):
self.assertEqual(_apply_transform("HELLO WORLD", "lowercase"), "hello world")
def test_numeric_basic(self):
self.assertEqual(_apply_transform("INV-2026-001", "numeric"), "2026-001")
def test_numeric_with_currency(self):
self.assertEqual(_apply_transform("€1,234.56", "numeric"), "1,234.56")
def test_numeric_empty_result_falls_back(self):
self.assertEqual(_apply_transform("abc", "numeric"), "abc")
def test_date_dmy_dots(self):
self.assertEqual(_apply_transform("13.04.2026", "date_dmy"), "2026-04-13")
def test_date_dmy_slashes(self):
self.assertEqual(_apply_transform("01/12/2025", "date_dmy"), "2025-12-01")
def test_date_dmy_two_digit_year(self):
self.assertEqual(_apply_transform("13.04.26", "date_dmy"), "2026-04-13")
def test_date_dmy_with_prefix(self):
self.assertEqual(_apply_transform("Date: 01/12/2025", "date_dmy"), "2025-12-01")
def test_date_dmy_invalid_falls_back(self):
self.assertEqual(_apply_transform("32.13.2026", "date_dmy"), "32.13.2026")
def test_date_dmy_no_match_falls_back(self):
self.assertEqual(_apply_transform("not a date", "date_dmy"), "not a date")
def test_date_ymd_dashes(self):
self.assertEqual(_apply_transform("2026-04-13", "date_ymd"), "2026-04-13")
def test_date_ymd_slashes(self):
self.assertEqual(_apply_transform("2026/04/13", "date_ymd"), "2026-04-13")
def test_date_ymd_invalid_falls_back(self):
self.assertEqual(_apply_transform("2026-13-32", "date_ymd"), "2026-13-32")
def test_empty_string(self):
self.assertEqual(_apply_transform("", "strip"), "")
def test_whitespace_only(self):
self.assertEqual(_apply_transform(" ", "strip"), "")
def test_unknown_transform_strips(self):
self.assertEqual(_apply_transform(" hello ", "unknown"), "hello")
class TestConvertValue(TestCase):
"""Tests for the _convert_value function."""
def test_string(self):
self.assertEqual(
_convert_value("Hello", CustomField.FieldDataType.STRING),
"Hello",
)
def test_string_truncation(self):
result = _convert_value("x" * 200, CustomField.FieldDataType.STRING)
self.assertEqual(len(result), 128)
def test_url(self):
self.assertEqual(
_convert_value("https://example.com", CustomField.FieldDataType.URL),
"https://example.com",
)
def test_long_text(self):
long = "x" * 500
self.assertEqual(
_convert_value(long, CustomField.FieldDataType.LONG_TEXT),
long,
)
def test_int_simple(self):
self.assertEqual(_convert_value("42", CustomField.FieldDataType.INT), 42)
def test_int_with_noise(self):
self.assertEqual(_convert_value("INV-123", CustomField.FieldDataType.INT), 123)
def test_int_negative(self):
self.assertEqual(_convert_value("-42", CustomField.FieldDataType.INT), -42)
def test_int_empty_returns_none(self):
self.assertIsNone(_convert_value("abc", CustomField.FieldDataType.INT))
def test_int_only_dash_returns_none(self):
self.assertIsNone(_convert_value("-", CustomField.FieldDataType.INT))
def test_float_simple(self):
self.assertAlmostEqual(
_convert_value("1234.56", CustomField.FieldDataType.FLOAT),
1234.56,
)
def test_float_european_format(self):
self.assertAlmostEqual(
_convert_value("1.234,56", CustomField.FieldDataType.FLOAT),
1234.56,
)
def test_float_us_format(self):
self.assertAlmostEqual(
_convert_value("1,234.56", CustomField.FieldDataType.FLOAT),
1234.56,
)
def test_float_comma_only(self):
self.assertAlmostEqual(
_convert_value("1234,56", CustomField.FieldDataType.FLOAT),
1234.56,
)
def test_float_empty_returns_none(self):
self.assertIsNone(_convert_value("abc", CustomField.FieldDataType.FLOAT))
def test_float_only_separator_returns_none(self):
self.assertIsNone(_convert_value(",", CustomField.FieldDataType.FLOAT))
def test_date_iso(self):
self.assertEqual(
_convert_value("2026-04-13", CustomField.FieldDataType.DATE),
"2026-04-13",
)
def test_date_invalid_returns_none(self):
self.assertIsNone(_convert_value("not a date", CustomField.FieldDataType.DATE))
def test_date_invalid_values_returns_none(self):
self.assertIsNone(_convert_value("2026-13-32", CustomField.FieldDataType.DATE))
def test_monetary_simple(self):
self.assertEqual(
_convert_value("123.45", CustomField.FieldDataType.MONETARY),
"123.45",
)
def test_monetary_european(self):
self.assertEqual(
_convert_value("1.234,56", CustomField.FieldDataType.MONETARY),
"1234.56",
)
def test_monetary_with_currency_symbol(self):
self.assertEqual(
_convert_value("€1,234.56", CustomField.FieldDataType.MONETARY),
"1234.56",
)
def test_monetary_empty_returns_none(self):
self.assertIsNone(_convert_value("CHF", CustomField.FieldDataType.MONETARY))
def test_bool_true(self):
for val in ("true", "True", "yes", "1", "ja", "x", "X"):
self.assertTrue(
_convert_value(val, CustomField.FieldDataType.BOOL),
f"Expected True for {val!r}",
)
def test_bool_false(self):
for val in ("false", "False", "no", "0", "nein"):
self.assertFalse(
_convert_value(val, CustomField.FieldDataType.BOOL),
f"Expected False for {val!r}",
)
def test_bool_unknown_returns_none(self):
self.assertIsNone(_convert_value("maybe", CustomField.FieldDataType.BOOL))
def test_unsupported_type_returns_none(self):
self.assertIsNone(
_convert_value("test", CustomField.FieldDataType.DOCUMENTLINK),
)
self.assertIsNone(
_convert_value("test", CustomField.FieldDataType.SELECT),
)
def test_empty_string_returns_none(self):
self.assertIsNone(_convert_value("", CustomField.FieldDataType.STRING))
class TestDetectMime(TestCase):
"""Tests for _detect_mime."""
def test_pdf_extension(self):
self.assertEqual(_detect_mime(Path("test.pdf")), "application/pdf")
def test_png_extension(self):
self.assertEqual(_detect_mime(Path("test.png")), "image/png")
def test_jpg_extension(self):
self.assertEqual(_detect_mime(Path("test.jpg")), "image/jpeg")
def test_unknown_extension(self):
self.assertIsNone(_detect_mime(Path("test.xyz")))
def test_webp_extension(self):
self.assertEqual(_detect_mime(Path("test.webp")), "image/webp")
class TestResolveDocPath(TestCase):
"""Tests for _resolve_doc_path."""
def test_returns_none_when_no_files_exist(self):
doc = MagicMock()
doc.has_archive_version = False
doc.source_path = Path("/nonexistent/source.pdf")
result = _resolve_doc_path(doc, None)
self.assertIsNone(result)
def test_returns_original_file_as_fallback(self):
doc = MagicMock()
doc.has_archive_version = False
doc.source_path = Path("/nonexistent/source.pdf")
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
result = _resolve_doc_path(doc, Path(f.name))
self.assertEqual(result, Path(f.name))
def test_returns_none_for_none_original_file(self):
doc = MagicMock()
doc.has_archive_version = False
doc.source_path = Path("/nonexistent/source.pdf")
result = _resolve_doc_path(doc, None)
self.assertIsNone(result)
class TestRunZoneExtraction(TestCase):
"""Tests for the full extraction pipeline."""
def setUp(self):
self.doc_type = DocumentType.objects.create(name="Invoice")
self.custom_field = CustomField.objects.create(
name="Invoice Number",
data_type=CustomField.FieldDataType.STRING,
)
def test_skips_document_without_type(self):
doc = Document.objects.create(
title="No Type",
content="test",
mime_type="application/pdf",
)
run_zone_extraction(doc, Path("/nonexistent"))
self.assertEqual(CustomFieldInstance.objects.count(), 0)
def test_skips_document_without_matching_template(self):
other_type = DocumentType.objects.create(name="Other")
doc = Document.objects.create(
title="No Template",
content="test",
mime_type="application/pdf",
document_type=other_type,
)
run_zone_extraction(doc, Path("/nonexistent"))
self.assertEqual(CustomFieldInstance.objects.count(), 0)
def test_skips_disabled_template(self):
template = OcrTemplate.objects.create(
name="Disabled",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=False,
)
OcrTemplateZone.objects.create(
template=template,
name="Zone",
custom_field=self.custom_field,
x=0,
y=0,
width=100,
height=50,
)
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
run_zone_extraction(doc, Path("/nonexistent"))
self.assertEqual(CustomFieldInstance.objects.count(), 0)
def test_skips_template_with_no_zones(self):
OcrTemplate.objects.create(
name="Empty",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=True,
)
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(b"%PDF-1.4 fake")
f.flush()
run_zone_extraction(doc, Path(f.name))
self.assertEqual(CustomFieldInstance.objects.count(), 0)
@patch("documents.zone_ocr._process_template")
def test_calls_process_for_enabled_template(self, mock_process):
template = OcrTemplate.objects.create(
name="Active",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=True,
)
OcrTemplateZone.objects.create(
template=template,
name="Zone",
custom_field=self.custom_field,
x=0,
y=0,
width=100,
height=50,
)
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(b"%PDF-1.4 fake")
f.flush()
run_zone_extraction(doc, Path(f.name))
self.assertTrue(mock_process.called)
@patch("documents.zone_ocr._process_template")
def test_handles_process_exception_gracefully(self, mock_process):
"""A failing template should not prevent other templates from running."""
mock_process.side_effect = RuntimeError("test error")
template = OcrTemplate.objects.create(
name="Failing",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=True,
)
OcrTemplateZone.objects.create(
template=template,
name="Zone",
custom_field=self.custom_field,
x=0,
y=0,
width=100,
height=50,
)
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(b"%PDF-1.4 fake")
f.flush()
# Should not raise
run_zone_extraction(doc, Path(f.name))
def test_handles_none_original_file(self):
"""Should not crash when original_file is None."""
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
# No template, so it exits early — but shouldn't crash on None
run_zone_extraction(doc, None)
@patch("documents.zone_ocr._process_template")
def test_multiple_templates_all_process(self, mock_process):
"""Multiple enabled templates for the same type should all run."""
for i in range(3):
template = OcrTemplate.objects.create(
name=f"Template {i}",
document_type=self.doc_type,
source_width=2480,
source_height=3508,
enabled=True,
)
OcrTemplateZone.objects.create(
template=template,
name=f"Zone {i}",
custom_field=self.custom_field,
x=0,
y=0,
width=100,
height=50,
)
doc = Document.objects.create(
title="Test",
content="test",
mime_type="application/pdf",
document_type=self.doc_type,
)
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(b"%PDF-1.4 fake")
f.flush()
run_zone_extraction(doc, Path(f.name))
self.assertEqual(mock_process.call_count, 3)
+292
View File
@@ -3,6 +3,7 @@ import logging
import os
import platform
import re
import subprocess
import tempfile
import zipfile
from collections import defaultdict
@@ -148,12 +149,14 @@ from documents.matching import match_correspondents
from documents.matching import match_document_types
from documents.matching import match_storage_paths
from documents.matching import match_tags
from documents.models import OCR_SUPPORTED_FIELD_TYPES
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import Note
from documents.models import OcrTemplate
from documents.models import PaperlessTask
from documents.models import SavedView
from documents.models import ShareLink
@@ -195,6 +198,7 @@ from documents.serialisers import EditPdfDocumentsSerializer
from documents.serialisers import EmailSerializer
from documents.serialisers import MergeDocumentsSerializer
from documents.serialisers import NotesSerializer
from documents.serialisers import OcrTemplateSerializer
from documents.serialisers import PostDocumentSerializer
from documents.serialisers import RemovePasswordDocumentsSerializer
from documents.serialisers import ReprocessDocumentsSerializer
@@ -2029,6 +2033,73 @@ class DocumentViewSet(
},
),
)
@action(methods=["post"], detail=True, url_path="run-zone-ocr")
def run_zone_ocr(self, request, pk=None):
"""Run zone-based OCR extraction on this document."""
try:
document = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404
if not document.document_type_id:
return Response(
{"error": "Document has no type assigned"},
status=status.HTTP_400_BAD_REQUEST,
)
templates = OcrTemplate.objects.filter(
document_type_id=document.document_type_id,
enabled=True,
)
if not templates.exists():
return Response(
{"error": "No OCR templates found for this document type"},
status=status.HTTP_404_NOT_FOUND,
)
doc_path = document.archive_path or document.source_path
if not doc_path or not Path(doc_path).is_file():
return Response(
{"error": "Document file not found"},
status=status.HTTP_404_NOT_FOUND,
)
from documents.zone_ocr import run_zone_extraction
run_zone_extraction(document, None)
# Collect results
results = []
builtin_labels = {"title": "Title", "asn": "ASN", "created": "Created"}
for template in templates.prefetch_related("zones", "zones__custom_field"):
for zone in template.zones.all():
target = getattr(zone, "target", None) or "custom_field"
if target == "custom_field" and zone.custom_field_id:
cf_instance = document.custom_fields.filter(
field=zone.custom_field,
).first()
field_name = zone.custom_field.name
value = cf_instance.value if cf_instance else None
else:
field_name = builtin_labels.get(target, target)
value = {
"title": document.title,
"asn": document.archive_serial_number,
"created": document.created.isoformat()
if document.created
else None,
}.get(target)
results.append(
{
"template": template.name,
"zone": zone.name,
"custom_field": field_name,
"value": value,
},
)
return Response({"results": results})
@action(
methods=["delete"],
detail=True,
@@ -5269,3 +5340,224 @@ def serve_logo(request: HttpRequest, filename: str | None = None) -> FileRespons
filename=app_logo.name,
as_attachment=True,
)
class OcrTemplateViewSet(ModelViewSet):
"""CRUD for OCR templates with zone definitions."""
queryset = (
OcrTemplate.objects.all()
.prefetch_related(
"zones",
"zones__custom_field",
)
.order_by("name")
)
serializer_class = OcrTemplateSerializer
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
pagination_class = StandardPagination
@action(
detail=False,
methods=["get"],
url_path=r"document-page-image/(?P<doc_id>[0-9]+)/(?P<page>[0-9]+)",
)
def document_page_image(self, request, doc_id=None, page=None):
"""Render a specific page of a document as a PNG image.
Used by the frontend template editor to display document pages
as images that users can draw zones on.
"""
try:
document = Document.objects.get(pk=doc_id)
except Document.DoesNotExist:
raise Http404("Document not found")
page_num = int(page)
# Validate page number
if document.page_count and page_num >= document.page_count:
raise Http404(
f"Page {page_num} out of range (document has {document.page_count} pages)",
)
doc_path = document.archive_path or document.source_path
if not doc_path or not Path(doc_path).is_file():
raise Http404("Document file not found")
# Check if document is an image (single page, no PDF rendering needed)
if document.mime_type and document.mime_type.startswith("image/"):
content = Path(doc_path).read_bytes()
return HttpResponse(content, content_type=document.mime_type)
with tempfile.TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
output_prefix = Path(tmp_dir) / "page"
try:
subprocess.run(
[
"pdftoppm",
"-png",
"-r",
"150", # Lower DPI for preview
"-f",
str(page_num + 1),
"-l",
str(page_num + 1),
str(doc_path),
str(output_prefix),
],
check=True,
capture_output=True,
timeout=30,
)
except subprocess.CalledProcessError as e:
raise Http404(
f"Failed to render page: {e.stderr.decode(errors='replace')[:200]}",
)
except FileNotFoundError:
raise Http404("pdftoppm not available - is poppler-utils installed?")
rendered = sorted(Path(tmp_dir).glob("page-*.png"))
if not rendered:
raise Http404("No rendered page found")
content = rendered[0].read_bytes()
return HttpResponse(content, content_type="image/png")
@action(detail=False, methods=["post"], url_path="test-zone")
def test_zone(self, request):
"""Run OCR on a single ad-hoc zone of a document and return what it
yields: the raw OCR text, the transformed value, and whether the
validation regex matches. Non-destructive - writes nothing. Used by the
editor's per-zone test so a user can tune the zone/regex before saving.
Accepts: {"document": <id>, "zone": {x, y, width, height, page,
ocr_language, transform, validation_regex, zone_source_width,
zone_source_height}}.
"""
from documents.models import OcrTemplateZone
from documents.zone_ocr import extract_zone_preview
zone_data = request.data.get("zone") or {}
try:
document = Document.objects.get(pk=request.data.get("document"))
except (Document.DoesNotExist, ValueError, TypeError):
return Response(
{"error": "Document not found"},
status=status.HTTP_404_NOT_FOUND,
)
doc_path = document.archive_path or document.source_path
if not doc_path or not Path(doc_path).is_file():
return Response(
{"error": "Document file not found"},
status=status.HTTP_404_NOT_FOUND,
)
try:
zone = OcrTemplateZone(
name=zone_data.get("name") or "test",
x=int(zone_data.get("x", 0)),
y=int(zone_data.get("y", 0)),
width=int(zone_data.get("width", 0)),
height=int(zone_data.get("height", 0)),
page=zone_data.get("page"),
ocr_language=zone_data.get("ocr_language") or "eng",
transform=zone_data.get("transform") or "strip",
date_format=zone_data.get("date_format") or "",
validation_regex=zone_data.get("validation_regex") or "",
)
except (ValueError, TypeError):
return Response(
{"error": "Invalid zone definition"},
status=status.HTTP_400_BAD_REQUEST,
)
if zone.width < 2 or zone.height < 2:
return Response(
{"error": "Zone is too small to test"},
status=status.HTTP_400_BAD_REQUEST,
)
result = extract_zone_preview(
Path(doc_path),
zone,
int(zone_data.get("zone_source_width") or 0),
int(zone_data.get("zone_source_height") or 0),
document.page_count,
)
regex_match = None
if zone.validation_regex and result.get("value") is not None:
try:
regex_match = (
re.fullmatch(zone.validation_regex, result["value"]) is not None
)
except re.error:
regex_match = None
return Response(
{
"raw_text": result.get("raw_text"),
"value": result.get("value"),
"regex": zone.validation_regex,
"regex_match": regex_match,
},
)
@action(detail=False, methods=["post"], url_path="quick-create-field")
def quick_create_field(self, request):
"""Create a custom field inline from the template editor.
Accepts: {"name": "Invoice Number", "data_type": "string"}
Returns the created field so the frontend can immediately use it.
"""
name = request.data.get("name", "").strip()
data_type = request.data.get("data_type", "").strip()
if not name:
return Response(
{"error": "Field name is required"},
status=status.HTTP_400_BAD_REQUEST,
)
if data_type not in OCR_SUPPORTED_FIELD_TYPES:
return Response(
{
"error": f"Unsupported data type '{data_type}'. "
f"Supported: {', '.join(sorted(OCR_SUPPORTED_FIELD_TYPES))}",
},
status=status.HTTP_400_BAD_REQUEST,
)
# Check if field already exists
existing = CustomField.objects.filter(name=name).first()
if existing:
return Response(
{
"id": existing.pk,
"name": existing.name,
"data_type": existing.data_type,
"created": False,
},
)
# Check user has permission to create custom fields
if not request.user.has_perm("documents.add_customfield"):
return Response(
{"error": "You don't have permission to create custom fields"},
status=status.HTTP_403_FORBIDDEN,
)
field = CustomField.objects.create(name=name, data_type=data_type)
return Response(
{
"id": field.pk,
"name": field.name,
"data_type": field.data_type,
"created": True,
},
status=status.HTTP_201_CREATED,
)
+757
View File
@@ -0,0 +1,757 @@
"""
Zone-based OCR extraction engine.
After a document is consumed, this module checks if the document's type has
an active OCR template. If so, it renders the relevant pages as images,
crops each zone, runs Tesseract OCR on the crop, applies transforms,
and writes the results to the mapped custom fields.
"""
from __future__ import annotations
import logging
import re
import string
import subprocess
import tempfile
from datetime import date
from datetime import datetime
from pathlib import Path
from django.conf import settings
from PIL import Image
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import OcrTemplate
from documents.models import OcrTemplateZone
logger = logging.getLogger("paperless.zone_ocr")
def run_zone_extraction(
document: Document,
original_file: Path | None,
) -> None:
"""
Run zone-based OCR extraction for a document if its type has an active template.
Called from the document_consumption_finished signal handler.
"""
if not document.document_type_id:
return
templates = OcrTemplate.objects.filter(
document_type_id=document.document_type_id,
enabled=True,
).prefetch_related("zones", "zones__custom_field")
if not templates.exists():
return
# Resolve the document file: prefer archive (PDF/A), then source, then signal arg
doc_path = _resolve_doc_path(document, original_file)
if doc_path is None:
logger.warning(
"Zone OCR: no accessible file for document %d",
document.pk,
)
return
for template in templates:
zones = list(template.zones.all())
if not zones:
continue
logger.info(
"Zone OCR: processing template '%s' for document %d (%d zones)",
template.name,
document.pk,
len(zones),
)
try:
_process_template(document, doc_path, template, zones)
except Exception:
logger.exception(
"Zone OCR: error processing template '%s' for document %d",
template.name,
document.pk,
)
def _resolve_doc_path(
document: Document,
original_file: Path | None,
) -> Path | None:
"""Find an accessible file for the document."""
candidates = []
if document.has_archive_version:
candidates.append(document.archive_path)
candidates.append(document.source_path)
if original_file is not None:
candidates.append(original_file)
for path in candidates:
if path is not None and Path(path).is_file():
return Path(path)
return None
def _resolve_page_idx(page_value, page_count) -> int:
"""Resolve a 1-indexed page (1 = first, -1 = last) to a 0-indexed image
index. A blank page_value defaults to the first page."""
if page_value is None:
return 0
if page_value == -1:
return (page_count - 1) if page_count else 0
if page_value >= 1:
return page_value - 1
return 0
def _process_template(
document: Document,
doc_path: Path,
template: OcrTemplate,
zones: list[OcrTemplateZone],
) -> None:
"""Process all zones in a template against a document.
Each zone is OCR'd independently, then zones are grouped by their target
field and each field is written exactly once. When several zones share a
field, their values are combined via the template's per-field format string
(or joined in order if none is set) this avoids the zones overwriting each
other's value.
"""
pages_needed: set[int] = {
_resolve_page_idx(zone.page, document.page_count) for zone in zones
}
with tempfile.TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
tmp_path = Path(tmp_dir)
page_images = _render_pages(
doc_path,
pages_needed,
tmp_path,
document.page_count,
)
# Pass 1: OCR every zone into a value (or None if it failed/was rejected).
zone_values: dict[int, str | None] = {}
for zone in zones:
page_idx = _resolve_page_idx(zone.page, document.page_count)
if page_idx not in page_images:
logger.warning(
"Zone OCR: page %d not available for zone '%s'",
page_idx,
zone.name,
)
continue
src_w = zone.zone_source_width or template.source_width
src_h = zone.zone_source_height or template.source_height
extracted = _extract_zone(
page_images[page_idx],
zone,
src_w,
src_h,
tmp_path,
)
if (
extracted is not None
and zone.validation_regex
and not re.fullmatch(zone.validation_regex, extracted)
):
logger.info(
"Zone OCR: '%s' value %r rejected by regex '%s'",
zone.name,
extracted[:100],
zone.validation_regex,
)
extracted = None
zone_values[id(zone)] = extracted
# Pass 2: group zones by target field and write each field once.
grouped: dict[str, list[OcrTemplateZone]] = {}
for zone in zones:
grouped.setdefault(_field_key(zone), []).append(zone)
combine_formats = template.combine_formats or {}
for key, field_zones in grouped.items():
value = _combine_field_value(
combine_formats.get(key, ""),
field_zones,
zone_values,
)
if not value:
continue
target_zone = field_zones[0]
_write_zone_value(document, target_zone, value)
logger.info(
"Zone OCR: %s = %r (from %d zone(s))",
_zone_target_label(target_zone),
value[:100] if len(value) > 100 else value,
len(field_zones),
)
def _field_key(zone: OcrTemplateZone) -> str:
"""Identify a zone's target field. Custom fields key by id, built-in targets
by their name. Matches the key used in OcrTemplate.combine_formats and on the
frontend field select."""
target = getattr(zone, "target", None) or "custom_field"
if target == "custom_field" and zone.custom_field_id:
return str(zone.custom_field_id)
return target
def _combine_field_value(
fmt: str,
field_zones: list[OcrTemplateZone],
zone_values: dict[int, str | None],
) -> str:
"""Combine the OCR values of all zones targeting one field.
With a format string, `{Zone Name}` tokens are replaced by that zone's value
and literal text is kept; separators left dangling by an empty token are
cleaned up. Without a format, the zone values are joined in order by a space.
"""
values = {z.name: (zone_values.get(id(z)) or "") for z in field_zones}
if not fmt:
parts = [zone_values.get(id(z)) or "" for z in field_zones]
return " ".join(p for p in parts if p).strip()
def _replace(match: re.Match) -> str:
return values.get(match.group(1).strip(), "")
combined = re.sub(r"\{([^{}]+)\}", _replace, fmt)
# Tidy up separators an empty token may have left behind.
combined = re.sub(r"\s{2,}", " ", combined)
combined = re.sub(r"([^\w\s])\s*\1+", r"\1", combined)
return combined.strip().strip("-/.,;:| \t")
def _render_pages(
doc_path: Path,
pages: set[int],
tmp_dir: Path,
page_count: int | None,
) -> dict[int, Path]:
"""Render specific PDF pages as PNG images using pdftoppm (poppler-utils)."""
result: dict[int, Path] = {}
mime = _detect_mime(doc_path)
if mime and mime.startswith("image/"):
# Single-image document — use it directly as page 0.
result[0] = doc_path
return result
# Callers pass already-resolved 0-indexed page numbers (see _resolve_page_idx).
for actual_page in pages:
if actual_page < 0:
logger.warning("Zone OCR: invalid page index %d", actual_page)
continue
output_prefix = tmp_dir / f"page_{actual_page}"
try:
subprocess.run(
[
"pdftoppm",
"-png",
"-r",
"300",
"-f",
str(actual_page + 1), # pdftoppm is 1-indexed
"-l",
str(actual_page + 1),
str(doc_path),
str(output_prefix),
],
check=True,
capture_output=True,
timeout=60,
)
except subprocess.TimeoutExpired:
logger.error("Zone OCR: pdftoppm timed out for page %d", actual_page)
continue
except subprocess.CalledProcessError as e:
logger.error(
"Zone OCR: pdftoppm failed for page %d: %s",
actual_page,
e.stderr.decode(errors="replace") if e.stderr else str(e),
)
continue
except FileNotFoundError:
logger.error("Zone OCR: pdftoppm not found — is poppler-utils installed?")
return result # No point trying other pages
# pdftoppm names output as prefix-NNNN.png
rendered = sorted(tmp_dir.glob(f"page_{actual_page}-*.png"))
if rendered:
result[actual_page] = rendered[0]
return result
def _crop_zone(
page_img: Path,
zone: OcrTemplateZone,
source_width: int,
source_height: int,
tmp_dir: Path,
) -> Image.Image | None:
"""Crop a zone from the page image and return the PIL Image."""
try:
with Image.open(page_img) as img:
img_width, img_height = img.size
scale_x = img_width / source_width
scale_y = img_height / source_height
crop_left = int(zone.x * scale_x)
crop_top = int(zone.y * scale_y)
crop_right = int((zone.x + zone.width) * scale_x)
crop_bottom = int((zone.y + zone.height) * scale_y)
# Clamp to the image so an oversized zone can't crop out of bounds.
crop_left = max(0, min(crop_left, img_width))
crop_top = max(0, min(crop_top, img_height))
crop_right = max(crop_left + 1, min(crop_right, img_width))
crop_bottom = max(crop_top + 1, min(crop_bottom, img_height))
if crop_right - crop_left < 2 or crop_bottom - crop_top < 2:
logger.warning("Zone OCR: crop too small for zone '%s'", zone.name)
return None
return img.crop((crop_left, crop_top, crop_right, crop_bottom)).copy()
except Exception:
logger.exception("Zone OCR: crop failed for zone '%s'", zone.name)
return None
def _read_barcode(cropped: Image.Image, zone_name: str) -> str | None:
"""Read QR/barcode from a cropped image using zxingcpp."""
try:
import zxingcpp
results = zxingcpp.read_barcodes(cropped)
if results:
text = results[0].text
logger.debug(
"Zone OCR: barcode found in zone '%s': %s",
zone_name,
text[:100],
)
return text
logger.debug("Zone OCR: no barcode found in zone '%s'", zone_name)
return None
except ImportError:
logger.error("Zone OCR: zxingcpp not available — install zxing-cpp")
return None
except Exception:
logger.exception("Zone OCR: barcode read failed for zone '%s'", zone_name)
return None
def _ocr_text(cropped: Image.Image, zone: OcrTemplateZone, tmp_dir: Path) -> str | None:
"""OCR a cropped image with Tesseract."""
crop_path = tmp_dir / f"zone_{zone.pk}.png"
cropped.save(crop_path)
try:
proc = subprocess.run(
[
"tesseract",
str(crop_path),
"stdout",
"-l",
zone.ocr_language,
"--psm",
"6", # Assume uniform block of text
],
capture_output=True,
text=True,
timeout=30,
check=True,
)
return proc.stdout.strip() or None
except subprocess.TimeoutExpired:
logger.error("Zone OCR: Tesseract timed out for zone '%s'", zone.name)
return None
except subprocess.CalledProcessError as e:
logger.error(
"Zone OCR: Tesseract failed for zone '%s': %s",
zone.name,
e.stderr[:200] if e.stderr else str(e),
)
return None
except FileNotFoundError:
logger.error("Zone OCR: Tesseract not found — is tesseract-ocr installed?")
return None
def _extract_zone(
page_img: Path,
zone: OcrTemplateZone,
source_width: int,
source_height: int,
tmp_dir: Path,
) -> str | None:
"""Crop a zone from the page image and extract text via OCR or barcode reader."""
cropped = _crop_zone(page_img, zone, source_width, source_height, tmp_dir)
if cropped is None:
return None
# QR/barcode zones skip Tesseract entirely
if zone.transform == "qr_code":
text = _read_barcode(cropped, zone.name)
if not text:
return None
return _apply_transform(
text,
zone.transform,
getattr(zone, "date_format", "") or "",
)
text = _ocr_text(cropped, zone, tmp_dir)
if not text:
return None
return _apply_transform(
text,
zone.transform,
getattr(zone, "date_format", "") or "",
)
def extract_zone_preview(
doc_path: Path,
zone: OcrTemplateZone,
source_width: int,
source_height: int,
page_count: int | None,
) -> dict:
"""Non-destructive single-zone extraction for the editor's per-zone test.
Renders the zone's page, crops it, runs OCR (or the barcode reader) and
applies the transform WITHOUT writing any custom field. Returns the raw
OCR text and the transformed value so the user can see what the zone yields
(and tune the validation regex) before saving.
"""
# zone.page is 1-indexed (1 = first, -1 = last); resolve to a 0-indexed
# image index exactly like the production extraction path does.
page_idx = _resolve_page_idx(zone.page, page_count)
with tempfile.TemporaryDirectory(dir=settings.SCRATCH_DIR) as tmp_dir:
tmp_path = Path(tmp_dir)
page_images = _render_pages(doc_path, {page_idx}, tmp_path, page_count)
if page_idx not in page_images:
return {"raw_text": None, "value": None}
if not source_width or not source_height:
with Image.open(page_images[page_idx]) as im:
source_width, source_height = im.size
cropped = _crop_zone(
page_images[page_idx],
zone,
source_width,
source_height,
tmp_path,
)
if cropped is None:
return {"raw_text": None, "value": None}
if zone.transform == "qr_code":
raw_text = _read_barcode(cropped, zone.name)
else:
raw_text = _ocr_text(cropped, zone, tmp_path)
value = (
_apply_transform(
raw_text,
zone.transform,
getattr(zone, "date_format", "") or "",
)
if raw_text
else None
)
return {"raw_text": raw_text, "value": value}
def _parse_date(text: str, fmt: str) -> str:
"""Parse a date from OCR text. With a Python strptime `fmt`, try that first;
otherwise (or on failure) fall back to dateparser auto-detection. Returns an
ISO date string, or the original text if nothing parses."""
text = text.strip()
if not text:
return text
if fmt:
try:
return datetime.strptime(text, fmt).date().isoformat()
except ValueError:
pass
try:
import dateparser
parsed = dateparser.parse(
text,
settings={
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": False,
},
)
if parsed:
return parsed.date().isoformat()
except Exception:
logger.debug("Zone OCR: dateparser failed for %r", text[:50])
return text
def _apply_transform(text: str, transform: str, date_format: str = "") -> str:
"""Apply post-processing transform to extracted text."""
text = text.strip()
if not text:
return text
if transform in ("strip", "none"):
return text
elif transform == "date":
return _parse_date(text, date_format)
elif transform == "uppercase":
return text.upper()
elif transform == "lowercase":
return text.lower()
elif transform == "numeric":
result = re.sub(r"[^\d.,\-]", "", text)
return result if result else text
elif transform == "strip_punctuation":
return text.strip(string.punctuation + " \t\r\n")
elif transform == "qr_code":
# Barcode/QR content as read by _read_barcode.
return text
return text
def _zone_target_label(zone: OcrTemplateZone) -> str:
"""Human label of a zone's write target (for logging)."""
target = getattr(zone, "target", None) or "custom_field"
if target == "custom_field":
return zone.custom_field.name if zone.custom_field_id else "(no field)"
return {"title": "Title", "asn": "ASN", "created": "Created"}.get(target, target)
def _parse_created_datetime(value: str):
"""Parse an extracted value into a tz-aware datetime for document.created.
Prefers an ISO date (the zone should use a date transform); falls back to
dateparser. Returns None if no date can be parsed.
"""
from django.utils import timezone as djtz
m = re.search(r"(\d{4})-(\d{2})-(\d{2})", value)
if m:
try:
dt = datetime(int(m[1]), int(m[2]), int(m[3]))
return djtz.make_aware(dt) if djtz.is_naive(dt) else dt
except ValueError:
pass
try:
import dateparser
parsed = dateparser.parse(
value,
settings={"RETURN_AS_TIMEZONE_AWARE": False},
)
if parsed:
return djtz.make_aware(parsed) if djtz.is_naive(parsed) else parsed
except Exception:
logger.debug("Zone OCR: dateparser failed for created value %r", value[:50])
return None
def _write_zone_value(
document: Document,
zone: OcrTemplateZone,
value: str,
) -> None:
"""Write an extracted value to the zone's target — a custom field, or a
built-in document field (title / archive_serial_number / created)."""
target = getattr(zone, "target", None) or "custom_field"
if target == "custom_field":
if zone.custom_field_id:
_write_custom_field(document, zone.custom_field, value)
else:
logger.debug("Zone OCR: zone '%s' has no custom field set", zone.name)
return
if target == "title":
document.title = value[:128]
document.save(update_fields=["title"])
elif target == "asn":
digits = re.sub(r"[^\d]", "", value)
if not digits:
logger.debug(
"Zone OCR: ASN zone '%s' produced no digits (%r)",
zone.name,
value[:50],
)
return
document.archive_serial_number = int(digits)
document.save(update_fields=["archive_serial_number"])
elif target == "created":
parsed = _parse_created_datetime(value)
if parsed is None:
logger.debug(
"Zone OCR: created zone '%s' could not parse a date (%r)",
zone.name,
value[:50],
)
return
document.created = parsed
document.save(update_fields=["created"])
def _write_custom_field(
document: Document,
custom_field: CustomField,
value: str,
) -> None:
"""Write an extracted value to a document's custom field."""
typed_value = _convert_value(value, custom_field.data_type)
if typed_value is None:
logger.debug(
"Zone OCR: skipping custom field '%s' — value conversion returned None",
custom_field.name,
)
return
value_field_name = CustomFieldInstance.get_value_field_name(custom_field.data_type)
CustomFieldInstance.objects.update_or_create(
document=document,
field=custom_field,
defaults={value_field_name: typed_value},
)
def _convert_value(value: str, data_type: str) -> object | None:
"""Convert an extracted OCR string to the appropriate type for the custom field."""
if not value:
return None
try:
if data_type in (
CustomField.FieldDataType.STRING,
CustomField.FieldDataType.URL,
):
return value[:128]
elif data_type == CustomField.FieldDataType.LONG_TEXT:
return value
elif data_type == CustomField.FieldDataType.INT:
digits = re.sub(r"[^\d\-]", "", value)
# Handle edge case: only dashes or empty
digits = digits.lstrip("-") or ""
if not digits:
return None
# Restore leading minus if original had one
if value.strip().startswith("-"):
digits = "-" + digits
return int(digits)
elif data_type == CustomField.FieldDataType.FLOAT:
# Handle European format: 1.234,56 → 1234.56
cleaned = re.sub(r"[^\d.,\-]", "", value)
if not cleaned or cleaned in (".", ",", "-"):
return None
# If both . and , present, the last one is the decimal separator
if "," in cleaned and "." in cleaned:
if cleaned.rindex(",") > cleaned.rindex("."):
# European: 1.234,56
cleaned = cleaned.replace(".", "").replace(",", ".")
else:
# US: 1,234.56
cleaned = cleaned.replace(",", "")
elif "," in cleaned:
# Only comma — treat as decimal separator
cleaned = cleaned.replace(",", ".")
return float(cleaned)
elif data_type == CustomField.FieldDataType.DATE:
match = re.search(r"(\d{4})-(\d{2})-(\d{2})", value)
if match:
y, m, d = match.groups()
# Validate the date
date(int(y), int(m), int(d))
return f"{y}-{m}-{d}"
return None
elif data_type == CustomField.FieldDataType.MONETARY:
cleaned = re.sub(r"[^\d.,\-]", "", value)
if not cleaned or cleaned in (".", ",", "-"):
return None
if "," in cleaned and "." in cleaned:
if cleaned.rindex(",") > cleaned.rindex("."):
cleaned = cleaned.replace(".", "").replace(",", ".")
else:
cleaned = cleaned.replace(",", "")
elif "," in cleaned:
cleaned = cleaned.replace(",", ".")
# Validate it parses as a number
float(cleaned)
return cleaned
elif data_type == CustomField.FieldDataType.BOOL:
lower = value.lower().strip()
if lower in ("true", "yes", "1", "ja", "oui", "si", "x"):
return True
elif lower in ("false", "no", "0", "nein", "non"):
return False
return None
else:
# Unsupported types (DOCUMENTLINK, SELECT) — can't OCR into these
logger.debug(
"Zone OCR: unsupported custom field type %s for OCR extraction",
data_type,
)
return None
except (ValueError, TypeError) as e:
logger.warning("Zone OCR: could not convert %r to %s: %s", value, data_type, e)
return None
def _detect_mime(path: Path) -> str | None:
"""Detect MIME type of a file."""
try:
import magic
return magic.from_file(str(path), mime=True)
except ImportError:
pass
except Exception:
logger.debug("Zone OCR: magic failed for %s, falling back to extension", path)
suffix = path.suffix.lower()
return {
".pdf": "application/pdf",
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".tiff": "image/tiff",
".tif": "image/tiff",
".webp": "image/webp",
".bmp": "image/bmp",
".gif": "image/gif",
}.get(suffix)
+65 -73
View File
@@ -2,7 +2,7 @@ msgid ""
msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2026-06-23 14:33+0000\n"
"POT-Creation-Date: 2026-06-03 22:14+0000\n"
"PO-Revision-Date: 2022-02-17 04:17\n"
"Last-Translator: \n"
"Language-Team: English\n"
@@ -21,39 +21,39 @@ msgstr ""
msgid "Documents"
msgstr ""
#: documents/filters.py:464
#: documents/filters.py:463
msgid "Value must be valid JSON."
msgstr ""
#: documents/filters.py:483
#: documents/filters.py:482
msgid "Invalid custom field query expression"
msgstr ""
#: documents/filters.py:493
#: documents/filters.py:492
msgid "Invalid expression list. Must be nonempty."
msgstr ""
#: documents/filters.py:514
#: documents/filters.py:513
msgid "Invalid logical operator {op!r}"
msgstr ""
#: documents/filters.py:528
#: documents/filters.py:527
msgid "Maximum number of query conditions exceeded."
msgstr ""
#: documents/filters.py:592
#: documents/filters.py:591
msgid "{name!r} is not a valid custom field."
msgstr ""
#: documents/filters.py:629
#: documents/filters.py:628
msgid "{data_type} does not support query expr {expr!r}."
msgstr ""
#: documents/filters.py:744 documents/models.py:136
#: documents/filters.py:743 documents/models.py:136
msgid "Maximum nesting depth exceeded."
msgstr ""
#: documents/filters.py:1052
#: documents/filters.py:990
msgid "Custom field not found"
msgstr ""
@@ -1351,49 +1351,49 @@ msgstr ""
msgid "workflow runs"
msgstr ""
#: documents/serialisers.py:503 documents/serialisers.py:855
#: documents/serialisers.py:2744 documents/views.py:297 documents/views.py:2482
#: documents/serialisers.py:463 documents/serialisers.py:815
#: documents/serialisers.py:2681 documents/views.py:295 documents/views.py:2468
#: paperless_mail/serialisers.py:155
msgid "Insufficient permissions."
msgstr ""
#: documents/serialisers.py:691
#: documents/serialisers.py:651
msgid "Invalid color."
msgstr ""
#: documents/serialisers.py:2216
#: documents/serialisers.py:2175
#, python-format
msgid "File type %(type)s not supported"
msgstr ""
#: documents/serialisers.py:2260
#: documents/serialisers.py:2219
#, python-format
msgid "Custom field id must be an integer: %(id)s"
msgstr ""
#: documents/serialisers.py:2267
#: documents/serialisers.py:2226
#, python-format
msgid "Custom field with id %(id)s does not exist"
msgstr ""
#: documents/serialisers.py:2284 documents/serialisers.py:2294
#: documents/serialisers.py:2243 documents/serialisers.py:2253
msgid ""
"Custom fields must be a list of integers or an object mapping ids to values."
msgstr ""
#: documents/serialisers.py:2289
#: documents/serialisers.py:2248
msgid "Some custom fields don't exist or were specified twice."
msgstr ""
#: documents/serialisers.py:2436
#: documents/serialisers.py:2395
msgid "Invalid variable detected."
msgstr ""
#: documents/serialisers.py:2800
#: documents/serialisers.py:2737
msgid "Duplicate document identifiers are not allowed."
msgstr ""
#: documents/serialisers.py:2830 documents/views.py:4429
#: documents/serialisers.py:2767 documents/views.py:4345
#, python-format
msgid "Documents not found: %(ids)s"
msgstr ""
@@ -1661,36 +1661,32 @@ msgstr ""
msgid "Unable to parse URI {value}"
msgstr ""
#: documents/views.py:290 documents/views.py:2479
#: documents/views.py:288 documents/views.py:2465
msgid "Invalid more_like_id"
msgstr ""
#: documents/views.py:1513
#: documents/views.py:1511
msgid "Invalid AI configuration."
msgstr ""
#: documents/views.py:1522
msgid "AI backend request timed out."
msgstr ""
#: documents/views.py:2304 documents/views.py:2625
#: documents/views.py:2290 documents/views.py:2606
msgid "Specify only one of text, title_search, query, or more_like_id."
msgstr ""
#: documents/views.py:4441
#: documents/views.py:4357
#, python-format
msgid "Insufficient permissions to share document %(id)s."
msgstr ""
#: documents/views.py:4487
#: documents/views.py:4403
msgid "Bundle is already being processed."
msgstr ""
#: documents/views.py:4547
#: documents/views.py:4463
msgid "The share link bundle is still being prepared. Please try again later."
msgstr ""
#: documents/views.py:4557
#: documents/views.py:4473
msgid "The share link bundle is unavailable."
msgstr ""
@@ -1939,158 +1935,154 @@ msgid "Sets the LLM output language"
msgstr ""
#: paperless/models.py:370
msgid "Sets the LLM timeout in seconds"
msgstr ""
#: paperless/models.py:376
msgid "paperless application settings"
msgstr ""
#: paperless/settings/__init__.py:545
#: paperless/settings/__init__.py:539
msgid "English (US)"
msgstr ""
#: paperless/settings/__init__.py:546
#: paperless/settings/__init__.py:540
msgid "Arabic"
msgstr ""
#: paperless/settings/__init__.py:547
#: paperless/settings/__init__.py:541
msgid "Afrikaans"
msgstr ""
#: paperless/settings/__init__.py:548
#: paperless/settings/__init__.py:542
msgid "Belarusian"
msgstr ""
#: paperless/settings/__init__.py:549
#: paperless/settings/__init__.py:543
msgid "Bulgarian"
msgstr ""
#: paperless/settings/__init__.py:550
#: paperless/settings/__init__.py:544
msgid "Catalan"
msgstr ""
#: paperless/settings/__init__.py:551
#: paperless/settings/__init__.py:545
msgid "Czech"
msgstr ""
#: paperless/settings/__init__.py:552
#: paperless/settings/__init__.py:546
msgid "Danish"
msgstr ""
#: paperless/settings/__init__.py:553
#: paperless/settings/__init__.py:547
msgid "German"
msgstr ""
#: paperless/settings/__init__.py:554
#: paperless/settings/__init__.py:548
msgid "Greek"
msgstr ""
#: paperless/settings/__init__.py:555
#: paperless/settings/__init__.py:549
msgid "English (GB)"
msgstr ""
#: paperless/settings/__init__.py:556
#: paperless/settings/__init__.py:550
msgid "Spanish"
msgstr ""
#: paperless/settings/__init__.py:557
#: paperless/settings/__init__.py:551
msgid "Persian"
msgstr ""
#: paperless/settings/__init__.py:558
#: paperless/settings/__init__.py:552
msgid "Finnish"
msgstr ""
#: paperless/settings/__init__.py:559
#: paperless/settings/__init__.py:553
msgid "French"
msgstr ""
#: paperless/settings/__init__.py:560
#: paperless/settings/__init__.py:554
msgid "Hungarian"
msgstr ""
#: paperless/settings/__init__.py:561
#: paperless/settings/__init__.py:555
msgid "Indonesian"
msgstr ""
#: paperless/settings/__init__.py:562
#: paperless/settings/__init__.py:556
msgid "Italian"
msgstr ""
#: paperless/settings/__init__.py:563
#: paperless/settings/__init__.py:557
msgid "Japanese"
msgstr ""
#: paperless/settings/__init__.py:564
#: paperless/settings/__init__.py:558
msgid "Korean"
msgstr ""
#: paperless/settings/__init__.py:565
#: paperless/settings/__init__.py:559
msgid "Luxembourgish"
msgstr ""
#: paperless/settings/__init__.py:566
#: paperless/settings/__init__.py:560
msgid "Norwegian"
msgstr ""
#: paperless/settings/__init__.py:567
#: paperless/settings/__init__.py:561
msgid "Dutch"
msgstr ""
#: paperless/settings/__init__.py:568
#: paperless/settings/__init__.py:562
msgid "Polish"
msgstr ""
#: paperless/settings/__init__.py:569
#: paperless/settings/__init__.py:563
msgid "Portuguese (Brazil)"
msgstr ""
#: paperless/settings/__init__.py:570
#: paperless/settings/__init__.py:564
msgid "Portuguese"
msgstr ""
#: paperless/settings/__init__.py:571
#: paperless/settings/__init__.py:565
msgid "Romanian"
msgstr ""
#: paperless/settings/__init__.py:572
#: paperless/settings/__init__.py:566
msgid "Russian"
msgstr ""
#: paperless/settings/__init__.py:573
#: paperless/settings/__init__.py:567
msgid "Slovak"
msgstr ""
#: paperless/settings/__init__.py:574
#: paperless/settings/__init__.py:568
msgid "Slovenian"
msgstr ""
#: paperless/settings/__init__.py:575
#: paperless/settings/__init__.py:569
msgid "Serbian"
msgstr ""
#: paperless/settings/__init__.py:576
#: paperless/settings/__init__.py:570
msgid "Swedish"
msgstr ""
#: paperless/settings/__init__.py:577
#: paperless/settings/__init__.py:571
msgid "Turkish"
msgstr ""
#: paperless/settings/__init__.py:578
#: paperless/settings/__init__.py:572
msgid "Ukrainian"
msgstr ""
#: paperless/settings/__init__.py:579
#: paperless/settings/__init__.py:573
msgid "Vietnamese"
msgstr ""
#: paperless/settings/__init__.py:580
#: paperless/settings/__init__.py:574
msgid "Chinese Simplified"
msgstr ""
#: paperless/settings/__init__.py:581
#: paperless/settings/__init__.py:575
msgid "Chinese Traditional"
msgstr ""
+2
View File
@@ -28,6 +28,7 @@ from documents.views import GlobalSearchView
from documents.views import IndexView
from documents.views import LogViewSet
from documents.views import MergeDocumentsView
from documents.views import OcrTemplateViewSet
from documents.views import PostDocumentView
from documents.views import RemoteVersionView
from documents.views import RemovePasswordDocumentsView
@@ -86,6 +87,7 @@ api_router.register(r"workflow_triggers", WorkflowTriggerViewSet)
api_router.register(r"workflow_actions", WorkflowActionViewSet)
api_router.register(r"workflows", WorkflowViewSet)
api_router.register(r"custom_fields", CustomFieldViewSet)
api_router.register(r"ocr_templates", OcrTemplateViewSet)
api_router.register(r"config", ApplicationConfigurationViewSet)
api_router.register(r"processed_mail", ProcessedMailViewSet)
+1 -1
View File
@@ -87,7 +87,7 @@ def build_localization_prompt(suggestions: dict, output_language: str) -> str:
Return the same JSON schema with all fields present.
Suggestions:
{json.dumps(suggestions, ensure_ascii=False)}
{json.dumps(suggestions)}
""".strip()
@@ -239,23 +239,6 @@ def test_get_language_name_falls_back_to_language_code():
assert get_language_name("zz-zz") == "zz-zz"
def test_build_localization_prompt_preserves_unicode_characters():
prompt = build_localization_prompt(
{
"title": "Gebührenbescheid",
"tags": [],
"correspondents": [],
"document_types": [],
"storage_paths": [],
"dates": [],
},
output_language="de-de",
)
assert "Gebührenbescheid" in prompt
assert "\\u00fc" not in prompt
@patch("paperless_ai.ai_classifier.query_similar_documents")
def test_get_context_for_document(
mock_query_similar_documents,